From df1db2535aeb654b408834f45e6e45b1223953ae Mon Sep 17 00:00:00 2001 From: x54-729 <17307130121@fudan.edu.cn> Date: Fri, 29 Apr 2022 16:53:02 +0000 Subject: [PATCH 1/9] =?UTF-8?q?=E8=BF=81=E7=A7=BBtransformers=20ver.4.11.3?= =?UTF-8?q?=E7=9A=84bert=20bart=20roberta=20gpt2=E5=92=8Ccpt=E6=A8=A1?= =?UTF-8?q?=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/utils/dummy_class.py | 5 +- fastNLP/transformers/__init__.py | 1 + fastNLP/transformers/torch/__init__.py | 9 + fastNLP/transformers/torch/activations.py | 125 + .../transformers/torch/configuration_utils.py | 777 ++++ fastNLP/transformers/torch/deepspeed.py | 388 ++ .../torch/dependency_versions_check.py | 20 + .../torch/dependency_versions_table.py | 76 + fastNLP/transformers/torch/file_utils.py | 934 +++++ .../torch/generation_beam_search.py | 393 ++ .../torch/generation_logits_process.py | 618 +++ .../torch/generation_stopping_criteria.py | 128 + .../transformers/torch/generation_utils.py | 2579 +++++++++++++ .../transformers/torch/modeling_outputs.py | 816 ++++ fastNLP/transformers/torch/modeling_utils.py | 1888 ++++++++++ fastNLP/transformers/torch/models/__init__.py | 5 + .../torch/models/auto/configuration_auto.py | 541 +++ .../torch/models/auto/tokenization_auto.py | 199 + .../torch/models/bart/__init__.py | 20 + .../torch/models/bart/configuration_bart.py | 177 + .../torch/models/bart/modeling_bart.py | 1834 +++++++++ .../torch/models/bart/tokenization_bart.py | 65 + .../torch/models/bert/__init__.py | 27 + .../torch/models/bert/configuration_bert.py | 158 + .../torch/models/bert/modeling_bert.py | 1806 +++++++++ .../torch/models/bert/tokenization_bert.py | 558 +++ .../transformers/torch/models/cpt/__init__.py | 12 + .../torch/models/cpt/modeling_cpt.py | 1489 ++++++++ .../torch/models/gpt2/__init__.py | 19 + .../torch/models/gpt2/configuration_gpt2.py | 184 + .../torch/models/gpt2/modeling_gpt2.py | 1393 +++++++ .../torch/models/gpt2/tokenization_gpt2.py | 308 ++ .../torch/models/roberta/__init__.py | 21 + .../models/roberta/configuration_roberta.py | 65 + .../torch/models/roberta/modeling_roberta.py | 1584 ++++++++ .../models/roberta/tokenization_roberta.py | 254 ++ .../transformers/torch/tokenization_utils.py | 915 +++++ .../torch/tokenization_utils_base.py | 3351 +++++++++++++++++ fastNLP/transformers/torch/utils/__init__.py | 0 .../torch/utils/model_parallel_utils.py | 54 + fastNLP/transformers/torch/utils/versions.py | 120 + 41 files changed, 23914 insertions(+), 2 deletions(-) create mode 100644 fastNLP/transformers/__init__.py create mode 100644 fastNLP/transformers/torch/__init__.py create mode 100644 fastNLP/transformers/torch/activations.py create mode 100644 fastNLP/transformers/torch/configuration_utils.py create mode 100644 fastNLP/transformers/torch/deepspeed.py create mode 100644 fastNLP/transformers/torch/dependency_versions_check.py create mode 100644 fastNLP/transformers/torch/dependency_versions_table.py create mode 100644 fastNLP/transformers/torch/file_utils.py create mode 100644 fastNLP/transformers/torch/generation_beam_search.py create mode 100644 fastNLP/transformers/torch/generation_logits_process.py create mode 100644 fastNLP/transformers/torch/generation_stopping_criteria.py create mode 100644 fastNLP/transformers/torch/generation_utils.py create mode 100644 fastNLP/transformers/torch/modeling_outputs.py create mode 100644 fastNLP/transformers/torch/modeling_utils.py create mode 100644 fastNLP/transformers/torch/models/__init__.py create mode 100644 fastNLP/transformers/torch/models/auto/configuration_auto.py create mode 100644 fastNLP/transformers/torch/models/auto/tokenization_auto.py create mode 100644 fastNLP/transformers/torch/models/bart/__init__.py create mode 100644 fastNLP/transformers/torch/models/bart/configuration_bart.py create mode 100644 fastNLP/transformers/torch/models/bart/modeling_bart.py create mode 100644 fastNLP/transformers/torch/models/bart/tokenization_bart.py create mode 100644 fastNLP/transformers/torch/models/bert/__init__.py create mode 100644 fastNLP/transformers/torch/models/bert/configuration_bert.py create mode 100644 fastNLP/transformers/torch/models/bert/modeling_bert.py create mode 100644 fastNLP/transformers/torch/models/bert/tokenization_bert.py create mode 100644 fastNLP/transformers/torch/models/cpt/__init__.py create mode 100644 fastNLP/transformers/torch/models/cpt/modeling_cpt.py create mode 100644 fastNLP/transformers/torch/models/gpt2/__init__.py create mode 100644 fastNLP/transformers/torch/models/gpt2/configuration_gpt2.py create mode 100644 fastNLP/transformers/torch/models/gpt2/modeling_gpt2.py create mode 100644 fastNLP/transformers/torch/models/gpt2/tokenization_gpt2.py create mode 100644 fastNLP/transformers/torch/models/roberta/__init__.py create mode 100644 fastNLP/transformers/torch/models/roberta/configuration_roberta.py create mode 100644 fastNLP/transformers/torch/models/roberta/modeling_roberta.py create mode 100644 fastNLP/transformers/torch/models/roberta/tokenization_roberta.py create mode 100644 fastNLP/transformers/torch/tokenization_utils.py create mode 100644 fastNLP/transformers/torch/tokenization_utils_base.py create mode 100644 fastNLP/transformers/torch/utils/__init__.py create mode 100644 fastNLP/transformers/torch/utils/model_parallel_utils.py create mode 100644 fastNLP/transformers/torch/utils/versions.py diff --git a/fastNLP/core/utils/dummy_class.py b/fastNLP/core/utils/dummy_class.py index 2e97c3e4..2856b656 100644 --- a/fastNLP/core/utils/dummy_class.py +++ b/fastNLP/core/utils/dummy_class.py @@ -1,4 +1,5 @@ - +import functools class DummyClass: - pass \ No newline at end of file + def __call__(self, *args, **kwargs): + return diff --git a/fastNLP/transformers/__init__.py b/fastNLP/transformers/__init__.py new file mode 100644 index 00000000..6403f6b9 --- /dev/null +++ b/fastNLP/transformers/__init__.py @@ -0,0 +1 @@ +"""基于 transformers-4.11.3 版本迁移""" \ No newline at end of file diff --git a/fastNLP/transformers/torch/__init__.py b/fastNLP/transformers/torch/__init__.py new file mode 100644 index 00000000..9ce4fb10 --- /dev/null +++ b/fastNLP/transformers/torch/__init__.py @@ -0,0 +1,9 @@ +""" +为了防止因 https://github.com/huggingface/transformers 版本变化导致代码不兼容,当前 folder 以及子 folder +都复制自 https://github.com/huggingface/transformers 的4.11.3版本。 +In order to avoid the code change of https://github.com/huggingface/transformers to cause version +mismatch, we copy code from https://github.com/huggingface/transformers(version:4.11.3) in this +folder and its subfolder. +""" +__version__ = "4.11.3" +from .models import * \ No newline at end of file diff --git a/fastNLP/transformers/torch/activations.py b/fastNLP/transformers/torch/activations.py new file mode 100644 index 00000000..cf01f2bf --- /dev/null +++ b/fastNLP/transformers/torch/activations.py @@ -0,0 +1,125 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +from packaging import version + +from fastNLP.envs.imports import _NEED_IMPORT_TORCH +from fastNLP.core.log import logger + +if _NEED_IMPORT_TORCH: + import torch + from torch import nn, tanh, sigmoid + from torch.nn.functional import relu +else: + from fastNLP.core.utils.dummy_class import ( + DummyClass as relu, + DummyClass as tanh, + DummyClass as sigmoid, +) + + +def _gelu_python(x): + """ + Original Implementation of the GELU activation function in Google BERT repo when initially created. For + information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 + + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional + Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 + """ + return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) + + +def gelu_new(x): + """ + Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see + the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 + """ + return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))) + +if _NEED_IMPORT_TORCH: + if version.parse(torch.__version__) < version.parse("1.4"): + gelu = _gelu_python + else: + gelu = nn.functional.gelu +else: + from fastNLP.core.utils.dummy_class import DummyClass as gelu + +def gelu_fast(x): + return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) + + +def quick_gelu(x): + return x * torch.sigmoid(1.702 * x) + + +def _silu_python(x): + """ + See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear + Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function + Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated + Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with + later. + """ + return x * torch.sigmoid(x) + +if _NEED_IMPORT_TORCH: + if version.parse(torch.__version__) < version.parse("1.7"): + silu = _silu_python + else: + silu = nn.functional.silu +else: + from fastNLP.core.utils.dummy_class import DummyClass as silu + + +def _mish_python(x): + """ + See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also + visit the official repository for the paper: https://github.com/digantamisra98/Mish + """ + return x * torch.tanh(nn.functional.softplus(x)) + +if _NEED_IMPORT_TORCH: + if version.parse(torch.__version__) < version.parse("1.9"): + mish = _mish_python + else: + mish = nn.functional.mish +else: + from fastNLP.core.utils.dummy_class import DummyClass as mish + + +def linear_act(x): + return x + + +ACT2FN = { + "relu": relu, + "silu": silu, + "swish": silu, + "gelu": gelu, + "tanh": tanh, + "gelu_new": gelu_new, + "gelu_fast": gelu_fast, + "quick_gelu": quick_gelu, + "mish": mish, + "linear": linear_act, + "sigmoid": sigmoid, +} + + +def get_activation(activation_string): + if activation_string in ACT2FN: + return ACT2FN[activation_string] + else: + raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}") diff --git a/fastNLP/transformers/torch/configuration_utils.py b/fastNLP/transformers/torch/configuration_utils.py new file mode 100644 index 00000000..9c17f336 --- /dev/null +++ b/fastNLP/transformers/torch/configuration_utils.py @@ -0,0 +1,777 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Configuration base class and utilities.""" + + +import copy +import json +import os +from typing import Any, Dict, Tuple, Union + +from . import __version__ +from .file_utils import ( + CONFIG_NAME, + cached_path, + hf_bucket_url, + is_offline_mode, + is_remote_url, +) +from fastNLP.envs.imports import _NEED_IMPORT_TORCH +from fastNLP.core.log import logger + + +class PretrainedConfig: + r""" + Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as + methods for loading/downloading/saving configurations. + + Note: + A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to + initialize a model does **not** load the model weights. It only affects the model's configuration. + + Class attributes (overridden by derived classes) + + - **model_type** (:obj:`str`) -- An identifier for the model type, serialized into the JSON file, and used to + recreate the correct object in :class:`~transformers.AutoConfig`. + - **is_composition** (:obj:`bool`) -- Whether the config class is composed of multiple sub-configs. In this + case the config has to be initialized from two or more configs of type + :class:`~transformers.PretrainedConfig` like: :class:`~transformers.EncoderDecoderConfig` or + :class:`~RagConfig`. + - **keys_to_ignore_at_inference** (:obj:`List[str]`) -- A list of keys to ignore by default when looking at + dictionary outputs of the model during inference. + - **attribute_map** (:obj:`Dict[str, str]`) -- A dict that maps model specific attribute names to the + standardized naming of attributes. + + Common attributes (present in all subclasses) + + - **vocab_size** (:obj:`int`) -- The number of tokens in the vocabulary, which is also the first dimension of + the embeddings matrix (this attribute may be missing for models that don't have a text modality like ViT). + - **hidden_size** (:obj:`int`) -- The hidden size of the model. + - **num_attention_heads** (:obj:`int`) -- The number of attention heads used in the multi-head attention layers + of the model. + - **num_hidden_layers** (:obj:`int`) -- The number of blocks in the model. + + Args: + name_or_path (:obj:`str`, `optional`, defaults to :obj:`""`): + Store the string that was passed to :func:`~transformers.PreTrainedModel.from_pretrained` or + :func:`~transformers.TFPreTrainedModel.from_pretrained` as ``pretrained_model_name_or_path`` if the + configuration was created with such a method. + output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the model should return all hidden-states. + output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the model should returns all attentions. + return_dict (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a plain + tuple. + is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether the model is used as an encoder/decoder or not. + is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether the model is used as decoder or not (in which case it's used as an encoder). + add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether cross-attention layers should be added to the model. Note, this option is only relevant for models + that can be used as decoder models within the `:class:~transformers.EncoderDecoderModel` class, which + consists of all models in ``AUTO_MODELS_FOR_CAUSAL_LM``. + tie_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder + and decoder model to have the exact same parameter names. + prune_heads (:obj:`Dict[int, List[int]]`, `optional`, defaults to :obj:`{}`): + Pruned heads of the model. The keys are the selected layer indices and the associated values, the list of + heads to prune in said layer. + + For instance ``{1: [0, 2], 2: [2, 3]}`` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2. + chunk_size_feed_forward (:obj:`int`, `optional`, defaults to :obj:`0`): + The chunk size of all feed forward layers in the residual attention blocks. A chunk size of :obj:`0` means + that the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes + :obj:`n` < sequence_length embeddings at a time. For more information on feed forward chunking, see `How + does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ . + + Parameters for sequence generation + + - **max_length** (:obj:`int`, `optional`, defaults to 20) -- Maximum length that will be used by default in the + :obj:`generate` method of the model. + - **min_length** (:obj:`int`, `optional`, defaults to 10) -- Minimum length that will be used by default in the + :obj:`generate` method of the model. + - **do_sample** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default in the + :obj:`generate` method of the model. Whether or not to use sampling ; use greedy decoding otherwise. + - **early_stopping** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default + in the :obj:`generate` method of the model. Whether to stop the beam search when at least ``num_beams`` + sentences are finished per batch or not. + - **num_beams** (:obj:`int`, `optional`, defaults to 1) -- Number of beams for beam search that will be used by + default in the :obj:`generate` method of the model. 1 means no beam search. + - **num_beam_groups** (:obj:`int`, `optional`, defaults to 1) -- Number of groups to divide :obj:`num_beams` + into in order to ensure diversity among different groups of beams that will be used by default in the + :obj:`generate` method of the model. 1 means no group beam search. + - **diversity_penalty** (:obj:`float`, `optional`, defaults to 0.0) -- Value to control diversity for group + beam search. that will be used by default in the :obj:`generate` method of the model. 0 means no diversity + penalty. The higher the penalty, the more diverse are the outputs. + - **temperature** (:obj:`float`, `optional`, defaults to 1) -- The value used to module the next token + probabilities that will be used by default in the :obj:`generate` method of the model. Must be strictly + positive. + - **top_k** (:obj:`int`, `optional`, defaults to 50) -- Number of highest probability vocabulary tokens to keep + for top-k-filtering that will be used by default in the :obj:`generate` method of the model. + - **top_p** (:obj:`float`, `optional`, defaults to 1) -- Value that will be used by default in the + :obj:`generate` method of the model for ``top_p``. If set to float < 1, only the most probable tokens with + probabilities that add up to ``top_p`` or higher are kept for generation. + - **repetition_penalty** (:obj:`float`, `optional`, defaults to 1) -- Parameter for repetition penalty that + will be used by default in the :obj:`generate` method of the model. 1.0 means no penalty. + - **length_penalty** (:obj:`float`, `optional`, defaults to 1) -- Exponential penalty to the length that will + be used by default in the :obj:`generate` method of the model. + - **no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by default in the + :obj:`generate` method of the model for ``no_repeat_ngram_size``. If set to int > 0, all ngrams of that size + can only occur once. + - **encoder_no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by + default in the :obj:`generate` method of the model for ``encoder_no_repeat_ngram_size``. If set to int > 0, + all ngrams of that size that occur in the ``encoder_input_ids`` cannot occur in the ``decoder_input_ids``. + - **bad_words_ids** (:obj:`List[int]`, `optional`) -- List of token ids that are not allowed to be generated + that will be used by default in the :obj:`generate` method of the model. In order to get the tokens of the + words that should not appear in the generated text, use :obj:`tokenizer.encode(bad_word, + add_prefix_space=True)`. + - **num_return_sequences** (:obj:`int`, `optional`, defaults to 1) -- Number of independently computed returned + sequences for each element in the batch that will be used by default in the :obj:`generate` method of the + model. + - **output_scores** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether the model should return the + logits when used for generation + - **return_dict_in_generate** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether the model should + return a :class:`~transformers.file_utils.ModelOutput` instead of a :obj:`torch.LongTensor` + - **forced_bos_token_id** (:obj:`int`, `optional`) -- The id of the token to force as the first generated token + after the :obj:`decoder_start_token_id`. Useful for multilingual models like :doc:`mBART + <../model_doc/mbart>` where the first generated token needs to be the target language token. + - **forced_eos_token_id** (:obj:`int`, `optional`) -- The id of the token to force as the last generated token + when :obj:`max_length` is reached. + - **remove_invalid_values** (:obj:`bool`, `optional`) -- Whether to remove possible `nan` and `inf` outputs of + the model to prevent the generation method to crash. Note that using ``remove_invalid_values`` can slow down + generation. + + + Parameters for fine-tuning tasks + + - **architectures** (:obj:`List[str]`, `optional`) -- Model architectures that can be used with the model + pretrained weights. + - **finetuning_task** (:obj:`str`, `optional`) -- Name of the task used to fine-tune the model. This can be + used when converting from an original (TensorFlow or PyTorch) checkpoint. + - **id2label** (:obj:`Dict[int, str]`, `optional`) -- A map from index (for instance prediction index, or + target index) to label. + - **label2id** (:obj:`Dict[str, int]`, `optional`) -- A map from label to index for the model. + - **num_labels** (:obj:`int`, `optional`) -- Number of labels to use in the last layer added to the model, + typically for a classification task. + - **task_specific_params** (:obj:`Dict[str, Any]`, `optional`) -- Additional keyword arguments to store for the + current task. + - **problem_type** (:obj:`str`, `optional`) -- Problem type for :obj:`XxxForSequenceClassification` models. Can + be one of (:obj:`"regression"`, :obj:`"single_label_classification"`, :obj:`"multi_label_classification"`). + Please note that this parameter is only available in the following models: `AlbertForSequenceClassification`, + `BertForSequenceClassification`, `BigBirdForSequenceClassification`, `ConvBertForSequenceClassification`, + `DistilBertForSequenceClassification`, `ElectraForSequenceClassification`, `FunnelForSequenceClassification`, + `LongformerForSequenceClassification`, `MobileBertForSequenceClassification`, + `ReformerForSequenceClassification`, `RobertaForSequenceClassification`, + `SqueezeBertForSequenceClassification`, `XLMForSequenceClassification` and `XLNetForSequenceClassification`. + + Parameters linked to the tokenizer + + - **tokenizer_class** (:obj:`str`, `optional`) -- The name of the associated tokenizer class to use (if none is + set, will use the tokenizer associated to the model by default). + - **prefix** (:obj:`str`, `optional`) -- A specific prompt that should be added at the beginning of each text + before calling the model. + - **bos_token_id** (:obj:`int`, `optional`)) -- The id of the `beginning-of-stream` token. + - **pad_token_id** (:obj:`int`, `optional`)) -- The id of the `padding` token. + - **eos_token_id** (:obj:`int`, `optional`)) -- The id of the `end-of-stream` token. + - **decoder_start_token_id** (:obj:`int`, `optional`)) -- If an encoder-decoder model starts decoding with a + different token than `bos`, the id of that token. + - **sep_token_id** (:obj:`int`, `optional`)) -- The id of the `separation` token. + + PyTorch specific parameters + + - **torchscript** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should be + used with Torchscript. + - **tie_word_embeddings** (:obj:`bool`, `optional`, defaults to :obj:`True`) -- Whether the model's input and + output word embeddings should be tied. Note that this is only relevant if the model has a output word + embedding layer. + - **torch_dtype** (:obj:`str`, `optional`) -- The :obj:`dtype` of the weights. This attribute can be used to + initialize the model to a non-default ``dtype`` (which is normally ``float32``) and thus allow for optimal + storage allocation. For example, if the saved model is ``float16``, ideally we want to load it back using the + minimal amount of memory needed to load ``float16`` weights. Since the config object is stored in plain text, + this attribute contains just the floating type string without the ``torch.`` prefix. For example, for + ``torch.float16`` ``torch_dtype`` is the ``"float16"`` string. + + This attribute is currently not being used during model loading time, but this may change in the future + versions. But we can already start preparing for the future by saving the dtype with save_pretrained. + + TensorFlow specific parameters + + - **use_bfloat16** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should use + BFloat16 scalars (only used by some TensorFlow models). + """ + model_type: str = "" + is_composition: bool = False + attribute_map: Dict[str, str] = {} + + def __setattr__(self, key, value): + if key in super().__getattribute__("attribute_map"): + key = super().__getattribute__("attribute_map")[key] + super().__setattr__(key, value) + + def __getattribute__(self, key): + if key != "attribute_map" and key in super().__getattribute__("attribute_map"): + key = super().__getattribute__("attribute_map")[key] + return super().__getattribute__(key) + + def __init__(self, **kwargs): + # Attributes with defaults + self.return_dict = kwargs.pop("return_dict", True) + self.output_hidden_states = kwargs.pop("output_hidden_states", False) + self.output_attentions = kwargs.pop("output_attentions", False) + self.torchscript = kwargs.pop("torchscript", False) # Only used by PyTorch models + self.torch_dtype = kwargs.pop("torch_dtype", None) # Only used by PyTorch models + self.use_bfloat16 = kwargs.pop("use_bfloat16", False) + self.pruned_heads = kwargs.pop("pruned_heads", {}) + self.tie_word_embeddings = kwargs.pop( + "tie_word_embeddings", True + ) # Whether input and output word embeddings should be tied for all MLM, LM and Seq2Seq models. + + # Is decoder is used in encoder-decoder models to differentiate encoder from decoder + self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False) + self.is_decoder = kwargs.pop("is_decoder", False) + self.add_cross_attention = kwargs.pop("add_cross_attention", False) + self.tie_encoder_decoder = kwargs.pop("tie_encoder_decoder", False) + + # Parameters for sequence generation + self.max_length = kwargs.pop("max_length", 20) + self.min_length = kwargs.pop("min_length", 0) + self.do_sample = kwargs.pop("do_sample", False) + self.early_stopping = kwargs.pop("early_stopping", False) + self.num_beams = kwargs.pop("num_beams", 1) + self.num_beam_groups = kwargs.pop("num_beam_groups", 1) + self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0) + self.temperature = kwargs.pop("temperature", 1.0) + self.top_k = kwargs.pop("top_k", 50) + self.top_p = kwargs.pop("top_p", 1.0) + self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0) + self.length_penalty = kwargs.pop("length_penalty", 1.0) + self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0) + self.encoder_no_repeat_ngram_size = kwargs.pop("encoder_no_repeat_ngram_size", 0) + self.bad_words_ids = kwargs.pop("bad_words_ids", None) + self.num_return_sequences = kwargs.pop("num_return_sequences", 1) + self.chunk_size_feed_forward = kwargs.pop("chunk_size_feed_forward", 0) + self.output_scores = kwargs.pop("output_scores", False) + self.return_dict_in_generate = kwargs.pop("return_dict_in_generate", False) + self.forced_bos_token_id = kwargs.pop("forced_bos_token_id", None) + self.forced_eos_token_id = kwargs.pop("forced_eos_token_id", None) + self.remove_invalid_values = kwargs.pop("remove_invalid_values", False) + + # Fine-tuning task arguments + self.architectures = kwargs.pop("architectures", None) + self.finetuning_task = kwargs.pop("finetuning_task", None) + self.id2label = kwargs.pop("id2label", None) + self.label2id = kwargs.pop("label2id", None) + if self.id2label is not None: + kwargs.pop("num_labels", None) + self.id2label = dict((int(key), value) for key, value in self.id2label.items()) + # Keys are always strings in JSON so convert ids to int here. + else: + self.num_labels = kwargs.pop("num_labels", 2) + + if self.torch_dtype is not None and isinstance(self.torch_dtype, str): + # we will start using self.torch_dtype in v5, but to be consistent with + # from_pretrained's torch_dtype arg convert it to an actual torch.dtype object + if _NEED_IMPORT_TORCH: + import torch + + self.torch_dtype = getattr(torch, self.torch_dtype) + + # Tokenizer arguments TODO: eventually tokenizer and models should share the same config + self.tokenizer_class = kwargs.pop("tokenizer_class", None) + self.prefix = kwargs.pop("prefix", None) + self.bos_token_id = kwargs.pop("bos_token_id", None) + self.pad_token_id = kwargs.pop("pad_token_id", None) + self.eos_token_id = kwargs.pop("eos_token_id", None) + self.sep_token_id = kwargs.pop("sep_token_id", None) + + self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None) + + # task specific arguments + self.task_specific_params = kwargs.pop("task_specific_params", None) + + # regression / multi-label classification + self.problem_type = kwargs.pop("problem_type", None) + allowed_problem_types = ("regression", "single_label_classification", "multi_label_classification") + if self.problem_type is not None and self.problem_type not in allowed_problem_types: + raise ValueError( + f"The config parameter `problem_type` was not understood: received {self.problem_type}" + "but only 'regression', 'single_label_classification' and 'multi_label_classification' are valid." + ) + + # TPU arguments + if kwargs.pop("xla_device", None) is not None: + logger.warning( + "The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can " + "safely remove it from your `config.json` file." + ) + + # Name or path to the pretrained checkpoint + self._name_or_path = str(kwargs.pop("name_or_path", "")) + + # Drop the transformers version info + self.transformers_version = kwargs.pop("transformers_version", None) + + # Deal with gradient checkpointing + if kwargs.get("gradient_checkpointing", False): + logger.warn( + "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 " + "Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the " + "`Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`." + ) + + # Additional attributes without default values + for key, value in kwargs.items(): + try: + setattr(self, key, value) + except AttributeError as err: + logger.error(f"Can't set {key} with value {value} for {self}") + raise err + + @property + def name_or_path(self) -> str: + return self._name_or_path + + @name_or_path.setter + def name_or_path(self, value): + self._name_or_path = str(value) # Make sure that name_or_path is a string (for JSON encoding) + + @property + def use_return_dict(self) -> bool: + """ + :obj:`bool`: Whether or not return :class:`~transformers.file_utils.ModelOutput` instead of tuples. + """ + # If torchscript is set, force `return_dict=False` to avoid jit errors + return self.return_dict and not self.torchscript + + @property + def num_labels(self) -> int: + """ + :obj:`int`: The number of labels for classification models. + """ + return len(self.id2label) + + @num_labels.setter + def num_labels(self, num_labels: int): + if not hasattr(self, "id2label") or self.id2label is None or len(self.id2label) != num_labels: + self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)} + self.label2id = dict(zip(self.id2label.values(), self.id2label.keys())) + + def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs): + """ + Save a configuration object to the directory ``save_directory``, so that it can be re-loaded using the + :func:`~transformers.PretrainedConfig.from_pretrained` class method. + + Args: + save_directory (:obj:`str` or :obj:`os.PathLike`): + Directory where the configuration JSON file will be saved (will be created if it does not exist). + push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to push your model to the Hugging Face model hub after saving it. + + .. warning:: + + Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with + :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are + pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory + instead. + + kwargs: + Additional key word arguments passed along to the + :meth:`~transformers.file_utils.PushToHubMixin.push_to_hub` method. + """ + if os.path.isfile(save_directory): + raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") + + os.makedirs(save_directory, exist_ok=True) + # If we save using the predefined names, we can load using `from_pretrained` + output_config_file = os.path.join(save_directory, CONFIG_NAME) + + self.to_json_file(output_config_file, use_diff=True) + logger.info(f"Configuration saved in {output_config_file}") + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + r""" + Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pretrained model + configuration. + + Args: + pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): + This can be either: + + - a string, the `model id` of a pretrained model configuration hosted inside a model repo on + huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or + namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing a configuration file saved using the + :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g., ``./my_model_directory/``. + - a path or url to a saved configuration JSON `file`, e.g., + ``./my_model_directory/configuration.json``. + cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + force_download (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to force to (re-)download the configuration files and override the cached versions if + they exist. + resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to delete incompletely received file. Attempts to resume the download if such a file + exists. + proxies (:obj:`Dict[str, str]`, `optional`): + A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. + use_auth_token (:obj:`str` or `bool`, `optional`): + The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token + generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). + revision(:obj:`str`, `optional`, defaults to :obj:`"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any + identifier allowed by git. + return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`): + If :obj:`False`, then this function returns just the final configuration object. + + If :obj:`True`, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs` + is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., + the part of ``kwargs`` which has not been used to update ``config`` and is otherwise ignored. + kwargs (:obj:`Dict[str, Any]`, `optional`): + The values in kwargs of any keys which are configuration attributes will be used to override the loaded + values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled + by the ``return_unused_kwargs`` keyword parameter. + + .. note:: + + Passing :obj:`use_auth_token=True` is required when you want to use a private model. + + + Returns: + :class:`PretrainedConfig`: The configuration object instantiated from this pretrained model. + + Examples:: + + # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a + # derived class: BertConfig + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from huggingface.co and cache. + config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` + config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json') + config = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False) + assert config.output_attentions == True + config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True, + foo=False, return_unused_kwargs=True) + assert config.output_attentions == True + assert unused_kwargs == {'foo': False} + + """ + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warn( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + @classmethod + def get_config_dict( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> Tuple[Dict[str, Any], Dict[str, Any]]: + """ + From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a + :class:`~transformers.PretrainedConfig` using ``from_dict``. + + + + Parameters: + pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): + The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. + + Returns: + :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object. + + """ + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + use_auth_token = kwargs.pop("use_auth_token", None) + local_files_only = kwargs.pop("local_files_only", False) + revision = kwargs.pop("revision", None) + from_pipeline = kwargs.pop("_from_pipeline", None) + from_auto_class = kwargs.pop("_from_auto", False) + + user_agent = {"file_type": "config", "from_auto_class": from_auto_class} + if from_pipeline is not None: + user_agent["using_pipeline"] = from_pipeline + + if is_offline_mode() and not local_files_only: + logger.info("Offline mode: forcing local_files_only=True") + local_files_only = True + + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + if os.path.isdir(pretrained_model_name_or_path): + config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) + elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): + config_file = pretrained_model_name_or_path + else: + config_file = hf_bucket_url( + pretrained_model_name_or_path, filename=CONFIG_NAME, revision=revision, mirror=None + ) + + try: + # Load from URL or cache if already cached + resolved_config_file = cached_path( + config_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + user_agent=user_agent, + ) + # Load config dict + config_dict = cls._dict_from_json_file(resolved_config_file) + + except EnvironmentError as err: + logger.error(err) + msg = ( + f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n" + f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" + f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {CONFIG_NAME} file\n\n" + ) + + if revision is not None: + msg += f"- or '{revision}' is a valid git identifier (branch name, a tag name, or a commit id) that exists for this model name as listed on its model page on 'https://huggingface.co/models'\n\n" + + raise EnvironmentError(msg) + + except (json.JSONDecodeError, UnicodeDecodeError): + msg = ( + f"Couldn't reach server at '{config_file}' to download configuration file or " + "configuration file is not a valid JSON file. " + f"Please check network or file content here: {resolved_config_file}." + ) + raise EnvironmentError(msg) + + if resolved_config_file == config_file: + logger.info(f"loading configuration file {config_file}") + else: + logger.info(f"loading configuration file {config_file} from cache at {resolved_config_file}") + + return config_dict, kwargs + + @classmethod + def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig": + """ + Instantiates a :class:`~transformers.PretrainedConfig` from a Python dictionary of parameters. + + Args: + config_dict (:obj:`Dict[str, Any]`): + Dictionary that will be used to instantiate the configuration object. Such a dictionary can be + retrieved from a pretrained checkpoint by leveraging the + :func:`~transformers.PretrainedConfig.get_config_dict` method. + kwargs (:obj:`Dict[str, Any]`): + Additional parameters from which to initialize the configuration object. + + Returns: + :class:`PretrainedConfig`: The configuration object instantiated from those parameters. + """ + return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) + + config = cls(**config_dict) + + if hasattr(config, "pruned_heads"): + config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items()) + + # Update config with kwargs if needed + to_remove = [] + for key, value in kwargs.items(): + if hasattr(config, key): + setattr(config, key, value) + if key != "torch_dtype": + to_remove.append(key) + for key in to_remove: + kwargs.pop(key, None) + + logger.info(f"Model config {config}") + if return_unused_kwargs: + return config, kwargs + else: + return config + + @classmethod + def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "PretrainedConfig": + """ + Instantiates a :class:`~transformers.PretrainedConfig` from the path to a JSON file of parameters. + + Args: + json_file (:obj:`str` or :obj:`os.PathLike`): + Path to the JSON file containing the parameters. + + Returns: + :class:`PretrainedConfig`: The configuration object instantiated from that JSON file. + + """ + config_dict = cls._dict_from_json_file(json_file) + return cls(**config_dict) + + @classmethod + def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]): + with open(json_file, "r", encoding="utf-8") as reader: + text = reader.read() + return json.loads(text) + + def __eq__(self, other): + return self.__dict__ == other.__dict__ + + def __repr__(self): + return f"{self.__class__.__name__} {self.to_json_string()}" + + def to_diff_dict(self) -> Dict[str, Any]: + """ + Removes all attributes from config which correspond to the default config attributes for better readability and + serializes to a Python dictionary. + + Returns: + :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, + """ + config_dict = self.to_dict() + + # get the default config dict + default_config_dict = PretrainedConfig().to_dict() + + # get class specific config dict + class_config_dict = self.__class__().to_dict() if not self.is_composition else {} + + serializable_config_dict = {} + + # only serialize values that differ from the default config + for key, value in config_dict.items(): + if ( + key not in default_config_dict + or key == "transformers_version" + or value != default_config_dict[key] + or (key in class_config_dict and value != class_config_dict[key]) + ): + serializable_config_dict[key] = value + + self.dict_torch_dtype_to_str(serializable_config_dict) + + return serializable_config_dict + + def to_dict(self) -> Dict[str, Any]: + """ + Serializes this instance to a Python dictionary. + + Returns: + :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. + """ + output = copy.deepcopy(self.__dict__) + if hasattr(self.__class__, "model_type"): + output["model_type"] = self.__class__.model_type + + # Transformers version when serializing the model + output["transformers_version"] = __version__ + + self.dict_torch_dtype_to_str(output) + + return output + + def to_json_string(self, use_diff: bool = True) -> str: + """ + Serializes this instance to a JSON string. + + Args: + use_diff (:obj:`bool`, `optional`, defaults to :obj:`True`): + If set to ``True``, only the difference between the config instance and the default + ``PretrainedConfig()`` is serialized to JSON string. + + Returns: + :obj:`str`: String containing all the attributes that make up this configuration instance in JSON format. + """ + if use_diff is True: + config_dict = self.to_diff_dict() + else: + config_dict = self.to_dict() + return json.dumps(config_dict, indent=2, sort_keys=True) + "\n" + + def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True): + """ + Save this instance to a JSON file. + + Args: + json_file_path (:obj:`str` or :obj:`os.PathLike`): + Path to the JSON file in which this configuration instance's parameters will be saved. + use_diff (:obj:`bool`, `optional`, defaults to :obj:`True`): + If set to ``True``, only the difference between the config instance and the default + ``PretrainedConfig()`` is serialized to JSON file. + """ + with open(json_file_path, "w", encoding="utf-8") as writer: + writer.write(self.to_json_string(use_diff=use_diff)) + + def update(self, config_dict: Dict[str, Any]): + """ + Updates attributes of this class with attributes from ``config_dict``. + + Args: + config_dict (:obj:`Dict[str, Any]`): Dictionary of attributes that should be updated for this class. + """ + for key, value in config_dict.items(): + setattr(self, key, value) + + def update_from_string(self, update_str: str): + """ + Updates attributes of this class with attributes from ``update_str``. + + The expected format is ints, floats and strings as is, and for booleans use ``true`` or ``false``. For example: + "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" + + The keys to change have to already exist in the config object. + + Args: + update_str (:obj:`str`): String with attributes that should be updated for this class. + + """ + + d = dict(x.split("=") for x in update_str.split(",")) + for k, v in d.items(): + if not hasattr(self, k): + raise ValueError(f"key {k} isn't in the original config dict") + + old_v = getattr(self, k) + if isinstance(old_v, bool): + if v.lower() in ["true", "1", "y", "yes"]: + v = True + elif v.lower() in ["false", "0", "n", "no"]: + v = False + else: + raise ValueError(f"can't derive true or false from {v} (key {k})") + elif isinstance(old_v, int): + v = int(v) + elif isinstance(old_v, float): + v = float(v) + elif not isinstance(old_v, str): + raise ValueError( + f"You can only update int, float, bool or string values in the config, got {v} for key {k}" + ) + + setattr(self, k, v) + + def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None: + """ + Checks whether the passed dictionary has a `torch_dtype` key and if it's not None, converts torch.dtype to a + string of just the type. For example, :obj:`torch.float32` get converted into `"float32"` string, which can + then be stored in the json format. + """ + if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str): + d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1] \ No newline at end of file diff --git a/fastNLP/transformers/torch/deepspeed.py b/fastNLP/transformers/torch/deepspeed.py new file mode 100644 index 00000000..fc3fcc7c --- /dev/null +++ b/fastNLP/transformers/torch/deepspeed.py @@ -0,0 +1,388 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Integration with Deepspeed +""" + +import importlib.util +import io +import json +import weakref +from copy import deepcopy +from functools import partialmethod + +from .dependency_versions_check import dep_version_check +from fastNLP.envs.imports import _NEED_IMPORT_TORCH +from fastNLP.core.log import logger + +if _NEED_IMPORT_TORCH: + import torch + + +def is_deepspeed_available(): + return importlib.util.find_spec("deepspeed") is not None + + +class HfDeepSpeedConfig: + """ + This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage. + + A ``weakref`` of this object is stored in the module's globals to be able to access the config from areas where + things like the Trainer object is not available (e.g. ``from_pretrained`` and ``_get_resized_embeddings``). + Therefore it's important that this object remains alive while the program is still running. + + :class:`~transformers.Trainer` uses the ``HfTrainerDeepSpeedConfig`` subclass instead. That subclass has logic to + sync the configuration with values of :class:`~transformers.TrainingArguments` by replacing special placeholder + values: ``"auto"``. Without this special logic the DeepSpeed configuration is not modified in any way. + + Args: + config_file_or_dict (:obj:`Union[str, Dict]`): path to DeepSpeed config file or dict. + + """ + + def __init__(self, config_file_or_dict): + # set global weakref object + set_hf_deepspeed_config(self) + + dep_version_check("deepspeed") + + if isinstance(config_file_or_dict, dict): + # Don't modify user's data should they want to reuse it (e.g. in tests), because once we + # modified it, it will not be accepted here again, since `auto` values would have been overridden + config = deepcopy(config_file_or_dict) + elif isinstance(config_file_or_dict, str): + with io.open(config_file_or_dict, "r", encoding="utf-8") as f: + config = json.load(f) + else: + raise ValueError("expecting either a path to a DeepSpeed config file or a pre-populated dict") + self.config = config + + # zero stage - this is done as early as possible, before model is created, to allow + # ``is_deepspeed_zero3_enabled`` query and getting to the early deepspeed config object + # during ``zero.Init()`` which needs whether fp16 is enabled, dtype, etc. + self._stage = self.get_value("zero_optimization.stage", -1) + + # offload + self._offload = False + if self.is_zero2() or self.is_zero3(): + offload_devices_valid = set(["cpu", "nvme"]) + offload_devices = set( + [ + self.get_value("zero_optimization.offload_optimizer.device"), + self.get_value("zero_optimization.offload_param.device"), + ] + ) + if len(offload_devices & offload_devices_valid) > 0: + self._offload = True + + def find_config_node(self, ds_key_long): + config = self.config + + # find the config node of interest if it exists + nodes = ds_key_long.split(".") + ds_key = nodes.pop() + for node in nodes: + config = config.get(node) + if config is None: + return None, ds_key + + return config, ds_key + + def get_value(self, ds_key_long, default=None): + """ + Returns the set value or ``default`` if no value is set + """ + config, ds_key = self.find_config_node(ds_key_long) + if config is None: + return default + return config.get(ds_key, default) + + def is_true(self, ds_key_long): + """ + Returns :obj:`True`/:obj:`False` only if the value is set, always :obj:`False` otherwise. So use this method to + ask the very specific question of whether the value is set to :obj:`True` (and it's not set to :obj:`False` or + isn't set). + + """ + value = self.get_value(ds_key_long) + return False if value is None else bool(value) + + def is_false(self, ds_key_long): + """ + Returns :obj:`True`/:obj:`False` only if the value is set, always :obj:`False` otherwise. So use this method to + ask the very specific question of whether the value is set to :obj:`False` (and it's not set to :obj:`True` or + isn't set). + """ + value = self.get_value(ds_key_long) + return False if value is None else not bool(value) + + def is_zero2(self): + return self._stage == 2 + + def is_zero3(self): + return self._stage == 3 + + def is_offload(self): + return self._offload + + +class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig): + """ + The ``HfTrainerDeepSpeedConfig`` object is meant to be created during ``TrainingArguments`` object creation and has + the same lifespan as the latter. + """ + + def __init__(self, config_file_or_dict): + super().__init__(config_file_or_dict) + self._dtype = torch.float16 + self.mismatches = [] + + def dtype(self): + return self._dtype + + def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True): + """ + A utility method that massages the config file and can optionally verify that the values match. + + 1. Replace "auto" values with ``TrainingArguments`` value. + + 2. If it wasn't "auto" and ``must_match`` is true, then check that DS config matches Trainer + config values and if mismatched add the entry to ``self.mismatched`` - will assert during + ``trainer_config_finalize`` for one or more mismatches. + + """ + config, ds_key = self.find_config_node(ds_key_long) + if config is None: + return + + if config.get(ds_key) == "auto": + config[ds_key] = hf_val + return + + if not must_match: + return + + ds_val = config.get(ds_key) + if ds_val is not None and ds_val != hf_val: + self.mismatches.append(f"- ds {ds_key_long}={ds_val} vs hf {hf_key}={hf_val}") + + fill_only = partialmethod(fill_match, must_match=False) + + def trainer_config_process(self, args): + """ + Adjust the config with ``TrainingArguments`` values. This stage is run during ``TrainingArguments`` object + creation. + """ + # DeepSpeed does: + # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps + train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps + self.fill_match( + "train_micro_batch_size_per_gpu", args.per_device_train_batch_size, "per_device_train_batch_size" + ) + self.fill_match("gradient_accumulation_steps", args.gradient_accumulation_steps, "gradient_accumulation_steps") + self.fill_match("train_batch_size", train_batch_size, "train_batch_size (calculated)") + self.fill_match("gradient_clipping", args.max_grad_norm, "max_grad_norm") + + self.fill_match("optimizer.params.lr", args.learning_rate, "learning_rate") + self.fill_match("optimizer.params.betas", [args.adam_beta1, args.adam_beta2], "adam_beta1+adam_beta2") + self.fill_match("optimizer.params.eps", args.adam_epsilon, "adam_epsilon") + self.fill_match("optimizer.params.weight_decay", args.weight_decay, "weight_decay") + + self.fill_only("scheduler.params.warmup_min_lr", 0) # not a trainer arg + self.fill_match("scheduler.params.warmup_max_lr", args.learning_rate, "learning_rate") + # total_num_steps - will get set in trainer_config_finalize + + # fp16 + if args.fp16: + fp16_backend = "apex" if args.fp16_backend == "apex" else "amp" + else: + fp16_backend = None + + # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set + # any here unless the user did the work + self.fill_match("fp16.enabled", fp16_backend == "amp", "fp16+fp16_backend(amp)") + + # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any + # ZeRO features + self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)") + self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level") + + # only if we have an explicit fp16.enabled = False then it's fp32, if it's True or this + # whole config section is missing then the fallback is fp16 + if self.is_false("fp16.enabled"): + self._dtype = torch.float32 + # later there will be other dtypes besides just fp16 and fp32 + # also not quite sure what dtype should be under apex, defaulting to fp16 for now + + def trainer_config_finalize(self, args, model, num_training_steps): + """ + This stage is run after we have the model and know num_training_steps. + + Now we we can complete the configuration process. + """ + # zero + if self.is_zero3(): + # automatically assign the optimal config values based on model config + hidden_size = model.config.hidden_size + self.fill_only("zero_optimization.reduce_bucket_size", hidden_size * hidden_size) + self.fill_only("zero_optimization.stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size) + self.fill_only("zero_optimization.stage3_param_persistence_threshold", 10 * hidden_size) + + # scheduler + self.fill_match("scheduler.params.total_num_steps", num_training_steps, "num_training_steps (calculated)") + self.fill_match("scheduler.params.warmup_num_steps", args.get_warmup_steps(num_training_steps), "warmup_steps") + + if len(self.mismatches) > 0: + mismatches = "\n".join(self.mismatches) + raise ValueError( + f"Please correct the following DeepSpeed config values that mismatch TrainingArguments values:\n{mismatches}\n" + "The easiest method is to set these DeepSpeed config values to 'auto'." + ) + + +# keep the config object global to be able to access it anywhere during TrainingArguments life-cycle +_hf_deepspeed_config_weak_ref = None + + +def set_hf_deepspeed_config(hf_deepspeed_config_obj): + # this is a special weakref global object to allow us to get to Deepspeed config from APIs + # that don't have an easy way to get to the Deepspeed config outside of the Trainer domain. + global _hf_deepspeed_config_weak_ref + # will go away automatically when HfDeepSpeedConfig is destroyed (when TrainingArguments is destroyed) + _hf_deepspeed_config_weak_ref = weakref.ref(hf_deepspeed_config_obj) + + +def is_deepspeed_zero3_enabled(): + if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None: + return _hf_deepspeed_config_weak_ref().is_zero3() + else: + return False + + +def deepspeed_config(): + if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None: + return _hf_deepspeed_config_weak_ref().config + else: + return None + + +def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): + """ + Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args. + + If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made. + + Args: + trainer: Trainer object + num_training_steps: per single gpu + resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load + + Returns: model, optimizer, lr_scheduler + + """ + import deepspeed + from deepspeed.utils import logger as ds_logger + + model = trainer.model + args = trainer.args + + hf_deepspeed_config = args.hf_deepspeed_config + hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps) + + # resume config update - some bits like `model` and `num_training_steps` only become available during train + config = hf_deepspeed_config.config + + # Optimizer + Scheduler + # Currently supported combos: + # 1. DS scheduler + DS optimizer: Yes + # 2. HF scheduler + HF optimizer: Yes + # 3. DS scheduler + HF optimizer: Yes + # 4. HF scheduler + DS optimizer: Yes + # + # Unless Offload is enabled in which case it's: + # 1. DS scheduler + DS optimizer: Yes + # 2. HF scheduler + HF optimizer: Mostly* + # 3. DS scheduler + HF optimizer: Mostly* + # 4. HF scheduler + DS optimizer: Yes + # + # Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB) + + optimizer = None + if "optimizer" in config: + if args.adafactor: + raise ValueError( + "--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. " + "Only one optimizer can be configured." + ) + else: + if hf_deepspeed_config.is_offload(): + logger.info( + "Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the custom optimizer has both CPU and GPU implementation (except LAMB)" + ) + + # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch. + # But trainer uses AdamW by default. + optimizer = trainer.create_optimizer() + # To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer` + config["zero_allow_untested_optimizer"] = True + + def _lr_scheduler_callable(optimizer): + return trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer) + + lr_scheduler = None + if "scheduler" not in config: + if optimizer is None: + # Optimizer is not available, so use callable to defer lr_scheduler creation to DS init + lr_scheduler = _lr_scheduler_callable + else: + lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer) + + # keep for quick debug: + # from pprint import pprint; pprint(config) + + # set the Deepspeed log level consistent with the trainer + ds_logger.setLevel(args.get_process_log_level()) + + model_parameters = filter(lambda p: p.requires_grad, model.parameters()) + + model, optimizer, _, lr_scheduler = deepspeed.initialize( + model=model, + model_parameters=model_parameters, + config_params=config, + optimizer=optimizer, + lr_scheduler=lr_scheduler, + ) + + if resume_from_checkpoint is not None: + + # it's possible that the user is trying to resume from model_path, which doesn't necessarily + # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's + # a resume from a checkpoint and not just a local pretrained weight. So we check here if the + # path contains what looks like a deepspeed checkpoint + import glob + + deepspeed_checkpoint_dirs = sorted(glob.glob(f"{resume_from_checkpoint}/global_step*")) + + if len(deepspeed_checkpoint_dirs) > 0: + logger.info(f"Attempting to resume from {resume_from_checkpoint}") + # this magically updates self.optimizer and self.lr_scheduler + load_path, _ = model.load_checkpoint( + resume_from_checkpoint, load_optimizer_states=True, load_lr_scheduler_states=True + ) + if load_path is None: + raise ValueError(f"[deepspeed] failed to resume from checkpoint {resume_from_checkpoint}") + else: + logger.info(f"{resume_from_checkpoint} doesn't have deepspeed checkpoints, doing nothing") + + return model, optimizer, lr_scheduler diff --git a/fastNLP/transformers/torch/dependency_versions_check.py b/fastNLP/transformers/torch/dependency_versions_check.py new file mode 100644 index 00000000..30e8f448 --- /dev/null +++ b/fastNLP/transformers/torch/dependency_versions_check.py @@ -0,0 +1,20 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys + +from .dependency_versions_table import deps +from .utils.versions import require_version + +def dep_version_check(pkg, hint=None): + require_version(deps[pkg], hint) diff --git a/fastNLP/transformers/torch/dependency_versions_table.py b/fastNLP/transformers/torch/dependency_versions_table.py new file mode 100644 index 00000000..ef396637 --- /dev/null +++ b/fastNLP/transformers/torch/dependency_versions_table.py @@ -0,0 +1,76 @@ +# THIS FILE HAS BEEN AUTOGENERATED. To update: +# 1. modify the `_deps` dict in setup.py +# 2. run `make deps_table_update`` +deps = { + "Pillow": "Pillow", + "black": "black==21.4b0", + "codecarbon": "codecarbon==1.2.0", + "cookiecutter": "cookiecutter==1.7.2", + "dataclasses": "dataclasses", + "datasets": "datasets", + "deepspeed": "deepspeed>=0.5.3", + "docutils": "docutils==0.16.0", + "fairscale": "fairscale>0.3", + "faiss-cpu": "faiss-cpu", + "fastapi": "fastapi", + "filelock": "filelock", + "flake8": "flake8>=3.8.3", + "flax": "flax>=0.3.4", + "fugashi": "fugashi>=1.0", + "GitPython": "GitPython<3.1.19", + "huggingface-hub": "huggingface-hub>=0.0.17", + "importlib_metadata": "importlib_metadata", + "ipadic": "ipadic>=1.0.0,<2.0", + "isort": "isort>=5.5.4", + "jax": "jax>=0.2.8", + "jaxlib": "jaxlib>=0.1.65", + "jieba": "jieba", + "keras2onnx": "keras2onnx", + "nltk": "nltk", + "numpy": "numpy>=1.17", + "onnxconverter-common": "onnxconverter-common", + "onnxruntime-tools": "onnxruntime-tools>=1.4.2", + "onnxruntime": "onnxruntime>=1.4.0", + "optuna": "optuna", + "optax": "optax>=0.0.8", + "packaging": "packaging>=20.0", + "parameterized": "parameterized", + "protobuf": "protobuf", + "psutil": "psutil", + "pyyaml": "pyyaml>=5.1", + "pydantic": "pydantic", + "pytest": "pytest", + "pytest-timeout": "pytest-timeout", + "pytest-xdist": "pytest-xdist", + "python": "python>=3.6.0", + "ray[tune]": "ray[tune]", + "recommonmark": "recommonmark", + "regex": "regex!=2019.12.17", + "requests": "requests", + "rouge-score": "rouge-score", + "sacrebleu": "sacrebleu>=1.4.12,<2.0.0", + "sacremoses": "sacremoses", + "sagemaker": "sagemaker>=2.31.0", + "scikit-learn": "scikit-learn", + "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92", + "sigopt": "sigopt", + "soundfile": "soundfile", + "sphinx-copybutton": "sphinx-copybutton", + "sphinx-markdown-tables": "sphinx-markdown-tables", + "sphinx-rtd-theme": "sphinx-rtd-theme==0.4.3", + "sphinx": "sphinx==3.2.1", + "sphinxext-opengraph": "sphinxext-opengraph==0.4.1", + "sphinx-intl": "sphinx-intl", + "starlette": "starlette", + "tensorflow-cpu": "tensorflow-cpu>=2.3", + "tensorflow": "tensorflow>=2.3", + "timeout-decorator": "timeout-decorator", + "timm": "timm", + "tokenizers": "tokenizers>=0.10.1,<0.11", + "torch": "torch>=1.0", + "torchaudio": "torchaudio", + "tqdm": "tqdm>=4.27", + "unidic": "unidic>=1.0.2", + "unidic_lite": "unidic_lite>=1.0.7", + "uvicorn": "uvicorn", +} diff --git a/fastNLP/transformers/torch/file_utils.py b/fastNLP/transformers/torch/file_utils.py new file mode 100644 index 00000000..2b606b33 --- /dev/null +++ b/fastNLP/transformers/torch/file_utils.py @@ -0,0 +1,934 @@ +import copy +import fnmatch +import importlib.util +import io +import json +import os +import re +import shutil +import sys +import tarfile +import tempfile +import operator +from collections import OrderedDict, UserDict +from contextlib import contextmanager +from dataclasses import fields +from enum import Enum +from functools import partial +from hashlib import sha256 +from pathlib import Path +from typing import Any, BinaryIO, Dict, Optional, Tuple, Union +from urllib.parse import urlparse +from uuid import uuid4 +from zipfile import ZipFile, is_zipfile + +import numpy as np +# from tqdm.auto import tqdm + +import requests + +from . import __version__ +from .utils.versions import importlib_metadata +from fastNLP.envs.imports import _NEED_IMPORT_TORCH, _TORCH_GREATER_EQUAL_1_8 +from fastNLP.envs.utils import _compare_version +from fastNLP.core.log import logger + +if _NEED_IMPORT_TORCH: + import torch + _torch_version = importlib_metadata.version("torch") + +hf_cache_home = os.path.expanduser( + os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface")) +) +default_cache_path = os.path.join(hf_cache_home, "transformers") + +PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path) +PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE) +TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE) +SESSION_ID = uuid4().hex + +ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"} + +DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", False) in ENV_VARS_TRUE_VALUES + +WEIGHTS_NAME = "pytorch_model.bin" +DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] + +_staging_mode = os.environ.get("HUGGINGFACE_CO_STAGING", "NO").upper() in ENV_VARS_TRUE_VALUES +_default_endpoint = "https://moon-staging.huggingface.co" if _staging_mode else "https://huggingface.co" + +HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HUGGINGFACE_CO_RESOLVE_ENDPOINT", _default_endpoint) +HUGGINGFACE_CO_PREFIX = HUGGINGFACE_CO_RESOLVE_ENDPOINT + "/{model_id}/resolve/{revision}/{filename}" + +CONFIG_NAME = "config.json" + +_is_offline_mode = True if os.environ.get("TRANSFORMERS_OFFLINE", "0").upper() in ENV_VARS_TRUE_VALUES else False + +@contextmanager +def filelock(path): + try: + import fcntl + open_mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC + fd = os.open(path, open_mode) + fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) + except: + pass + + yield + + try: + fcntl.flock(fd, fcntl.LOCK_UN) + os.close(fd) + except: + pass + +def is_offline_mode(): + return _is_offline_mode + +def is_training_run_on_sagemaker(): + return "SAGEMAKER_JOB_NAME" in os.environ + +def add_start_docstrings(*docstr): + def docstring_decorator(fn): + fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") + return fn + + return docstring_decorator + + +def add_start_docstrings_to_model_forward(*docstr): + def docstring_decorator(fn): + class_name = f":class:`~transformers.{fn.__qualname__.split('.')[0]}`" + intro = f" The {class_name} forward method, overrides the :func:`__call__` special method." + note = r""" + + .. note:: + Although the recipe for forward pass needs to be defined within this function, one should call the + :class:`Module` instance afterwards instead of this since the former takes care of running the pre and post + processing steps while the latter silently ignores them. + """ + fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") + return fn + + return docstring_decorator + + +def add_end_docstrings(*docstr): + def docstring_decorator(fn): + fn.__doc__ = fn.__doc__ + "".join(docstr) + return fn + + return docstring_decorator + +PT_RETURN_INTRODUCTION = r""" + Returns: + :class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`: A :class:`~{full_output_type}` or a tuple of + :obj:`torch.FloatTensor` (if ``return_dict=False`` is passed or when ``config.return_dict=False``) comprising + various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs. + +""" + +def _get_indent(t): + """Returns the indentation in the first line of t""" + search = re.search(r"^(\s*)\S", t) + return "" if search is None else search.groups()[0] + + +def _convert_output_args_doc(output_args_doc): + """Convert output_args_doc to display properly.""" + # Split output_arg_doc in blocks argument/description + indent = _get_indent(output_args_doc) + blocks = [] + current_block = "" + for line in output_args_doc.split("\n"): + # If the indent is the same as the beginning, the line is the name of new arg. + if _get_indent(line) == indent: + if len(current_block) > 0: + blocks.append(current_block[:-1]) + current_block = f"{line}\n" + else: + # Otherwise it's part of the description of the current arg. + # We need to remove 2 spaces to the indentation. + current_block += f"{line[2:]}\n" + blocks.append(current_block[:-1]) + + # Format each block for proper rendering + for i in range(len(blocks)): + blocks[i] = re.sub(r"^(\s+)(\S+)(\s+)", r"\1- **\2**\3", blocks[i]) + blocks[i] = re.sub(r":\s*\n\s*(\S)", r" -- \1", blocks[i]) + + return "\n".join(blocks) + +def _prepare_output_docstrings(output_type, config_class): + """ + Prepares the return part of the docstring using `output_type`. + """ + docstrings = output_type.__doc__ + + # Remove the head of the docstring to keep the list of args only + lines = docstrings.split("\n") + i = 0 + while i < len(lines) and re.search(r"^\s*(Args|Parameters):\s*$", lines[i]) is None: + i += 1 + if i < len(lines): + docstrings = "\n".join(lines[(i + 1) :]) + docstrings = _convert_output_args_doc(docstrings) + + # Add the return introduction + full_output_type = f"{output_type.__module__}.{output_type.__name__}" + intro = PT_RETURN_INTRODUCTION + intro = intro.format(full_output_type=full_output_type, config_class=config_class) + return intro + docstrings + +PT_TOKEN_CLASSIFICATION_SAMPLE = r""" + Example:: + + >>> from transformers import {tokenizer_class}, {model_class} + >>> import torch + + >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}') + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1 + + >>> outputs = model(**inputs, labels=labels) + >>> loss = outputs.loss + >>> logits = outputs.logits +""" + +PT_QUESTION_ANSWERING_SAMPLE = r""" + Example:: + + >>> from transformers import {tokenizer_class}, {model_class} + >>> import torch + + >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}') + + >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" + >>> inputs = tokenizer(question, text, return_tensors='pt') + >>> start_positions = torch.tensor([1]) + >>> end_positions = torch.tensor([3]) + + >>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions) + >>> loss = outputs.loss + >>> start_scores = outputs.start_logits + >>> end_scores = outputs.end_logits +""" + +PT_SEQUENCE_CLASSIFICATION_SAMPLE = r""" + Example:: + + >>> from transformers import {tokenizer_class}, {model_class} + >>> import torch + + >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}') + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 + >>> outputs = model(**inputs, labels=labels) + >>> loss = outputs.loss + >>> logits = outputs.logits +""" + +PT_MASKED_LM_SAMPLE = r""" + Example:: + + >>> from transformers import {tokenizer_class}, {model_class} + >>> import torch + + >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}') + + >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt") + >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"] + + >>> outputs = model(**inputs, labels=labels) + >>> loss = outputs.loss + >>> logits = outputs.logits +""" + +PT_BASE_MODEL_SAMPLE = r""" + Example:: + + >>> from transformers import {tokenizer_class}, {model_class} + >>> import torch + + >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}') + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state +""" + +PT_MULTIPLE_CHOICE_SAMPLE = r""" + Example:: + + >>> from transformers import {tokenizer_class}, {model_class} + >>> import torch + + >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}') + + >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." + >>> choice0 = "It is eaten with a fork and a knife." + >>> choice1 = "It is eaten while held in the hand." + >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 + + >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='pt', padding=True) + >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1 + + >>> # the linear classifier still needs to be trained + >>> loss = outputs.loss + >>> logits = outputs.logits +""" + +PT_CAUSAL_LM_SAMPLE = r""" + Example:: + + >>> import torch + >>> from transformers import {tokenizer_class}, {model_class} + + >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') + >>> model = {model_class}.from_pretrained('{checkpoint}') + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs, labels=inputs["input_ids"]) + >>> loss = outputs.loss + >>> logits = outputs.logits +""" + +PT_SAMPLE_DOCSTRINGS = { + "SequenceClassification": PT_SEQUENCE_CLASSIFICATION_SAMPLE, + "QuestionAnswering": PT_QUESTION_ANSWERING_SAMPLE, + "TokenClassification": PT_TOKEN_CLASSIFICATION_SAMPLE, + "MultipleChoice": PT_MULTIPLE_CHOICE_SAMPLE, + "MaskedLM": PT_MASKED_LM_SAMPLE, + "LMHead": PT_CAUSAL_LM_SAMPLE, + "BaseModel": PT_BASE_MODEL_SAMPLE, +} + +def add_code_sample_docstrings( + *docstr, tokenizer_class=None, checkpoint=None, output_type=None, config_class=None, mask=None, model_cls=None +): + def docstring_decorator(fn): + # model_class defaults to function's class if not specified otherwise + model_class = fn.__qualname__.split(".")[0] if model_cls is None else model_cls + + sample_docstrings = PT_SAMPLE_DOCSTRINGS + + doc_kwargs = dict(model_class=model_class, tokenizer_class=tokenizer_class, checkpoint=checkpoint) + + if "SequenceClassification" in model_class: + code_sample = sample_docstrings["SequenceClassification"] + elif "QuestionAnswering" in model_class: + code_sample = sample_docstrings["QuestionAnswering"] + elif "TokenClassification" in model_class: + code_sample = sample_docstrings["TokenClassification"] + elif "MultipleChoice" in model_class: + code_sample = sample_docstrings["MultipleChoice"] + elif "MaskedLM" in model_class or model_class in ["FlaubertWithLMHeadModel", "XLMWithLMHeadModel"]: + doc_kwargs["mask"] = "[MASK]" if mask is None else mask + code_sample = sample_docstrings["MaskedLM"] + elif "LMHead" in model_class or "CausalLM" in model_class: + code_sample = sample_docstrings["LMHead"] + elif "Model" in model_class or "Encoder" in model_class: + code_sample = sample_docstrings["BaseModel"] + else: + raise ValueError(f"Docstring can't be built for model {model_class}") + + output_doc = _prepare_output_docstrings(output_type, config_class) if output_type is not None else "" + built_doc = code_sample.format(**doc_kwargs) + fn.__doc__ = (fn.__doc__ or "") + "".join(docstr) + output_doc + built_doc + return fn + + return docstring_decorator + +def replace_return_docstrings(output_type=None, config_class=None): + def docstring_decorator(fn): + docstrings = fn.__doc__ + lines = docstrings.split("\n") + i = 0 + while i < len(lines) and re.search(r"^\s*Returns?:\s*$", lines[i]) is None: + i += 1 + if i < len(lines): + lines[i] = _prepare_output_docstrings(output_type, config_class) + docstrings = "\n".join(lines) + else: + raise ValueError( + f"The function {fn} should have an empty 'Return:' or 'Returns:' in its docstring as placeholder, current docstring is:\n{docstrings}" + ) + fn.__doc__ = docstrings + return fn + + return docstring_decorator + +def is_remote_url(url_or_filename): + parsed = urlparse(url_or_filename) + return parsed.scheme in ("http", "https") + +def hf_bucket_url( + model_id: str, filename: str, subfolder: Optional[str] = None, revision: Optional[str] = None, mirror=None +) -> str: + """ + Resolve a model identifier, a file name, and an optional revision id, to a huggingface.co-hosted url, redirecting + to Cloudfront (a Content Delivery Network, or CDN) for large files. + + Cloudfront is replicated over the globe so downloads are way faster for the end user (and it also lowers our + bandwidth costs). + + Cloudfront aggressively caches files by default (default TTL is 24 hours), however this is not an issue here + because we migrated to a git-based versioning system on huggingface.co, so we now store the files on S3/Cloudfront + in a content-addressable way (i.e., the file name is its hash). Using content-addressable filenames means cache + can't ever be stale. + + In terms of client-side caching from this library, we base our caching on the objects' ETag. An object' ETag is: + its sha1 if stored in git, or its sha256 if stored in git-lfs. Files cached locally from transformers before v3.5.0 + are not shared with those new files, because the cached file's name contains a hash of the url (which changed). + """ + if subfolder is not None: + filename = f"{subfolder}/{filename}" + + if mirror: + if mirror in ["tuna", "bfsu"]: + raise ValueError("The Tuna and BFSU mirrors are no longer available. Try removing the mirror argument.") + legacy_format = "/" not in model_id + if legacy_format: + return f"{mirror}/{model_id}-{filename}" + else: + return f"{mirror}/{model_id}/{filename}" + + if revision is None: + revision = "main" + return HUGGINGFACE_CO_PREFIX.format(model_id=model_id, revision=revision, filename=filename) + +def url_to_filename(url: str, etag: Optional[str] = None) -> str: + """ + Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's, + delimited by a period. If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name so that TF 2.0 can + identify it as a HDF5 file (see + https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380) + """ + url_bytes = url.encode("utf-8") + filename = sha256(url_bytes).hexdigest() + + if etag: + etag_bytes = etag.encode("utf-8") + filename += "." + sha256(etag_bytes).hexdigest() + + if url.endswith(".h5"): + filename += ".h5" + + return filename + +def cached_path( + url_or_filename, + cache_dir=None, + force_download=False, + proxies=None, + resume_download=False, + user_agent: Union[Dict, str, None] = None, + extract_compressed_file=False, + force_extract=False, + use_auth_token: Union[bool, str, None] = None, + local_files_only=False, +) -> Optional[str]: + """ + Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file + and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and + then return the path + + Args: + cache_dir: specify a cache directory to save the file to (overwrite the default cache dir). + force_download: if True, re-download the file even if it's already cached in the cache dir. + resume_download: if True, resume the download if incompletely received file is found. + user_agent: Optional string or dict that will be appended to the user-agent on remote requests. + use_auth_token: Optional string or boolean to use as Bearer token for remote files. If True, + will get token from ~/.huggingface. + extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed + file in a folder along the archive. + force_extract: if True when extract_compressed_file is True and the archive was already extracted, + re-extract the archive and override the folder where it was extracted. + + Return: + Local path (string) of file or if networking is off, last version of file cached on disk. + + Raises: + In case of non-recoverable file (non-existent or inaccessible url + no cache on disk). + """ + if cache_dir is None: + cache_dir = TRANSFORMERS_CACHE + if isinstance(url_or_filename, Path): + url_or_filename = str(url_or_filename) + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + if is_offline_mode() and not local_files_only: + logger.info("Offline mode: forcing local_files_only=True") + local_files_only = True + + if is_remote_url(url_or_filename): + # URL, so get it from the cache (downloading if necessary) + output_path = get_from_cache( + url_or_filename, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + user_agent=user_agent, + use_auth_token=use_auth_token, + local_files_only=local_files_only, + ) + elif os.path.exists(url_or_filename): + # File, and it exists. + output_path = url_or_filename + elif urlparse(url_or_filename).scheme == "": + # File, but it doesn't exist. + raise EnvironmentError(f"file {url_or_filename} not found") + else: + # Something unknown + raise ValueError(f"unable to parse {url_or_filename} as a URL or as a local path") + + if extract_compressed_file: + if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path): + return output_path + + # Path where we extract compressed archives + # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/" + output_dir, output_file = os.path.split(output_path) + output_extract_dir_name = output_file.replace(".", "-") + "-extracted" + output_path_extracted = os.path.join(output_dir, output_extract_dir_name) + + if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract: + return output_path_extracted + + # Prevent parallel extractions + lock_path = output_path + ".lock" + with filelock(lock_path): + shutil.rmtree(output_path_extracted, ignore_errors=True) + os.makedirs(output_path_extracted) + if is_zipfile(output_path): + with ZipFile(output_path, "r") as zip_file: + zip_file.extractall(output_path_extracted) + zip_file.close() + elif tarfile.is_tarfile(output_path): + tar_file = tarfile.open(output_path) + tar_file.extractall(output_path_extracted) + tar_file.close() + else: + raise EnvironmentError(f"Archive format of {output_path} could not be identified") + + return output_path_extracted + + return output_path + +def define_sagemaker_information(): + try: + instance_data = requests.get(os.environ["ECS_CONTAINER_METADATA_URI"]).json() + dlc_container_used = instance_data["Image"] + dlc_tag = instance_data["Image"].split(":")[1] + except Exception: + dlc_container_used = None + dlc_tag = None + + sagemaker_params = json.loads(os.getenv("SM_FRAMEWORK_PARAMS", "{}")) + runs_distributed_training = True if "sagemaker_distributed_dataparallel_enabled" in sagemaker_params else False + account_id = os.getenv("TRAINING_JOB_ARN").split(":")[4] if "TRAINING_JOB_ARN" in os.environ else None + + sagemaker_object = { + "sm_framework": os.getenv("SM_FRAMEWORK_MODULE", None), + "sm_region": os.getenv("AWS_REGION", None), + "sm_number_gpu": os.getenv("SM_NUM_GPUS", 0), + "sm_number_cpu": os.getenv("SM_NUM_CPUS", 0), + "sm_distributed_training": runs_distributed_training, + "sm_deep_learning_container": dlc_container_used, + "sm_deep_learning_container_tag": dlc_tag, + "sm_account_id": account_id, + } + return sagemaker_object + +def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str: + """ + Formats a user-agent string with basic info about a request. + """ + ua = f"transformers/{__version__}; python/{sys.version.split()[0]}; session_id/{SESSION_ID}" + if _NEED_IMPORT_TORCH: + ua += f"; torch/{_torch_version}" + if DISABLE_TELEMETRY: + return ua + "; telemetry/off" + if is_training_run_on_sagemaker(): + ua += "; " + "; ".join(f"{k}/{v}" for k, v in define_sagemaker_information().items()) + # CI will set this value to True + if os.environ.get("TRANSFORMERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES: + ua += "; is_ci/true" + if isinstance(user_agent, dict): + ua += "; " + "; ".join(f"{k}/{v}" for k, v in user_agent.items()) + elif isinstance(user_agent, str): + ua += "; " + user_agent + return ua + +def http_get(url: str, temp_file: BinaryIO, proxies=None, resume_size=0, headers: Optional[Dict[str, str]] = None): + """ + Download remote file. Do not gobble up errors. + """ + headers = copy.deepcopy(headers) + if resume_size > 0: + headers["Range"] = f"bytes={resume_size}-" + r = requests.get(url, stream=True, proxies=proxies, headers=headers) + r.raise_for_status() + content_length = r.headers.get("Content-Length") + total = resume_size + int(content_length) if content_length is not None else None + # progress = tqdm( + # unit="B", + # unit_scale=True, + # unit_divisor=1024, + # total=total, + # initial=resume_size, + # desc="Downloading", + # disable=bool(logging.get_verbosity() == logging.NOTSET), + # ) + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + # progress.update(len(chunk)) + temp_file.write(chunk) + # progress.close() + +def get_from_cache( + url: str, + cache_dir=None, + force_download=False, + proxies=None, + etag_timeout=10, + resume_download=False, + user_agent: Union[Dict, str, None] = None, + use_auth_token: Union[bool, str, None] = None, + local_files_only=False, +) -> Optional[str]: + """ + Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the + path to the cached file. + + Return: + Local path (string) of file or if networking is off, last version of file cached on disk. + + Raises: + In case of non-recoverable file (non-existent or inaccessible url + no cache on disk). + """ + if cache_dir is None: + cache_dir = TRANSFORMERS_CACHE + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + os.makedirs(cache_dir, exist_ok=True) + + headers = {"user-agent": http_user_agent(user_agent)} + if isinstance(use_auth_token, str): + headers["authorization"] = f"Bearer {use_auth_token}" + elif use_auth_token: + raise RuntimeError("`use_auth_token=True` is not supported in FastNLP now") + # token = HfFolder.get_token() + # if token is None: + # raise EnvironmentError("You specified use_auth_token=True, but a huggingface token was not found.") + # headers["authorization"] = f"Bearer {token}" + + url_to_download = url + etag = None + if not local_files_only: + try: + r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout) + r.raise_for_status() + etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag") + # We favor a custom header indicating the etag of the linked resource, and + # we fallback to the regular etag header. + # If we don't have any of those, raise an error. + if etag is None: + raise OSError( + "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility." + ) + # In case of a redirect, + # save an extra redirect on the request.get call, + # and ensure we download the exact atomic version even if it changed + # between the HEAD and the GET (unlikely, but hey). + if 300 <= r.status_code <= 399: + url_to_download = r.headers["Location"] + except (requests.exceptions.SSLError, requests.exceptions.ProxyError): + # Actually raise for those subclasses of ConnectionError + raise + except (requests.exceptions.ConnectionError, requests.exceptions.Timeout): + # Otherwise, our Internet connection is down. + # etag is None + pass + + filename = url_to_filename(url, etag) + + # get cache path to put the file + cache_path = os.path.join(cache_dir, filename) + + # etag is None == we don't have a connection or we passed local_files_only. + # try to get the last downloaded one + if etag is None: + if os.path.exists(cache_path): + return cache_path + else: + matching_files = [ + file + for file in fnmatch.filter(os.listdir(cache_dir), filename.split(".")[0] + ".*") + if not file.endswith(".json") and not file.endswith(".lock") + ] + if len(matching_files) > 0: + return os.path.join(cache_dir, matching_files[-1]) + else: + # If files cannot be found and local_files_only=True, + # the models might've been found if local_files_only=False + # Notify the user about that + if local_files_only: + raise FileNotFoundError( + "Cannot find the requested files in the cached path and outgoing traffic has been" + " disabled. To enable model look-ups and downloads online, set 'local_files_only'" + " to False." + ) + else: + raise ValueError( + "Connection error, and we cannot find the requested files in the cached path." + " Please try again or make sure your Internet connection is on." + ) + + # From now on, etag is not None. + if os.path.exists(cache_path) and not force_download: + return cache_path + + # Prevent parallel downloads of the same file with a lock. + lock_path = cache_path + ".lock" + with filelock(lock_path): + + # If the download just completed while the lock was activated. + if os.path.exists(cache_path) and not force_download: + # Even if returning early like here, the lock will be released. + return cache_path + + if resume_download: + incomplete_path = cache_path + ".incomplete" + + @contextmanager + def _resumable_file_manager() -> "io.BufferedWriter": + with open(incomplete_path, "ab") as f: + yield f + + temp_file_manager = _resumable_file_manager + if os.path.exists(incomplete_path): + resume_size = os.stat(incomplete_path).st_size + else: + resume_size = 0 + else: + temp_file_manager = partial(tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False) + resume_size = 0 + + # Download to temporary file, then copy to cache dir once finished. + # Otherwise you get corrupt cache entries if the download gets interrupted. + with temp_file_manager() as temp_file: + logger.info(f"{url} not found in cache or force_download set to True, downloading to {temp_file.name}") + + http_get(url_to_download, temp_file, proxies=proxies, resume_size=resume_size, headers=headers) + + logger.info(f"storing {url} in cache at {cache_path}") + os.replace(temp_file.name, cache_path) + + # NamedTemporaryFile creates a file with hardwired 0600 perms (ignoring umask), so fixing it. + umask = os.umask(0o666) + os.umask(umask) + os.chmod(cache_path, 0o666 & ~umask) + + logger.info(f"creating metadata file for {cache_path}") + meta = {"url": url, "etag": etag} + meta_path = cache_path + ".json" + with open(meta_path, "w") as meta_file: + json.dump(meta, meta_file) + + return cache_path + +def is_torch_fx_available(): + return _TORCH_GREATER_EQUAL_1_8 and _compare_version("torch", operator.lt, "1.9.0") + +def is_torch_fx_proxy(x): + if is_torch_fx_available(): + import torch.fx + + return isinstance(x, torch.fx.Proxy) + return False + +def is_sentencepiece_available(): + return importlib.util.find_spec("sentencepiece") is not None + +def is_tokenizers_available(): + return importlib.util.find_spec("tokenizers") is not None + +def is_tensor(x): + """ + Tests if ``x`` is a :obj:`torch.Tensor`, :obj:`tf.Tensor`, obj:`jaxlib.xla_extension.DeviceArray` or + :obj:`np.ndarray`. + """ + if is_torch_fx_proxy(x): + return True + + if isinstance(x, torch.Tensor): + return True + + return isinstance(x, np.ndarray) + +def to_py_obj(obj): + """ + Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list. + """ + if isinstance(obj, (dict, UserDict)): + return {k: to_py_obj(v) for k, v in obj.items()} + elif isinstance(obj, (list, tuple)): + return [to_py_obj(o) for o in obj] + elif _NEED_IMPORT_TORCH and _is_torch(obj): + return obj.detach().cpu().tolist() + elif isinstance(obj, np.ndarray): + return obj.tolist() + else: + return obj + +def _is_numpy(x): + return isinstance(x, np.ndarray) + +def _is_torch(x): + import torch + + return isinstance(x, torch.Tensor) + + +def _is_torch_device(x): + import torch + + return isinstance(x, torch.device) + +class ModelOutput(OrderedDict): + """ + Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like + a tuple) or strings (like a dictionary) that will ignore the ``None`` attributes. Otherwise behaves like a regular + python dictionary. + + .. warning:: + You can't unpack a :obj:`ModelOutput` directly. Use the :meth:`~transformers.file_utils.ModelOutput.to_tuple` + method to convert it to a tuple before. + """ + + def __post_init__(self): + class_fields = fields(self) + + # Safety and consistency checks + assert len(class_fields), f"{self.__class__.__name__} has no fields." + assert all( + field.default is None for field in class_fields[1:] + ), f"{self.__class__.__name__} should not have more than one required field." + + first_field = getattr(self, class_fields[0].name) + other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:]) + + if other_fields_are_none and not is_tensor(first_field): + if isinstance(first_field, dict): + iterator = first_field.items() + first_field_iterator = True + else: + try: + iterator = iter(first_field) + first_field_iterator = True + except TypeError: + first_field_iterator = False + + # if we provided an iterator as first field and the iterator is a (key, value) iterator + # set the associated fields + if first_field_iterator: + for element in iterator: + if ( + not isinstance(element, (list, tuple)) + or not len(element) == 2 + or not isinstance(element[0], str) + ): + break + setattr(self, element[0], element[1]) + if element[1] is not None: + self[element[0]] = element[1] + elif first_field is not None: + self[class_fields[0].name] = first_field + else: + for field in class_fields: + v = getattr(self, field.name) + if v is not None: + self[field.name] = v + + def __delitem__(self, *args, **kwargs): + raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.") + + def setdefault(self, *args, **kwargs): + raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.") + + def pop(self, *args, **kwargs): + raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.") + + def update(self, *args, **kwargs): + raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.") + + def __getitem__(self, k): + if isinstance(k, str): + inner_dict = {k: v for (k, v) in self.items()} + return inner_dict[k] + else: + return self.to_tuple()[k] + + def __setattr__(self, name, value): + if name in self.keys() and value is not None: + # Don't call self.__setitem__ to avoid recursion errors + super().__setitem__(name, value) + super().__setattr__(name, value) + + def __setitem__(self, key, value): + # Will raise a KeyException if needed + super().__setitem__(key, value) + # Don't call self.__setattr__ to avoid recursion errors + super().__setattr__(key, value) + + def to_tuple(self) -> Tuple[Any]: + """ + Convert self to a tuple containing all the attributes/keys that are not ``None``. + """ + return tuple(self[k] for k in self.keys()) + + +class ExplicitEnum(Enum): + """ + Enum with more explicit error message for missing values. + """ + + @classmethod + def _missing_(cls, value): + raise ValueError( + f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}" + ) + + +class PaddingStrategy(ExplicitEnum): + """ + Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion + in an IDE. + """ + + LONGEST = "longest" + MAX_LENGTH = "max_length" + DO_NOT_PAD = "do_not_pad" + + +class TensorType(ExplicitEnum): + """ + Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for + tab-completion in an IDE. + """ + + PYTORCH = "pt" + NUMPY = "np" diff --git a/fastNLP/transformers/torch/generation_beam_search.py b/fastNLP/transformers/torch/generation_beam_search.py new file mode 100644 index 00000000..117d9a38 --- /dev/null +++ b/fastNLP/transformers/torch/generation_beam_search.py @@ -0,0 +1,393 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from collections import UserDict +from typing import Optional, Tuple + +from .file_utils import add_start_docstrings +from fastNLP.envs.imports import _NEED_IMPORT_TORCH +from fastNLP.core.log import logger + +if _NEED_IMPORT_TORCH: + import torch + + +PROCESS_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using any class inheriting from :class:`~transformers.PreTrainedTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + next_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2 * num_beams)`): + Current scores of the top :obj:`2 * num_beams` non-finished beam hypotheses. + next_tokens (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`): + :obj:`input_ids` of the tokens corresponding to the top :obj:`2 * num_beams` non-finished beam hypotheses. + next_indices (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`): + Beam indices indicating to which beam hypothesis the :obj:`next_tokens` correspond. + pad_token_id (:obj:`int`, `optional`): + The id of the `padding` token. + eos_token_id (:obj:`int`, `optional`): + The id of the `end-of-sequence` token. + + Return: + :obj:`UserDict`: A dictionary composed of the fields as defined above: + + - **next_beam_scores** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Updated + scores of all non-finished beams. + - **next_beam_tokens** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Next tokens + to be added to the non-finished beam_hypotheses. + - **next_beam_indices** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Beam indices + indicating to which beam the next tokens shall be added. + +""" + +FINALIZE_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using any class inheriting from :class:`~transformers.PreTrainedTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + final_beam_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`): + The final scores of all non-finished beams. + final_beam_tokens (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`): + The last tokens to be added to the non-finished beam_hypotheses. + final_beam_indices (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`): + The beam indices indicating to which beam the :obj:`final_beam_tokens` shall be added. + pad_token_id (:obj:`int`, `optional`): + The id of the `padding` token. + eos_token_id (:obj:`int`, `optional`): + The id of the `end-of-sequence` token. + + Return: + :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated + sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all + batches finished early due to the :obj:`eos_token_id`. + +""" + + +class BeamScorer(ABC): + """ + Abstract base class for all beam scorers that are used for :meth:`~transformers.PreTrainedModel.beam_search` and + :meth:`~transformers.PreTrainedModel.beam_sample`. + """ + + @abstractmethod + @add_start_docstrings(PROCESS_INPUTS_DOCSTRING) + def process( + self, + input_ids: "torch.LongTensor", + next_scores: "torch.FloatTensor", + next_tokens: "torch.LongTensor", + next_indices: "torch.LongTensor", + **kwargs + ) -> Tuple["torch.Tensor"]: + raise NotImplementedError("This is an abstract method.") + + @abstractmethod + @add_start_docstrings(FINALIZE_INPUTS_DOCSTRING) + def finalize( + self, + input_ids: "torch.LongTensor", + next_scores: "torch.FloatTensor", + next_tokens: "torch.LongTensor", + next_indices: "torch.LongTensor", + max_length: int, + **kwargs + ) -> "torch.LongTensor": + raise NotImplementedError("This is an abstract method.") + + +class BeamSearchScorer(BeamScorer): + r""" + :class:`transformers.BeamScorer` implementing standard beam search decoding. + + Adapted in part from `Facebook's XLM beam search code + `__. + + Reference for the diverse beam search algorithm and implementation `Ashwin Kalyan's DBS implementation + `__ + + Args: + batch_size (:obj:`int`): + Batch Size of :obj:`input_ids` for which standard beam search decoding is run in parallel. + max_length (:obj:`int`): + The maximum length of the sequence to be generated. + num_beams (:obj:`int`): + Number of beams for beam search. + device (:obj:`torch.device`): + Defines the device type (*e.g.*, :obj:`"cpu"` or :obj:`"cuda"`) on which this instance of + :obj:`BeamSearchScorer` will be allocated. + length_penalty (:obj:`float`, `optional`, defaults to 1.0): + Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the + model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer + sequences. + do_early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. + num_beam_hyps_to_keep (:obj:`int`, `optional`, defaults to 1): + The number of beam hypotheses that shall be returned upon calling + :meth:`~transformer.BeamSearchScorer.finalize`. + num_beam_groups (:obj:`int`): + Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of + beams. See `this paper `__ for more details. + """ + + def __init__( + self, + batch_size: int, + num_beams: int, + device: "torch.device", + length_penalty: Optional[float] = 1.0, + do_early_stopping: Optional[bool] = False, + num_beam_hyps_to_keep: Optional[int] = 1, + num_beam_groups: Optional[int] = 1, + **kwargs, + ): + self.num_beams = num_beams + self.device = device + self.length_penalty = length_penalty + self.do_early_stopping = do_early_stopping + self.num_beam_hyps_to_keep = num_beam_hyps_to_keep + self.num_beam_groups = num_beam_groups + self.group_size = self.num_beams // self.num_beam_groups + + self._is_init = False + self._beam_hyps = [ + BeamHypotheses( + num_beams=self.num_beams, + length_penalty=self.length_penalty, + early_stopping=self.do_early_stopping, + ) + for _ in range(batch_size) + ] + self._done = torch.tensor([False for _ in range(batch_size)], dtype=torch.bool, device=self.device) + + if not isinstance(num_beams, int) or num_beams <= 1: + raise ValueError( + f"`num_beams` has to be an integer strictly greater than 1, but is {num_beams}. For `num_beams` == 1, one should make use of `greedy_search` instead." + ) + + if not isinstance(num_beam_groups, int) or (num_beam_groups > num_beams) or (num_beams % num_beam_groups != 0): + raise ValueError( + f"`num_beam_groups` has to be an integer smaller or equal than `num_beams` and `num_beams` " + f"has to be divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}." + ) + + if "max_length" in kwargs: + logger.warn( + "Passing `max_length` to BeamSearchScorer is deprecated and has no effect." + "`max_length` should be passed directly to `beam_search(...)`, `beam_sample(...)`" + ",or `group_beam_search(...)`." + ) + + @property + def is_done(self) -> bool: + return self._done.all() + + def process( + self, + input_ids: "torch.LongTensor", + next_scores: "torch.FloatTensor", + next_tokens: "torch.LongTensor", + next_indices: "torch.LongTensor", + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + ) -> Tuple["torch.Tensor"]: + cur_len = input_ids.shape[-1] + batch_size = len(self._beam_hyps) + assert batch_size == (input_ids.shape[0] // self.group_size) + + device = input_ids.device + next_beam_scores = torch.zeros((batch_size, self.group_size), dtype=next_scores.dtype, device=device) + next_beam_tokens = torch.zeros((batch_size, self.group_size), dtype=next_tokens.dtype, device=device) + next_beam_indices = torch.zeros((batch_size, self.group_size), dtype=next_indices.dtype, device=device) + + for batch_idx, beam_hyp in enumerate(self._beam_hyps): + if self._done[batch_idx]: + assert ( + len(beam_hyp) >= self.num_beams + ), f"Batch can only be done if at least {self.num_beams} beams have been generated" + assert ( + eos_token_id is not None and pad_token_id is not None + ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" + # pad the batch + next_beam_scores[batch_idx, :] = 0 + next_beam_tokens[batch_idx, :] = pad_token_id + next_beam_indices[batch_idx, :] = 0 + continue + + # next tokens for this sentence + beam_idx = 0 + for beam_token_rank, (next_token, next_score, next_index) in enumerate( + zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx]) + ): + batch_beam_idx = batch_idx * self.group_size + next_index + # add to generated hypotheses if end of sentence + if (eos_token_id is not None) and (next_token.item() == eos_token_id): + # if beam_token does not belong to top num_beams tokens, it should not be added + is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size + if is_beam_token_worse_than_top_num_beams: + continue + beam_hyp.add( + input_ids[batch_beam_idx].clone(), + next_score.item(), + ) + else: + # add next predicted token since it is not eos_token + next_beam_scores[batch_idx, beam_idx] = next_score + next_beam_tokens[batch_idx, beam_idx] = next_token + next_beam_indices[batch_idx, beam_idx] = batch_beam_idx + beam_idx += 1 + + # once the beam for next step is full, don't add more tokens to it. + if beam_idx == self.group_size: + break + + if beam_idx < self.group_size: + raise ValueError( + f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id: {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected." + ) + + # Check if we are done so that we can save a pad step if all(done) + self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done( + next_scores[batch_idx].max().item(), cur_len + ) + + return UserDict( + { + "next_beam_scores": next_beam_scores.view(-1), + "next_beam_tokens": next_beam_tokens.view(-1), + "next_beam_indices": next_beam_indices.view(-1), + } + ) + + def finalize( + self, + input_ids: "torch.LongTensor", + final_beam_scores: "torch.FloatTensor", + final_beam_tokens: "torch.LongTensor", + final_beam_indices: "torch.LongTensor", + max_length: int, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + ) -> Tuple["torch.LongTensor"]: + batch_size = len(self._beam_hyps) + + # finalize all open beam hypotheses and add to generated hypotheses + for batch_idx, beam_hyp in enumerate(self._beam_hyps): + if self._done[batch_idx]: + continue + + # all open beam hypotheses are added to the beam hypothesis + # beam hypothesis class automatically keeps the best beams + for beam_id in range(self.num_beams): + batch_beam_idx = batch_idx * self.num_beams + beam_id + final_score = final_beam_scores[batch_beam_idx].item() + final_tokens = input_ids[batch_beam_idx] + beam_hyp.add(final_tokens, final_score) + + # select the best hypotheses + sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep) + best = [] + best_scores = torch.zeros(batch_size * self.num_beam_hyps_to_keep, device=self.device, dtype=torch.float32) + + # retrieve best hypotheses + for i, beam_hyp in enumerate(self._beam_hyps): + sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0]) + for j in range(self.num_beam_hyps_to_keep): + best_hyp_tuple = sorted_hyps.pop() + best_score = best_hyp_tuple[0] + best_hyp = best_hyp_tuple[1] + sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp) + + # append to lists + best.append(best_hyp) + best_scores[i * self.num_beam_hyps_to_keep + j] = best_score + + # prepare for adding eos + sent_max_len = min(sent_lengths.max().item() + 1, max_length) + decoded: "torch.LongTensor" = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len) + # shorter batches are padded if needed + if sent_lengths.min().item() != sent_lengths.max().item(): + assert pad_token_id is not None, "`pad_token_id` has to be defined" + decoded.fill_(pad_token_id) + + # fill with hypotheses and eos_token_id if the latter fits in + for i, hypo in enumerate(best): + decoded[i, : sent_lengths[i]] = hypo + if sent_lengths[i] < max_length: + decoded[i, sent_lengths[i]] = eos_token_id + return UserDict( + { + "sequences": decoded, + "sequence_scores": best_scores, + } + ) + + +class BeamHypotheses: + def __init__(self, num_beams: int, length_penalty: float, early_stopping: bool): + """ + Initialize n-best list of hypotheses. + """ + self.length_penalty = length_penalty + self.early_stopping = early_stopping + self.num_beams = num_beams + self.beams = [] + self.worst_score = 1e9 + + def __len__(self): + """ + Number of hypotheses in the list. + """ + return len(self.beams) + + def add(self, hyp: "torch.LongTensor", sum_logprobs: float): + """ + Add a new hypothesis to the list. + """ + score = sum_logprobs / (hyp.shape[-1] ** self.length_penalty) + if len(self) < self.num_beams or score > self.worst_score: + self.beams.append((score, hyp)) + if len(self) > self.num_beams: + sorted_next_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) + del self.beams[sorted_next_scores[0][1]] + self.worst_score = sorted_next_scores[1][0] + else: + self.worst_score = min(score, self.worst_score) + + def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool: + """ + If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst + one in the heap, then we are done with this sentence. + """ + + if len(self) < self.num_beams: + return False + elif self.early_stopping: + return True + else: + cur_score = best_sum_logprobs / cur_len ** self.length_penalty + ret = self.worst_score >= cur_score + return ret diff --git a/fastNLP/transformers/torch/generation_logits_process.py b/fastNLP/transformers/torch/generation_logits_process.py new file mode 100644 index 00000000..e97b62be --- /dev/null +++ b/fastNLP/transformers/torch/generation_logits_process.py @@ -0,0 +1,618 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import math +from abc import ABC +from typing import Callable, Iterable, List, Optional + +import numpy as np + +from .file_utils import add_start_docstrings +from fastNLP.envs.imports import _NEED_IMPORT_TORCH +from fastNLP.core.log import logger + +if _NEED_IMPORT_TORCH: + import torch + +LOGITS_PROCESSOR_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~transformers.BertTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`): + Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam + search or log softmax for each vocabulary token when using beam search + kwargs: + Additional logits processor specific kwargs. + + Return: + :obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`: The processed prediction scores. + +""" + + +class LogitsProcessor(ABC): + """Abstract base class for all logit processors that can be applied during generation.""" + + @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": + """Torch method for processing logits.""" + raise NotImplementedError( + f"{self.__class__} is an abstract class. Only classes inheriting this class can be called." + ) + + +class LogitsWarper(ABC): + """Abstract base class for all logit warpers that can be applied during generation with multinomial sampling.""" + + @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": + """Torch method for warping logits.""" + raise NotImplementedError( + f"{self.__class__} is an abstract class. Only classes inheriting this class can be called." + ) + + +class LogitsProcessorList(list): + """ + This class can be used to create a list of :class:`~transformers.LogitsProcessor` or + :class:`~transformers.LogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits from + list and adds a specific `__call__` method to apply each :class:`~transformers.LogitsProcessor` or + :class:`~transformers.LogitsWarper` to the inputs. + """ + + @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> "torch.FloatTensor": + for processor in self: + function_args = inspect.signature(processor.__call__).parameters + if len(function_args) > 2: + assert all( + arg in kwargs for arg in list(function_args.keys())[2:] + ), f"Make sure that all the required parameters: {list(function_args.keys())} for {processor.__class__} are passed to the logits processor." + scores = processor(input_ids, scores, **kwargs) + else: + scores = processor(input_ids, scores) + return scores + + +class MinLengthLogitsProcessor(LogitsProcessor): + r""" + :class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0. + + Args: + min_length (:obj:`int`): + The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`. + eos_token_id (:obj:`int`): + The id of the `end-of-sequence` token. + """ + + def __init__(self, min_length: int, eos_token_id: int): + if not isinstance(min_length, int) or min_length < 0: + raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}") + + if not isinstance(eos_token_id, int) or eos_token_id < 0: + raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}") + + self.min_length = min_length + self.eos_token_id = eos_token_id + + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": + cur_len = input_ids.shape[-1] + if cur_len < self.min_length: + scores[:, self.eos_token_id] = -float("inf") + return scores + + +class TemperatureLogitsWarper(LogitsWarper): + r""" + :class:`transformers.LogitsWarper` for temperature (exponential scaling output probability distribution). + + Args: + temperature (:obj:`float`): + The value used to module the logits distribution. + """ + + def __init__(self, temperature: float): + if not isinstance(temperature, float) or not (temperature > 0): + raise ValueError(f"`temperature` has to be a strictly positive float, but is {temperature}") + + self.temperature = temperature + + def __call__(self, input_ids: "torch.Tensor", scores: "torch.Tensor") -> "torch.FloatTensor": + scores = scores / self.temperature + return scores + + +class RepetitionPenaltyLogitsProcessor(LogitsProcessor): + r""" + :class:`transformers.LogitsProcessor` enforcing an exponential penalty on repeated sequences. + + Args: + repetition_penalty (:obj:`float`): + The parameter for repetition penalty. 1.0 means no penalty. See `this paper + `__ for more details. + """ + + def __init__(self, penalty: float): + if not isinstance(penalty, float) or not (penalty > 0): + raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}") + + self.penalty = penalty + + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": + score = torch.gather(scores, 1, input_ids) + + # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability + score = torch.where(score < 0, score * self.penalty, score / self.penalty) + + scores.scatter_(1, input_ids, score) + return scores + + +class TopPLogitsWarper(LogitsWarper): + """ + :class:`transformers.LogitsWarper` that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= + prob_cut_off. + + Args: + top_p (:obj:`float`): + If set to < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or higher are + kept for generation. + filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`): + All filtered values will be set to this float value. + min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1): + Minimum number of tokens that cannot be filtered. + """ + + def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): + top_p = float(top_p) + if top_p < 0 or top_p > 1.0: + raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}") + + self.top_p = top_p + self.filter_value = filter_value + self.min_tokens_to_keep = min_tokens_to_keep + + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": + sorted_logits, sorted_indices = torch.sort(scores, descending=True) + cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) + + # Remove tokens with cumulative top_p above the threshold (token with 0 are kept) + sorted_indices_to_remove = cumulative_probs > self.top_p + if self.min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) + sorted_indices_to_remove[..., : self.min_tokens_to_keep - 1] = 0 + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + # scatter sorted tensors to original indexing + indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) + scores = scores.masked_fill(indices_to_remove, self.filter_value) + return scores + + +class TopKLogitsWarper(LogitsWarper): + r""" + :class:`transformers.LogitsWarper` that performs top-k, i.e. restricting to the k highest probability elements. + + Args: + top_k (:obj:`int`): + The number of highest probability vocabulary tokens to keep for top-k-filtering. + filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`): + All filtered values will be set to this float value. + min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1): + Minimum number of tokens that cannot be filtered. + """ + + def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): + if not isinstance(top_k, int) or top_k <= 0: + raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}") + + self.top_k = top_k + self.filter_value = filter_value + self.min_tokens_to_keep = min_tokens_to_keep + + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": + top_k = min(max(self.top_k, self.min_tokens_to_keep), scores.size(-1)) # Safety check + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = scores < torch.topk(scores, top_k)[0][..., -1, None] + scores = scores.masked_fill(indices_to_remove, self.filter_value) + return scores + + +def _get_ngrams(ngram_size: int, prev_input_ids: "torch.Tensor", num_hypos: int): + generated_ngrams = [{} for _ in range(num_hypos)] + for idx in range(num_hypos): + gen_tokens = prev_input_ids[idx].tolist() + generated_ngram = generated_ngrams[idx] + for ngram in zip(*[gen_tokens[i:] for i in range(ngram_size)]): + prev_ngram_tuple = tuple(ngram[:-1]) + generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] + return generated_ngrams + + +def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur_len): + # Before decoding the next token, prevent decoding of ngrams that have already appeared + start_idx = cur_len + 1 - ngram_size + ngram_idx = tuple(prev_input_ids[start_idx:cur_len].tolist()) + return banned_ngrams.get(ngram_idx, []) + + +def _calc_banned_ngram_tokens( + ngram_size: int, prev_input_ids: "torch.Tensor", num_hypos: int, cur_len: int +) -> List[Iterable[int]]: + """Copied from fairseq for no_repeat_ngram in beam_search""" + if cur_len + 1 < ngram_size: + # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet + return [[] for _ in range(num_hypos)] + + generated_ngrams = _get_ngrams(ngram_size, prev_input_ids, num_hypos) + + banned_tokens = [ + _get_generated_ngrams(generated_ngrams[hypo_idx], prev_input_ids[hypo_idx], ngram_size, cur_len) + for hypo_idx in range(num_hypos) + ] + return banned_tokens + + +class NoRepeatNGramLogitsProcessor(LogitsProcessor): + r""" + :class:`transformers.LogitsProcessor` that enforces no repetition of n-grams. See `Fairseq + `__. + + Args: + ngram_size (:obj:`int`): + All ngrams of size :obj:`ngram_size` can only occur once. + """ + + def __init__(self, ngram_size: int): + if not isinstance(ngram_size, int) or ngram_size <= 0: + raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}") + self.ngram_size = ngram_size + + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": + num_batch_hypotheses = scores.shape[0] + cur_len = input_ids.shape[-1] + banned_batch_tokens = _calc_banned_ngram_tokens(self.ngram_size, input_ids, num_batch_hypotheses, cur_len) + + for i, banned_tokens in enumerate(banned_batch_tokens): + scores[i, banned_tokens] = -float("inf") + + return scores + + +class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor): + r""" + :class:`transformers.LogitsProcessor` that enforces no repetition of encoder input ids n-grams for the decoder ids. + See `ParlAI `__. + + Args: + encoder_ngram_size (:obj:`int`): + All ngrams of size :obj:`ngram_size` can only occur within the encoder input ids. + encoder_input_ids (:obj:`int`): + The encoder_input_ids that should not be repeated within the decoder ids. + """ + + def __init__(self, encoder_ngram_size: int, encoder_input_ids: "torch.LongTensor"): + if not isinstance(encoder_ngram_size, int) or encoder_ngram_size <= 0: + raise ValueError( + f"`encoder_ngram_size` has to be a strictly positive integer, but is {encoder_ngram_size}" + ) + self.ngram_size = encoder_ngram_size + if len(encoder_input_ids.shape) == 1: + encoder_input_ids = encoder_input_ids.unsqueeze(0) + self.batch_size = encoder_input_ids.shape[0] + self.generated_ngrams = _get_ngrams(encoder_ngram_size, encoder_input_ids, self.batch_size) + + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": + # B x num_beams + num_hypos = scores.shape[0] + num_beams = num_hypos // self.batch_size + cur_len = input_ids.shape[-1] + banned_batch_tokens = [ + _get_generated_ngrams( + self.generated_ngrams[hypo_idx // num_beams], input_ids[hypo_idx], self.ngram_size, cur_len + ) + for hypo_idx in range(num_hypos) + ] + + for i, banned_tokens in enumerate(banned_batch_tokens): + scores[i, banned_tokens] = -float("inf") + + return scores + + +class NoBadWordsLogitsProcessor(LogitsProcessor): + """ + :class:`transformers.LogitsProcessor` that enforces that specified sequences will never be sampled. + + Args: + bad_words_ids (:obj:`List[List[int]]`): + List of list of token ids that are not allowed to be generated. In order to get the tokens of the words + that should not appear in the generated text, use :obj:`tokenizer(bad_word, + add_prefix_space=True).input_ids`. + eos_token_id (:obj:`int`): + The id of the `end-of-sequence` token. + """ + + def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int): + + if not isinstance(bad_words_ids, List) or len(bad_words_ids) == 0: + raise ValueError(f"`bad_words_ids` has to be a non-emtpy list, but is {bad_words_ids}.") + if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids): + raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.") + if any( + any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in bad_word_ids) + for bad_word_ids in bad_words_ids + ): + raise ValueError( + f"Each list in `bad_words_ids` has to be a list of positive integers, but is {bad_words_ids}." + ) + + bad_words_ids = list(filter(lambda bad_token_seq: bad_token_seq != [eos_token_id], bad_words_ids)) + self.bad_words_id_length_1 = [] + self.bad_words_id_length_greater_than_1 = [] + for word in bad_words_ids: + if len(word) == 1: + self.bad_words_id_length_1.append(word[0]) + else: + self.bad_words_id_length_greater_than_1.append(word) + + self.static_bad_words_mask: Optional[torch.LongTensor] = None + + for banned_token_seq in self.bad_words_id_length_greater_than_1: + assert len(banned_token_seq) > 0, f"Banned words token sequences {bad_words_ids} cannot have an empty list" + + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": + if self.static_bad_words_mask is None and len(self.bad_words_id_length_1) > 0: + self.static_bad_words_mask = self._calc_static_bad_word_mask(scores) + + dynamic_banned_tokens = self._calc_banned_bad_words_ids(input_ids.tolist()) + scores = self._set_scores_to_inf_for_banned_tokens(scores, dynamic_banned_tokens) + + return scores + + def _calc_static_bad_word_mask(self, scores: "torch.FloatTensor") -> "torch.BoolTensor": + static_bad_words_mask = torch.zeros(scores.shape[1]) + static_bad_words_mask[self.bad_words_id_length_1] = 1 + return static_bad_words_mask.unsqueeze(0).to(scores.device).bool() + + def _tokens_match(self, prev_tokens: List[int], tokens: List[int]) -> bool: + if len(tokens) == 0: + # if bad word tokens is just one token always ban it + return True + elif len(tokens) > len(prev_tokens): + # if bad word tokens are longer then prev input_ids they can't be equal + return False + else: + return prev_tokens[-len(tokens) :] == tokens + + def _calc_banned_bad_words_ids(self, prev_input_ids: List[List[int]]) -> Iterable[int]: + banned_tokens = [] + for prev_input_ids_slice in prev_input_ids: + banned_tokens_slice = [] + for banned_token_seq in self.bad_words_id_length_greater_than_1: + if self._tokens_match(prev_input_ids_slice, banned_token_seq[:-1]): + banned_tokens_slice.append(banned_token_seq[-1]) + + banned_tokens.append(banned_tokens_slice) + + return banned_tokens + + def _set_scores_to_inf_for_banned_tokens( + self, scores: "torch.Tensor", banned_tokens: List[List[int]] + ) -> "torch.Tensor": + """ + Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be a + list of list of banned tokens to ban in the format [[batch index, vocabulary position],... + + Args: + scores: logits distribution of shape (batch size, vocabulary size) + banned_tokens: list of list of tokens to ban of length (batch_size) + """ + banned_mask_list = [] + for idx, batch_banned_tokens in enumerate(banned_tokens): + for token in batch_banned_tokens: + # Eliminates invalid bad word IDs that are over the vocabulary size. + if token <= scores.shape[1]: + banned_mask_list.append([idx, token]) + else: + logger.error( + f"An invalid bad word ID is defined: {token}. This ID is not contained in the" + f"vocabulary, and is therefore ignored." + ) + if not banned_mask_list and self.static_bad_words_mask is None: + return scores + + else: + if banned_mask_list: + banned_mask = torch.LongTensor(banned_mask_list) + indices = torch.ones(len(banned_mask)) + # A sparse tensor is generated from a list of coordinates: [[0, 1], [0, 2], [2, 0]]. A conversion to dense tensor generates: + # [ 0 1 1 ] + # [ 0 0 0 ] + # [ 1 0 0 ] + + banned_mask = ( + torch.sparse.LongTensor(banned_mask.t(), indices, scores.size()) + .to(scores.device) + .to_dense() + .bool() + ) + + if self.static_bad_words_mask is not None: + banned_mask = torch.bitwise_or(banned_mask, self.static_bad_words_mask) + else: + banned_mask = self.static_bad_words_mask + + scores = scores.masked_fill(banned_mask, -float("inf")) + return scores + + +class PrefixConstrainedLogitsProcessor(LogitsProcessor): + r""" + :class:`transformers.LogitsProcessor` that enforces constrained generation and is useful for prefix-conditioned + constrained generation. See `Autoregressive Entity Retrieval `__ for more + information. + + Args: + prefix_allowed_tokens_fn: (:obj:`Callable[[int, torch.Tensor], List[int]]`): + This function constraints the beam search to allowed tokens only at each step. This function takes 2 + arguments :obj:`inputs_ids` and the batch ID :obj:`batch_id`. It has to return a list with the allowed + tokens for the next generation step conditioned on the previously generated tokens :obj:`inputs_ids` and + the batch ID :obj:`batch_id`. + """ + + def __init__(self, prefix_allowed_tokens_fn: Callable[[int, "torch.Tensor"], List[int]], num_beams: int): + self._prefix_allowed_tokens_fn = prefix_allowed_tokens_fn + self._num_beams = num_beams + + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": + mask = torch.full_like(scores, -math.inf) + for batch_id, beam_sent in enumerate(input_ids.view(-1, self._num_beams, input_ids.shape[-1])): + for beam_id, sent in enumerate(beam_sent): + mask[batch_id * self._num_beams + beam_id, self._prefix_allowed_tokens_fn(batch_id, sent)] = 0 + + return scores + mask + + +class HammingDiversityLogitsProcessor(LogitsProcessor): + r""" + :class:`transformers.LogitsProcessor` that enforces diverse beam search. Note that this logits processor is only + effective for :meth:`transformers.PreTrainedModel.group_beam_search`. See `Diverse Beam Search: Decoding Diverse + Solutions from Neural Sequence Models `__ for more details. + + Args: + diversity_penalty (:obj:`float`): + This value is subtracted from a beam's score if it generates a token same as any beam from other group at a + particular time. Note that :obj:`diversity_penalty` is only effective if ``group beam search`` is enabled. + num_beams (:obj:`int`): + Number of beams used for group beam search. See `this paper `__ for + more details. + num_beam_groups (:obj:`int`): + Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of + beams. See `this paper `__ for more details. + """ + + def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int): + if not isinstance(diversity_penalty, float) or (not diversity_penalty > 0.0): + raise ValueError("`diversity_penalty` should be a float strictly larger than 0.") + self._diversity_penalty = diversity_penalty + if not isinstance(num_beams, int) or num_beams < 2: + raise ValueError("`num_beams` should be an integer strictly larger than 1.") + self._num_beams = num_beams + if not isinstance(num_beam_groups, int) or num_beam_groups < 2: + raise ValueError("`num_beam_groups` should be an integer strictly larger than 1.") + if num_beam_groups > num_beams: + raise ValueError("`beam_groups` has to be smaller or equal to `num_beams`.") + self._num_sub_beams = num_beams // num_beam_groups + + def __call__( + self, + input_ids: "torch.LongTensor", + scores: "torch.FloatTensor", + current_tokens: "torch.LongTensor", + beam_group_idx: int, + ) -> "torch.FloatTensor": + # hamming diversity: penalise using same token in current group which was used in previous groups at + # the same time step + batch_size = current_tokens.shape[0] // self._num_beams + group_start_idx = beam_group_idx * self._num_sub_beams + group_end_idx = min(group_start_idx + self._num_sub_beams, self._num_beams) + group_size = group_end_idx - group_start_idx + vocab_size = scores.shape[-1] + + if group_start_idx == 0: + return scores + + for batch_idx in range(batch_size): + # predicted tokens of last time step of previous groups + previous_group_tokens = current_tokens[ + batch_idx * self._num_beams : batch_idx * self._num_beams + group_start_idx + ] + token_frequency = torch.bincount(previous_group_tokens, minlength=vocab_size).to(scores.device) + scores[batch_idx * group_size : (batch_idx + 1) * group_size] -= self._diversity_penalty * token_frequency + + return scores + + +class ForcedBOSTokenLogitsProcessor(LogitsProcessor): + r""" + :class:`~transformers.LogitsProcessor` that enforces the specified token as the first generated token. + + Args: + bos_token_id (:obj:`int`): + The id of the token to force as the first generated token. + """ + + def __init__(self, bos_token_id: int): + self.bos_token_id = bos_token_id + + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": + cur_len = input_ids.shape[-1] + if cur_len == 1: + num_tokens = scores.shape[1] + scores[:, [i for i in range(num_tokens) if i != self.bos_token_id]] = -float("inf") + scores[:, self.bos_token_id] = 0 + return scores + + +class ForcedEOSTokenLogitsProcessor(LogitsProcessor): + r""" + :class:`~transformers.LogitsProcessor` that enforces the specified token as the last generated token when + :obj:`max_length` is reached. + + Args: + max_length (:obj:`int`): + The maximum length of the sequence to be generated. + eos_token_id (:obj:`int`): + The id of the token to force as the last generated token when :obj:`max_length` is reached. + """ + + def __init__(self, max_length: int, eos_token_id: int): + self.max_length = max_length + self.eos_token_id = eos_token_id + + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": + cur_len = input_ids.shape[-1] + if cur_len == self.max_length - 1: + num_tokens = scores.shape[1] + scores[:, [i for i in range(num_tokens) if i != self.eos_token_id]] = -float("inf") + scores[:, self.eos_token_id] = 0 + return scores + + +class InfNanRemoveLogitsProcessor(LogitsProcessor): + r""" + :class:`~transformers.LogitsProcessor` that removes all :obj:`nan` and :obj:`inf` values to avoid the generation + method to fail. Note that using the logits processor should only be used if necessary since it can slow down the + generation method. :obj:`max_length` is reached. + """ + + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": + # set all nan values to 0.0 + scores[scores != scores] = 0.0 + + # set all inf values to max possible value + scores[scores == float("inf")] = torch.finfo(scores.dtype).max + + return scores diff --git a/fastNLP/transformers/torch/generation_stopping_criteria.py b/fastNLP/transformers/torch/generation_stopping_criteria.py new file mode 100644 index 00000000..179bf7c1 --- /dev/null +++ b/fastNLP/transformers/torch/generation_stopping_criteria.py @@ -0,0 +1,128 @@ +import time +from abc import ABC +from copy import deepcopy +from typing import Optional + +from .file_utils import add_start_docstrings +from fastNLP.envs.imports import _NEED_IMPORT_TORCH +from fastNLP.core.log import logger + +if _NEED_IMPORT_TORCH: + import torch + +STOPPING_CRITERIA_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~transformers.BertTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`): + Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax + or scores for each vocabulary token after SoftMax. + kwargs: + Additional stopping criteria specific kwargs. + + Return: + :obj:`bool`. :obj:`False` indicates we should continue, :obj:`True` indicates we should stop. + +""" + + +class StoppingCriteria(ABC): + """Abstract base class for all stopping criteria that can be applied during generation.""" + + @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING) + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> bool: + raise NotImplementedError("StoppingCriteria needs to be subclassed") + + +class MaxLengthCriteria(StoppingCriteria): + """ + This class can be used to stop generation whenever the full generated number of tokens exceeds :obj:`max_length`. + Keep in mind for decoder-only type of transformers, this will include the initial prompted tokens. + + Args: + max_length (:obj:`int`): + The maximum length that the output sequence can have in number of tokens. + """ + + def __init__(self, max_length: int): + self.max_length = max_length + + @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING) + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> bool: + return input_ids.shape[-1] >= self.max_length + + +class MaxNewTokensCriteria(StoppingCriteria): + """ + This class can be used to stop generation whenever the generated number of tokens exceeds :obj:`max_new_tokens`. + Keep in mind for decoder-only type of transformers, this will **not** include the initial prompted tokens. This is + very close to :obj:`MaxLengthCriteria` but ignores the number of initial tokens. + + Args: + start_length (:obj:`int`): + The number of initial tokens. + max_new_tokens (:obj:`int`): + The maximum number of tokens to generate. + """ + + def __init__(self, start_length: int, max_new_tokens: int): + self.start_length = start_length + self.max_new_tokens = max_new_tokens + self.max_length = start_length + max_new_tokens + + @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING) + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> bool: + return input_ids.shape[-1] >= self.max_length + + +class MaxTimeCriteria(StoppingCriteria): + """ + This class can be used to stop generation whenever the full generation exceeds some amount of time. By default, the + time will start being counted when you initialize this function. You can override this by passing an + :obj:`initial_time`. + + Args: + max_time (:obj:`float`): + The maximum allowed time in seconds for the generation. + initial_time (:obj:`float`, `optional`, defaults to :obj:`time.time()`): + The start of the generation allowed time. + """ + + def __init__(self, max_time: float, initial_timestamp: Optional[float] = None): + self.max_time = max_time + self.initial_timestamp = time.time() if initial_timestamp is None else initial_timestamp + + @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING) + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> bool: + return time.time() - self.initial_timestamp > self.max_time + + +class StoppingCriteriaList(list): + @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING) + def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> bool: + return any(criteria(input_ids, scores) for criteria in self) + + @property + def max_length(self) -> Optional[int]: + for stopping_criterium in self: + if isinstance(stopping_criterium, MaxLengthCriteria): + return stopping_criterium.max_length + elif isinstance(stopping_criterium, MaxNewTokensCriteria): + return stopping_criterium.max_length + return None + + +def validate_stopping_criteria(stopping_criteria: StoppingCriteriaList, max_length: int) -> StoppingCriteriaList: + stopping_max_length = stopping_criteria.max_length + new_stopping_criteria = deepcopy(stopping_criteria) + if stopping_max_length is not None and stopping_max_length != max_length: + logger.warn("You set different `max_length` for stopping criteria and `max_length` parameter", UserWarning) + elif stopping_max_length is None: + new_stopping_criteria.append(MaxLengthCriteria(max_length=max_length)) + return new_stopping_criteria diff --git a/fastNLP/transformers/torch/generation_utils.py b/fastNLP/transformers/torch/generation_utils.py new file mode 100644 index 00000000..cfc2108c --- /dev/null +++ b/fastNLP/transformers/torch/generation_utils.py @@ -0,0 +1,2579 @@ +# coding=utf-8 +# Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union + +from .file_utils import ModelOutput +from .generation_beam_search import BeamScorer, BeamSearchScorer +from .generation_logits_process import ( + EncoderNoRepeatNGramLogitsProcessor, + ForcedBOSTokenLogitsProcessor, + ForcedEOSTokenLogitsProcessor, + HammingDiversityLogitsProcessor, + InfNanRemoveLogitsProcessor, + LogitsProcessorList, + MinLengthLogitsProcessor, + NoBadWordsLogitsProcessor, + NoRepeatNGramLogitsProcessor, + PrefixConstrainedLogitsProcessor, + RepetitionPenaltyLogitsProcessor, + TemperatureLogitsWarper, + TopKLogitsWarper, + TopPLogitsWarper, +) +from .generation_stopping_criteria import ( + MaxLengthCriteria, + MaxNewTokensCriteria, + MaxTimeCriteria, + StoppingCriteriaList, + validate_stopping_criteria, +) + +from fastNLP.envs.imports import _NEED_IMPORT_TORCH +from fastNLP.core.log import logger + +if _NEED_IMPORT_TORCH: + import torch + import torch.distributed as dist + from torch import nn, no_grad +else: + from fastNLP.core.utils.dummy_class import DummyClass as no_grad + + +@dataclass +class GreedySearchDecoderOnlyOutput(ModelOutput): + """ + Base class for outputs of decoder-only generation models using greedy search. + + + Args: + sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or + shorter if all batches finished early due to the :obj:`eos_token_id`. + scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``): + Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) + at each generation step. :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`torch.FloatTensor` + with each tensor of shape :obj:`(batch_size, config.vocab_size)`). + attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`. + hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size, generated_length, hidden_size)`. + """ + + sequences: "torch.LongTensor" = None + scores: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + hidden_states: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + + +@dataclass +class GreedySearchEncoderDecoderOutput(ModelOutput): + """ + Base class for outputs of encoder-decoder generation models using greedy search. Hidden states and attention + weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the + encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) + + + Args: + sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or + shorter if all batches finished early due to the :obj:`eos_token_id`. + scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``): + Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) + at each generation step. :obj:`(max_length-1,)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor + of shape :obj:`(batch_size, config.vocab_size)`). + encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer of the decoder) of shape :obj:`(batch_size, + num_heads, sequence_length, sequence_length)`. + encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + decoder_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`. + cross_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`. + decoder_hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size, generated_length, hidden_size)`. + """ + + sequences: "torch.LongTensor" = None + scores: Optional[Tuple["torch.FloatTensor"]] = None + encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None + encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + decoder_attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + cross_attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + decoder_hidden_states: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + + +@dataclass +class SampleDecoderOnlyOutput(ModelOutput): + """ + Base class for outputs of decoder-only generation models using sampling. + + + Args: + sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`): + The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or + shorter if all batches finished early due to the :obj:`eos_token_id`. + scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``): + Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) + at each generation step. :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`torch.FloatTensor` + with each tensor of shape :obj:`(batch_size*num_return_sequences, config.vocab_size)`). + attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(num_return_sequences*batch_size, num_heads, generated_length, + sequence_length)`. + hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(num_return_sequences*batch_size, generated_length, hidden_size)`. + """ + + sequences: "torch.LongTensor" = None + scores: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + hidden_states: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + + +@dataclass +class SampleEncoderDecoderOutput(ModelOutput): + """ + Base class for outputs of encoder-decoder generation models using sampling. Hidden states and attention weights of + the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states + attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) + + + Args: + sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`): + The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or + shorter if all batches finished early due to the :obj:`eos_token_id`. + scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``): + Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax) + at each generation step. :obj:`(max_length-1,)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor + of shape :obj:`(batch_size*num_return_sequences, config.vocab_size)`). + encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer of the decoder) of shape + :obj:`(batch_size*num_return_sequences, num_heads, sequence_length, sequence_length)`. + encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size*num_return_sequences, sequence_length, hidden_size)`. + decoder_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_return_sequences, num_heads, generated_length, + sequence_length)`. + cross_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`. + decoder_hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_return_sequences, generated_length, hidden_size)`. + """ + + sequences: "torch.LongTensor" = None + scores: Optional[Tuple["torch.FloatTensor"]] = None + encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None + encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + decoder_attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + cross_attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + decoder_hidden_states: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + + +@dataclass +class BeamSearchDecoderOnlyOutput(ModelOutput): + """ + Base class for outputs of decoder-only generation models using beam search. + + Args: + sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`): + The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or + shorter if all batches finished early due to the :obj:`eos_token_id`. + sequences_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_return_sequences)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``): + Final beam scores of the generated ``sequences``. + scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``): + Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log + softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam + . :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor of + shape :obj:`(batch_size*num_beams*num_return_sequences, config.vocab_size)`). + attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length, + sequence_length)`. + hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, generated_length, + hidden_size)`. + """ + + sequences: "torch.LongTensor" = None + sequences_scores: Optional["torch.FloatTensor"] = None + scores: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + hidden_states: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + + +@dataclass +class BeamSearchEncoderDecoderOutput(ModelOutput): + """ + Base class for outputs of encoder-decoder generation models using beam search. Hidden states and attention weights + of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states + attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) + + Args: + sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`): + The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or + shorter if all batches finished early due to the :obj:`eos_token_id`. + sequences_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_return_sequences)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``): + Final beam scores of the generated ``sequences``. + scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``): + Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log + softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam + . :obj:`(max_length-1,)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor of shape + :obj:`(batch_size*num_beams, config.vocab_size)`). + attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer of the decoder) of shape :obj:`(batch_size, + num_heads, sequence_length, sequence_length)`. + encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`. + decoder_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, num_heads, + generated_length, sequence_length)`. + cross_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`. + decoder_hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, generated_length, + hidden_size)`. + """ + + sequences: "torch.LongTensor" = None + sequences_scores: Optional["torch.FloatTensor"] = None + scores: Optional[Tuple["torch.FloatTensor"]] = None + encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None + encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + decoder_attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + cross_attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + decoder_hidden_states: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + + +@dataclass +class BeamSampleDecoderOnlyOutput(ModelOutput): + """ + Base class for outputs of decoder-only generation models using beam sample. + + Args: + sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`): + The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or + shorter if all batches finished early due to the :obj:`eos_token_id`. + sequences_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_return_sequence)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``): + Final beam scores of the generated ``sequences``. + scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``): + Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log + softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam + . :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor of + shape :obj:`(batch_size*num_beams*num_return_sequences, config.vocab_size)`). + attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length, + sequence_length)`. + hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, generated_length, hidden_size)`. + """ + + sequences: "torch.LongTensor" = None + sequences_scores: Optional["torch.FloatTensor"] = None + scores: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + hidden_states: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + + +@dataclass +class BeamSampleEncoderDecoderOutput(ModelOutput): + """ + Base class for outputs of encoder-decoder generation models using beam sampling. Hidden states and attention + weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the + encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes) + + Args: + sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_beams, sequence_length)`): + The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or + shorter if all batches finished early due to the :obj:`eos_token_id`. + sequences_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_return_sequence)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``): + Final beam scores of the generated ``sequences``. + scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``): + Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log + softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam + . :obj:`(max_length-1,)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor of shape + :obj:`(batch_size*num_beams, config.vocab_size)`). + encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer of the decoder) of shape :obj:`(batch_size, + num_heads, sequence_length, sequence_length)`. + encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size*num_beams, sequence_length, hidden_size)`. + decoder_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length, + sequence_length)`. + cross_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`. + decoder_hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of + :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, generated_length, hidden_size)`. + """ + + sequences: "torch.LongTensor" = None + sequences_scores: Optional["torch.FloatTensor"] = None + scores: Optional[Tuple["torch.FloatTensor"]] = None + encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None + encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + decoder_attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + cross_attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + decoder_hidden_states: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + + +GreedySearchOutput = Union[GreedySearchEncoderDecoderOutput, GreedySearchDecoderOnlyOutput] +SampleOutput = Union[SampleEncoderDecoderOutput, SampleDecoderOnlyOutput] +BeamSearchOutput = Union[BeamSearchEncoderDecoderOutput, BeamSearchDecoderOnlyOutput] +BeamSampleOutput = Union[BeamSampleEncoderDecoderOutput, BeamSampleDecoderOnlyOutput] + + +class GenerationMixin: + """ + A class containing all of the functions supporting generation, to be used as a mixin in + :class:`~transformers.PreTrainedModel`. + """ + + def prepare_inputs_for_generation(self, input_ids: "torch.LongTensor", **kwargs) -> Dict[str, Any]: + """ + Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to prepare inputs in the + generate method. + """ + return {"input_ids": input_ids} + + def adjust_logits_during_generation(self, logits: "torch.FloatTensor", **kwargs) -> "torch.FloatTensor": + """ + Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to adjust the logits in + the generate method. + """ + return logits + + def _prepare_input_ids_for_generation( + self, bos_token_id: Optional[int], encoder_outputs: Optional[ModelOutput] + ) -> "torch.LongTensor": + if self.config.is_encoder_decoder and encoder_outputs is not None: + # make dummy input_ids with value -100, as a sanity check ensuring that they won't be used for encoding + shape = encoder_outputs.last_hidden_state.size()[:-1] + return torch.ones(shape, dtype=torch.long, device=self.device) * -100 + + if bos_token_id is None: + raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.") + return torch.ones((1, 1), dtype=torch.long, device=self.device) * bos_token_id + + def _prepare_attention_mask_for_generation( + self, input_ids: "torch.Tensor", pad_token_id: int, eos_token_id: int + ) -> "torch.LongTensor": + is_pad_token_in_inputs_ids = (pad_token_id is not None) and (pad_token_id in input_ids) + is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ( + (eos_token_id is not None) and (pad_token_id != eos_token_id) + ) + if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id: + return input_ids.ne(pad_token_id).long() + return input_ids.new_ones(input_ids.shape, dtype=torch.long) + + def _prepare_encoder_decoder_kwargs_for_generation( + self, input_ids: "torch.LongTensor", model_kwargs + ) -> Dict[str, Any]: + if "encoder_outputs" not in model_kwargs: + # retrieve encoder hidden states + encoder = self.get_encoder() + encoder_kwargs = { + argument: value + for argument, value in model_kwargs.items() + if not (argument.startswith("decoder_") or argument.startswith("cross_attn")) + } + model_kwargs["encoder_outputs"]: ModelOutput = encoder(input_ids, return_dict=True, **encoder_kwargs) + return model_kwargs + + def _prepare_decoder_input_ids_for_generation( + self, input_ids: "torch.LongTensor", decoder_start_token_id: int = None, bos_token_id: int = None + ) -> "torch.LongTensor": + decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id) + decoder_input_ids = ( + torch.ones((input_ids.shape[0], 1), dtype=torch.long, device=input_ids.device) * decoder_start_token_id + ) + return decoder_input_ids + + def _get_pad_token_id(self, pad_token_id: int = None, eos_token_id: int = None) -> int: + if pad_token_id is None and eos_token_id is not None: + logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.") + pad_token_id = eos_token_id + return pad_token_id + + def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int: + decoder_start_token_id = ( + decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id + ) + bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id + + if decoder_start_token_id is not None: + return decoder_start_token_id + elif ( + hasattr(self.config, "decoder") + and hasattr(self.config.decoder, "decoder_start_token_id") + and self.config.decoder.decoder_start_token_id is not None + ): + return self.config.decoder.decoder_start_token_id + elif bos_token_id is not None: + return bos_token_id + elif ( + hasattr(self.config, "decoder") + and hasattr(self.config.decoder, "bos_token_id") + and self.config.decoder.bos_token_id is not None + ): + return self.config.decoder.bos_token_id + raise ValueError( + "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation." + ) + + @staticmethod + def _expand_inputs_for_generation( + input_ids: "torch.LongTensor", + expand_size: int = 1, + is_encoder_decoder: bool = False, + attention_mask: "torch.LongTensor" = None, + encoder_outputs: ModelOutput = None, + **model_kwargs, + ) -> Tuple["torch.LongTensor", Dict[str, Any]]: + expanded_return_idx = ( + torch.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1).to(input_ids.device) + ) + input_ids = input_ids.index_select(0, expanded_return_idx) + + if "token_type_ids" in model_kwargs: + token_type_ids = model_kwargs["token_type_ids"] + model_kwargs["token_type_ids"] = token_type_ids.index_select(0, expanded_return_idx) + + if attention_mask is not None: + model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx) + + if is_encoder_decoder: + assert encoder_outputs is not None + encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.index_select( + 0, expanded_return_idx.to(encoder_outputs.last_hidden_state.device) + ) + model_kwargs["encoder_outputs"] = encoder_outputs + return input_ids, model_kwargs + + @staticmethod + def _update_model_kwargs_for_generation( + outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False + ) -> Dict[str, Any]: + # update past + if "past_key_values" in outputs: + model_kwargs["past"] = outputs.past_key_values + elif "mems" in outputs: + model_kwargs["past"] = outputs.mems + elif "past_buckets_states" in outputs: + model_kwargs["past"] = outputs.past_buckets_states + else: + model_kwargs["past"] = None + + # update token_type_ids with last value + if "token_type_ids" in model_kwargs: + token_type_ids = model_kwargs["token_type_ids"] + model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1) + + # update attention mask + if not is_encoder_decoder: + if "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = torch.cat( + [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + ) + + return model_kwargs + + def _reorder_cache(self, past, beam_idx): + raise NotImplementedError( + f"Make sure that a `_reorder_cache` function is correctly implemented in {self.__class__.__module__} to enable beam search for {self.__class__}" + ) + + def _get_logits_warper( + self, top_k: int = None, top_p: float = None, temperature: float = None, num_beams: int = None + ) -> LogitsProcessorList: + """ + This class returns a :obj:`~transformers.LogitsProcessorList` list object that contains all relevant + :obj:`~transformers.LogitsWarper` instances used for multinomial sampling. + """ + + # init warp parameters + top_k = top_k if top_k is not None else self.config.top_k + top_p = top_p if top_p is not None else self.config.top_p + temperature = temperature if temperature is not None else self.config.temperature + # instantiate warpers list + warpers = LogitsProcessorList() + + # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files + # all samplers can be found in `generation_utils_samplers.py` + if temperature is not None and temperature != 1.0: + warpers.append(TemperatureLogitsWarper(temperature)) + if top_k is not None and top_k != 0: + warpers.append(TopKLogitsWarper(top_k=top_k, min_tokens_to_keep=(2 if num_beams > 1 else 1))) + if top_p is not None and top_p < 1.0: + warpers.append(TopPLogitsWarper(top_p=top_p, min_tokens_to_keep=(2 if num_beams > 1 else 1))) + return warpers + + def _get_logits_processor( + self, + repetition_penalty: float, + no_repeat_ngram_size: int, + encoder_no_repeat_ngram_size: int, + encoder_input_ids: "torch.LongTensor", + bad_words_ids: List[List[int]], + min_length: int, + max_length: int, + eos_token_id: int, + forced_bos_token_id: int, + forced_eos_token_id: int, + prefix_allowed_tokens_fn: Callable[[int, "torch.Tensor"], List[int]], + num_beams: int, + num_beam_groups: int, + diversity_penalty: float, + remove_invalid_values: bool, + ) -> LogitsProcessorList: + """ + This class returns a :obj:`~transformers.LogitsProcessorList` list object that contains all relevant + :obj:`~transformers.LogitsProcessor` instances used to modify the scores of the language model head. + """ + processors = LogitsProcessorList() + + # init warp parameters + repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty + no_repeat_ngram_size = ( + no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size + ) + encoder_no_repeat_ngram_size = ( + encoder_no_repeat_ngram_size + if encoder_no_repeat_ngram_size is not None + else self.config.encoder_no_repeat_ngram_size + ) + bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids + min_length = min_length if min_length is not None else self.config.min_length + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + diversity_penalty = diversity_penalty if diversity_penalty is not None else self.config.diversity_penalty + forced_bos_token_id = ( + forced_bos_token_id if forced_bos_token_id is not None else self.config.forced_bos_token_id + ) + forced_eos_token_id = ( + forced_eos_token_id if forced_eos_token_id is not None else self.config.forced_eos_token_id + ) + remove_invalid_values = ( + remove_invalid_values if remove_invalid_values is not None else self.config.remove_invalid_values + ) + # instantiate processors list + + # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files + # all samplers can be found in `generation_utils_samplers.py` + if diversity_penalty is not None and diversity_penalty > 0.0: + processors.append( + HammingDiversityLogitsProcessor( + diversity_penalty=diversity_penalty, num_beams=num_beams, num_beam_groups=num_beam_groups + ) + ) + if repetition_penalty is not None and repetition_penalty != 1.0: + processors.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty)) + if no_repeat_ngram_size is not None and no_repeat_ngram_size > 0: + processors.append(NoRepeatNGramLogitsProcessor(no_repeat_ngram_size)) + if encoder_no_repeat_ngram_size is not None and encoder_no_repeat_ngram_size > 0: + if self.config.is_encoder_decoder: + processors.append(EncoderNoRepeatNGramLogitsProcessor(encoder_no_repeat_ngram_size, encoder_input_ids)) + else: + raise ValueError( + "It's impossible to use `encoder_no_repeat_ngram_size` with decoder-only architecture" + ) + if bad_words_ids is not None: + processors.append(NoBadWordsLogitsProcessor(bad_words_ids, eos_token_id)) + if min_length is not None and eos_token_id is not None and min_length > -1: + processors.append(MinLengthLogitsProcessor(min_length, eos_token_id)) + if prefix_allowed_tokens_fn is not None: + processors.append(PrefixConstrainedLogitsProcessor(prefix_allowed_tokens_fn, num_beams // num_beam_groups)) + if forced_bos_token_id is not None: + processors.append(ForcedBOSTokenLogitsProcessor(forced_bos_token_id)) + if forced_eos_token_id is not None: + processors.append(ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id)) + if remove_invalid_values is True: + processors.append(InfNanRemoveLogitsProcessor()) + return processors + + def _get_stopping_criteria( + self, max_length: Optional[int], max_time: Optional[float], max_new_tokens: Optional[int], start_length: int + ) -> StoppingCriteriaList: + stopping_criteria = StoppingCriteriaList() + if max_length is not None: + stopping_criteria.append(MaxLengthCriteria(max_length=max_length)) + if max_time is not None: + stopping_criteria.append(MaxTimeCriteria(max_time=max_time)) + if max_new_tokens is not None: + stopping_criteria.append(MaxNewTokensCriteria(start_length=start_length, max_new_tokens=max_new_tokens)) + return stopping_criteria + + @no_grad() + def generate( + self, + input_ids: Optional["torch.LongTensor"] = None, + max_length: Optional[int] = None, + min_length: Optional[int] = None, + do_sample: Optional[bool] = None, + early_stopping: Optional[bool] = None, + num_beams: Optional[int] = None, + temperature: Optional[float] = None, + top_k: Optional[int] = None, + top_p: Optional[float] = None, + repetition_penalty: Optional[float] = None, + bad_words_ids: Optional[Iterable[int]] = None, + bos_token_id: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + length_penalty: Optional[float] = None, + no_repeat_ngram_size: Optional[int] = None, + encoder_no_repeat_ngram_size: Optional[int] = None, + num_return_sequences: Optional[int] = None, + max_time: Optional[float] = None, + max_new_tokens: Optional[int] = None, + decoder_start_token_id: Optional[int] = None, + use_cache: Optional[bool] = None, + num_beam_groups: Optional[int] = None, + diversity_penalty: Optional[float] = None, + prefix_allowed_tokens_fn: Optional[Callable[[int, "torch.Tensor"], List[int]]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_scores: Optional[bool] = None, + return_dict_in_generate: Optional[bool] = None, + forced_bos_token_id: Optional[int] = None, + forced_eos_token_id: Optional[int] = None, + remove_invalid_values: Optional[bool] = None, + synced_gpus: Optional[bool] = None, + **model_kwargs, + ) -> Union[GreedySearchOutput, SampleOutput, BeamSearchOutput, BeamSampleOutput, "torch.LongTensor"]: + r""" + Generates sequences for models with a language modeling head. The method currently supports greedy decoding, + multinomial sampling, beam-search decoding, and beam-search multinomial sampling. + + Apart from :obj:`input_ids` and :obj:`attention_mask`, all the arguments below will default to the value of the + attribute of the same name inside the :class:`~transformers.PretrainedConfig` of the model. The default values + indicated are the default values of those config. + + Most of these parameters are explained in more detail in `this blog post + `__. + + Parameters: + + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + The sequence used as a prompt for the generation. If :obj:`None` the method initializes it with + :obj:`bos_token_id` and a batch size of 1. + max_length (:obj:`int`, `optional`, defaults to :obj:`model.config.max_length`): + The maximum length of the sequence to be generated. + max_new_tokens (:obj:`int`, `optional`, defaults to None): + The maximum numbers of tokens to generate, ignore the current number of tokens. Use either + :obj:`max_new_tokens` or :obj:`max_length` but not both, they serve the same purpose. + min_length (:obj:`int`, `optional`, defaults to 10): + The minimum length of the sequence to be generated. + do_sample (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to use sampling ; use greedy decoding otherwise. + early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. + num_beams (:obj:`int`, `optional`, defaults to 1): + Number of beams for beam search. 1 means no beam search. + temperature (:obj:`float`, `optional`, defaults to 1.0): + The value used to module the next token probabilities. + top_k (:obj:`int`, `optional`, defaults to 50): + The number of highest probability vocabulary tokens to keep for top-k-filtering. + top_p (:obj:`float`, `optional`, defaults to 1.0): + If set to float < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or + higher are kept for generation. + repetition_penalty (:obj:`float`, `optional`, defaults to 1.0): + The parameter for repetition penalty. 1.0 means no penalty. See `this paper + `__ for more details. + pad_token_id (:obj:`int`, `optional`): + The id of the `padding` token. + bos_token_id (:obj:`int`, `optional`): + The id of the `beginning-of-sequence` token. + eos_token_id (:obj:`int`, `optional`): + The id of the `end-of-sequence` token. + length_penalty (:obj:`float`, `optional`, defaults to 1.0): + Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the + model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer + sequences. + no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0): + If set to int > 0, all ngrams of that size can only occur once. + encoder_no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0): + If set to int > 0, all ngrams of that size that occur in the ``encoder_input_ids`` cannot occur in the + ``decoder_input_ids``. + bad_words_ids(:obj:`List[List[int]]`, `optional`): + List of token ids that are not allowed to be generated. In order to get the tokens of the words that + should not appear in the generated text, use :obj:`tokenizer(bad_word, + add_prefix_space=True).input_ids`. + num_return_sequences(:obj:`int`, `optional`, defaults to 1): + The number of independently computed returned sequences for each element in the batch. + max_time(:obj:`float`, `optional`, defaults to None): + The maximum amount of time you allow the computation to run for in seconds. generation will still + finish the current pass after allocated time has been passed. + attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values are in ``[0, 1]``, 1 for + tokens that are not masked, and 0 for masked tokens. If not provided, will default to a tensor the same + shape as :obj:`input_ids` that masks the pad token. `What are attention masks? + <../glossary.html#attention-mask>`__ + decoder_start_token_id (:obj:`int`, `optional`): + If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token. + use_cache: (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not the model should use the past last key/values attentions (if applicable to the model) to + speed up decoding. + num_beam_groups (:obj:`int`, `optional`, defaults to 1): + Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of + beams. `this paper `__ for more details. + diversity_penalty (:obj:`float`, `optional`, defaults to 0.0): + This value is subtracted from a beam's score if it generates a token same as any beam from other group + at a particular time. Note that :obj:`diversity_penalty` is only effective if ``group beam search`` is + enabled. + prefix_allowed_tokens_fn: (:obj:`Callable[[int, torch.Tensor], List[int]]`, `optional`): + If provided, this function constraints the beam search to allowed tokens only at each step. If not + provided no constraint is applied. This function takes 2 arguments: the batch ID :obj:`batch_id` and + :obj:`input_ids`. It has to return a list with the allowed tokens for the next generation step + conditioned on the batch ID :obj:`batch_id` and the previously generated tokens :obj:`inputs_ids`. This + argument is useful for constrained generation conditioned on the prefix, as described in + `Autoregressive Entity Retrieval `__. + output_attentions (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more details. + output_hidden_states (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors + for more details. + output_scores (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details. + return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + forced_bos_token_id (:obj:`int`, `optional`): + The id of the token to force as the first generated token after the :obj:`decoder_start_token_id`. + Useful for multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token + needs to be the target language token. + forced_eos_token_id (:obj:`int`, `optional`): + The id of the token to force as the last generated token when :obj:`max_length` is reached. + remove_invalid_values (:obj:`bool`, `optional`): + Whether to remove possible `nan` and `inf` outputs of the model to prevent the generation method to + crash. Note that using ``remove_invalid_values`` can slow down generation. + synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + + model_kwargs: + Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If the + model is an encoder-decoder model, encoder specific kwargs should not be prefixed and decoder specific + kwargs should be prefixed with `decoder_`. + + Return: + :class:`~transformers.file_utils.ModelOutput` or :obj:`torch.LongTensor`: A + :class:`~transformers.file_utils.ModelOutput` (if ``return_dict_in_generate=True`` or when + ``config.return_dict_in_generate=True``) or a :obj:`torch.FloatTensor`. + + If the model is `not` an encoder-decoder model (``model.config.is_encoder_decoder=False``), the + possible :class:`~transformers.file_utils.ModelOutput` types are: + + - :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput`, + - :class:`~transformers.generation_utils.SampleDecoderOnlyOutput`, + - :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput`, + - :class:`~transformers.generation_utils.BeamSampleDecoderOnlyOutput` + + If the model is an encoder-decoder model (``model.config.is_encoder_decoder=True``), the possible + :class:`~transformers.file_utils.ModelOutput` types are: + + - :class:`~transformers.generation_utils.GreedySearchEncoderDecoderOutput`, + - :class:`~transformers.generation_utils.SampleEncoderDecoderOutput`, + - :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput`, + - :class:`~transformers.generation_utils.BeamSampleEncoderDecoderOutput` + + Examples:: + >>> from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM + + >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2") + >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2") + >>> # do greedy decoding without providing a prompt + >>> outputs = model.generate(max_length=40) + >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True)) + + >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") + >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") + >>> document = ( + ... "at least two people were killed in a suspected bomb attack on a passenger bus " + ... "in the strife-torn southern philippines on monday , the military said." + ... ) + >>> # encode input context + >>> input_ids = tokenizer(document, return_tensors="pt").input_ids + >>> # generate 3 independent sequences using beam search decoding (5 beams) + >>> # with T5 encoder-decoder model conditioned on short news article. + >>> outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3) + >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True)) + + >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2") + >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2") + >>> input_context = "The dog" + >>> # encode input context + >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids + >>> # generate 3 candidates using sampling + >>> outputs = model.generate(input_ids=input_ids, max_length=20, num_return_sequences=3, do_sample=True) + >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True)) + + >>> tokenizer = AutoTokenizer.from_pretrained("ctrl") + >>> model = AutoModelForCausalLM.from_pretrained("ctrl") + >>> # "Legal" is one of the control codes for ctrl + >>> input_context = "Legal My neighbor is" + >>> # encode input context + >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids + >>> outputs = model.generate(input_ids=input_ids, max_length=20, repetition_penalty=1.2) + >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True)) + + >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") + >>> model = AutoModelForCausalLM.from_pretrained("gpt2") + >>> input_context = "My cute dog" + >>> # get tokens of words that should not be generated + >>> bad_words_ids = [tokenizer(bad_word, add_prefix_space=True).input_ids for bad_word in ["idiot", "stupid", "shut up"]] + >>> # encode input context + >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids + >>> # generate sequences without allowing bad_words to be generated + >>> outputs = model.generate(input_ids=input_ids, max_length=20, do_sample=True, bad_words_ids=bad_words_ids) + >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True)) + """ + + # set init values + if max_length is None and max_new_tokens is None: + # Both are None, default + max_length = self.config.max_length + elif max_length is not None and max_new_tokens is not None: + # Both are set, this is odd, raise a warning + logger.warn( + "Both `max_length` and `max_new_tokens` have been set but they serve the same purpose.", UserWarning + ) + + max_length = max_length if max_length is not None else self.config.max_length + num_beams = num_beams if num_beams is not None else self.config.num_beams + num_beam_groups = num_beam_groups if num_beam_groups is not None else self.config.num_beam_groups + do_sample = do_sample if do_sample is not None else self.config.do_sample + num_return_sequences = ( + num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences + ) + + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + + output_scores = output_scores if output_scores is not None else self.config.output_scores + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate + ) + + model_kwargs["output_attentions"] = output_attentions + model_kwargs["output_hidden_states"] = output_hidden_states + + if input_ids is None and "inputs_embeds" not in model_kwargs: + # init `input_ids` with bos_token_id + input_ids = self._prepare_input_ids_for_generation(bos_token_id, model_kwargs.get("encoder_outputs")) + + if model_kwargs.get("attention_mask", None) is None: + # init `attention_mask` depending on `pad_token_id` + model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( + input_ids, pad_token_id, eos_token_id + ) + + # special case if pad_token_id is not defined + if pad_token_id is None and eos_token_id is not None: + logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.") + pad_token_id = eos_token_id + + # Storing encoder_input_ids for logits_processor that could use them + encoder_input_ids = input_ids if self.config.is_encoder_decoder else None + + if self.config.is_encoder_decoder: + # add encoder_outputs to model_kwargs + model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs) + + # set input_ids as decoder_input_ids + if "decoder_input_ids" in model_kwargs: + input_ids = model_kwargs.pop("decoder_input_ids") + else: + input_ids = self._prepare_decoder_input_ids_for_generation( + input_ids, decoder_start_token_id=decoder_start_token_id, bos_token_id=bos_token_id + ) + + if "encoder_outputs" not in model_kwargs or not isinstance(model_kwargs["encoder_outputs"], ModelOutput): + raise ValueError("Make sure that `model_kwargs` include `encoder_outputs` of type `ModelOutput`.") + + if input_ids.shape[-1] >= max_length: + input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids" + logger.warning( + f"Input length of {input_ids_string} is {input_ids.shape[-1]}, but ``max_length`` is set to {max_length}." + "This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``." + ) + + # determine generation mode + is_greedy_gen_mode = (num_beams == 1) and (num_beam_groups == 1) and do_sample is False + is_sample_gen_mode = (num_beams == 1) and (num_beam_groups == 1) and do_sample is True + is_beam_gen_mode = (num_beams > 1) and (num_beam_groups == 1) and do_sample is False + is_beam_sample_gen_mode = (num_beams > 1) and (num_beam_groups == 1) and do_sample is True + is_group_beam_gen_mode = (num_beams > 1) and (num_beam_groups > 1) + if num_beam_groups > num_beams: + raise ValueError("`num_beam_groups` has to be smaller or equal to `num_beams`") + if is_group_beam_gen_mode and do_sample is True: + raise ValueError( + "Diverse beam search cannot be used in sampling mode. Make sure that `do_sample` is set to `False`." + ) + + # set model_kwargs + model_kwargs["use_cache"] = use_cache + + # get distribution pre_processing samplers + logits_processor = self._get_logits_processor( + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size, + encoder_input_ids=encoder_input_ids, + bad_words_ids=bad_words_ids, + min_length=min_length, + max_length=max_length, + eos_token_id=eos_token_id, + forced_bos_token_id=forced_bos_token_id, + forced_eos_token_id=forced_eos_token_id, + prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, + num_beams=num_beams, + num_beam_groups=num_beam_groups, + diversity_penalty=diversity_penalty, + remove_invalid_values=remove_invalid_values, + ) + + cur_len = input_ids.shape[-1] + stopping_criteria = self._get_stopping_criteria( + max_length=max_length, max_time=max_time, max_new_tokens=max_new_tokens, start_length=cur_len + ) + + if is_greedy_gen_mode: + if num_return_sequences > 1: + raise ValueError( + f"num_return_sequences has to be 1, but is {num_return_sequences} when doing greedy search." + ) + + # greedy search + return self.greedy_search( + input_ids, + logits_processor=logits_processor, + stopping_criteria=stopping_criteria, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + output_scores=output_scores, + return_dict_in_generate=return_dict_in_generate, + synced_gpus=synced_gpus, + **model_kwargs, + ) + + elif is_sample_gen_mode: + # get probability distribution warper + logits_warper = self._get_logits_warper( + top_k=top_k, top_p=top_p, temperature=temperature, num_beams=num_beams + ) + + # expand input_ids with `num_return_sequences` additional sequences per batch + input_ids, model_kwargs = self._expand_inputs_for_generation( + input_ids, + expand_size=num_return_sequences, + is_encoder_decoder=self.config.is_encoder_decoder, + **model_kwargs, + ) + + # sample + return self.sample( + input_ids, + logits_processor=logits_processor, + logits_warper=logits_warper, + stopping_criteria=stopping_criteria, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + output_scores=output_scores, + return_dict_in_generate=return_dict_in_generate, + synced_gpus=synced_gpus, + **model_kwargs, + ) + + elif is_beam_gen_mode: + batch_size = input_ids.shape[0] + + length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty + early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping + + if num_return_sequences > num_beams: + raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.") + + if stopping_criteria.max_length is None: + raise ValueError("`max_length` needs to be a stopping_criteria for now.") + + beam_scorer = BeamSearchScorer( + batch_size=batch_size, + num_beams=num_beams, + device=self.device, + length_penalty=length_penalty, + do_early_stopping=early_stopping, + num_beam_hyps_to_keep=num_return_sequences, + ) + # interleave with `num_beams` + input_ids, model_kwargs = self._expand_inputs_for_generation( + input_ids, expand_size=num_beams, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs + ) + return self.beam_search( + input_ids, + beam_scorer, + logits_processor=logits_processor, + stopping_criteria=stopping_criteria, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + output_scores=output_scores, + return_dict_in_generate=return_dict_in_generate, + synced_gpus=synced_gpus, + **model_kwargs, + ) + + elif is_beam_sample_gen_mode: + logits_warper = self._get_logits_warper( + top_k=top_k, top_p=top_p, temperature=temperature, num_beams=num_beams + ) + + batch_size = input_ids.shape[0] * num_return_sequences + + length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty + if stopping_criteria.max_length is None: + raise ValueError("`max_length` needs to be a stopping_criteria for now.") + beam_scorer = BeamSearchScorer( + batch_size=batch_size, + num_beams=num_beams, + device=self.device, + length_penalty=length_penalty, + do_early_stopping=early_stopping, + ) + + # interleave with `num_beams * num_return_sequences` + input_ids, model_kwargs = self._expand_inputs_for_generation( + input_ids, + expand_size=num_beams * num_return_sequences, + is_encoder_decoder=self.config.is_encoder_decoder, + **model_kwargs, + ) + + return self.beam_sample( + input_ids, + beam_scorer, + logits_processor=logits_processor, + logits_warper=logits_warper, + stopping_criteria=stopping_criteria, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + output_scores=output_scores, + return_dict_in_generate=return_dict_in_generate, + synced_gpus=synced_gpus, + **model_kwargs, + ) + + elif is_group_beam_gen_mode: + batch_size = input_ids.shape[0] + + length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty + early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping + + if num_return_sequences > num_beams: + raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.") + + if num_beams % num_beam_groups != 0: + raise ValueError("`num_beams` should be divisible by `num_beam_groups` for group beam search.") + + if stopping_criteria.max_length is None: + raise ValueError("`max_length` needs to be a stopping_criteria for now.") + + diverse_beam_scorer = BeamSearchScorer( + batch_size=batch_size, + num_beams=num_beams, + max_length=stopping_criteria.max_length, + device=self.device, + length_penalty=length_penalty, + do_early_stopping=early_stopping, + num_beam_hyps_to_keep=num_return_sequences, + num_beam_groups=num_beam_groups, + ) + # interleave with `num_beams` + input_ids, model_kwargs = self._expand_inputs_for_generation( + input_ids, expand_size=num_beams, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs + ) + return self.group_beam_search( + input_ids, + diverse_beam_scorer, + logits_processor=logits_processor, + stopping_criteria=stopping_criteria, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + output_scores=output_scores, + return_dict_in_generate=return_dict_in_generate, + synced_gpus=synced_gpus, + **model_kwargs, + ) + + def greedy_search( + self, + input_ids: "torch.LongTensor", + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_scores: Optional[bool] = None, + return_dict_in_generate: Optional[bool] = None, + synced_gpus: Optional[bool] = None, + **model_kwargs, + ) -> Union[GreedySearchOutput, "torch.LongTensor"]: + r""" + Generates sequences for models with a language modeling head using greedy decoding. + + Parameters: + + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + logits_processor (:obj:`LogitsProcessorList`, `optional`): + An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from + :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling + head applied at each generation step. + stopping_criteria (:obj:`StoppingCriteriaList`, `optional`): + An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from + :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop. + + max_length (:obj:`int`, `optional`, defaults to 20): + **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of + generated tokens. The maximum length of the sequence to be generated. + pad_token_id (:obj:`int`, `optional`): + The id of the `padding` token. + eos_token_id (:obj:`int`, `optional`): + The id of the `end-of-sequence` token. + output_attentions (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more details. + output_hidden_states (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors + for more details. + output_scores (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details. + return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + model_kwargs: + Additional model specific keyword arguments will be forwarded to the :obj:`forward` function of the + model. If model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`. + + Return: + :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput`, + :class:`~transformers.generation_utils.GreedySearchEncoderDecoderOutput` or obj:`torch.LongTensor`: A + :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a + :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput` if + ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a + :class:`~transformers.generation_utils.GreedySearchEncoderDecoderOutput` if + ``model.config.is_encoder_decoder=True``. + + Examples:: + + >>> from transformers import ( + ... AutoTokenizer, + ... AutoModelForCausalLM, + ... LogitsProcessorList, + ... MinLengthLogitsProcessor, + ... ) + + >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") + >>> model = AutoModelForCausalLM.from_pretrained("gpt2") + + >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token + >>> model.config.pad_token_id = model.config.eos_token_id + + >>> input_prompt = "Today is a beautiful day, and" + >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids + + >>> # instantiate logits processors + >>> logits_processor = LogitsProcessorList([ + ... MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id), + ... ]) + + >>> outputs = model.greedy_search(input_ids, logits_processor=logits_processor) + + >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True)) + """ + # init values + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + if max_length is not None: + logger.warn( + "`max_length` is deprecated in this function, use `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.", + UserWarning, + ) + stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + output_scores = output_scores if output_scores is not None else self.config.output_scores + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate + ) + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None + ) + + # keep track of which sequences are already finished + unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) + cur_len = input_ids.shape[-1] + + this_peer_finished = False # used by synced_gpus only + while True: + + if synced_gpus: + # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. + # The following logic allows an early break if all peers finished generating their sequence + this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) + # send 0.0 if we finished, 1.0 otherwise + dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) + # did all peers finish? the reduced sum will be 0.0 then + if this_peer_finished_flag.item() == 0.0: + break + + # prepare model inputs + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + + # forward pass to get next token + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + if synced_gpus and this_peer_finished: + cur_len = cur_len + 1 + continue # don't waste resources running the code we don't need + + next_token_logits = outputs.logits[:, -1, :] + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_scores: + scores += (next_token_logits,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) + ) + + # pre-process distribution + next_tokens_scores = logits_processor(input_ids, next_token_logits) + + # argmax + next_tokens = torch.argmax(next_tokens_scores, dim=-1) + + # finished sentences should have their next token be a padding token + if eos_token_id is not None: + assert pad_token_id is not None, "If eos_token_id is defined, make sure that pad_token_id is defined." + next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) + + # update generated ids, model inputs, and length for next step + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + model_kwargs = self._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder + ) + cur_len = cur_len + 1 + + # if eos_token was found in one sentence, set sentence to finished + if eos_token_id is not None: + unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long()) + + # stop when each sentence is finished, or if we exceed the maximum length + if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): + if not synced_gpus: + break + else: + this_peer_finished = True + + if return_dict_in_generate: + if self.config.is_encoder_decoder: + return GreedySearchEncoderDecoderOutput( + sequences=input_ids, + scores=scores, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + ) + else: + return GreedySearchDecoderOnlyOutput( + sequences=input_ids, + scores=scores, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + ) + else: + return input_ids + + def sample( + self, + input_ids: "torch.LongTensor", + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_warper: Optional[LogitsProcessorList] = None, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_scores: Optional[bool] = None, + return_dict_in_generate: Optional[bool] = None, + synced_gpus: Optional[bool] = None, + **model_kwargs, + ) -> Union[SampleOutput, "torch.LongTensor"]: + r""" + Generates sequences for models with a language modeling head using multinomial sampling. + + Parameters: + + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + logits_processor (:obj:`LogitsProcessorList`, `optional`): + An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from + :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling + head applied at each generation step. + stopping_criteria (:obj:`StoppingCriteriaList`, `optional`): + An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from + :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop. + logits_warper (:obj:`LogitsProcessorList`, `optional`): + An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from + :class:`~transformers.LogitsWarper` used to warp the prediction score distribution of the language + modeling head applied before multinomial sampling at each generation step. + max_length (:obj:`int`, `optional`, defaults to 20): + **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of + generated tokens. The maximum length of the sequence to be generated. + pad_token_id (:obj:`int`, `optional`): + The id of the `padding` token. + eos_token_id (:obj:`int`, `optional`): + The id of the `end-of-sequence` token. + output_attentions (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more details. + output_hidden_states (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors + for more details. + output_scores (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details. + return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + model_kwargs: + Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If + model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`. + + Return: + :class:`~transformers.generation_utils.SampleDecoderOnlyOutput`, + :class:`~transformers.generation_utils.SampleEncoderDecoderOutput` or obj:`torch.LongTensor`: A + :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a + :class:`~transformers.generation_utils.SampleDecoderOnlyOutput` if + ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a + :class:`~transformers.generation_utils.SampleEncoderDecoderOutput` if + ``model.config.is_encoder_decoder=True``. + + Examples:: + + >>> from transformers import ( + ... AutoTokenizer, + ... AutoModelForCausalLM, + ... LogitsProcessorList, + ... MinLengthLogitsProcessor, + ... TopKLogitsWarper, + ... TemperatureLogitsWarper, + ... ) + + >>> tokenizer = AutoTokenizer.from_pretrained("gpt2") + >>> model = AutoModelForCausalLM.from_pretrained("gpt2") + + >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token + >>> model.config.pad_token_id = model.config.eos_token_id + + >>> input_prompt = "Today is a beautiful day, and" + >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids + + >>> # instantiate logits processors + >>> logits_processor = LogitsProcessorList([ + ... MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id), + ... ]) + >>> # instantiate logits processors + >>> logits_warper = LogitsProcessorList([ + ... TopKLogitsWarper(50), + ... TemperatureLogitsWarper(0.7), + ... ]) + + >>> outputs = model.sample(input_ids, logits_processor=logits_processor, logits_warper=logits_warper) + + >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True)) + """ + + # init values + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + if max_length is not None: + logger.warn( + "`max_length` is deprecated in this function, use `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.", + UserWarning, + ) + stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) + logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList() + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + output_scores = output_scores if output_scores is not None else self.config.output_scores + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate + ) + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None + ) + + # keep track of which sequences are already finished + unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1) + cur_len = input_ids.shape[-1] + + this_peer_finished = False # used by synced_gpus only + # auto-regressive generation + while True: + + if synced_gpus: + # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. + # The following logic allows an early break if all peers finished generating their sequence + this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) + # send 0.0 if we finished, 1.0 otherwise + dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) + # did all peers finish? the reduced sum will be 0.0 then + if this_peer_finished_flag.item() == 0.0: + break + + # prepare model inputs + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + + # forward pass to get next token + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + if synced_gpus and this_peer_finished: + cur_len = cur_len + 1 + continue # don't waste resources running the code we don't need + + next_token_logits = outputs.logits[:, -1, :] + + # pre-process distribution + next_token_scores = logits_processor(input_ids, next_token_logits) + next_token_scores = logits_warper(input_ids, next_token_scores) + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_scores: + scores += (next_token_scores,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) + ) + + # sample + probs = nn.functional.softmax(next_token_scores, dim=-1) + next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1) + + # finished sentences should have their next token be a padding token + if eos_token_id is not None: + assert pad_token_id is not None, "If eos_token_id is defined, make sure that pad_token_id is defined." + next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences) + + # update generated ids, model inputs, and length for next step + input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1) + model_kwargs = self._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder + ) + cur_len = cur_len + 1 + + # if eos_token was found in one sentence, set sentence to finished + if eos_token_id is not None: + unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long()) + + # stop when each sentence is finished, or if we exceed the maximum length + if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): + if not synced_gpus: + break + else: + this_peer_finished = True + + if return_dict_in_generate: + if self.config.is_encoder_decoder: + return SampleEncoderDecoderOutput( + sequences=input_ids, + scores=scores, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + ) + else: + return SampleDecoderOnlyOutput( + sequences=input_ids, + scores=scores, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + ) + else: + return input_ids + + def beam_search( + self, + input_ids: "torch.LongTensor", + beam_scorer: BeamScorer, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_scores: Optional[bool] = None, + return_dict_in_generate: Optional[bool] = None, + synced_gpus: Optional[bool] = None, + **model_kwargs, + ) -> Union[BeamSearchOutput, "torch.LongTensor"]: + r""" + Generates sequences for models with a language modeling head using beam search decoding. + + Parameters: + + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + beam_scorer (:obj:`BeamScorer`): + An derived instance of :class:`~transformers.BeamScorer` that defines how beam hypotheses are + constructed, stored and sorted during generation. For more information, the documentation of + :class:`~transformers.BeamScorer` should be read. + logits_processor (:obj:`LogitsProcessorList`, `optional`): + An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from + :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling + head applied at each generation step. + stopping_criteria (:obj:`StoppingCriteriaList`, `optional`): + An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from + :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop. + max_length (:obj:`int`, `optional`, defaults to 20): + **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of + generated tokens. The maximum length of the sequence to be generated. + pad_token_id (:obj:`int`, `optional`): + The id of the `padding` token. + eos_token_id (:obj:`int`, `optional`): + The id of the `end-of-sequence` token. + output_attentions (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more details. + output_hidden_states (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors + for more details. + output_scores (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details. + return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + model_kwargs: + Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If + model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`. + + Return: + :class:`~transformers.generation_utilsBeamSearchDecoderOnlyOutput`, + :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` or obj:`torch.LongTensor`: A + :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a + :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput` if + ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a + :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` if + ``model.config.is_encoder_decoder=True``. + + + Examples:: + + >>> from transformers import ( + ... AutoTokenizer, + ... AutoModelForSeq2SeqLM, + ... LogitsProcessorList, + ... MinLengthLogitsProcessor, + ... BeamSearchScorer, + ... ) + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") + >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") + + >>> encoder_input_str = "translate English to German: How old are you?" + >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids + + + >>> # lets run beam search using 3 beams + >>> num_beams = 3 + >>> # define decoder start token ids + >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long) + >>> input_ids = input_ids * model.config.decoder_start_token_id + + >>> # add encoder_outputs to model keyword arguments + >>> model_kwargs = { + ... "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True) + ... } + + >>> # instantiate beam scorer + >>> beam_scorer = BeamSearchScorer( + ... batch_size=1, + ... num_beams=num_beams, + ... device=model.device, + ... ) + + >>> # instantiate logits processors + >>> logits_processor = LogitsProcessorList([ + ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id), + ... ]) + + >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs) + + >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True)) + """ + # init values + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + if max_length is not None: + logger.warn( + "`max_length` is deprecated in this function, use `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.", + UserWarning, + ) + stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) + if len(stopping_criteria) == 0: + logger.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning) + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + output_scores = output_scores if output_scores is not None else self.config.output_scores + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate + ) + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None + ) + + batch_size = len(beam_scorer._beam_hyps) + num_beams = beam_scorer.num_beams + + batch_beam_size, cur_len = input_ids.shape + + assert ( + num_beams * batch_size == batch_beam_size + ), f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}." + + beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) + beam_scores[:, 1:] = -1e9 + beam_scores = beam_scores.view((batch_size * num_beams,)) + + this_peer_finished = False # used by synced_gpus only + while True: + + if synced_gpus: + # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. + # The following logic allows an early break if all peers finished generating their sequence + this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) + # send 0.0 if we finished, 1.0 otherwise + dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) + # did all peers finish? the reduced sum will be 0.0 then + if this_peer_finished_flag.item() == 0.0: + break + + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + if synced_gpus and this_peer_finished: + cur_len = cur_len + 1 + continue # don't waste resources running the code we don't need + + next_token_logits = outputs.logits[:, -1, :] + # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id` + # cannot be generated both before and after the `nn.functional.log_softmax` operation. + next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len) + next_token_scores = nn.functional.log_softmax( + next_token_logits, dim=-1 + ) # (batch_size * num_beams, vocab_size) + + next_token_scores = logits_processor(input_ids, next_token_scores) + next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores) + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_scores: + scores += (next_token_scores,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) + ) + + # reshape for beam search + vocab_size = next_token_scores.shape[-1] + next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) + + next_token_scores, next_tokens = torch.topk( + next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True + ) + + next_indices = (next_tokens / vocab_size).long() + next_tokens = next_tokens % vocab_size + + # stateless + beam_outputs = beam_scorer.process( + input_ids, + next_token_scores, + next_tokens, + next_indices, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + ) + beam_scores = beam_outputs["next_beam_scores"] + beam_next_tokens = beam_outputs["next_beam_tokens"] + beam_idx = beam_outputs["next_beam_indices"] + + input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1) + + model_kwargs = self._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder + ) + if model_kwargs["past"] is not None: + model_kwargs["past"] = self._reorder_cache(model_kwargs["past"], beam_idx) + + # increase cur_len + cur_len = cur_len + 1 + + if beam_scorer.is_done or stopping_criteria(input_ids, scores): + if not synced_gpus: + break + else: + this_peer_finished = True + + sequence_outputs = beam_scorer.finalize( + input_ids, + beam_scores, + next_tokens, + next_indices, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + max_length=stopping_criteria.max_length, + ) + + if return_dict_in_generate: + if not output_scores: + sequence_outputs["sequence_scores"] = None + if self.config.is_encoder_decoder: + return BeamSearchEncoderDecoderOutput( + sequences=sequence_outputs["sequences"], + sequences_scores=sequence_outputs["sequence_scores"], + scores=scores, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + ) + else: + return BeamSearchDecoderOnlyOutput( + sequences=sequence_outputs["sequences"], + sequences_scores=sequence_outputs["sequence_scores"], + scores=scores, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + ) + else: + return sequence_outputs["sequences"] + + def beam_sample( + self, + input_ids: "torch.LongTensor", + beam_scorer: BeamScorer, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + logits_warper: Optional[LogitsProcessorList] = None, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_scores: Optional[bool] = None, + return_dict_in_generate: Optional[bool] = None, + synced_gpus: Optional[bool] = None, + **model_kwargs, + ) -> Union[BeamSampleOutput, "torch.LongTensor"]: + r""" + Generates sequences for models with a language modeling head using beam search with multinomial sampling. + + Parameters: + + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + beam_scorer (:obj:`BeamScorer`): + A derived instance of :class:`~transformers.BeamScorer` that defines how beam hypotheses are + constructed, stored and sorted during generation. For more information, the documentation of + :class:`~transformers.BeamScorer` should be read. + logits_processor (:obj:`LogitsProcessorList`, `optional`): + An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from + :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling + head applied at each generation step. + stopping_criteria (:obj:`StoppingCriteriaList`, `optional`): + An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from + :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop. + logits_warper (:obj:`LogitsProcessorList`, `optional`): + An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from + :class:`~transformers.LogitsWarper` used to warp the prediction score distribution of the language + modeling head applied before multinomial sampling at each generation step. + max_length (:obj:`int`, `optional`, defaults to 20): + **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of + generated tokens. The maximum length of the sequence to be generated. + pad_token_id (:obj:`int`, `optional`): + The id of the `padding` token. + eos_token_id (:obj:`int`, `optional`): + The id of the `end-of-sequence` token. + output_attentions (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more details. + output_hidden_states (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors + for more details. + output_scores (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details. + return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + model_kwargs: + Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If + model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`. + + Return: + :class:`~transformers.generation_utils.BeamSampleDecoderOnlyOutput`, + :class:`~transformers.generation_utils.BeamSampleEncoderDecoderOutput` or obj:`torch.LongTensor`: A + :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a + :class:`~transformers.generation_utils.BeamSampleDecoderOnlyOutput` if + ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a + :class:`~transformers.generation_utils.BeamSampleEncoderDecoderOutput` if + ``model.config.is_encoder_decoder=True``. + + Examples:: + + >>> from transformers import ( + ... AutoTokenizer, + ... AutoModelForSeq2SeqLM, + ... LogitsProcessorList, + ... MinLengthLogitsProcessor, + ... TopKLogitsWarper, + ... TemperatureLogitsWarper, + ... BeamSearchScorer, + ... ) + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") + >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") + + >>> encoder_input_str = "translate English to German: How old are you?" + >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids + + >>> # lets run beam search using 3 beams + >>> num_beams = 3 + >>> # define decoder start token ids + >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long) + >>> input_ids = input_ids * model.config.decoder_start_token_id + + >>> # add encoder_outputs to model keyword arguments + >>> model_kwargs = { + ... "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True) + ... } + + >>> # instantiate beam scorer + >>> beam_scorer = BeamSearchScorer( + ... batch_size=1, + ... max_length=model.config.max_length, + ... num_beams=num_beams, + ... device=model.device, + ... ) + + >>> # instantiate logits processors + >>> logits_processor = LogitsProcessorList([ + ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id) + ... ]) + >>> # instantiate logits processors + >>> logits_warper = LogitsProcessorList([ + ... TopKLogitsWarper(50), + ... TemperatureLogitsWarper(0.7), + ... ]) + + >>> outputs = model.beam_sample( + ... input_ids, beam_scorer, logits_processor=logits_processor, logits_warper=logits_warper, **model_kwargs + ... ) + + >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True)) + """ + # init values + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + if max_length is not None: + logger.warn( + "`max_length` is deprecated in this function, use `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.", + UserWarning, + ) + stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + output_scores = output_scores if output_scores is not None else self.config.output_scores + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate + ) + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None + ) + + batch_size = len(beam_scorer._beam_hyps) + num_beams = beam_scorer.num_beams + + batch_beam_size, cur_len = input_ids.shape + + beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) + beam_scores = beam_scores.view((batch_size * num_beams,)) + + this_peer_finished = False # used by synced_gpus only + while True: + + if synced_gpus: + # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. + # The following logic allows an early break if all peers finished generating their sequence + this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) + # send 0.0 if we finished, 1.0 otherwise + dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) + # did all peers finish? the reduced sum will be 0.0 then + if this_peer_finished_flag.item() == 0.0: + break + + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + if synced_gpus and this_peer_finished: + cur_len = cur_len + 1 + continue # don't waste resources running the code we don't need + + next_token_logits = outputs.logits[:, -1, :] + + # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id` + # cannot be generated both before and after the `nn.functional.log_softmax` operation. + next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len) + next_token_scores = nn.functional.log_softmax( + next_token_logits, dim=-1 + ) # (batch_size * num_beams, vocab_size) + + next_token_scores = logits_processor(input_ids, next_token_scores) + next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores) + next_token_scores = logits_warper(input_ids, next_token_scores) + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_scores: + scores += (next_token_scores,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) + ) + + # reshape for beam search + vocab_size = next_token_scores.shape[-1] + next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size) + + probs = nn.functional.softmax(next_token_scores, dim=-1) + + next_tokens = torch.multinomial(probs, num_samples=2 * num_beams) + next_token_scores = torch.gather(next_token_scores, -1, next_tokens) + + next_token_scores, _indices = torch.sort(next_token_scores, descending=True, dim=1) + next_tokens = torch.gather(next_tokens, -1, _indices) + + next_indices = next_tokens // vocab_size + next_tokens = next_tokens % vocab_size + + # stateless + beam_outputs = beam_scorer.process( + input_ids, + next_token_scores, + next_tokens, + next_indices, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + ) + beam_scores = beam_outputs["next_beam_scores"] + beam_next_tokens = beam_outputs["next_beam_tokens"] + beam_idx = beam_outputs["next_beam_indices"] + + input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1) + + model_kwargs = self._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder + ) + if model_kwargs["past"] is not None: + model_kwargs["past"] = self._reorder_cache(model_kwargs["past"], beam_idx) + + # increase cur_len + cur_len = cur_len + 1 + + if beam_scorer.is_done or stopping_criteria(input_ids, scores): + if not synced_gpus: + break + else: + this_peer_finished = True + + sequence_outputs = beam_scorer.finalize( + input_ids, + beam_scores, + next_tokens, + next_indices, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + max_length=stopping_criteria.max_length, + ) + + if return_dict_in_generate: + if not output_scores: + sequence_outputs["sequence_scores"] = None + if self.config.is_encoder_decoder: + return BeamSampleEncoderDecoderOutput( + sequences=sequence_outputs["sequences"], + sequences_scores=sequence_outputs["sequence_scores"], + scores=scores, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + ) + else: + return BeamSampleDecoderOnlyOutput( + sequences=sequence_outputs["sequences"], + sequences_scores=sequence_outputs["sequence_scores"], + scores=scores, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + ) + else: + return sequence_outputs["sequences"] + + def group_beam_search( + self, + input_ids: "torch.LongTensor", + beam_scorer: BeamScorer, + logits_processor: Optional[LogitsProcessorList] = None, + stopping_criteria: Optional[StoppingCriteriaList] = None, + max_length: Optional[int] = None, + pad_token_id: Optional[int] = None, + eos_token_id: Optional[int] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + output_scores: Optional[bool] = None, + return_dict_in_generate: Optional[bool] = None, + synced_gpus: Optional[bool] = None, + **model_kwargs, + ): + r""" + Generates sequences for models with a language modeling head using beam search decoding. + + Parameters: + + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + beam_scorer (:obj:`BeamScorer`): + An derived instance of :class:`~transformers.BeamScorer` that defines how beam hypotheses are + constructed, stored and sorted during generation. For more information, the documentation of + :class:`~transformers.BeamScorer` should be read. + logits_processor (:obj:`LogitsProcessorList`, `optional`): + An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from + :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling + head applied at each generation step. + stopping_criteria (:obj:`StoppingCriteriaList`, `optional`): + An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from + :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop. + max_length (:obj:`int`, `optional`, defaults to 20): + **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of + generated tokens. The maximum length of the sequence to be generated. + pad_token_id (:obj:`int`, `optional`): + The id of the `padding` token. + eos_token_id (:obj:`int`, `optional`): + The id of the `end-of-sequence` token. + output_attentions (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more details. + output_hidden_states (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors + for more details. + output_scores (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details. + return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to continue running the while loop until max_length (needed for ZeRO stage 3) + + model_kwargs: + Additional model specific kwargs that will be forwarded to the :obj:`forward` function of the model. If + model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`. + + Return: + :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput`, + :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` or obj:`torch.LongTensor`: A + :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a + :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput` if + :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput` if + ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a + :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` if + ``model.config.is_encoder_decoder=True``. + + Examples:: + + >>> from transformers import ( + ... AutoTokenizer, + ... AutoModelForSeq2SeqLM, + ... LogitsProcessorList, + ... MinLengthLogitsProcessor, + ... HammingDiversityLogitsProcessor, + ... BeamSearchScorer, + ... ) + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("t5-base") + >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base") + + >>> encoder_input_str = "translate English to German: How old are you?" + >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids + + + >>> # lets run diverse beam search using 6 beams + >>> num_beams = 6 + >>> # define decoder start token ids + >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long) + >>> input_ids = input_ids * model.config.decoder_start_token_id + + >>> # add encoder_outputs to model keyword arguments + >>> model_kwargs = { + ... "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True) + ... } + + >>> # instantiate beam scorer + >>> beam_scorer = BeamSearchScorer( + ... batch_size=1, + ... max_length=model.config.max_length, + ... num_beams=num_beams, + ... device=model.device, + ... num_beam_groups=3 + ... ) + + >>> # instantiate logits processors + >>> logits_processor = LogitsProcessorList([ + ... HammingDiversityLogitsProcessor(5.5, num_beams=6, num_beam_groups=3), + ... MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id), + ... ]) + + >>> outputs = model.group_beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs) + + >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True)) + """ + # init values + logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() + stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() + if max_length is not None: + logger.warn( + "`max_length` is deprecated in this function, use `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.", + UserWarning, + ) + stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length) + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id + output_scores = output_scores if output_scores is not None else self.config.output_scores + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict_in_generate = ( + return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate + ) + + # init attention / hidden states / scores tuples + scores = () if (return_dict_in_generate and output_scores) else None + decoder_attentions = () if (return_dict_in_generate and output_attentions) else None + cross_attentions = () if (return_dict_in_generate and output_attentions) else None + decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None + + # if model is an encoder-decoder, retrieve encoder attention weights and hidden states + if return_dict_in_generate and self.config.is_encoder_decoder: + encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None + encoder_hidden_states = ( + model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None + ) + + batch_size = len(beam_scorer._beam_hyps) + num_beams = beam_scorer.num_beams + num_beam_groups = beam_scorer.num_beam_groups + num_sub_beams = num_beams // num_beam_groups + device = input_ids.device + + batch_beam_size, cur_len = input_ids.shape + + assert ( + num_beams * batch_size == batch_beam_size + ), f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}." + + beam_scores = torch.full((batch_size, num_beams), -1e9, dtype=torch.float, device=device) + # initialise score of first beam of each group with 0 and the rest with 1e-9. This ensures that the beams in + # the same group don't produce same tokens everytime. + beam_scores[:, ::num_sub_beams] = 0 + beam_scores = beam_scores.view((batch_size * num_beams,)) + + this_peer_finished = False # used by synced_gpus only + while True: + + if synced_gpus: + # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. + # The following logic allows an early break if all peers finished generating their sequence + this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device) + # send 0.0 if we finished, 1.0 otherwise + dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM) + # did all peers finish? the reduced sum will be 0.0 then + if this_peer_finished_flag.item() == 0.0: + break + + # predicted tokens in cur_len step + current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device) + + # indices which will form the beams in the next time step + reordering_indices = torch.zeros(batch_size * num_beams, dtype=torch.long, device=device) + + # do one decoder step on all beams of all sentences in batch + model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) + outputs = self( + **model_inputs, + return_dict=True, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + if synced_gpus and this_peer_finished: + cur_len = cur_len + 1 + continue # don't waste resources running the code we don't need + + if output_scores: + processed_score = torch.zeros_like(outputs.logits[:, -1, :]) + + for beam_group_idx in range(num_beam_groups): + group_start_idx = beam_group_idx * num_sub_beams + group_end_idx = min(group_start_idx + num_sub_beams, num_beams) + group_size = group_end_idx - group_start_idx + + # indices of beams of current group among all sentences in batch + batch_group_indices = [] + + for batch_idx in range(batch_size): + batch_group_indices.extend( + [batch_idx * num_beams + idx for idx in range(group_start_idx, group_end_idx)] + ) + group_input_ids = input_ids[batch_group_indices] + + # select outputs of beams of current group only + next_token_logits = outputs.logits[batch_group_indices, -1, :] + + # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id` + # cannot be generated both before and after the `nn.functional.log_softmax` operation. + next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len) + next_token_scores = nn.functional.log_softmax( + next_token_logits, dim=-1 + ) # (batch_size * group_size, vocab_size) + vocab_size = next_token_scores.shape[-1] + + next_token_scores = logits_processor( + group_input_ids, next_token_scores, current_tokens=current_tokens, beam_group_idx=beam_group_idx + ) + next_token_scores = next_token_scores + beam_scores[batch_group_indices].unsqueeze(-1).expand_as( + next_token_scores + ) + + if output_scores: + processed_score[batch_group_indices] = next_token_scores + + # reshape for beam search + next_token_scores = next_token_scores.view(batch_size, group_size * vocab_size) + + next_token_scores, next_tokens = torch.topk( + next_token_scores, 2 * group_size, dim=1, largest=True, sorted=True + ) + + next_indices = next_tokens // vocab_size + next_tokens = next_tokens % vocab_size + + # stateless + beam_outputs = beam_scorer.process( + group_input_ids, + next_token_scores, + next_tokens, + next_indices, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + ) + beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"] + beam_next_tokens = beam_outputs["next_beam_tokens"] + beam_idx = beam_outputs["next_beam_indices"] + + input_ids[batch_group_indices] = group_input_ids[beam_idx] + group_input_ids = torch.cat([group_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1) + current_tokens[batch_group_indices] = group_input_ids[:, -1] + + # (beam_idx // group_size) -> batch_idx + # (beam_idx % group_size) -> offset of idx inside the group + reordering_indices[batch_group_indices] = ( + num_beams * (beam_idx // group_size) + group_start_idx + (beam_idx % group_size) + ) + + # Store scores, attentions and hidden_states when required + if return_dict_in_generate: + if output_scores: + scores += (processed_score,) + if output_attentions: + decoder_attentions += ( + (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,) + ) + if self.config.is_encoder_decoder: + cross_attentions += (outputs.cross_attentions,) + + if output_hidden_states: + decoder_hidden_states += ( + (outputs.decoder_hidden_states,) + if self.config.is_encoder_decoder + else (outputs.hidden_states,) + ) + + input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1) + + model_kwargs = self._update_model_kwargs_for_generation( + outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder + ) + if model_kwargs["past"] is not None: + model_kwargs["past"] = self._reorder_cache(model_kwargs["past"], reordering_indices) + + # increase cur_len + cur_len = cur_len + 1 + + if beam_scorer.is_done or stopping_criteria(input_ids, scores): + if not synced_gpus: + break + else: + this_peer_finished = True + + sequence_outputs = beam_scorer.finalize( + input_ids, + beam_scores, + next_tokens, + next_indices, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + max_length=stopping_criteria.max_length, + ) + + if return_dict_in_generate: + if not output_scores: + sequence_outputs["sequence_scores"] = None + if self.config.is_encoder_decoder: + return BeamSearchEncoderDecoderOutput( + sequences=sequence_outputs["sequences"], + sequences_scores=sequence_outputs["sequence_scores"], + scores=scores, + encoder_attentions=encoder_attentions, + encoder_hidden_states=encoder_hidden_states, + decoder_attentions=decoder_attentions, + cross_attentions=cross_attentions, + decoder_hidden_states=decoder_hidden_states, + ) + else: + return BeamSearchDecoderOnlyOutput( + sequences=sequence_outputs["sequences"], + sequences_scores=sequence_outputs["sequence_scores"], + scores=scores, + attentions=decoder_attentions, + hidden_states=decoder_hidden_states, + ) + else: + return sequence_outputs["sequences"] + + +def top_k_top_p_filtering( + logits: "torch.FloatTensor", + top_k: int = 0, + top_p: float = 1.0, + filter_value: float = -float("Inf"), + min_tokens_to_keep: int = 1, +) -> "torch.FloatTensor": + """ + Filter a distribution of logits using top-k and/or nucleus (top-p) filtering + + Args: + logits: logits distribution shape (batch size, vocabulary size) + top_k (:obj:`int`, `optional`, defaults to 0): + If > 0, only keep the top k tokens with highest probability (top-k filtering) + top_p (:obj:`float`, `optional`, defaults to 1.0): + If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus + filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) + min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1): + Minimumber of tokens we keep per batch example in the output. + + From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 + """ + if top_k > 0: + logits = TopKLogitsWarper(top_k=top_k, filter_value=filter_value, min_tokens_to_keep=min_tokens_to_keep)( + None, logits + ) + + if 0 <= top_p <= 1.0: + logits = TopPLogitsWarper(top_p=top_p, min_tokens_to_keep=min_tokens_to_keep)(None, logits) + + return logits diff --git a/fastNLP/transformers/torch/modeling_outputs.py b/fastNLP/transformers/torch/modeling_outputs.py new file mode 100644 index 00000000..ae972a94 --- /dev/null +++ b/fastNLP/transformers/torch/modeling_outputs.py @@ -0,0 +1,816 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import Optional, Tuple + +from .file_utils import ModelOutput +from fastNLP.envs.imports import _NEED_IMPORT_TORCH + +if _NEED_IMPORT_TORCH: + import torch + + +@dataclass +class BaseModelOutput(ModelOutput): + """ + Base class for model's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: "torch.FloatTensor" = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class BaseModelOutputWithPooling(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) after further processing + through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns + the classification token after processing through a linear layer and a tanh activation function. The linear + layer weights are trained from the next sentence prediction (classification) objective during pretraining. + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: "torch.FloatTensor" = None + pooler_output: "torch.FloatTensor" = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class BaseModelOutputWithPast(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, + 1, hidden_size)` is output. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors + of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + ``config.is_encoder_decoder=True`` 2 additional tensors of shape :obj:`(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + ``config.is_encoder_decoder=True`` in the cross-attention blocks) that can be used (see + :obj:`past_key_values` input) to speed up sequential decoding. + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: "torch.FloatTensor" = None + past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class BaseModelOutputWithCrossAttentions(ModelOutput): + """ + Base class for model's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + """ + + last_hidden_state: "torch.FloatTensor" = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) after further processing + through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns + the classification token after processing through a linear layer and a tanh activation function. The linear + layer weights are trained from the next sentence prediction (classification) objective during pretraining. + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors + of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + ``config.is_encoder_decoder=True`` 2 additional tensors of shape :obj:`(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + ``config.is_encoder_decoder=True`` in the cross-attention blocks) that can be used (see + :obj:`past_key_values` input) to speed up sequential decoding. + """ + + last_hidden_state: "torch.FloatTensor" = None + pooler_output: "torch.FloatTensor" = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class BaseModelOutputWithPastAndCrossAttentions(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, + 1, hidden_size)` is output. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors + of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + ``config.is_encoder_decoder=True`` 2 additional tensors of shape :obj:`(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + ``config.is_encoder_decoder=True`` in the cross-attention blocks) that can be used (see + :obj:`past_key_values` input) to speed up sequential decoding. + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + """ + + last_hidden_state: "torch.FloatTensor" = None + past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class Seq2SeqModelOutput(ModelOutput): + """ + Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential + decoding. + + Args: + last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + + If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, + 1, hidden_size)` is output. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors + of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + last_hidden_state: "torch.FloatTensor" = None + past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + decoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + decoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None + cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None + encoder_last_hidden_state: Optional["torch.FloatTensor"] = None + encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class CausalLMOutput(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Language modeling loss (for next-token prediction). + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional["torch.FloatTensor"] = None + logits: "torch.FloatTensor" = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class CausalLMOutputWithPast(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Language modeling loss (for next-token prediction). + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors + of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + :obj:`past_key_values` input) to speed up sequential decoding. + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional["torch.FloatTensor"] = None + logits: "torch.FloatTensor" = None + past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class CausalLMOutputWithCrossAttentions(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Language modeling loss (for next-token prediction). + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Cross attentions weights after the attention softmax, used to compute the weighted average in the + cross-attention heads. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + Tuple of :obj:`torch.FloatTensor` tuples of length :obj:`config.n_layers`, with each tuple containing the + cached key, value states of the self-attention and the cross-attention layers if model is used in + encoder-decoder setting. Only relevant if ``config.is_decoder = True``. + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + :obj:`past_key_values` input) to speed up sequential decoding. + """ + + loss: Optional["torch.FloatTensor"] = None + logits: "torch.FloatTensor" = None + past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class SequenceClassifierOutputWithPast(ModelOutput): + """ + Base class for outputs of sentence classification models. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors + of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + :obj:`past_key_values` input) to speed up sequential decoding. + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional["torch.FloatTensor"] = None + logits: "torch.FloatTensor" = None + past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class MaskedLMOutput(ModelOutput): + """ + Base class for masked language models outputs. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Masked language modeling (MLM) loss. + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional["torch.FloatTensor"] = None + logits: "torch.FloatTensor" = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class Seq2SeqLMOutput(ModelOutput): + """ + Base class for sequence-to-sequence language models outputs. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Language modeling loss. + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors + of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional["torch.FloatTensor"] = None + logits: "torch.FloatTensor" = None + past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + decoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + decoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None + cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None + encoder_last_hidden_state: Optional["torch.FloatTensor"] = None + encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class NextSentencePredictorOutput(ModelOutput): + """ + Base class for outputs of models predicting if two sentences are consecutive or not. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided): + Next sequence prediction (classification) loss. + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional["torch.FloatTensor"] = None + logits: "torch.FloatTensor" = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class SequenceClassifierOutput(ModelOutput): + """ + Base class for outputs of sentence classification models. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional["torch.FloatTensor"] = None + logits: "torch.FloatTensor" = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class Seq2SeqSequenceClassifierOutput(ModelOutput): + """ + Base class for outputs of sequence-to-sequence sentence classification models. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors + of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional["torch.FloatTensor"] = None + logits: "torch.FloatTensor" = None + past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + decoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + decoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None + cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None + encoder_last_hidden_state: Optional["torch.FloatTensor"] = None + encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class MultipleChoiceModelOutput(ModelOutput): + """ + Base class for outputs of multiple choice models. + + Args: + loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): + Classification loss. + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): + `num_choices` is the second dimension of the input tensors. (see `input_ids` above). + + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional["torch.FloatTensor"] = None + logits: "torch.FloatTensor" = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class TokenClassifierOutput(ModelOutput): + """ + Base class for outputs of token classification models. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : + Classification loss. + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional["torch.FloatTensor"] = None + logits: "torch.FloatTensor" = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class QuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of question answering models. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`): + Span-start scores (before SoftMax). + end_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`): + Span-end scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional["torch.FloatTensor"] = None + start_logits: "torch.FloatTensor" = None + end_logits: "torch.FloatTensor" = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +@dataclass +class Seq2SeqQuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of sequence-to-sequence question answering models. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`): + Span-start scores (before SoftMax). + end_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`): + Span-end scores (before SoftMax). + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors + of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional["torch.FloatTensor"] = None + start_logits: "torch.FloatTensor" = None + end_logits: "torch.FloatTensor" = None + past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + decoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + decoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None + cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None + encoder_last_hidden_state: Optional["torch.FloatTensor"] = None + encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None diff --git a/fastNLP/transformers/torch/modeling_utils.py b/fastNLP/transformers/torch/modeling_utils.py new file mode 100644 index 00000000..d1d5c2f3 --- /dev/null +++ b/fastNLP/transformers/torch/modeling_utils.py @@ -0,0 +1,1888 @@ +import inspect +import os +import re +from contextlib import contextmanager +from functools import partial +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union + +from .activations import get_activation +from .configuration_utils import PretrainedConfig +from .deepspeed import deepspeed_config, is_deepspeed_zero3_enabled +from .utils.versions import require_version_core +from .file_utils import ( + DUMMY_INPUTS, + WEIGHTS_NAME, + cached_path, + hf_bucket_url, + is_offline_mode, + is_remote_url, +) +from .generation_utils import GenerationMixin +from fastNLP.core.log import logger +from fastNLP.envs.imports import _NEED_IMPORT_TORCH + +if _NEED_IMPORT_TORCH: + import torch + from torch import Tensor, device, nn, save as torch_save + from torch.nn import Module + try: + from torch.nn import Identity + except ImportError: + # Older PyTorch compatibility + class Identity(nn.Module): + r"""A placeholder identity operator that is argument-insensitive.""" + + def __init__(self, *args, **kwargs): + super().__init__() + + def forward(self, input): + return input +else: + from fastNLP.core.utils.dummy_class import( + DummyClass as Module, + DummyClass as torch_save, + ) + +_init_weights = True + +@contextmanager +def no_init_weights(_enable=True): + """ + Context manager to globally disable weight initialization to speed up loading large models. + + TODO(Patrick): Delete safety argument `_enable=True` at next major version. . + """ + global _init_weights + if _enable: + _init_weights = False + try: + yield + finally: + _init_weights = True + +def find_pruneable_heads_and_indices( + heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int] +) -> Tuple[Set[int], "torch.LongTensor"]: + """ + Finds the heads and their indices taking :obj:`already_pruned_heads` into account. + + Args: + heads (:obj:`List[int]`): List of the indices of heads to prune. + n_heads (:obj:`int`): The number of heads in the model. + head_size (:obj:`int`): The size of each head. + already_pruned_heads (:obj:`Set[int]`): A set of already pruned heads. + + Returns: + :obj:`Tuple[Set[int], torch.LongTensor]`: A tuple with the remaining heads and their corresponding indices. + """ + mask = torch.ones(n_heads, head_size) + heads = set(heads) - already_pruned_heads # Convert to set and remove already pruned heads + for head in heads: + # Compute how many pruned heads are before the head and move the index accordingly + head = head - sum(1 if h < head else 0 for h in already_pruned_heads) + mask[head] = 0 + mask = mask.view(-1).contiguous().eq(1) + index: torch.LongTensor = torch.arange(len(mask))[mask].long() + return heads, index + +def get_parameter_device(parameter: Union[Module, GenerationMixin, "ModuleUtilsMixin"]): + try: + return next(parameter.parameters()).device + except StopIteration: + # For nn.DataParallel compatibility in PyTorch 1.5 + + def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]: + tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)] + return tuples + + gen = parameter._named_members(get_members_fn=find_tensor_attributes) + first_tuple = next(gen) + return first_tuple[1].device + + +def get_parameter_dtype(parameter: Union["nn.Module", GenerationMixin, "ModuleUtilsMixin"]): + try: + return next(parameter.parameters()).dtype + except StopIteration: + # For nn.DataParallel compatibility in PyTorch 1.5 + + def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]: + tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)] + return tuples + + gen = parameter._named_members(get_members_fn=find_tensor_attributes) + first_tuple = next(gen) + return first_tuple[1].dtype + +class ModuleUtilsMixin: + """ + A few utilities for :obj:`torch.nn.Modules`, to be used as a mixin. + """ + + @staticmethod + def _hook_rss_memory_pre_forward(module, *args, **kwargs): + try: + import psutil + except (ImportError): + raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.") + + process = psutil.Process(os.getpid()) + mem = process.memory_info() + module.mem_rss_pre_forward = mem.rss + return None + + @staticmethod + def _hook_rss_memory_post_forward(module, *args, **kwargs): + try: + import psutil + except (ImportError): + raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.") + + process = psutil.Process(os.getpid()) + mem = process.memory_info() + module.mem_rss_post_forward = mem.rss + mem_rss_diff = module.mem_rss_post_forward - module.mem_rss_pre_forward + module.mem_rss_diff = mem_rss_diff + (module.mem_rss_diff if hasattr(module, "mem_rss_diff") else 0) + return None + + def add_memory_hooks(self): + """ + Add a memory hook before and after each sub-module forward pass to record increase in memory consumption. + + Increase in memory consumption is stored in a :obj:`mem_rss_diff` attribute for each module and can be reset to + zero with :obj:`model.reset_memory_hooks_state()`. + """ + for module in self.modules(): + module.register_forward_pre_hook(self._hook_rss_memory_pre_forward) + module.register_forward_hook(self._hook_rss_memory_post_forward) + self.reset_memory_hooks_state() + + def reset_memory_hooks_state(self): + """ + Reset the :obj:`mem_rss_diff` attribute of each module (see + :func:`~transformers.modeling_utils.ModuleUtilsMixin.add_memory_hooks`). + """ + for module in self.modules(): + module.mem_rss_diff = 0 + module.mem_rss_post_forward = 0 + module.mem_rss_pre_forward = 0 + + @property + def device(self) -> "device": + """ + :obj:`torch.device`: The device on which the module is (assuming that all the module parameters are on the same + device). + """ + return get_parameter_device(self) + + @property + def dtype(self) -> "torch.dtype": + """ + :obj:`torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype). + """ + return get_parameter_dtype(self) + + def invert_attention_mask(self, encoder_attention_mask: "Tensor") -> "Tensor": + """ + Invert an attention mask (e.g., switches 0. and 1.). + + Args: + encoder_attention_mask (:obj:`torch.Tensor`): An attention mask. + + Returns: + :obj:`torch.Tensor`: The inverted attention mask. + """ + if encoder_attention_mask.dim() == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if encoder_attention_mask.dim() == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow + # /transformer/transformer_layers.py#L270 + # encoder_extended_attention_mask = (encoder_extended_attention_mask == + # encoder_extended_attention_mask.transpose(-1, -2)) + encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + + if self.dtype == torch.float16: + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4 + elif self.dtype == torch.float32: + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9 + else: + raise ValueError( + f"{self.dtype} not recognized. `dtype` should be set to either `torch.float32` or `torch.float16`" + ) + + return encoder_extended_attention_mask + + def get_extended_attention_mask(self, attention_mask: "Tensor", input_shape: Tuple[int], device: "device") -> "Tensor": + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (:obj:`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (:obj:`Tuple[int]`): + The shape of the input to the model. + device: (:obj:`torch.device`): + The device of the input to the model. + + Returns: + :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder: + batch_size, seq_length = input_shape + seq_ids = torch.arange(seq_length, device=device) + causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] + # in case past_key_values are used we need to add a prefix ones mask to the causal mask + # causal and attention masks must have same type with pytorch version < 1.3 + causal_mask = causal_mask.to(attention_mask.dtype) + + if causal_mask.shape[1] < attention_mask.shape[1]: + prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1] + causal_mask = torch.cat( + [ + torch.ones( + (batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype + ), + causal_mask, + ], + axis=-1, + ) + + extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] + else: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def get_head_mask( + self, head_mask: Optional["Tensor"], num_hidden_layers: int, is_attention_chunked: bool = False + ) -> "Tensor": + """ + Prepare the head mask if needed. + + Args: + head_mask (:obj:`torch.Tensor` with shape :obj:`[num_heads]` or :obj:`[num_hidden_layers x num_heads]`, `optional`): + The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard). + num_hidden_layers (:obj:`int`): + The number of hidden layers in the model. + is_attention_chunked: (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the attentions scores are computed by chunks or not. + + Returns: + :obj:`torch.Tensor` with shape :obj:`[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or + list with :obj:`[None]` for each layer. + """ + if head_mask is not None: + head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) + if is_attention_chunked is True: + head_mask = head_mask.unsqueeze(-1) + else: + head_mask = [None] * num_hidden_layers + + return head_mask + + def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): + """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer + assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" + head_mask = head_mask.to(dtype=self.dtype) # switch to float if need + fp16 compatibility + return head_mask + + def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int: + """ + Get number of (optionally, trainable or non-embeddings) parameters in the module. + + Args: + only_trainable (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to return only the number of trainable parameters + + exclude_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to return only the number of non-embeddings parameters + + Returns: + :obj:`int`: The number of parameters. + """ + + if exclude_embeddings: + embedding_param_names = [ + f"{name}.weight" for name, module_type in self.named_modules() if isinstance(module_type, nn.Embedding) + ] + non_embedding_parameters = [ + parameter for name, parameter in self.named_parameters() if name not in embedding_param_names + ] + return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable) + else: + return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable) + + def estimate_tokens(self, input_dict: Dict[str, Union["torch.Tensor", Any]]) -> int: + """ + Helper function to estimate the total number of tokens from the model inputs. + + Args: + inputs (:obj:`dict`): The model inputs. + + Returns: + :obj:`int`: The total number of tokens. + """ + token_inputs = [tensor for key, tensor in input_dict.items() if "input" in key] + if token_inputs: + return sum([token_input.numel() for token_input in token_inputs]) + else: + logger.warn( + "Could not estimate the number of tokens of the input, floating-point operations will not be computed" + ) + return 0 + + def floating_point_ops( + self, input_dict: Dict[str, Union["torch.Tensor", Any]], exclude_embeddings: bool = True + ) -> int: + """ + Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a + batch with this transformer model. Default approximation neglects the quadratic dependency on the number of + tokens (valid if :obj:`12 * d_model << sequence_length`) as laid out in `this paper + `__ section 2.1. Should be overridden for transformers with parameter + re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths. + + Args: + batch_size (:obj:`int`): + The batch size for the forward pass. + + sequence_length (:obj:`int`): + The number of tokens in each line of the batch. + + exclude_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to count embedding and softmax operations. + + Returns: + :obj:`int`: The number of floating-point operations. + """ + + return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings) + + +class PreTrainedModel(Module, ModuleUtilsMixin, GenerationMixin): + r""" + Base class for all models. + + :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods + for loading, downloading and saving models as well as a few methods common to all models to: + + * resize the input embeddings, + * prune heads in the self-attention heads. + + Class attributes (overridden by derived classes): + + - **config_class** (:class:`~transformers.PretrainedConfig`) -- A subclass of + :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture. + - **load_tf_weights** (:obj:`Callable`) -- A python `method` for loading a TensorFlow checkpoint in a PyTorch + model, taking as arguments: + + - **model** (:class:`~transformers.PreTrainedModel`) -- An instance of the model on which to load the + TensorFlow checkpoint. + - **config** (:class:`~transformers.PreTrainedConfig`) -- An instance of the configuration associated to + the model. + - **path** (:obj:`str`) -- A path to the TensorFlow checkpoint. + + - **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in + derived classes of the same architecture adding modules on top of the base model. + - **is_parallelizable** (:obj:`bool`) -- A flag indicating whether this model supports model parallelization. + """ + config_class = None + base_model_prefix = "" + # a list of re pattern of tensor names to ignore from the model when loading the model weights + # (and avoid unnecessary warnings). + _keys_to_ignore_on_load_missing = None + # a list of re pattern of tensor names to ignore from the weights when loading the model weights + # (and avoid unnecessary warnings). + _keys_to_ignore_on_load_unexpected = None + # a list of of tensor names to ignore when saving the model (useful for keys that aren't + # trained, but which are deterministic, or tied variables) + _keys_to_ignore_on_save = None + + is_parallelizable = False + supports_gradient_checkpointing = False + + @property + def dummy_inputs(self) -> Dict[str, "torch.Tensor"]: + """ + :obj:`Dict[str, torch.Tensor]`: Dummy inputs to do a forward pass in the network. + """ + return {"input_ids": torch.tensor(DUMMY_INPUTS)} + + def __init__(self, config: PretrainedConfig, *inputs, **kwargs): + super().__init__() + if not isinstance(config, PretrainedConfig): + raise ValueError( + f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class " + "`PretrainedConfig`. To create a model from a pretrained model use " + f"`model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + # Save config and origin of the pretrained weights if given in model + self.config = config + self.name_or_path = config.name_or_path + if getattr(self.config, "gradient_checkpointing", False): + self.gradient_checkpointing_enable() + # Remove the attribute now that is has been consumed, so it's no saved in the config. + delattr(self.config, "gradient_checkpointing") + + @classmethod + def _from_config(cls, config, **kwargs): + """ + All context managers that the model should be initialized under go here. + + Args: + torch_dtype (:obj:`torch.dtype`, `optional`): + Override the default ``torch.dtype`` and load the model under this dtype. + """ + torch_dtype = kwargs.pop("torch_dtype", None) + + # override default dtype if needed + dtype_orig = None + if torch_dtype is not None: + dtype_orig = cls._set_default_torch_dtype(torch_dtype) + + if is_deepspeed_zero3_enabled(): + import deepspeed + + logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model") + # this immediately partitions the model across all gpus, to avoid the overhead in time + # and memory copying it on CPU or each GPU first + with deepspeed.zero.Init(config_dict_or_path=deepspeed_config()): + model = cls(config, **kwargs) + else: + model = cls(config, **kwargs) + + # restore default dtype if it was modified + if dtype_orig is not None: + torch.set_default_dtype(dtype_orig) + + return model + + @classmethod + def _set_default_torch_dtype(cls, dtype: "torch.dtype") -> "torch.dtype": + """ + Change the default dtype and return the previous one. This is needed when wanting to instantiate the model + under specific dtype. + + Args: + dtype (:obj:`torch.dtype`): + a floating dtype to set to. + + Returns: + :obj:`torch.dtype`: the original ``dtype`` that can be used to restore ``torch.set_default_dtype(dtype)`` + if it was modified. If it wasn't, returns :obj:`None`. + + Note ``set_default_dtype`` currently only works with floating-point types and asserts if for example, + ``torch.int64`` is passed. So if a non-float ``dtype`` is passed this functions will throw an exception. + """ + if not dtype.is_floating_point: + raise ValueError( + f"Can't instantiate {cls.__name__} model under dtype={dtype} since it is not a floating point dtype" + ) + + logger.info(f"Instantiating {cls.__name__} model under default dtype {dtype}.") + dtype_orig = torch.get_default_dtype() + torch.set_default_dtype(dtype) + return dtype_orig + + @property + def base_model(self) -> "nn.Module": + """ + :obj:`torch.nn.Module`: The main body of the model. + """ + return getattr(self, self.base_model_prefix, self) + + def get_input_embeddings(self) -> "nn.Module": + """ + Returns the model's input embeddings. + + Returns: + :obj:`nn.Module`: A torch module mapping vocabulary to hidden states. + """ + base_model = getattr(self, self.base_model_prefix, self) + if base_model is not self: + return base_model.get_input_embeddings() + else: + raise NotImplementedError + + def set_input_embeddings(self, value: "nn.Module"): + """ + Set model's input embeddings. + + Args: + value (:obj:`nn.Module`): A module mapping vocabulary to hidden states. + """ + base_model = getattr(self, self.base_model_prefix, self) + if base_model is not self: + base_model.set_input_embeddings(value) + else: + raise NotImplementedError + + def get_output_embeddings(self) -> "nn.Module": + """ + Returns the model's output embeddings. + + Returns: + :obj:`nn.Module`: A torch module mapping hidden states to vocabulary. + """ + return None # Overwrite for models with output embeddings + + def _init_weights(self, module): + """ + Initialize the weights. This method should be overridden by derived class. + """ + raise NotImplementedError(f"Make sure `_init_weigths` is implemented for {self.__class__}") + + def tie_weights(self): + """ + Tie the weights between the input embeddings and the output embeddings. + + If the :obj:`torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning + the weights instead. + """ + output_embeddings = self.get_output_embeddings() + if output_embeddings is not None and self.config.tie_word_embeddings: + self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings()) + + if self.config.is_encoder_decoder and self.config.tie_encoder_decoder: + if hasattr(self, self.base_model_prefix): + self = getattr(self, self.base_model_prefix) + self._tie_encoder_decoder_weights(self.encoder, self.decoder, self.base_model_prefix) + + for module in self.modules(): + if hasattr(module, "_tie_weights"): + module._tie_weights() + + @staticmethod + def _tie_encoder_decoder_weights(encoder: "nn.Module", decoder: "nn.Module", base_model_prefix: str): + uninitialized_encoder_weights: List[str] = [] + if decoder.__class__ != encoder.__class__: + logger.info( + f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized." + ) + + def tie_encoder_to_decoder_recursively( + decoder_pointer: nn.Module, + encoder_pointer: nn.Module, + module_name: str, + uninitialized_encoder_weights: List[str], + depth=0, + ): + assert isinstance(decoder_pointer, nn.Module) and isinstance( + encoder_pointer, nn.Module + ), f"{decoder_pointer} and {encoder_pointer} have to be of type nn.Module" + if hasattr(decoder_pointer, "weight"): + assert hasattr(encoder_pointer, "weight") + encoder_pointer.weight = decoder_pointer.weight + if hasattr(decoder_pointer, "bias"): + assert hasattr(encoder_pointer, "bias") + encoder_pointer.bias = decoder_pointer.bias + return + + encoder_modules = encoder_pointer._modules + decoder_modules = decoder_pointer._modules + if len(decoder_modules) > 0: + assert ( + len(encoder_modules) > 0 + ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}" + + all_encoder_weights = set([module_name + "/" + sub_name for sub_name in encoder_modules.keys()]) + encoder_layer_pos = 0 + for name, module in decoder_modules.items(): + if name.isdigit(): + encoder_name = str(int(name) + encoder_layer_pos) + decoder_name = name + if not isinstance(decoder_modules[decoder_name], type(encoder_modules[encoder_name])) and len( + encoder_modules + ) != len(decoder_modules): + # this can happen if the name corresponds to the position in a list module list of layers + # in this case the decoder has added a cross-attention that the encoder does not have + # thus skip this step and subtract one layer pos from encoder + encoder_layer_pos -= 1 + continue + elif name not in encoder_modules: + continue + elif depth > 500: + raise ValueError( + "Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model." + ) + else: + decoder_name = encoder_name = name + tie_encoder_to_decoder_recursively( + decoder_modules[decoder_name], + encoder_modules[encoder_name], + module_name + "/" + name, + uninitialized_encoder_weights, + depth=depth + 1, + ) + all_encoder_weights.remove(module_name + "/" + encoder_name) + + uninitialized_encoder_weights += list(all_encoder_weights) + + # tie weights recursively + tie_encoder_to_decoder_recursively(decoder, encoder, base_model_prefix, uninitialized_encoder_weights) + if len(uninitialized_encoder_weights) > 0: + logger.warning( + f"The following encoder weights were not tied to the decoder {uninitialized_encoder_weights}" + ) + + def _tie_or_clone_weights(self, output_embeddings, input_embeddings): + """Tie or clone module weights depending of whether we are using TorchScript or not""" + if self.config.torchscript: + output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone()) + else: + output_embeddings.weight = input_embeddings.weight + + if getattr(output_embeddings, "bias", None) is not None: + output_embeddings.bias.data = nn.functional.pad( + output_embeddings.bias.data, + ( + 0, + output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0], + ), + "constant", + 0, + ) + if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"): + output_embeddings.out_features = input_embeddings.num_embeddings + + def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> "nn.Embedding": + """ + Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`. + + Takes care of tying weights embeddings afterwards if the model class has a :obj:`tie_weights()` method. + + Arguments: + new_num_tokens (:obj:`int`, `optional`): + The number of new tokens in the embedding matrix. Increasing the size will add newly initialized + vectors at the end. Reducing the size will remove vectors from the end. If not provided or :obj:`None`, + just returns a pointer to the input tokens :obj:`torch.nn.Embedding` module of the model without doing + anything. + + Return: + :obj:`torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model. + """ + model_embeds = self._resize_token_embeddings(new_num_tokens) + if new_num_tokens is None: + return model_embeds + + # Update base model and current model config + self.config.vocab_size = new_num_tokens + self.vocab_size = new_num_tokens + + # Tie weights again if needed + self.tie_weights() + + return model_embeds + + def _resize_token_embeddings(self, new_num_tokens): + old_embeddings = self.get_input_embeddings() + new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) + self.set_input_embeddings(new_embeddings) + + # if word embeddings are not tied, make sure that lm head is resized as well + if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings: + old_lm_head = self.get_output_embeddings() + new_lm_head = self._get_resized_lm_head(old_lm_head, new_num_tokens) + self.set_output_embeddings(new_lm_head) + + return self.get_input_embeddings() + + def _get_resized_embeddings( + self, old_embeddings: "nn.Embedding", new_num_tokens: Optional[int] = None + ) -> "nn.Embedding": + """ + Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly + initialized vectors at the end. Reducing the size will remove vectors from the end + + Args: + old_embeddings (:obj:`torch.nn.Embedding`): + Old embeddings to be resized. + new_num_tokens (:obj:`int`, `optional`): + New number of tokens in the embedding matrix. + + Increasing the size will add newly initialized vectors at the end. Reducing the size will remove + vectors from the end. If not provided or :obj:`None`, just returns a pointer to the input tokens + :obj:`torch.nn.Embedding`` module of the model without doing anything. + + Return: + :obj:`torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if + :obj:`new_num_tokens` is :obj:`None` + """ + if new_num_tokens is None: + return old_embeddings + + if is_deepspeed_zero3_enabled(): + import deepspeed + + with deepspeed.zero.GatheredParameters(old_embeddings.weight, modifier_rank=None): + old_num_tokens, old_embedding_dim = old_embeddings.weight.size() + else: + old_num_tokens, old_embedding_dim = old_embeddings.weight.size() + + if old_num_tokens == new_num_tokens: + return old_embeddings + + if not isinstance(old_embeddings, nn.Embedding): + raise TypeError( + f"Old embeddings are of type {type(old_embeddings)}, which is not an instance of {nn.Embedding}." + f"You should either use a different resize function or make sure that `old_embeddings` are an instance of {nn.Embedding}." + ) + + # Build new embeddings + new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim).to( + self.device, dtype=old_embeddings.weight.dtype + ) + + # initialize all new embeddings (in particular added tokens) + self._init_weights(new_embeddings) + + # Copy token embeddings from the previous weights + + # numbers of tokens to copy + n = min(old_num_tokens, new_num_tokens) + if is_deepspeed_zero3_enabled(): + import deepspeed + + with deepspeed.zero.GatheredParameters(old_embeddings.weight, modifier_rank=0): + if torch.distributed.get_rank() == 0: + new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :] + else: + new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :] + + return new_embeddings + + def _get_resized_lm_head( + self, old_lm_head: "nn.Linear", new_num_tokens: Optional[int] = None, transposed: Optional[bool] = False + ) -> "nn.Linear": + """ + Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized + vectors at the end. Reducing the size will remove vectors from the end + + Args: + old_lm_head (:obj:`torch.nn.Linear`): + Old lm head liner layer to be resized. + new_num_tokens (:obj:`int`, `optional`): + New number of tokens in the linear matrix. + + Increasing the size will add newly initialized vectors at the end. Reducing the size will remove + vectors from the end. If not provided or :obj:`None`, just returns a pointer to the input tokens + :obj:`torch.nn.Linear`` module of the model without doing anything. + transposed (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether ``old_lm_head`` is transposed or not. If True ``old_lm_head.size()`` is ``lm_head_dim, + vocab_size`` else ``vocab_size, lm_head_dim``. + + Return: + :obj:`torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if + :obj:`new_num_tokens` is :obj:`None` + """ + if new_num_tokens is None: + return old_lm_head + + if is_deepspeed_zero3_enabled(): + import deepspeed + + with deepspeed.zero.GatheredParameters(old_lm_head.weight, modifier_rank=None): + old_num_tokens, old_lm_head_dim = ( + old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size() + ) + else: + old_num_tokens, old_lm_head_dim = ( + old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size() + ) + + if old_num_tokens == new_num_tokens: + return old_lm_head + + if not isinstance(old_lm_head, nn.Linear): + raise TypeError( + f"Old language model head is of type {type(old_lm_head)}, which is not an instance of {nn.Linear}." + f"You should either use a different resize function or make sure that `old_lm_head` are an instance of {nn.Linear}." + ) + + # Build new lm head + new_lm_head_shape = (old_lm_head_dim, new_num_tokens) if not transposed else (new_num_tokens, old_lm_head_dim) + has_new_lm_head_bias = old_lm_head.bias is not None + new_lm_head = nn.Linear(*new_lm_head_shape, bias=has_new_lm_head_bias).to(self.device) + + # initialize new lm head (in particular added tokens) + self._init_weights(new_lm_head) + + num_tokens_to_copy = min(old_num_tokens, new_num_tokens) + + # XXX: put the long block of code in a wrapper + if is_deepspeed_zero3_enabled(): + import deepspeed + + with deepspeed.zero.GatheredParameters(old_lm_head.weight, modifier_rank=0): + if torch.distributed.get_rank() == 0: + # Copy old lm head weights to new lm head + if not transposed: + new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[ + :num_tokens_to_copy, : + ] + else: + new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[ + :, :num_tokens_to_copy + ] + + # Copy bias weights to new lm head + if has_new_lm_head_bias: + new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy] + else: + # Copy old lm head weights to new lm head + if not transposed: + new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[:num_tokens_to_copy, :] + else: + new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[:, :num_tokens_to_copy] + + # Copy bias weights to new lm head + if has_new_lm_head_bias: + new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy] + + return new_lm_head + + def resize_position_embeddings(self, new_num_position_embeddings: int): + raise NotImplementedError( + f"`resize_position_embeddings` is not implemented for {self.__class__}`. To implement it, you should " + f"overwrite this method in the class {self.__class__} in `modeling_{self.__class__.__module__}.py`" + ) + + def get_position_embeddings(self) -> Union["nn.Embedding", Tuple["nn.Embedding"]]: + raise NotImplementedError( + f"`get_position_embeddings` is not implemented for {self.__class__}`. To implement it, you should " + f"overwrite this method in the class {self.__class__} in `modeling_{self.__class__.__module__}.py`" + ) + + def init_weights(self): + """ + If needed prunes and maybe initializes weights. + """ + # Prune heads if needed + if self.config.pruned_heads: + self.prune_heads(self.config.pruned_heads) + + if _init_weights: + # Initialize weights + self.apply(self._init_weights) + + # Tie weights should be skipped when not initializing all weights + # since from_pretrained(...) calls tie weights anyways + self.tie_weights() + + def prune_heads(self, heads_to_prune: Dict[int, List[int]]): + """ + Prunes heads of the base model. + + Arguments: + heads_to_prune (:obj:`Dict[int, List[int]]`): + Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list of + heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads + 0 and 2 on layer 1 and heads 2 and 3 on layer 2. + """ + # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads + for layer, heads in heads_to_prune.items(): + union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads) + self.config.pruned_heads[layer] = list(union_heads) # Unfortunately we have to store it as list for JSON + + self.base_model._prune_heads(heads_to_prune) + + def gradient_checkpointing_enable(self, flag: bool = True): + """ + Activates gradient checkpointing for the current model. + + Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint + activations". + """ + if not self.supports_gradient_checkpointing: + raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.") + self.apply(partial(self._set_gradient_checkpointing, value=True)) + + def gradient_checkpointing_disable(self, flag: bool = True): + """ + Deactivates gradient checkpointing for the current model. + + Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint + activations". + """ + if self.supports_gradient_checkpointing: + self.apply(partial(self._set_gradient_checkpointing, value=False)) + + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + save_config: bool = True, + state_dict: Optional[dict] = None, + save_function: Callable = torch_save, + **kwargs, + ): + """ + Save a model and its configuration file to a directory, so that it can be re-loaded using the + `:func:`~transformers.PreTrainedModel.from_pretrained`` class method. + + Arguments: + save_directory (:obj:`str` or :obj:`os.PathLike`): + Directory to which to save. Will be created if it doesn't exist. + save_config (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to save the config of the model. Useful when in distributed training like TPUs and need + to call this function on all processes. In this case, set :obj:`save_config=True` only on the main + process to avoid race conditions. + state_dict (nested dictionary of :obj:`torch.Tensor`): + The state dictionary of the model to save. Will default to :obj:`self.state_dict()`, but can be used to + only save parts of the model or if special precautions need to be taken when recovering the state + dictionary of a model (like when using model parallelism). + save_function (:obj:`Callable`): + The function to use to save the state dictionary. Useful on distributed training like TPUs when one + need to replace :obj:`torch.save` by another method. + push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to push your model to the Hugging Face model hub after saving it. + + .. warning:: + + Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with + :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are + pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory + instead. + + kwargs: + Additional key word arguments passed along to the + :meth:`~transformers.file_utils.PushToHubMixin.push_to_hub` method. + """ + if os.path.isfile(save_directory): + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") + return + + os.makedirs(save_directory, exist_ok=True) + + # Only save the model itself if we are using distributed training + model_to_save = unwrap_model(self) + + # save the string version of dtype to the config, e.g. convert torch.float32 => "float32" + # we currently don't use this setting automatically, but may start to use with v5 + dtype = get_parameter_dtype(model_to_save) + model_to_save.config.torch_dtype = str(dtype).split(".")[1] + + # Attach architecture to the config + model_to_save.config.architectures = [model_to_save.__class__.__name__] + + # Save the config + if save_config: + model_to_save.config.save_pretrained(save_directory) + + # Save the model + if state_dict is None: + state_dict = model_to_save.state_dict() + + # Handle the case where some state_dict keys shouldn't be saved + if self._keys_to_ignore_on_save is not None: + state_dict = {k: v for k, v in state_dict.items() if k not in self._keys_to_ignore_on_save} + + # If we save using the predefined names, we can load using `from_pretrained` + output_model_file = os.path.join(save_directory, WEIGHTS_NAME) + save_function(state_dict, output_model_file) + + logger.info(f"Model weights saved in {output_model_file}") + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs): + r""" + Instantiate a pretrained pytorch model from a pre-trained model configuration. + + The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated). To + train the model, you should first set it back in training mode with ``model.train()``. + + The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come + pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning + task. + + The warning `Weights from XXX not used in YYY` means that the layer XXX is not used by YYY, therefore those + weights are discarded. + + Parameters: + pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`, `optional`): + Can be either: + + - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under + a user or organization name, like ``dbmdz/bert-base-german-cased``. + - A path to a `directory` containing model weights saved using + :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``. + - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In + this case, ``from_tf`` should be set to :obj:`True` and a configuration object should be provided + as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in + a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + - A path or url to a model folder containing a `flax checkpoint file` in `.msgpack` format (e.g, + ``./flax_model/`` containing ``flax_model.msgpack``). In this case, ``from_flax`` should be set + to :obj:`True`. + - :obj:`None` if you are both providing the configuration and state dictionary (resp. with keyword + arguments ``config`` and ``state_dict``). + model_args (sequence of positional arguments, `optional`): + All remaining positional arguments will be passed to the underlying model's ``__init__`` method. + config (:obj:`Union[PretrainedConfig, str, os.PathLike]`, `optional`): + Can be either: + + - an instance of a class derived from :class:`~transformers.PretrainedConfig`, + - a string or path valid as input to :func:`~transformers.PretrainedConfig.from_pretrained`. + + Configuration for the model to use instead of an automatically loaded configuration. Configuration can + be automatically loaded when: + + - The model is a model provided by the library (loaded with the `model id` string of a pretrained + model). + - The model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded + by supplying the save directory. + - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a + configuration JSON file named `config.json` is found in the directory. + state_dict (:obj:`Dict[str, torch.Tensor]`, `optional`): + A state dictionary to use instead of a state dictionary loaded from saved weights file. + + This option can be used if you want to create a model from a pretrained configuration but load your own + weights. In this case though, you should check if using + :func:`~transformers.PreTrainedModel.save_pretrained` and + :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + cache_dir (:obj:`Union[str, os.PathLike]`, `optional`): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + from_tf (:obj:`bool`, `optional`, defaults to :obj:`False`): + Load the model weights from a TensorFlow checkpoint save file (see docstring of + ``pretrained_model_name_or_path`` argument). + from_flax (:obj:`bool`, `optional`, defaults to :obj:`False`): + Load the model weights from a Flax checkpoint save file (see docstring of + ``pretrained_model_name_or_path`` argument). + ignore_mismatched_sizes (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to raise an error if some of the weights from the checkpoint do not have the same size + as the weights of the model (if for instance, you are instantiating a model with 10 labels from a + checkpoint with 3 labels). + force_download (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to delete incompletely received files. Will attempt to resume the download if such a + file exists. + proxies (:obj:`Dict[str, str], `optional`): + A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages. + local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to only look at local files (i.e., do not try to download the model). + use_auth_token (:obj:`str` or `bool`, `optional`): + The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token + generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). + revision(:obj:`str`, `optional`, defaults to :obj:`"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any + identifier allowed by git. + mirror(:obj:`str`, `optional`): + Mirror source to accelerate downloads in China. If you are from China and have an accessibility + problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. + Please refer to the mirror site for more information. + _fast_init(:obj:`bool`, `optional`, defaults to `:obj:`True`): + Whether or not to disable fast initialization. + low_cpu_mem_usage(:obj:`bool`, `optional`, defaults to `:obj:`False`): + Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. + This is an experimental feature and a subject to change at any moment. + torch_dtype (:obj:`str` or :obj:`torch.dtype`, `optional`): + Override the default ``torch.dtype`` and load the model under this dtype. If ``"auto"`` is passed the + dtype will be automatically derived from the model's weights. + + .. warning:: + + One should only disable `_fast_init` to ensure backwards compatibility with + ``transformers.__version__ < 4.6.0`` for seeded model initialization. This argument will be removed + at the next major version. See `pull request 11471 + `__ for more information. + + kwargs (remaining dictionary of keyword arguments, `optional`): + Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., + :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or + automatically loaded: + + - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the + underlying model's ``__init__`` method (we assume all relevant updates to the configuration have + already been done) + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class + initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of + ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute + with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration + attribute will be passed to the underlying model's ``__init__`` function. + + .. note:: + + Passing :obj:`use_auth_token=True` is required when you want to use a private model. + + .. note:: + + Activate the special `"offline-mode" + `__ to use this method in a firewalled + environment. + + Examples:: + + >>> from transformers import BertConfig, BertModel + >>> # Download model and configuration from huggingface.co and cache. + >>> model = BertModel.from_pretrained('bert-base-uncased') + >>> # Model was saved using `save_pretrained('./test/saved_model/')` (for example purposes, not runnable). + >>> model = BertModel.from_pretrained('./test/saved_model/') + >>> # Update configuration during loading. + >>> model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True) + >>> assert model.config.output_attentions == True + >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable). + >>> config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json') + >>> model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config) + >>> # Loading from a Flax checkpoint file instead of a PyTorch model (slower) + >>> model = BertModel.from_pretrained('bert-base-uncased', from_flax=True) + + """ + config = kwargs.pop("config", None) + state_dict = kwargs.pop("state_dict", None) + cache_dir = kwargs.pop("cache_dir", None) + ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + output_loading_info = kwargs.pop("output_loading_info", False) + local_files_only = kwargs.pop("local_files_only", False) + use_auth_token = kwargs.pop("use_auth_token", None) + revision = kwargs.pop("revision", None) + mirror = kwargs.pop("mirror", None) + from_pipeline = kwargs.pop("_from_pipeline", None) + from_auto_class = kwargs.pop("_from_auto", False) + _fast_init = kwargs.pop("_fast_init", True) + torch_dtype = kwargs.pop("torch_dtype", None) + low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False) + + user_agent = {"file_type": "model", "framework": "pytorch", "from_auto_class": from_auto_class} + if from_pipeline is not None: + user_agent["using_pipeline"] = from_pipeline + + if is_offline_mode() and not local_files_only: + logger.info("Offline mode: forcing local_files_only=True") + local_files_only = True + + # Load config if we don't provide a configuration + if not isinstance(config, PretrainedConfig): + config_path = config if config is not None else pretrained_model_name_or_path + config, model_kwargs = cls.config_class.from_pretrained( + config_path, + *model_args, + cache_dir=cache_dir, + return_unused_kwargs=True, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + revision=revision, + _from_auto=from_auto_class, + _from_pipeline=from_pipeline, + **kwargs, + ) + else: + model_kwargs = kwargs + + # Load model + if pretrained_model_name_or_path is not None: + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + if os.path.isdir(pretrained_model_name_or_path): + if os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)): + # Load from a PyTorch checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) + else: + raise EnvironmentError( + f"Error no file named {[WEIGHTS_NAME]} found in " + f"directory {pretrained_model_name_or_path}" + ) + elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): + archive_file = pretrained_model_name_or_path + elif os.path.isfile(pretrained_model_name_or_path + ".index"): + raise ValueError( + f"We found a TensorFlow checkpoint at {pretrained_model_name_or_path + '.index'}, which is not supported" + ) + else: + # set correct filename + filename = WEIGHTS_NAME + + archive_file = hf_bucket_url( + pretrained_model_name_or_path, + filename=filename, + revision=revision, + mirror=mirror, + ) + + try: + # Load from URL or cache if already cached + resolved_archive_file = cached_path( + archive_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + user_agent=user_agent, + ) + except EnvironmentError as err: + logger.error(err) + msg = ( + f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n" + f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" + f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {WEIGHTS_NAME}\n\n" + ) + + if revision is not None: + msg += f"- or '{revision}' is a valid git identifier (branch name, a tag name, or a commit id) that exists for this model name as listed on its model page on 'https://huggingface.co/models'\n\n" + + raise EnvironmentError(msg) + + if resolved_archive_file == archive_file: + logger.info(f"loading weights file {archive_file}") + else: + logger.info(f"loading weights file {archive_file} from cache at {resolved_archive_file}") + else: + resolved_archive_file = None + + # load pt weights early so that we know which dtype to init the model under + if state_dict is None: + try: + state_dict = torch.load(resolved_archive_file, map_location="cpu") + except Exception as e: + try: + with open(resolved_archive_file) as f: + if f.read().startswith("version"): + raise OSError( + "You seem to have cloned a repository without having git-lfs installed. Please install " + "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder " + "you cloned." + ) + else: + raise ValueError from e + except (UnicodeDecodeError, ValueError): + raise OSError( + f"Unable to load weights from pytorch checkpoint file for '{pretrained_model_name_or_path}' " + f"at '{resolved_archive_file}'" + "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. " + ) + + # set dtype to instantiate the model under: + # 1. If torch_dtype is not None, we use that dtype + # 2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first + # weights entry - we assume all weights are of the same dtype + # we also may have config.torch_dtype available, but we won't rely on it till v5 + dtype_orig = None + if torch_dtype is not None: + if isinstance(torch_dtype, str): + if torch_dtype == "auto": + torch_dtype = next(iter(state_dict.values())).dtype + else: + raise ValueError( + f"`torch_dtype` can be either a `torch.dtype` or `auto`, but received {torch_dtype}" + ) + dtype_orig = cls._set_default_torch_dtype(torch_dtype) + + if low_cpu_mem_usage: + # save the keys + loaded_state_dict_keys = [k for k in state_dict.keys()] + del state_dict # free CPU memory - will reload again later + + config.name_or_path = pretrained_model_name_or_path + + # Instantiate model. + if is_deepspeed_zero3_enabled(): + import deepspeed + + logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model") + # this immediately partitions the model across all gpus, to avoid the overhead in time + # and memory copying it on CPU or each GPU first + with deepspeed.zero.Init(config_dict_or_path=deepspeed_config()): + with no_init_weights(_enable=_fast_init): + model = cls(config, *model_args, **model_kwargs) + else: + with no_init_weights(_enable=_fast_init): + model = cls(config, *model_args, **model_kwargs) + + if dtype_orig is not None: + torch.set_default_dtype(dtype_orig) + + if low_cpu_mem_usage: + cls._load_state_dict_into_model_low_mem(model, loaded_state_dict_keys, resolved_archive_file) + else: + model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_state_dict_into_model( + model, + state_dict, + pretrained_model_name_or_path, + ignore_mismatched_sizes=ignore_mismatched_sizes, + _fast_init=_fast_init, + ) + + # make sure token embedding weights are still tied if needed + model.tie_weights() + + # Set model in evaluation mode to deactivate DropOut modules by default + model.eval() + + if output_loading_info: + loading_info = { + "missing_keys": missing_keys, + "unexpected_keys": unexpected_keys, + "mismatched_keys": mismatched_keys, + "error_msgs": error_msgs, + } + return model, loading_info + + return model + + @classmethod + def _load_state_dict_into_model( + cls, model, state_dict, pretrained_model_name_or_path, ignore_mismatched_sizes=False, _fast_init=True + ): + + # Convert old format to new format if needed from a PyTorch state_dict + old_keys = [] + new_keys = [] + for key in state_dict.keys(): + new_key = None + if "gamma" in key: + new_key = key.replace("gamma", "weight") + if "beta" in key: + new_key = key.replace("beta", "bias") + if new_key: + old_keys.append(key) + new_keys.append(new_key) + for old_key, new_key in zip(old_keys, new_keys): + state_dict[new_key] = state_dict.pop(old_key) + + # Retrieve missing & unexpected_keys + model_state_dict = model.state_dict() + expected_keys = list(model_state_dict.keys()) + loaded_keys = list(state_dict.keys()) + prefix = model.base_model_prefix + + has_prefix_module = any(s.startswith(prefix) for s in loaded_keys) + expects_prefix_module = any(s.startswith(prefix) for s in expected_keys) + + # key re-naming operations are never done on the keys + # that are loaded, but always on the keys of the newly initialized model + remove_prefix = not has_prefix_module and expects_prefix_module + add_prefix = has_prefix_module and not expects_prefix_module + + if remove_prefix: + expected_keys_not_prefixed = [s for s in expected_keys if not s.startswith(prefix)] + expected_keys = [".".join(s.split(".")[1:]) if s.startswith(prefix) else s for s in expected_keys] + elif add_prefix: + expected_keys = [".".join([prefix, s]) for s in expected_keys] + + missing_keys = list(set(expected_keys) - set(loaded_keys)) + unexpected_keys = list(set(loaded_keys) - set(expected_keys)) + + # Mistmatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not + # matching the weights in the model. + mismatched_keys = [] + if ignore_mismatched_sizes: + for checkpoint_key in loaded_keys: + model_key = checkpoint_key + if remove_prefix and checkpoint_key.startswith(prefix): + model_key = ".".join(checkpoint_key.split(".")[1:]) + elif add_prefix: + model_key = f"{prefix}.{checkpoint_key}" + + if ( + model_key in model_state_dict + and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape + ): + mismatched_keys.append( + (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape) + ) + del state_dict[checkpoint_key] + + # Some models may have keys that are not in the state by design, removing them before needlessly warning + # the user. + if cls._keys_to_ignore_on_load_missing is not None: + for pat in cls._keys_to_ignore_on_load_missing: + missing_keys = [k for k in missing_keys if re.search(pat, k) is None] + + if cls._keys_to_ignore_on_load_unexpected is not None: + for pat in cls._keys_to_ignore_on_load_unexpected: + unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] + + if _fast_init: + # retrieve unintialized modules and initialize + uninitialized_modules = model.retrieve_modules_from_names( + missing_keys, add_prefix=add_prefix, remove_prefix=remove_prefix + ) + for module in uninitialized_modules: + model._init_weights(module) + + # copy state_dict so _load_from_state_dict can modify it + metadata = getattr(state_dict, "_metadata", None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + error_msgs = [] + + # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants + # so we need to apply the function recursively. + def load(module: nn.Module, prefix=""): + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + args = (state_dict, prefix, local_metadata, True, [], [], error_msgs) + if is_deepspeed_zero3_enabled(): + import deepspeed + + # because zero3 puts placeholders in model params, this context + # manager gathers (unpartitions) the params of the current layer, then loads from + # the state dict and then re-partitions them again + with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0): + if torch.distributed.get_rank() == 0: + module._load_from_state_dict(*args) + else: + module._load_from_state_dict(*args) + + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + ".") + + # Make sure we are able to load base models as well as derived models (with heads) + start_prefix = "" + model_to_load = model + if not hasattr(model, cls.base_model_prefix) and has_prefix_module: + start_prefix = cls.base_model_prefix + "." + if hasattr(model, cls.base_model_prefix) and not has_prefix_module: + model_to_load = getattr(model, cls.base_model_prefix) + if any(key in expected_keys_not_prefixed for key in loaded_keys): + raise ValueError( + "The state dictionary of the model you are training to load is corrupted. Are you sure it was " + "properly saved?" + ) + + load(model_to_load, prefix=start_prefix) + + if len(error_msgs) > 0: + error_msg = "\n\t".join(error_msgs) + raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}") + + if len(unexpected_keys) > 0: + logger.warning( + f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when " + f"initializing {model.__class__.__name__}: {unexpected_keys}\n" + f"- This IS expected if you are initializing {model.__class__.__name__} from the checkpoint of a model trained on another task " + f"or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n" + f"- This IS NOT expected if you are initializing {model.__class__.__name__} from the checkpoint of a model that you expect " + f"to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)." + ) + else: + logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n") + if len(missing_keys) > 0: + logger.warning( + f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at {pretrained_model_name_or_path} " + f"and are newly initialized: {missing_keys}\n" + f"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference." + ) + elif len(mismatched_keys) == 0: + logger.info( + f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at {pretrained_model_name_or_path}.\n" + f"If your task is similar to the task the model of the checkpoint was trained on, " + f"you can already use {model.__class__.__name__} for predictions without further training." + ) + if len(mismatched_keys) > 0: + mismatched_warning = "\n".join( + [ + f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated" + for key, shape1, shape2 in mismatched_keys + ] + ) + logger.warning( + f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at {pretrained_model_name_or_path} " + f"and are newly initialized because the shapes did not match:\n{mismatched_warning}\n" + f"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference." + ) + + return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs + + def retrieve_modules_from_names(self, names, add_prefix=False, remove_prefix=False): + module_keys = set([".".join(key.split(".")[:-1]) for key in names]) + + # torch.nn.ParameterList is a special case where two parameter keywords + # are appended to the module name, *e.g.* bert.special_embeddings.0 + module_keys = module_keys.union(set([".".join(key.split(".")[:-2]) for key in names if key[-1].isdigit()])) + + retrieved_modules = [] + # retrieve all modules that has at least one missing weight name + for name, module in self.named_modules(): + if remove_prefix: + name = ".".join(name.split(".")[1:]) if name.startswith(self.base_model_prefix) else name + elif add_prefix: + name = ".".join([self.base_model_prefix, name]) if len(name) > 0 else self.base_model_prefix + + if name in module_keys: + retrieved_modules.append(module) + + return retrieved_modules + + @classmethod + def _load_state_dict_into_model_low_mem(cls, model, loaded_state_dict_keys, resolved_archive_file): + """ + This is an experimental function that loads the model using ~1.x model size CPU memory + + Before it gets called we do: + + 1. save which state_dict keys we have + 2. drop state_dict before model is created, since the latter takes 1x model size memory + + Here then we continue: + + 3. switch to the meta device all params/buffers that are going to be replaced from the loaded state_dict + 4. load state_dict 2nd time + 5. replace the params/buffers from the state_dict + + Currently, it doesn't handle missing_keys, unexpected_keys, mismatched_keys. It can't handle deepspeed. + """ + + require_version_core("torch>=1.9") + if is_deepspeed_zero3_enabled(): + raise ValueError("low_cpu_mem_usage arg cannot be used with DeepSpeed ZeRO-3") + + # a helper util to find the last sub-module and the param/buffer name + def find_submodule_and_param_name(model, long_key): + split_key = long_key.split(".") + submodule = model + while len(split_key) > 1: + if hasattr(submodule, split_key[0]): + submodule = getattr(submodule, split_key[0]) + del split_key[0] + else: + submodule = None + break + return submodule, split_key[0] + + # dematerialize param storage for keys that are going to be replaced by state_dict, by + # putting those on the meta device + for k in loaded_state_dict_keys: + submodule, param_name = find_submodule_and_param_name(model, k) + if submodule is not None: + # selectively switch to the meta device only those params/buffers that will + # be next replaced from state_dict. This a complex way to do p.to_("meta") + # since we have no in-place to_ for tensors. + new_val = getattr(submodule, param_name) + if isinstance(new_val, torch.nn.Parameter): + # isinstance returns False for Params on meta device, so switch after the check + new_val = torch.nn.Parameter(new_val.to("meta")) + else: + new_val = new_val.to("meta") + setattr(submodule, param_name, new_val) + + # only now can load state_dict + state_dict = torch.load(resolved_archive_file, map_location="cpu") + + # materialize state_dict entries one by one on CPU + for k in loaded_state_dict_keys: + submodule, param_name = find_submodule_and_param_name(model, k) + if submodule is not None: + new_val = state_dict[k] + if isinstance(getattr(submodule, param_name), torch.nn.Parameter): + new_val = torch.nn.Parameter(new_val) + setattr(submodule, param_name, new_val) + + del state_dict + +class Conv1D(Module): + """ + 1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2). + + Basically works like a linear layer but the weights are transposed. + + Args: + nf (:obj:`int`): The number of output features. + nx (:obj:`int`): The number of input features. + """ + + def __init__(self, nf, nx): + super().__init__() + self.nf = nf + w = torch.empty(nx, nf) + nn.init.normal_(w, std=0.02) + self.weight = nn.Parameter(w) + self.bias = nn.Parameter(torch.zeros(nf)) + + def forward(self, x): + size_out = x.size()[:-1] + (self.nf,) + x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight) + x = x.view(*size_out) + return x + + +class SequenceSummary(Module): + r""" + Compute a single vector summary of a sequence hidden states. + + Args: + config (:class:`~transformers.PretrainedConfig`): + The config used by the model. Relevant arguments in the config class of the model are (refer to the actual + config class of your model for the default values it uses): + + - **summary_type** (:obj:`str`) -- The method to use to make this summary. Accepted values are: + + - :obj:`"last"` -- Take the last token hidden state (like XLNet) + - :obj:`"first"` -- Take the first token hidden state (like Bert) + - :obj:`"mean"` -- Take the mean of all tokens hidden states + - :obj:`"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2) + - :obj:`"attn"` -- Not implemented now, use multi-head attention + + - **summary_use_proj** (:obj:`bool`) -- Add a projection after the vector extraction. + - **summary_proj_to_labels** (:obj:`bool`) -- If :obj:`True`, the projection outputs to + :obj:`config.num_labels` classes (otherwise to :obj:`config.hidden_size`). + - **summary_activation** (:obj:`Optional[str]`) -- Set to :obj:`"tanh"` to add a tanh activation to the + output, another string or :obj:`None` will add no activation. + - **summary_first_dropout** (:obj:`float`) -- Optional dropout probability before the projection and + activation. + - **summary_last_dropout** (:obj:`float`)-- Optional dropout probability after the projection and + activation. + """ + + def __init__(self, config: PretrainedConfig): + super().__init__() + + self.summary_type = getattr(config, "summary_type", "last") + if self.summary_type == "attn": + # We should use a standard multi-head attention module with absolute positional embedding for that. + # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 + # We can probably just use the multi-head attention module of PyTorch >=1.1.0 + raise NotImplementedError + + self.summary = Identity() + if hasattr(config, "summary_use_proj") and config.summary_use_proj: + if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: + num_classes = config.num_labels + else: + num_classes = config.hidden_size + self.summary = nn.Linear(config.hidden_size, num_classes) + + activation_string = getattr(config, "summary_activation", None) + self.activation: Callable = get_activation(activation_string) if activation_string else Identity() + + self.first_dropout = Identity() + if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0: + self.first_dropout = nn.Dropout(config.summary_first_dropout) + + self.last_dropout = Identity() + if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0: + self.last_dropout = nn.Dropout(config.summary_last_dropout) + + def forward( + self, hidden_states: "torch.FloatTensor", cls_index: Optional["torch.LongTensor"] = None + ) -> "torch.FloatTensor": + """ + Compute a single vector summary of a sequence hidden states. + + Args: + hidden_states (:obj:`torch.FloatTensor` of shape :obj:`[batch_size, seq_len, hidden_size]`): + The hidden states of the last layer. + cls_index (:obj:`torch.LongTensor` of shape :obj:`[batch_size]` or :obj:`[batch_size, ...]` where ... are optional leading dimensions of :obj:`hidden_states`, `optional`): + Used if :obj:`summary_type == "cls_index"` and takes the last token of the sequence as classification + token. + + Returns: + :obj:`torch.FloatTensor`: The summary of the sequence hidden states. + """ + if self.summary_type == "last": + output = hidden_states[:, -1] + elif self.summary_type == "first": + output = hidden_states[:, 0] + elif self.summary_type == "mean": + output = hidden_states.mean(dim=1) + elif self.summary_type == "cls_index": + if cls_index is None: + cls_index = torch.full_like( + hidden_states[..., :1, :], + hidden_states.shape[-2] - 1, + dtype=torch.long, + ) + else: + cls_index = cls_index.unsqueeze(-1).unsqueeze(-1) + cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)) + # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states + output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size) + elif self.summary_type == "attn": + raise NotImplementedError + + output = self.first_dropout(output) + output = self.summary(output) + output = self.activation(output) + output = self.last_dropout(output) + + return output + + +def unwrap_model(model: "nn.Module") -> "nn.Module": + """ + Recursively unwraps a model from potential containers (as used in distributed training). + + Args: + model (:obj:`torch.nn.Module`): The model to unwrap. + """ + # since there could be multiple levels of wrapping, unwrap recursively + if hasattr(model, "module"): + return unwrap_model(model.module) + else: + return model + +def prune_linear_layer(layer: "nn.Linear", index: "torch.LongTensor", dim: int = 0) -> "nn.Linear": + """ + Prune a linear layer to keep only entries in index. + + Used to remove heads. + + Args: + layer (:obj:`torch.nn.Linear`): The layer to prune. + index (:obj:`torch.LongTensor`): The indices to keep in the layer. + dim (:obj:`int`, `optional`, defaults to 0): The dimension on which to keep the indices. + + Returns: + :obj:`torch.nn.Linear`: The pruned layer as a new layer with :obj:`requires_grad=True`. + """ + index = index.to(layer.weight.device) + W = layer.weight.index_select(dim, index).clone().detach() + if layer.bias is not None: + if dim == 1: + b = layer.bias.clone().detach() + else: + b = layer.bias[index].clone().detach() + new_size = list(layer.weight.size()) + new_size[dim] = len(index) + new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device) + new_layer.weight.requires_grad = False + new_layer.weight.copy_(W.contiguous()) + new_layer.weight.requires_grad = True + if layer.bias is not None: + new_layer.bias.requires_grad = False + new_layer.bias.copy_(b.contiguous()) + new_layer.bias.requires_grad = True + return new_layer + +def prune_conv1d_layer(layer: Conv1D, index: "torch.LongTensor", dim: int = 1) -> Conv1D: + """ + Prune a Conv1D layer to keep only entries in index. A Conv1D work as a Linear layer (see e.g. BERT) but the weights + are transposed. + + Used to remove heads. + + Args: + layer (:class:`~transformers.modeling_utils.Conv1D`): The layer to prune. + index (:obj:`torch.LongTensor`): The indices to keep in the layer. + dim (:obj:`int`, `optional`, defaults to 1): The dimension on which to keep the indices. + + Returns: + :class:`~transformers.modeling_utils.Conv1D`: The pruned layer as a new layer with :obj:`requires_grad=True`. + """ + index = index.to(layer.weight.device) + W = layer.weight.index_select(dim, index).clone().detach() + if dim == 0: + b = layer.bias.clone().detach() + else: + b = layer.bias[index].clone().detach() + new_size = list(layer.weight.size()) + new_size[dim] = len(index) + new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device) + new_layer.weight.requires_grad = False + new_layer.weight.copy_(W.contiguous()) + new_layer.weight.requires_grad = True + new_layer.bias.requires_grad = False + new_layer.bias.copy_(b.contiguous()) + new_layer.bias.requires_grad = True + return new_layer + +def apply_chunking_to_forward( + forward_fn: Callable[..., "torch.Tensor"], chunk_size: int, chunk_dim: int, *input_tensors +) -> "torch.Tensor": + """ + This function chunks the :obj:`input_tensors` into smaller input tensor parts of size :obj:`chunk_size` over the + dimension :obj:`chunk_dim`. It then applies a layer :obj:`forward_fn` to each chunk independently to save memory. + + If the :obj:`forward_fn` is independent across the :obj:`chunk_dim` this function will yield the same result as + directly applying :obj:`forward_fn` to :obj:`input_tensors`. + + Args: + forward_fn (:obj:`Callable[..., torch.Tensor]`): + The forward function of the model. + chunk_size (:obj:`int`): + The chunk size of a chunked tensor: :obj:`num_chunks = len(input_tensors[0]) / chunk_size`. + chunk_dim (:obj:`int`): + The dimension over which the :obj:`input_tensors` should be chunked. + input_tensors (:obj:`Tuple[torch.Tensor]`): + The input tensors of ``forward_fn`` which will be chunked + + Returns: + :obj:`torch.Tensor`: A tensor with the same shape as the :obj:`forward_fn` would have given if applied`. + + + Examples:: + + # rename the usual forward() fn to forward_chunk() + def forward_chunk(self, hidden_states): + hidden_states = self.decoder(hidden_states) + return hidden_states + + # implement a chunked forward function + def forward(self, hidden_states): + return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states) + """ + + assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors" + + # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility + num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters) + if num_args_in_forward_chunk_fn != len(input_tensors): + raise ValueError( + f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input " + "tensors are given" + ) + + if chunk_size > 0: + tensor_shape = input_tensors[0].shape[chunk_dim] + for input_tensor in input_tensors: + if input_tensor.shape[chunk_dim] != tensor_shape: + raise ValueError( + f"All input tenors have to be of the same shape: {tensor_shape}, " + f"found shape {input_tensor.shape[chunk_dim]}" + ) + + if input_tensors[0].shape[chunk_dim] % chunk_size != 0: + raise ValueError( + f"The dimension to be chunked {input_tensors[0].shape[chunk_dim]} has to be a multiple of the chunk " + f"size {chunk_size}" + ) + + num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size + + # chunk input tensor into tuples + input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors) + # apply forward fn to every tuple + output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks)) + # concatenate output at same dimension + return torch.cat(output_chunks, dim=chunk_dim) + + return forward_fn(*input_tensors) \ No newline at end of file diff --git a/fastNLP/transformers/torch/models/__init__.py b/fastNLP/transformers/torch/models/__init__.py new file mode 100644 index 00000000..ddf3005f --- /dev/null +++ b/fastNLP/transformers/torch/models/__init__.py @@ -0,0 +1,5 @@ +from .bart import * +from .bert import * +from .cpt import * +from .gpt2 import * +from .roberta import * \ No newline at end of file diff --git a/fastNLP/transformers/torch/models/auto/configuration_auto.py b/fastNLP/transformers/torch/models/auto/configuration_auto.py new file mode 100644 index 00000000..bcd7576c --- /dev/null +++ b/fastNLP/transformers/torch/models/auto/configuration_auto.py @@ -0,0 +1,541 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Auto Config class. """ +import importlib +import re +from collections import OrderedDict +from typing import List, Union + +from fastNLP.transformers.torch.configuration_utils import PretrainedConfig +from fastNLP.transformers.torch.file_utils import CONFIG_NAME +from fastNLP.core.log import logger + + +CONFIG_MAPPING_NAMES = OrderedDict( + [ + # Add configs here + ("fnet", "FNetConfig"), + ("gptj", "GPTJConfig"), + ("layoutlmv2", "LayoutLMv2Config"), + ("beit", "BeitConfig"), + ("rembert", "RemBertConfig"), + ("visual_bert", "VisualBertConfig"), + ("canine", "CanineConfig"), + ("roformer", "RoFormerConfig"), + ("clip", "CLIPConfig"), + ("bigbird_pegasus", "BigBirdPegasusConfig"), + ("deit", "DeiTConfig"), + ("luke", "LukeConfig"), + ("detr", "DetrConfig"), + ("gpt_neo", "GPTNeoConfig"), + ("big_bird", "BigBirdConfig"), + ("speech_to_text_2", "Speech2Text2Config"), + ("speech_to_text", "Speech2TextConfig"), + ("vit", "ViTConfig"), + ("wav2vec2", "Wav2Vec2Config"), + ("m2m_100", "M2M100Config"), + ("convbert", "ConvBertConfig"), + ("led", "LEDConfig"), + ("blenderbot-small", "BlenderbotSmallConfig"), + ("retribert", "RetriBertConfig"), + ("ibert", "IBertConfig"), + ("mt5", "MT5Config"), + ("t5", "T5Config"), + ("mobilebert", "MobileBertConfig"), + ("distilbert", "DistilBertConfig"), + ("albert", "AlbertConfig"), + ("bert-generation", "BertGenerationConfig"), + ("camembert", "CamembertConfig"), + ("xlm-roberta", "XLMRobertaConfig"), + ("pegasus", "PegasusConfig"), + ("marian", "MarianConfig"), + ("mbart", "MBartConfig"), + ("megatron-bert", "MegatronBertConfig"), + ("mpnet", "MPNetConfig"), + ("bart", "BartConfig"), + ("blenderbot", "BlenderbotConfig"), + ("reformer", "ReformerConfig"), + ("longformer", "LongformerConfig"), + ("roberta", "RobertaConfig"), + ("deberta-v2", "DebertaV2Config"), + ("deberta", "DebertaConfig"), + ("flaubert", "FlaubertConfig"), + ("fsmt", "FSMTConfig"), + ("squeezebert", "SqueezeBertConfig"), + ("hubert", "HubertConfig"), + ("bert", "BertConfig"), + ("openai-gpt", "OpenAIGPTConfig"), + ("gpt2", "GPT2Config"), + ("transfo-xl", "TransfoXLConfig"), + ("xlnet", "XLNetConfig"), + ("xlm-prophetnet", "XLMProphetNetConfig"), + ("prophetnet", "ProphetNetConfig"), + ("xlm", "XLMConfig"), + ("ctrl", "CTRLConfig"), + ("electra", "ElectraConfig"), + ("speech-encoder-decoder", "SpeechEncoderDecoderConfig"), + ("encoder-decoder", "EncoderDecoderConfig"), + ("funnel", "FunnelConfig"), + ("lxmert", "LxmertConfig"), + ("dpr", "DPRConfig"), + ("layoutlm", "LayoutLMConfig"), + ("rag", "RagConfig"), + ("tapas", "TapasConfig"), + ("splinter", "SplinterConfig"), + ] +) + +CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict( + [ + # Add archive maps here + ("fnet", "FNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("pegasus", "PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("gptj", "GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("layoutlmv2", "LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("rembert", "REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("visual_bert", "VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("roformer", "ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("clip", "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("bigbird_pegasus", "BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("luke", "LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("detr", "DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("gpt_neo", "GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("big_bird", "BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("megatron-bert", "MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("speech_to_text", "SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("speech_to_text_2", "SPEECH_TO_TEXT_2_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("vit", "VIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("wav2vec2", "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("m2m_100", "M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("convbert", "CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("led", "LED_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("blenderbot-small", "BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("bert", "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("bart", "BART_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("blenderbot", "BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("mbart", "MBART_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("openai-gpt", "OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("gpt2", "GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("ctrl", "CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("xlnet", "XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("xlm", "XLM_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("roberta", "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("distilbert", "DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("camembert", "CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("t5", "T5_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("xlm-roberta", "XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("flaubert", "FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("fsmt", "FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("electra", "ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("longformer", "LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("retribert", "RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("funnel", "FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("lxmert", "LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("layoutlm", "LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("dpr", "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("deberta", "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("deberta-v2", "DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("squeezebert", "SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("xlm-prophetnet", "XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("prophetnet", "PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("mpnet", "MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("tapas", "TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ("splinter", "SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP"), + ] +) + +MODEL_NAMES_MAPPING = OrderedDict( + [ + # Add full (and cased) model names here + ("fnet", "FNet"), + ("gptj", "GPT-J"), + ("beit", "BeiT"), + ("rembert", "RemBERT"), + ("layoutlmv2", "LayoutLMv2"), + ("visual_bert", "VisualBert"), + ("canine", "Canine"), + ("roformer", "RoFormer"), + ("clip", "CLIP"), + ("bigbird_pegasus", "BigBirdPegasus"), + ("deit", "DeiT"), + ("luke", "LUKE"), + ("detr", "DETR"), + ("gpt_neo", "GPT Neo"), + ("big_bird", "BigBird"), + ("speech_to_text_2", "Speech2Text2"), + ("speech_to_text", "Speech2Text"), + ("vit", "ViT"), + ("wav2vec2", "Wav2Vec2"), + ("m2m_100", "M2M100"), + ("convbert", "ConvBERT"), + ("led", "LED"), + ("blenderbot-small", "BlenderbotSmall"), + ("retribert", "RetriBERT"), + ("ibert", "I-BERT"), + ("t5", "T5"), + ("mobilebert", "MobileBERT"), + ("distilbert", "DistilBERT"), + ("albert", "ALBERT"), + ("bert-generation", "Bert Generation"), + ("camembert", "CamemBERT"), + ("xlm-roberta", "XLM-RoBERTa"), + ("pegasus", "Pegasus"), + ("blenderbot", "Blenderbot"), + ("marian", "Marian"), + ("mbart", "mBART"), + ("megatron-bert", "MegatronBert"), + ("bart", "BART"), + ("reformer", "Reformer"), + ("longformer", "Longformer"), + ("roberta", "RoBERTa"), + ("flaubert", "FlauBERT"), + ("fsmt", "FairSeq Machine-Translation"), + ("squeezebert", "SqueezeBERT"), + ("bert", "BERT"), + ("openai-gpt", "OpenAI GPT"), + ("gpt2", "OpenAI GPT-2"), + ("transfo-xl", "Transformer-XL"), + ("xlnet", "XLNet"), + ("xlm", "XLM"), + ("ctrl", "CTRL"), + ("electra", "ELECTRA"), + ("encoder-decoder", "Encoder decoder"), + ("speech-encoder-decoder", "Speech Encoder decoder"), + ("funnel", "Funnel Transformer"), + ("lxmert", "LXMERT"), + ("deberta-v2", "DeBERTa-v2"), + ("deberta", "DeBERTa"), + ("layoutlm", "LayoutLM"), + ("dpr", "DPR"), + ("rag", "RAG"), + ("xlm-prophetnet", "XLMProphetNet"), + ("prophetnet", "ProphetNet"), + ("mt5", "mT5"), + ("mpnet", "MPNet"), + ("tapas", "TAPAS"), + ("hubert", "Hubert"), + ("barthez", "BARThez"), + ("phobert", "PhoBERT"), + ("cpm", "CPM"), + ("bertweet", "Bertweet"), + ("bert-japanese", "BertJapanese"), + ("byt5", "ByT5"), + ("mbart50", "mBART-50"), + ("splinter", "Splinter"), + ] +) + +SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict([("openai-gpt", "openai")]) + + +def model_type_to_module_name(key): + """Converts a config key to the corresponding module.""" + # Special treatment + if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME: + return SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key] + + return key.replace("-", "_") + + +def config_class_to_model_type(config): + """Converts a config class name to the corresponding model type""" + for key, cls in CONFIG_MAPPING_NAMES.items(): + if cls == config: + return key + return None + + +class _LazyConfigMapping(OrderedDict): + """ + A dictionary that lazily load its values when they are requested. + """ + + def __init__(self, mapping): + self._mapping = mapping + self._modules = {} + + def __getitem__(self, key): + if key not in self._mapping: + raise KeyError(key) + value = self._mapping[key] + module_name = model_type_to_module_name(key) + if module_name not in self._modules: + self._modules[module_name] = importlib.import_module(f".{module_name}", "transformers.models") + return getattr(self._modules[module_name], value) + + def keys(self): + return self._mapping.keys() + + def values(self): + return [self[k] for k in self._mapping.keys()] + + def items(self): + return [(k, self[k]) for k in self._mapping.keys()] + + def __iter__(self): + return iter(self._mapping.keys()) + + def __contains__(self, item): + return item in self._mapping + + +CONFIG_MAPPING = _LazyConfigMapping(CONFIG_MAPPING_NAMES) + + +class _LazyLoadAllMappings(OrderedDict): + """ + A mapping that will load all pairs of key values at the first access (either by indexing, requestions keys, values, + etc.) + + Args: + mapping: The mapping to load. + """ + + def __init__(self, mapping): + self._mapping = mapping + self._initialized = False + self._data = {} + + def _initialize(self): + if self._initialized: + return + logger.warn( + "ALL_PRETRAINED_CONFIG_ARCHIVE_MAP is deprecated and will be removed in v5 of Transformers. " + "It does not contain all available model checkpoints, far from it. Checkout hf.co/models for that.", + FutureWarning, + ) + + for model_type, map_name in self._mapping.items(): + module_name = model_type_to_module_name(model_type) + module = importlib.import_module(f".{module_name}", "transformers.models") + mapping = getattr(module, map_name) + self._data.update(mapping) + + self._initialized = True + + def __getitem__(self, key): + self._initialize() + return self._data[key] + + def keys(self): + self._initialize() + return self._data.keys() + + def values(self): + self._initialize() + return self._data.values() + + def items(self): + self._initialize() + return self._data.keys() + + def __iter__(self): + self._initialize() + return iter(self._data) + + def __contains__(self, item): + self._initialize() + return item in self._data + + +ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = _LazyLoadAllMappings(CONFIG_ARCHIVE_MAP_MAPPING_NAMES) + + +def _get_class_name(model_class: Union[str, List[str]]): + if isinstance(model_class, (list, tuple)): + return " or ".join([f":class:`~transformers.{c}`" for c in model_class if c is not None]) + return f":class:`~transformers.{model_class}`" + + +def _list_model_options(indent, config_to_class=None, use_model_types=True): + if config_to_class is None and not use_model_types: + raise ValueError("Using `use_model_types=False` requires a `config_to_class` dictionary.") + if use_model_types: + if config_to_class is None: + model_type_to_name = { + model_type: f":class:`~transformers.{config}`" for model_type, config in CONFIG_MAPPING_NAMES.items() + } + else: + model_type_to_name = { + model_type: _get_class_name(model_class) + for model_type, model_class in config_to_class.items() + if model_type in MODEL_NAMES_MAPPING + } + lines = [ + f"{indent}- **{model_type}** -- {model_type_to_name[model_type]} ({MODEL_NAMES_MAPPING[model_type]} model)" + for model_type in sorted(model_type_to_name.keys()) + ] + else: + config_to_name = { + CONFIG_MAPPING_NAMES[config]: _get_class_name(clas) + for config, clas in config_to_class.items() + if config in CONFIG_MAPPING_NAMES + } + config_to_model_name = { + config: MODEL_NAMES_MAPPING[model_type] for model_type, config in CONFIG_MAPPING_NAMES.items() + } + lines = [ + f"{indent}- :class:`~transformers.{config_name}` configuration class: {config_to_name[config_name]} ({config_to_model_name[config_name]} model)" + for config_name in sorted(config_to_name.keys()) + ] + return "\n".join(lines) + + +def replace_list_option_in_docstrings(config_to_class=None, use_model_types=True): + def docstring_decorator(fn): + docstrings = fn.__doc__ + lines = docstrings.split("\n") + i = 0 + while i < len(lines) and re.search(r"^(\s*)List options\s*$", lines[i]) is None: + i += 1 + if i < len(lines): + indent = re.search(r"^(\s*)List options\s*$", lines[i]).groups()[0] + if use_model_types: + indent = f"{indent} " + lines[i] = _list_model_options(indent, config_to_class=config_to_class, use_model_types=use_model_types) + docstrings = "\n".join(lines) + else: + raise ValueError( + f"The function {fn} should have an empty 'List options' in its docstring as placeholder, current docstring is:\n{docstrings}" + ) + fn.__doc__ = docstrings + return fn + + return docstring_decorator + + +class AutoConfig: + r""" + This is a generic configuration class that will be instantiated as one of the configuration classes of the library + when created with the :meth:`~transformers.AutoConfig.from_pretrained` class method. + + This class cannot be instantiated directly using ``__init__()`` (throws an error). + """ + + def __init__(self): + raise EnvironmentError( + "AutoConfig is designed to be instantiated " + "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method." + ) + + @classmethod + def for_model(cls, model_type: str, *args, **kwargs): + if model_type in CONFIG_MAPPING: + config_class = CONFIG_MAPPING[model_type] + return config_class(*args, **kwargs) + raise ValueError( + f"Unrecognized model identifier: {model_type}. Should contain one of {', '.join(CONFIG_MAPPING.keys())}" + ) + + @classmethod + @replace_list_option_in_docstrings() + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + r""" + Instantiate one of the configuration classes of the library from a pretrained model configuration. + + The configuration class to instantiate is selected based on the :obj:`model_type` property of the config object + that is loaded, or when it's missing, by falling back to using pattern matching on + :obj:`pretrained_model_name_or_path`: + + List options + + Args: + pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): + Can be either: + + - A string, the `model id` of a pretrained model configuration hosted inside a model repo on + huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or + namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. + - A path to a `directory` containing a configuration file saved using the + :meth:`~transformers.PretrainedConfig.save_pretrained` method, or the + :meth:`~transformers.PreTrainedModel.save_pretrained` method, e.g., ``./my_model_directory/``. + - A path or url to a saved configuration JSON `file`, e.g., + ``./my_model_directory/configuration.json``. + cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + force_download (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to force the (re-)download the model weights and configuration files and override the + cached versions if they exist. + resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to delete incompletely received files. Will attempt to resume the download if such a + file exists. + proxies (:obj:`Dict[str, str]`, `optional`): + A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + revision(:obj:`str`, `optional`, defaults to :obj:`"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any + identifier allowed by git. + return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`): + If :obj:`False`, then this function returns just the final configuration object. + + If :obj:`True`, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs` + is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., + the part of ``kwargs`` which has not been used to update ``config`` and is otherwise ignored. + kwargs(additional keyword arguments, `optional`): + The values in kwargs of any keys which are configuration attributes will be used to override the loaded + values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled + by the ``return_unused_kwargs`` keyword parameter. + + Examples:: + + >>> from transformers import AutoConfig + + >>> # Download configuration from huggingface.co and cache. + >>> config = AutoConfig.from_pretrained('bert-base-uncased') + + >>> # Download configuration from huggingface.co (user-uploaded) and cache. + >>> config = AutoConfig.from_pretrained('dbmdz/bert-base-german-cased') + + >>> # If configuration file is in a directory (e.g., was saved using `save_pretrained('./test/saved_model/')`). + >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/') + + >>> # Load a specific configuration file. + >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json') + + >>> # Change some config attributes when loading a pretrained config. + >>> config = AutoConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False) + >>> config.output_attentions + True + >>> config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False, return_unused_kwargs=True) + >>> config.output_attentions + True + >>> config.unused_kwargs + {'foo': False} + """ + kwargs["_from_auto"] = True + config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) + if "model_type" in config_dict: + config_class = CONFIG_MAPPING[config_dict["model_type"]] + return config_class.from_dict(config_dict, **kwargs) + else: + # Fallback: use pattern matching on the string. + for pattern, config_class in CONFIG_MAPPING.items(): + if pattern in str(pretrained_model_name_or_path): + return config_class.from_dict(config_dict, **kwargs) + + raise ValueError( + f"Unrecognized model in {pretrained_model_name_or_path}. " + f"Should have a `model_type` key in its {CONFIG_NAME}, or contain one of the following strings " + f"in its name: {', '.join(CONFIG_MAPPING.keys())}" + ) diff --git a/fastNLP/transformers/torch/models/auto/tokenization_auto.py b/fastNLP/transformers/torch/models/auto/tokenization_auto.py new file mode 100644 index 00000000..e275579f --- /dev/null +++ b/fastNLP/transformers/torch/models/auto/tokenization_auto.py @@ -0,0 +1,199 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Auto Tokenizer class. """ + +from collections import OrderedDict +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union + +from ...file_utils import ( + is_sentencepiece_available, + is_tokenizers_available, +) + +if TYPE_CHECKING: + # This significantly improves completion suggestion performance when + # the transformers package is used with Microsoft's Pylance language server. + TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict() +else: + TOKENIZER_MAPPING_NAMES = OrderedDict( + [ + ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)), + ("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)), + ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)), + ( + "t5", + ( + "T5Tokenizer" if is_sentencepiece_available() else None, + "T5TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "mt5", + ( + "MT5Tokenizer" if is_sentencepiece_available() else None, + "MT5TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)), + ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)), + ( + "albert", + ( + "AlbertTokenizer" if is_sentencepiece_available() else None, + "AlbertTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "camembert", + ( + "CamembertTokenizer" if is_sentencepiece_available() else None, + "CamembertTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "pegasus", + ( + "PegasusTokenizer" if is_sentencepiece_available() else None, + "PegasusTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "mbart", + ( + "MBartTokenizer" if is_sentencepiece_available() else None, + "MBartTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "xlm-roberta", + ( + "XLMRobertaTokenizer" if is_sentencepiece_available() else None, + "XLMRobertaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)), + ("blenderbot-small", ("BlenderbotSmallTokenizer", None)), + ("blenderbot", ("BlenderbotTokenizer", None)), + ("bart", ("BartTokenizer", "BartTokenizerFast")), + ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)), + ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), + ( + "reformer", + ( + "ReformerTokenizer" if is_sentencepiece_available() else None, + "ReformerTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)), + ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)), + ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)), + ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)), + ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)), + ( + "dpr", + ( + "DPRQuestionEncoderTokenizer", + "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "squeezebert", + ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None), + ), + ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)), + ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("transfo-xl", ("TransfoXLTokenizer", None)), + ( + "xlnet", + ( + "XLNetTokenizer" if is_sentencepiece_available() else None, + "XLNetTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("flaubert", ("FlaubertTokenizer", None)), + ("xlm", ("XLMTokenizer", None)), + ("ctrl", ("CTRLTokenizer", None)), + ("fsmt", ("FSMTTokenizer", None)), + ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)), + ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)), + ("deberta-v2", ("DebertaV2Tokenizer" if is_sentencepiece_available() else None, None)), + ("rag", ("RagTokenizer", None)), + ("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)), + ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)), + ("speech_to_text_2", ("Speech2Text2Tokenizer", None)), + ("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)), + ("prophetnet", ("ProphetNetTokenizer", None)), + ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)), + ("tapas", ("TapasTokenizer", None)), + ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)), + ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)), + ( + "big_bird", + ( + "BigBirdTokenizer" if is_sentencepiece_available() else None, + "BigBirdTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), + ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)), + ("hubert", ("Wav2Vec2CTCTokenizer", None)), + ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("luke", ("LukeTokenizer", None)), + ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)), + ("canine", ("CanineTokenizer", None)), + ("bertweet", ("BertweetTokenizer", None)), + ("bert-japanese", ("BertJapaneseTokenizer", None)), + ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")), + ("byt5", ("ByT5Tokenizer", None)), + ( + "cpm", + ( + "CpmTokenizer" if is_sentencepiece_available() else None, + "CpmTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)), + ("phobert", ("PhobertTokenizer", None)), + ( + "barthez", + ( + "BarthezTokenizer" if is_sentencepiece_available() else None, + "BarthezTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "mbart50", + ( + "MBart50Tokenizer" if is_sentencepiece_available() else None, + "MBart50TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "rembert", + ( + "RemBertTokenizer" if is_sentencepiece_available() else None, + "RemBertTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "clip", + ( + "CLIPTokenizer", + "CLIPTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ] + ) \ No newline at end of file diff --git a/fastNLP/transformers/torch/models/bart/__init__.py b/fastNLP/transformers/torch/models/bart/__init__.py new file mode 100644 index 00000000..127f95b6 --- /dev/null +++ b/fastNLP/transformers/torch/models/bart/__init__.py @@ -0,0 +1,20 @@ +__all__ = [ + "BartConfig", + "BART_PRETRAINED_CONFIG_ARCHIVE_MAP", + + "BART_PRETRAINED_MODEL_ARCHIVE_LIST", + "BartForCausalLM", + "BartForConditionalGeneration", + "BartForQuestionAnswering", + "BartForSequenceClassification", + "BartModel", + "BartPretrainedModel", + "PretrainedBartModel", + + "BartTokenizer", +] + +from .configuration_bart import BartConfig, BART_PRETRAINED_CONFIG_ARCHIVE_MAP +from .tokenization_bart import BartTokenizer +from .modeling_bart import BartForCausalLM, BartForConditionalGeneration, BartModel, BartForQuestionAnswering, \ + BartForSequenceClassification, BartPretrainedModel, PretrainedBartModel, BART_PRETRAINED_MODEL_ARCHIVE_LIST \ No newline at end of file diff --git a/fastNLP/transformers/torch/models/bart/configuration_bart.py b/fastNLP/transformers/torch/models/bart/configuration_bart.py new file mode 100644 index 00000000..3b52bc81 --- /dev/null +++ b/fastNLP/transformers/torch/models/bart/configuration_bart.py @@ -0,0 +1,177 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" BART model configuration """ +from fastNLP.transformers.torch.configuration_utils import PretrainedConfig +from fastNLP.core.log import logger + +__all__ = [ + "BartConfig", + "BART_PRETRAINED_CONFIG_ARCHIVE_MAP", +] + +BART_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/config.json", + # See all BART models at https://huggingface.co/models?filter=bart +} + + +class BartConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a :class:`~transformers.BartModel`. It is used to + instantiate a BART model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the BART `facebook/bart-large + `__ architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model + outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. + + + Args: + vocab_size (:obj:`int`, `optional`, defaults to 50265): + Vocabulary size of the BART model. Defines the number of different tokens that can be represented by the + :obj:`inputs_ids` passed when calling :class:`~transformers.BartModel` or + :class:`~transformers.TFBartModel`. + d_model (:obj:`int`, `optional`, defaults to 1024): + Dimensionality of the layers and the pooler layer. + encoder_layers (:obj:`int`, `optional`, defaults to 12): + Number of encoder layers. + decoder_layers (:obj:`int`, `optional`, defaults to 12): + Number of decoder layers. + encoder_attention_heads (:obj:`int`, `optional`, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (:obj:`int`, `optional`, defaults to 16): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. + dropout (:obj:`float`, `optional`, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (:obj:`float`, `optional`, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (:obj:`float`, `optional`, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + classifier_dropout (:obj:`float`, `optional`, defaults to 0.0): + The dropout ratio for classifier. + max_position_embeddings (:obj:`int`, `optional`, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + init_std (:obj:`float`, `optional`, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0): + The LayerDrop probability for the encoder. See the `LayerDrop paper `__ for more details. + decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0): + The LayerDrop probability for the decoder. See the `LayerDrop paper `__ for more details. + scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`): + Scale embeddings by diving by sqrt(d_model). + use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not the model should return the last key/values attentions (not used by all models). + num_labels: (:obj:`int`, `optional`, defaults to 3): + The number of labels to use in :class:`~transformers.BartForSequenceClassification`. + forced_eos_token_id (:obj:`int`, `optional`, defaults to 2): + The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to + :obj:`eos_token_id`. + + Example:: + + >>> from transformers import BartModel, BartConfig + + >>> # Initializing a BART facebook/bart-large style configuration + >>> configuration = BartConfig() + + >>> # Initializing a model from the facebook/bart-large style configuration + >>> model = BartModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + """ + model_type = "bart" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} + + def __init__( + self, + vocab_size=50265, + max_position_embeddings=1024, + encoder_layers=12, + encoder_ffn_dim=4096, + encoder_attention_heads=16, + decoder_layers=12, + decoder_ffn_dim=4096, + decoder_attention_heads=16, + encoder_layerdrop=0.0, + decoder_layerdrop=0.0, + activation_function="gelu", + d_model=1024, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + classifier_dropout=0.0, + scale_embedding=False, + use_cache=True, + num_labels=3, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + is_encoder_decoder=True, + decoder_start_token_id=2, + forced_eos_token_id=2, + **kwargs + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + self.classifier_dropout = classifier_dropout + self.use_cache = use_cache + self.num_hidden_layers = encoder_layers + self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + + super().__init__( + num_labels=num_labels, + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + forced_eos_token_id=forced_eos_token_id, + **kwargs, + ) + + # ensure backward compatibility for BART CNN models + if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False): + self.forced_bos_token_id = self.bos_token_id + logger.warn( + f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions." + "The config can simply be saved and uploaded again to be fixed." + ) diff --git a/fastNLP/transformers/torch/models/bart/modeling_bart.py b/fastNLP/transformers/torch/models/bart/modeling_bart.py new file mode 100644 index 00000000..7219f49a --- /dev/null +++ b/fastNLP/transformers/torch/models/bart/modeling_bart.py @@ -0,0 +1,1834 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch BART model. """ +import copy +import math +import random +from typing import Optional, Tuple + +from fastNLP.transformers.torch.activations import ACT2FN +from fastNLP.transformers.torch.file_utils import ( + add_code_sample_docstrings, + add_end_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from fastNLP.transformers.torch.modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, + Seq2SeqQuestionAnsweringModelOutput, + Seq2SeqSequenceClassifierOutput, +) +from fastNLP.transformers.torch.modeling_utils import PreTrainedModel +from .configuration_bart import BartConfig +from fastNLP.envs.imports import _NEED_IMPORT_TORCH +from fastNLP.core.log import logger + +if _NEED_IMPORT_TORCH: + import torch + import torch.utils.checkpoint + from torch import nn + from torch.nn import CrossEntropyLoss, MSELoss, Module, Embedding +else: + from fastNLP.core.utils.dummy_class import DummyClass as Module, DummyClass as Embedding + +__all__ = [ + "BART_PRETRAINED_MODEL_ARCHIVE_LIST", + "BartForCausalLM", + "BartForConditionalGeneration", + "BartForQuestionAnswering", + "BartForSequenceClassification", + "BartModel", + "BartPretrainedModel", + "PretrainedBartModel", +] + +_CHECKPOINT_FOR_DOC = "facebook/bart-large" +_CONFIG_FOR_DOC = "BartConfig" +_TOKENIZER_FOR_DOC = "BartTokenizer" + + +BART_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "facebook/bart-large", + # See all BART models at https://huggingface.co/models?filter=bart +] + + +def shift_tokens_right(input_ids: "torch.Tensor", pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined." + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +def _make_causal_mask(input_ids_shape: "torch.Size", dtype: "torch.dtype", past_key_values_length: int = 0): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), float("-inf")) + mask_cond = torch.arange(mask.size(-1)) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +def _expand_mask(mask: "torch.Tensor", dtype: "torch.dtype", tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min) + + +class BartLearnedPositionalEmbedding(Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int): + # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2 + # and adjust num_embeddings appropriately. Other models don't have this hack + self.offset = 2 + super().__init__(num_embeddings + self.offset, embedding_dim) + + def forward(self, input_ids_shape: "torch.Size", past_key_values_length: int = 0): + """`input_ids_shape` is expected to be [bsz x seqlen].""" + bsz, seq_len = input_ids_shape[:2] + positions = torch.arange( + past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device + ) + return super().forward(positions + self.offset) + + +class BartAttention(Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + assert ( + self.head_dim * num_heads == self.embed_dim + ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})." + self.scaling = self.head_dim ** -0.5 + self.is_decoder = is_decoder + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: "torch.Tensor", seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: "torch.Tensor", + key_value_states: Optional["torch.Tensor"] = None, + past_key_value: Optional[Tuple["torch.Tensor"]] = None, + attention_mask: Optional["torch.Tensor"] = None, + layer_head_mask: Optional["torch.Tensor"] = None, + output_attentions: bool = False, + ) -> Tuple["torch.Tensor", Optional["torch.Tensor"], Optional[Tuple["torch.Tensor"]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + bsz, tgt_len, embed_dim = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value + + +class BartEncoderLayer(Module): + def __init__(self, config: BartConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = BartAttention( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: "torch.Tensor", + attention_mask: "torch.Tensor", + layer_head_mask: "torch.Tensor", + output_attentions: bool = False, + ): + """ + Args: + hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + attention_mask (:obj:`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states, attn_weights, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if hidden_states.dtype == torch.float16 and ( + torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any() + ): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class BartDecoderLayer(Module): + def __init__(self, config: BartConfig): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = BartAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.encoder_attn = BartAttention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: "torch.Tensor", + attention_mask: Optional["torch.Tensor"] = None, + encoder_hidden_states: Optional["torch.Tensor"] = None, + encoder_attention_mask: Optional["torch.Tensor"] = None, + layer_head_mask: Optional["torch.Tensor"] = None, + cross_attn_layer_head_mask: Optional["torch.Tensor"] = None, + past_key_value: Optional[Tuple["torch.Tensor"]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + ): + """ + Args: + hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (:obj:`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + cross_attn_layer_head_mask (:obj:`torch.FloatTensor`): mask for cross-attention heads in a given layer of + size `(decoder_attention_heads,)`. + past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class BartClassificationHead(Module): + """Head for sentence-level classification tasks.""" + + def __init__( + self, + input_dim: int, + inner_dim: int, + num_classes: int, + pooler_dropout: float, + ): + super().__init__() + self.dense = nn.Linear(input_dim, inner_dim) + self.dropout = nn.Dropout(p=pooler_dropout) + self.out_proj = nn.Linear(inner_dim, num_classes) + + def forward(self, hidden_states: "torch.Tensor"): + hidden_states = self.dropout(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = torch.tanh(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + +class BartPretrainedModel(PreTrainedModel): + config_class = BartConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_unexpected = [r"encoder\.version", r"decoder\.version"] + + def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, (BartDecoder, BartEncoder)): + module.gradient_checkpointing = value + + @property + def dummy_inputs(self): + pad_token = self.config.pad_token_id + input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device) + dummy_inputs = { + "attention_mask": input_ids.ne(pad_token), + "input_ids": input_ids, + } + return dummy_inputs + + +class PretrainedBartModel(BartPretrainedModel): + def __init_subclass__(self): + logger.warn( + "The class `PretrainedBartModel` has been depreciated, please use `BartPretrainedModel` instead.", + FutureWarning, + ) + + +BART_START_DOCSTRING = r""" + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Parameters: + config (:class:`~transformers.BartConfig`): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +BART_GENERATION_EXAMPLE = r""" + Summarization example:: + + >>> from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig + + >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn') + >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn') + + >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt') + + >>> # Generate Summary + >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True) + >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]) + + Mask filling example:: + + >>> from transformers import BartTokenizer, BartForConditionalGeneration + >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large') + >>> TXT = "My friends are but they eat too many carbs." + + >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large') + >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids'] + >>> logits = model(input_ids).logits + + >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() + >>> probs = logits[0, masked_index].softmax(dim=0) + >>> values, predictions = probs.topk(5) + + >>> tokenizer.decode(predictions).split() +""" + +BART_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using :class:`~transformers.BartTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~transformers.BartTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__ + + Bart uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If + :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see + :obj:`past_key_values`). + + For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no + :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to + the right for denoising pre-training following the paper. + decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`): + Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will + also be used by default. + + If you want to change padding behavior, you should read :func:`modeling_bart._prepare_decoder_inputs` and + modify to your needs. See diagram 1 in `the paper `__ for more + information on the default strategy. + head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in ``[0, + 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`): + Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`: + :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, + `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the + cross-attention of the decoder. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors + of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`. + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded + representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds` + have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert + :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + + If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds` + takes the value of :obj:`inputs_embeds`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. +""" + + +class BartEncoder(BartPretrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + :class:`BartEncoderLayer`. + + Args: + config: BartConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: BartConfig, embed_tokens: Optional["nn.Embedding"] = None): + super().__init__(config) + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + + embed_dim = config.d_model + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + + if embed_tokens is not None: + self.embed_tokens = embed_tokens + else: + self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx) + + self.embed_positions = BartLearnedPositionalEmbedding( + config.max_position_embeddings, + embed_dim, + ) + self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)]) + self.layernorm_embedding = nn.LayerNorm(embed_dim) + + self.init_weights() + self.gradient_checkpointing = False + + def forward( + self, + input_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using :class:`~transformers.BartTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` + for details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`): + Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded + representation. This is useful if you want more control over how to convert :obj:`input_ids` indices + into associated vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors + for more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + embed_pos = self.embed_positions(input_shape) + + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # expand attention_mask + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + assert head_mask.size()[0] == ( + len(self.layers) + ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}." + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if self.training and (dropout_probability < self.layerdrop): # skip the layer + layer_outputs = (None, None) + else: + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(encoder_layer), + hidden_states, + attention_mask, + (head_mask[idx] if head_mask is not None else None), + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class BartDecoder(BartPretrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`BartDecoderLayer` + + Args: + config: BartConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: BartConfig, embed_tokens: Optional["nn.Embedding"] = None): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + + if embed_tokens is not None: + self.embed_tokens = embed_tokens + else: + self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx) + + self.embed_positions = BartLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + ) + self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.layernorm_embedding = nn.LayerNorm(config.d_model) + + self.init_weights() + self.gradient_checkpointing = False + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length + ).to(self.device) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + def forward( + self, + input_ids=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + head_mask=None, + cross_attn_head_mask=None, + past_key_values=None, + inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using :class:`~transformers.BartTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` + for details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`): + Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`): + Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing + cross-attention on hidden heads. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 + tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional + tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential + decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last + :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of + shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, + sequence_length)`. + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded + representation. This is useful if you want more control over how to convert :obj:`input_ids` indices + into associated vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors + for more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) + + # embed positions + positions = self.embed_positions(input_shape, past_key_values_length) + + hidden_states = inputs_embeds + positions + hidden_states = self.layernorm_embedding(hidden_states) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + next_decoder_cache = () if use_cache else None + + # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired + for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): + if attn_mask is not None: + assert attn_mask.size()[0] == ( + len(self.layers) + ), f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}." + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + dropout_probability = random.uniform(0, 1) + if self.training and (dropout_probability < self.layerdrop): + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, use_cache) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + head_mask[idx] if head_mask is not None else None, + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, + None, + ) + else: + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + cross_attn_layer_head_mask=( + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None + ), + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[3 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + "The bare BART Model outputting raw hidden-states without any specific head on top.", + BART_START_DOCSTRING, +) +class BartModel(BartPretrainedModel): + def __init__(self, config: BartConfig): + super().__init__(config) + + padding_idx, vocab_size = config.pad_token_id, config.vocab_size + self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx) + + self.encoder = BartEncoder(config, self.shared) + self.decoder = BartDecoder(config, self.shared) + + self.init_weights() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, value): + self.shared = value + self.encoder.embed_tokens = self.shared + self.decoder.embed_tokens = self.shared + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=Seq2SeqModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + encoder_outputs=None, + past_key_values=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + + # different to other models, Bart automatically creates decoder_input_ids from + # input_ids if no decoder_input_ids are provided + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + input_ids, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING +) +class BartForConditionalGeneration(BartPretrainedModel): + base_model_prefix = "model" + _keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head\.weight"] + + def __init__(self, config: BartConfig): + super().__init__(config) + self.model = BartModel(config) + self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) + self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) + + self.init_weights() + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + def resize_token_embeddings(self, new_num_tokens: int) -> "nn.Embedding": + new_embeddings = super().resize_token_embeddings(new_num_tokens) + self._resize_final_logits_bias(new_num_tokens) + return new_embeddings + + def _resize_final_logits_bias(self, new_num_tokens: int) -> None: + old_num_tokens = self.final_logits_bias.shape[-1] + if new_num_tokens <= old_num_tokens: + new_bias = self.final_logits_bias[:, :new_num_tokens] + else: + extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device) + new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) + self.register_buffer("final_logits_bias", new_bias) + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + @add_end_docstrings(BART_GENERATION_EXAMPLE) + def forward( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + encoder_outputs=None, + past_key_values=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should either be in ``[0, ..., + config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``. + + Returns: + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + def prepare_inputs_for_generation( + self, + decoder_input_ids, + past=None, + attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + use_cache=None, + encoder_outputs=None, + **kwargs + ): + # cut decoder_input_ids if past is used + if past is not None: + decoder_input_ids = decoder_input_ids[:, -1:] + + return { + "input_ids": None, # encoder_outputs is defined. input_ids not needed + "encoder_outputs": encoder_outputs, + "past_key_values": past, + "decoder_input_ids": decoder_input_ids, + "attention_mask": attention_mask, + "head_mask": head_mask, + "decoder_head_mask": decoder_head_mask, + "cross_attn_head_mask": cross_attn_head_mask, + "use_cache": use_cache, # change this to avoid caching (presumably for debugging) + } + + def prepare_decoder_input_ids_from_labels(self, labels: "torch.Tensor"): + return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + + @staticmethod + def _reorder_cache(past, beam_idx): + reordered_past = () + for layer_past in past: + # cached cross_attention states don't have to be reordered -> they are always the same + reordered_past += ( + tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:], + ) + return reordered_past + + +@add_start_docstrings( + """ + Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE + tasks. + """, + BART_START_DOCSTRING, +) +class BartForSequenceClassification(BartPretrainedModel): + def __init__(self, config: BartConfig, **kwargs): + super().__init__(config, **kwargs) + self.model = BartModel(config) + self.classification_head = BartClassificationHead( + config.d_model, + config.d_model, + config.num_labels, + config.classifier_dropout, + ) + self.model._init_weights(self.classification_head.dense) + self.model._init_weights(self.classification_head.out_proj) + + @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=Seq2SeqSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + encoder_outputs=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + if input_ids is None and inputs_embeds is not None: + raise NotImplementedError( + f"Passing input embeddings is currently not supported for {self.__class__.__name__}" + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] # last hidden state + + eos_mask = input_ids.eq(self.config.eos_token_id) + + if len(torch.unique_consecutive(eos_mask.sum(1))) > 1: + raise ValueError("All examples must have the same number of tokens.") + sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[ + :, -1, : + ] + logits = self.classification_head(sentence_representation) + + loss = None + if labels is not None: + if self.config.num_labels == 1: + # regression + loss_fct = MSELoss() + loss = loss_fct(logits.view(-1), labels.view(-1)) + else: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return Seq2SeqSequenceClassifierOutput( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + """ + BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + BART_START_DOCSTRING, +) +class BartForQuestionAnswering(BartPretrainedModel): + def __init__(self, config): + super().__init__(config) + + config.num_labels = 2 + self.num_labels = config.num_labels + + self.model = BartModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.model._init_weights(self.qa_outputs) + + @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=Seq2SeqQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + encoder_outputs=None, + start_positions=None, + end_positions=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if start_positions is not None and end_positions is not None: + use_cache = False + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = ( + start_logits, + end_logits, + ) + outputs[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return Seq2SeqQuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +class BartDecoderWrapper(BartPretrainedModel): + """ + This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is + used in combination with the :class:`~transformers.EncoderDecoderModel` framework. + """ + + def __init__(self, config): + super().__init__(config) + self.decoder = BartDecoder(config) + + def forward(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + +class BartForCausalLM(BartPretrainedModel): + def __init__(self, config): + super().__init__(config) + config = copy.deepcopy(config) + config.is_decoder = True + config.is_encoder_decoder = False + self.model = BartDecoderWrapper(config) + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.init_weights() + + def get_input_embeddings(self): + return self.model.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.model.decoder.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model.decoder = decoder + + def get_decoder(self): + return self.model.decoder + + @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + head_mask=None, + cross_attn_head_mask=None, + past_key_values=None, + inputs_embeds=None, + labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using :class:`~transformers.BartTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` + for details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + if the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used + in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`): + Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 + tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional + tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two + additional tensors are only required when the model is used as a decoder in a Sequence to Sequence + model. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential + decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids`` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should either be in ``[0, ..., + config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are + ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., + config.vocab_size]``. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors + for more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + + Returns: + + Example:: + + >>> from transformers import BartTokenizer, BartForCausalLM + + >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large') + >>> model = BartForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False) + >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + """ + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + head_mask=head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + logits = self.lm_head(outputs[0]) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, use_cache=None, **kwargs): + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_ids.shape) + + if past: + input_ids = input_ids[:, -1:] + # first step, decoder_cached_states are empty + return { + "input_ids": input_ids, # encoder_outputs is defined. input_ids not needed + "attention_mask": attention_mask, + "past_key_values": past, + "use_cache": use_cache, + } + + @staticmethod + def _reorder_cache(past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + return reordered_past diff --git a/fastNLP/transformers/torch/models/bart/tokenization_bart.py b/fastNLP/transformers/torch/models/bart/tokenization_bart.py new file mode 100644 index 00000000..fe6c1d04 --- /dev/null +++ b/fastNLP/transformers/torch/models/bart/tokenization_bart.py @@ -0,0 +1,65 @@ +# coding=utf-8 +# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..roberta.tokenization_roberta import RobertaTokenizer +from fastNLP.core.log import logger + +__all__ = [ + "BartTokenizer", +] + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"} + +# See all BART models at https://huggingface.co/models?filter=bart +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/vocab.json", + "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/vocab.json", + "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/vocab.json", + "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/vocab.json", + "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/vocab.json", + "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/vocab.json", + }, + "merges_file": { + "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/merges.txt", + "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/merges.txt", + "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/merges.txt", + "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/merges.txt", + "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/merges.txt", + "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/merges.txt", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "facebook/bart-base": 1024, + "facebook/bart-large": 1024, + "facebook/bart-large-mnli": 1024, + "facebook/bart-large-cnn": 1024, + "facebook/bart-large-xsum": 1024, + "yjernite/bart_eli5": 1024, +} + + +class BartTokenizer(RobertaTokenizer): + r""" + Construct a BART tokenizer. + + :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to superclass + :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization + parameters and other methods. + """ + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES diff --git a/fastNLP/transformers/torch/models/bert/__init__.py b/fastNLP/transformers/torch/models/bert/__init__.py new file mode 100644 index 00000000..0edc1d6c --- /dev/null +++ b/fastNLP/transformers/torch/models/bert/__init__.py @@ -0,0 +1,27 @@ +__all__ = [ + "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", + "BertConfig", + + "BERT_PRETRAINED_MODEL_ARCHIVE_LIST", + "BertForMaskedLM", + "BertForMultipleChoice", + "BertForNextSentencePrediction", + "BertForPreTraining", + "BertForQuestionAnswering", + "BertForSequenceClassification", + "BertForTokenClassification", + "BertLayer", + "BertLMHeadModel", + "BertModel", + "BertPreTrainedModel", + + "BasicTokenizer", + "BertTokenizer", + "WordpieceTokenizer", +] + +from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP +from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer +from .modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST, BertForMaskedLM, BertForMultipleChoice, BertForPreTraining, \ + BertForNextSentencePrediction, BertForQuestionAnswering, BertForSequenceClassification, BertForTokenClassification, \ + BertLayer, BertLMHeadModel, BertModel, BertPreTrainedModel \ No newline at end of file diff --git a/fastNLP/transformers/torch/models/bert/configuration_bert.py b/fastNLP/transformers/torch/models/bert/configuration_bert.py new file mode 100644 index 00000000..f8be6082 --- /dev/null +++ b/fastNLP/transformers/torch/models/bert/configuration_bert.py @@ -0,0 +1,158 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" BERT model configuration """ + +from fastNLP.transformers.torch.configuration_utils import PretrainedConfig +from fastNLP.core.log import logger + +__all__ = [ + "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", + "BertConfig", +] + +BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/config.json", + "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/config.json", + "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/config.json", + "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/config.json", + "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json", + "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json", + "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/config.json", + "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/config.json", + "bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/config.json", + "bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/config.json", + "bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json", + "bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/config.json", + "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/config.json", + "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/config.json", + "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/config.json", + "cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/config.json", + "cl-tohoku/bert-base-japanese-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json", + "cl-tohoku/bert-base-japanese-char": "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/config.json", + "cl-tohoku/bert-base-japanese-char-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/config.json", + "TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json", + "TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/config.json", + "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/config.json", + # See all BERT models at https://huggingface.co/models?filter=bert +} + + +class BertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a + :class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments, + defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration + to that of the BERT `bert-base-uncased `__ architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model + outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. + + + Args: + vocab_size (:obj:`int`, `optional`, defaults to 30522): + Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the + :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or + :class:`~transformers.TFBertModel`. + hidden_size (:obj:`int`, `optional`, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (:obj:`int`, `optional`, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (:obj:`int`, `optional`, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (:obj:`int`, `optional`, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. + hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (:obj:`int`, `optional`, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (:obj:`int`, `optional`, defaults to 2): + The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or + :class:`~transformers.TFBertModel`. + initializer_range (:obj:`float`, `optional`, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`): + Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`, + :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on + :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.) + `__. For more information on :obj:`"relative_key_query"`, please refer to + `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.) + `__. + use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if ``config.is_decoder=True``. + classifier_dropout (:obj:`float`, `optional`): + The dropout ratio for the classification head. + + Examples:: + + >>> from transformers import BertModel, BertConfig + + >>> # Initializing a BERT bert-base-uncased style configuration + >>> configuration = BertConfig() + + >>> # Initializing a model from the bert-base-uncased style configuration + >>> model = BertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + """ + model_type = "bert" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + use_cache=True, + classifier_dropout=None, + **kwargs + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout diff --git a/fastNLP/transformers/torch/models/bert/modeling_bert.py b/fastNLP/transformers/torch/models/bert/modeling_bert.py new file mode 100644 index 00000000..b95da0df --- /dev/null +++ b/fastNLP/transformers/torch/models/bert/modeling_bert.py @@ -0,0 +1,1806 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model. """ + + +import math +from dataclasses import dataclass +from typing import Optional, Tuple + +from packaging import version + +from fastNLP.transformers.torch.activations import ACT2FN +from fastNLP.transformers.torch.file_utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from fastNLP.transformers.torch.modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + NextSentencePredictorOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from fastNLP.transformers.torch.modeling_utils import ( + PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) +from .configuration_bert import BertConfig +from fastNLP.envs.imports import _NEED_IMPORT_TORCH +from fastNLP.core.log import logger + +if _NEED_IMPORT_TORCH: + import torch + import torch.utils.checkpoint + from torch import nn + from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss, Module +else: + from fastNLP.core.utils.dummy_class import DummyClass as Module + +__all__ = [ + "BERT_PRETRAINED_MODEL_ARCHIVE_LIST", + "BertForMaskedLM", + "BertForMultipleChoice", + "BertForNextSentencePrediction", + "BertForPreTraining", + "BertForQuestionAnswering", + "BertForSequenceClassification", + "BertForTokenClassification", + "BertLayer", + "BertLMHeadModel", + "BertModel", + "BertPreTrainedModel", +] + +_CHECKPOINT_FOR_DOC = "bert-base-uncased" +_CONFIG_FOR_DOC = "BertConfig" +_TOKENIZER_FOR_DOC = "BertTokenizer" + +BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "bert-base-uncased", + "bert-large-uncased", + "bert-base-cased", + "bert-large-cased", + "bert-base-multilingual-uncased", + "bert-base-multilingual-cased", + "bert-base-chinese", + "bert-base-german-cased", + "bert-large-uncased-whole-word-masking", + "bert-large-cased-whole-word-masking", + "bert-large-uncased-whole-word-masking-finetuned-squad", + "bert-large-cased-whole-word-masking-finetuned-squad", + "bert-base-cased-finetuned-mrpc", + "bert-base-german-dbmdz-cased", + "bert-base-german-dbmdz-uncased", + "cl-tohoku/bert-base-japanese", + "cl-tohoku/bert-base-japanese-whole-word-masking", + "cl-tohoku/bert-base-japanese-char", + "cl-tohoku/bert-base-japanese-char-whole-word-masking", + "TurkuNLP/bert-base-finnish-cased-v1", + "TurkuNLP/bert-base-finnish-uncased-v1", + "wietsedv/bert-base-dutch-cased", + # See all BERT models at https://huggingface.co/models?filter=bert +] + +class BertEmbeddings(Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + if version.parse(torch.__version__) > version.parse("1.6.0"): + self.register_buffer( + "token_type_ids", + torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device), + persistent=False, + ) + + def forward( + self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +class BertSelfOutput(Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(Module): + def __init__(self, config): + super().__init__() + self.self = BertSelfAttention(config) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BertIntermediate(Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BertAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added" + self.crossattention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + assert hasattr( + self, "crossattention" + ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`" + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BertEncoder(Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, past_key_value, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class BertPooler(Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(Module): + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertOnlyMLMHead(Module): + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertOnlyNSPHead(Module): + def __init__(self, config): + super().__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class BertPreTrainingHeads(Module): + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class BertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BertConfig + base_model_prefix = "bert" + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, BertEncoder): + module.gradient_checkpointing = value + + +@dataclass +class BertForPreTrainingOutput(ModelOutput): + """ + Output type of :class:`~transformers.BertForPreTraining`. + + Args: + loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional["torch.FloatTensor"] = None + prediction_logits: "torch.FloatTensor" = None + seq_relationship_logits: "torch.FloatTensor" = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +BERT_START_DOCSTRING = r""" + + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Parameters: + config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. +""" + +BERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~transformers.BertTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + `What are position IDs? <../glossary.html#position-ids>`_ + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", + BERT_START_DOCSTRING, +) +class BertModel(BertPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in `Attention is + all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration + set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder` + argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an + input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + + self.pooler = BertPooler(config) if add_pooling_layer else None + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next + sentence prediction (classification)` head. + """, + BERT_START_DOCSTRING, +) +class BertForPreTraining(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config) + self.cls = BertPreTrainingHeads(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + next_sentence_label=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair + (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. + + Returns: + + Example:: + + >>> from transformers import BertTokenizer, BertForPreTraining + >>> import torch + + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + >>> model = BertForPreTraining.from_pretrained('bert-base-uncased') + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + total_loss = None + if labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + + if not return_dict: + output = (prediction_scores, seq_relationship_score) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return BertForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """Bert Model with a `language modeling` head on top for CLM fine-tuning. """, BERT_START_DOCSTRING +) +class BertLMHeadModel(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning("If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`") + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are + ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]`` + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + + Returns: + + Example:: + + >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig + >>> import torch + + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') + >>> config = BertConfig.from_pretrained("bert-base-cased") + >>> config.is_decoder = True + >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.logits + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss() + lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past} + + def _reorder_cache(self, past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + return reordered_past + + +@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) +class BertForMaskedLM(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + assert self.config.pad_token_id is not None, "The PAD token should be defined for generation" + attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1) + dummy_token = torch.full( + (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device + ) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {"input_ids": input_ids, "attention_mask": attention_mask} + + +@add_start_docstrings( + """Bert Model with a `next sentence prediction (classification)` head on top. """, + BERT_START_DOCSTRING, +) +class BertForNextSentencePrediction(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config) + self.cls = BertOnlyNSPHead(config) + + self.init_weights() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair + (see ``input_ids`` docstring). Indices should be in ``[0, 1]``: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + + Returns: + + Example:: + + >>> from transformers import BertTokenizer, BertForNextSentencePrediction + >>> import torch + + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') + + >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." + >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." + >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt') + + >>> outputs = model(**encoding, labels=torch.LongTensor([1])) + >>> logits = outputs.logits + >>> assert logits[0, 0] < logits[0, 1] # next sentence was random + """ + + if "next_sentence_label" in kwargs: + logger.warn( + "The `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.", + FutureWarning, + ) + labels = kwargs.pop("next_sentence_label") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + seq_relationship_scores = self.cls(pooled_output) + + next_sentence_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1)) + + if not return_dict: + output = (seq_relationship_scores,) + outputs[2:] + return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output + + return NextSentencePredictorOutput( + loss=next_sentence_loss, + logits=seq_relationship_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """, + BERT_START_DOCSTRING, +) +class BertForSequenceClassification(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + BERT_START_DOCSTRING, +) +class BertForMultipleChoice(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, 1) + + self.init_weights() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., + num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See + :obj:`input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + BERT_START_DOCSTRING, +) +class BertForTokenClassification(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config, add_pooling_layer=False) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - + 1]``. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) + ) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + BERT_START_DOCSTRING, +) +class BertForQuestionAnswering(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) diff --git a/fastNLP/transformers/torch/models/bert/tokenization_bert.py b/fastNLP/transformers/torch/models/bert/tokenization_bert.py new file mode 100644 index 00000000..26edd70d --- /dev/null +++ b/fastNLP/transformers/torch/models/bert/tokenization_bert.py @@ -0,0 +1,558 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Bert.""" + + +import collections +import os +import unicodedata +from typing import List, Optional, Tuple + +from fastNLP.transformers.torch.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from fastNLP.core.log import logger + +__all__ = [ + "BasicTokenizer", + "BertTokenizer", + "WordpieceTokenizer", +] + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", + "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", + "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt", + "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt", + "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/vocab.txt", + "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt", + "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt", + "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/vocab.txt", + "bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt", + "bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/vocab.txt", + "bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt", + "bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt", + "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt", + "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/vocab.txt", + "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt", + "TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt", + "TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/vocab.txt", + "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/vocab.txt", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "bert-base-uncased": 512, + "bert-large-uncased": 512, + "bert-base-cased": 512, + "bert-large-cased": 512, + "bert-base-multilingual-uncased": 512, + "bert-base-multilingual-cased": 512, + "bert-base-chinese": 512, + "bert-base-german-cased": 512, + "bert-large-uncased-whole-word-masking": 512, + "bert-large-cased-whole-word-masking": 512, + "bert-large-uncased-whole-word-masking-finetuned-squad": 512, + "bert-large-cased-whole-word-masking-finetuned-squad": 512, + "bert-base-cased-finetuned-mrpc": 512, + "bert-base-german-dbmdz-cased": 512, + "bert-base-german-dbmdz-uncased": 512, + "TurkuNLP/bert-base-finnish-cased-v1": 512, + "TurkuNLP/bert-base-finnish-uncased-v1": 512, + "wietsedv/bert-base-dutch-cased": 512, +} + +PRETRAINED_INIT_CONFIGURATION = { + "bert-base-uncased": {"do_lower_case": True}, + "bert-large-uncased": {"do_lower_case": True}, + "bert-base-cased": {"do_lower_case": False}, + "bert-large-cased": {"do_lower_case": False}, + "bert-base-multilingual-uncased": {"do_lower_case": True}, + "bert-base-multilingual-cased": {"do_lower_case": False}, + "bert-base-chinese": {"do_lower_case": False}, + "bert-base-german-cased": {"do_lower_case": False}, + "bert-large-uncased-whole-word-masking": {"do_lower_case": True}, + "bert-large-cased-whole-word-masking": {"do_lower_case": False}, + "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True}, + "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False}, + "bert-base-cased-finetuned-mrpc": {"do_lower_case": False}, + "bert-base-german-dbmdz-cased": {"do_lower_case": False}, + "bert-base-german-dbmdz-uncased": {"do_lower_case": True}, + "TurkuNLP/bert-base-finnish-cased-v1": {"do_lower_case": False}, + "TurkuNLP/bert-base-finnish-uncased-v1": {"do_lower_case": True}, + "wietsedv/bert-base-dutch-cased": {"do_lower_case": False}, +} + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class BertTokenizer(PreTrainedTokenizer): + r""" + Construct a BERT tokenizer. Based on WordPiece. + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. + Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + File containing the vocabulary. + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to lowercase the input when tokenizing. + do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to do basic tokenization before WordPiece. + never_split (:obj:`Iterable`, `optional`): + Collection of tokens which will never be split during tokenization. Only has an effect when + :obj:`do_basic_tokenize=True` + unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this `issue + `__). + strip_accents: (:obj:`bool`, `optional`): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for :obj:`lowercase` (as in the original BERT). + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__( + self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs + ): + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained " + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + ) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + + @property + def do_lower_case(self): + return self.basic_tokenizer.do_lower_case + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + def _tokenize(self, text): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): + + # If the token is part of the never_split set + if token in self.basic_tokenizer.never_split: + split_tokens.append(token) + else: + split_tokens += self.wordpiece_tokenizer.tokenize(token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: ``[CLS] X [SEP]`` + - pair of sequences: ``[CLS] A [SEP] B [SEP]`` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` method. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence + pair mask has the following format: + + :: + + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + + If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +class BasicTokenizer(object): + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + + Args: + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to lowercase the input when tokenizing. + never_split (:obj:`Iterable`, `optional`): + Collection of tokens which will never be split during tokenization. Only has an effect when + :obj:`do_basic_tokenize=True` + tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this `issue + `__). + strip_accents: (:obj:`bool`, `optional`): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for :obj:`lowercase` (as in the original BERT). + """ + + def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see + WordPieceTokenizer. + + Args: + **never_split**: (`optional`) list of str + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + :func:`PreTrainedTokenizer.tokenize`) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if never_split is not None and text in never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) # + or (cp >= 0x20000 and cp <= 0x2A6DF) # + or (cp >= 0x2A700 and cp <= 0x2B73F) # + or (cp >= 0x2B740 and cp <= 0x2B81F) # + or (cp >= 0x2B820 and cp <= 0x2CEAF) # + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) # + ): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform + tokenization using the given vocabulary. + + For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`. + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer`. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens diff --git a/fastNLP/transformers/torch/models/cpt/__init__.py b/fastNLP/transformers/torch/models/cpt/__init__.py new file mode 100644 index 00000000..58d9f918 --- /dev/null +++ b/fastNLP/transformers/torch/models/cpt/__init__.py @@ -0,0 +1,12 @@ +__all__ = [ + "CPT_PRETRAINED_MODEL_ARCHIVE_LIST", + "CPTForConditionalGeneration", + "CPTForSequenceClassification", + "CPTForMaskedLM", + "CPTForQuestionAnswering", + "CPTModel", + "CPTPretrainedModel", +] + +from .modeling_cpt import CPT_PRETRAINED_MODEL_ARCHIVE_LIST, CPTForConditionalGeneration, CPTForSequenceClassification, \ + CPTForMaskedLM, CPTForQuestionAnswering, CPTModel, CPTPretrainedModel \ No newline at end of file diff --git a/fastNLP/transformers/torch/models/cpt/modeling_cpt.py b/fastNLP/transformers/torch/models/cpt/modeling_cpt.py new file mode 100644 index 00000000..2910cc26 --- /dev/null +++ b/fastNLP/transformers/torch/models/cpt/modeling_cpt.py @@ -0,0 +1,1489 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch CPT model. modified from transformers==4.4.1""" +import math +import random +from typing import Optional, Tuple + +from fastNLP.transformers.torch.activations import ACT2FN +from fastNLP.transformers.torch.file_utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from fastNLP.transformers.torch.modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, + Seq2SeqQuestionAnsweringModelOutput, + Seq2SeqSequenceClassifierOutput, +) +from fastNLP.transformers.torch.modeling_utils import PreTrainedModel +from ..bart import BartConfig as CPTConfig +from ..bert import BertModel, BertConfig +from fastNLP.core.log import logger +from fastNLP.envs.imports import _NEED_IMPORT_TORCH + +if _NEED_IMPORT_TORCH: + import torch + import torch.nn.functional as F + import torch.utils.checkpoint + from torch import nn + from torch.nn import CrossEntropyLoss, LayerNorm, Module, Embedding +else: + from fastNLP.core.utils.dummy_class import( + DummyClass as Module, + DummyClass as Embedding + ) + +__all__ = [ + "CPT_PRETRAINED_MODEL_ARCHIVE_LIST", + "CPTForConditionalGeneration", + "CPTForSequenceClassification", + "CPTForMaskedLM", + "CPTForQuestionAnswering", + "CPTModel", + "CPTPretrainedModel", +] + +_CHECKPOINT_FOR_DOC = "fnlp/cpt-large" +_CONFIG_FOR_DOC = "CPTConfig" +_TOKENIZER_FOR_DOC = "CPTTokenizer" + + +CPT_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "fnlp/cpt-large", +] + + +def shift_tokens_right(input_ids: "torch.Tensor", pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined." + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +def _make_causal_mask(input_ids_shape: "torch.Size", dtype: "torch.dtype", past_key_values_length: int = 0): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), float("-inf")) + mask_cond = torch.arange(mask.size(-1)) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +def _expand_mask(mask: "torch.Tensor", dtype: "torch.dtype", tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min) + +def attention_mask_func(attention_scores, attention_mask): + return attention_scores + attention_mask + +def init_method(std): + def init_(tensor): + return torch.nn.init.normal_(tensor, mean=0.0, std=std) + + return init_ + +class CPTLearnedPositionalEmbedding(Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int): + # CPT is set up so that if padding_idx is specified then offset the embedding ids by 2 + # and adjust num_embeddings appropriately. Other models dont have this hack + self.offset = 2 + super().__init__(num_embeddings + self.offset, embedding_dim) + + def forward(self, input_ids_shape: "torch.Size", past_key_values_length: int = 0): + """`input_ids_shape` is expected to be [bsz x seqlen].""" + bsz, seq_len = input_ids_shape[:2] + positions = torch.arange( + past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device + ) + return super().forward(positions + self.offset) + + +class CPTAttention(Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + assert ( + self.head_dim * num_heads == self.embed_dim + ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})." + self.scaling = self.head_dim ** -0.5 + self.is_decoder = is_decoder + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + + def _shape(self, tensor: "torch.Tensor", seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: "torch.Tensor", + key_value_states: Optional["torch.Tensor"] = None, + past_key_value: Optional[Tuple["torch.Tensor"]] = None, + attention_mask: Optional["torch.Tensor"] = None, + layer_head_mask: Optional["torch.Tensor"] = None, + output_attentions: bool = False, + ) -> Tuple["torch.Tensor", Optional["torch.Tensor"], Optional[Tuple["torch.Tensor"]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + bsz, tgt_len, embed_dim = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + assert attn_weights.size() == ( + bsz * self.num_heads, + tgt_len, + src_len, + ), f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" + + if attention_mask is not None: + assert attention_mask.size() == ( + bsz, + 1, + tgt_len, + src_len, + ), f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = F.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + assert layer_head_mask.size() == ( + self.num_heads, + ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}" + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if output_attentions: + # this operation is a bit akward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + # with mpu.get_cuda_rng_tracker().fork(): + attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + assert attn_output.size() == ( + bsz * self.num_heads, + tgt_len, + self.head_dim, + ), f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}" + + attn_output = ( + attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + .transpose(1, 2) + .reshape(bsz, tgt_len, embed_dim) + ) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value + +class CPTDecoderLayer(Module): + def __init__(self, config: CPTConfig): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = CPTAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = LayerNorm(self.embed_dim) + self.encoder_attn = CPTAttention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + ) + self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: "torch.Tensor", + attention_mask: Optional["torch.Tensor"] = None, + encoder_hidden_states: Optional["torch.Tensor"] = None, + encoder_attention_mask: Optional["torch.Tensor"] = None, + layer_head_mask: Optional["torch.Tensor"] = None, + encoder_layer_head_mask: Optional["torch.Tensor"] = None, + past_key_value: Optional[Tuple["torch.Tensor"]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + ): + """ + Args: + hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)` + attention_mask (:obj:`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)` + encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size + `(config.encoder_attention_heads,)`. + encoder_layer_head_mask (:obj:`torch.FloatTensor`): mask for encoder attention heads in a given layer of + size `(config.encoder_attention_heads,)`. + past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=encoder_layer_head_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + ) + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class CPTClassificationHead(Module): + """Head for sentence-level classification tasks.""" + + def __init__( + self, + input_dim: int, + inner_dim: int, + num_classes: int, + pooler_dropout: float, + ): + super().__init__() + self.dense = nn.Linear(input_dim, inner_dim) + self.dropout = nn.Dropout(p=pooler_dropout) + self.out_proj = nn.Linear(inner_dim, num_classes) + + def forward(self, hidden_states: "torch.Tensor"): + hidden_states = self.dropout(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = torch.tanh(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + +class CPTPretrainedModel(PreTrainedModel): + config_class = CPTConfig + base_model_prefix = "model" + + def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + @property + def dummy_inputs(self): + pad_token = self.config.pad_token_id + input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device) + dummy_inputs = { + "attention_mask": input_ids.ne(pad_token), + "input_ids": input_ids, + } + return dummy_inputs + +CPT_START_DOCSTRING = r""" + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + Parameters: + config (:class:`~transformers.CPTConfig`): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +CPT_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + Indices can be obtained using :class:`~transformers.CPTTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + `What are attention masks? <../glossary.html#attention-mask>`__ + decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`): + Indices of decoder input sequence tokens in the vocabulary. + Indices can be obtained using :class:`~transformers.CPTTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + `What are input IDs? <../glossary.html#input-ids>`__ + CPT uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If + :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see + :obj:`past_key_values`). + For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no + :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to + the right for denoising pre-training following the paper. + decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`): + Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will + also be used by default. + If you want to change padding behavior, you should read :func:`modeling_cpt._prepare_decoder_inputs` and + modify to your needs. See diagram 1 in `the paper `__ for more + information on the default strategy. + head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``: + - 1 indicates the head is **not masked**, + - 0 indicates the heas is **masked**. + decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``: + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`): + Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`: + :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`, + `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the + cross-attention of the decoder. + past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`. + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded + representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds` + have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert + :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds` + takes the value of :obj:`inputs_embeds`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. +""" + +class CPTDecoder(CPTPretrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`CPTDecoderLayer` + Args: + config: CPTConfig + embed_tokens (torch.nn.Embedding): output embedding + """ + + def __init__(self, config: CPTConfig, embed_tokens: Optional["nn.Embedding"] = None): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + + if embed_tokens is not None: + self.embed_tokens = embed_tokens + else: + self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx) + + self.embed_positions = CPTLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + ) + self.layers = nn.ModuleList([CPTDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.layernorm_embedding = LayerNorm(config.d_model) + + self.init_weights() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length + ).to(self.device) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + def forward( + self, + input_ids=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + head_mask=None, + encoder_head_mask=None, + past_key_values=None, + inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + Indices can be obtained using :class:`~transformers.CPTTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` + for details. + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + `What are attention masks? <../glossary.html#attention-mask>`__ + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + `What are attention masks? <../glossary.html#attention-mask>`__ + head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``: + - 1 indicates the head is **not masked**, + - 0 indicates the heas is **masked**. + encoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention + on hidden heads. Mask values selected in ``[0, 1]``: + - 1 indicates the head is **not masked**, + - 0 indicates the heas is **masked**. + past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up + decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last + :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of + shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, + sequence_length)`. + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded + representation. This is useful if you want more control over how to convert :obj:`input_ids` indices + into associated vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under + returned tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors + for more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]) + + # embed positions + positions = self.embed_positions(input_shape, past_key_values_length) + + hidden_states = inputs_embeds + positions + hidden_states = self.layernorm_embedding(hidden_states) + + hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + next_decoder_cache = () if use_cache else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + assert head_mask.size()[0] == ( + len(self.layers) + ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}." + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + dropout_probability = random.uniform(0, 1) + if self.training and (dropout_probability < self.layerdrop): + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + + if use_cache: + logger.warn( + "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting " + "`use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, use_cache) + + return custom_forward + + # layer_outputs = mpu.checkpoint( + layer_outputs = torch.utils.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + head_mask[idx] if head_mask is not None else None, + encoder_head_mask[idx] if encoder_head_mask is not None else None, + None, + ) + else: + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + encoder_layer_head_mask=(encoder_head_mask[idx] if encoder_head_mask is not None else None), + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[3 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + "The bare CPT Model outputting raw hidden-states without any specific head on top.", + CPT_START_DOCSTRING, +) +class CPTModel(CPTPretrainedModel): + def __init__(self, config: CPTConfig): + super().__init__(config) + encoder_config = BertConfig( + vocab_size=config.vocab_size, + hidden_size=config.d_model, + num_hidden_layers=config.encoder_layers, + num_attention_heads=config.encoder_attention_heads, + intermediate_size=config.encoder_ffn_dim, + hidden_dropout_prob=config.activation_dropout, + attention_probs_dropout_prob=config.attention_dropout, + ) + config.vocab_size = encoder_config.vocab_size + self.encoder = BertModel(encoder_config, add_pooling_layer=False) + self.shared = self.encoder.get_input_embeddings() + self.decoder = CPTDecoder(config, self.shared) + self.num_decoder_layers = config.decoder_layers + self.init_weights() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, value): + self.shared = value + self.encoder.set_input_embeddings(self.shared) + self.decoder.embed_tokens = self.shared + + def get_encoder(self): + class _Encoder(torch.nn.Module): + def __init__(self, encoder): + super().__init__() + self.encoder = encoder + + def forward(self, *args, **kwargs): + kwargs['output_hidden_states'] = True + return self.encoder(*args, **kwargs) + return _Encoder(self.encoder) + + def get_decoder(self): + return self.decoder + + @add_start_docstrings_to_model_forward(CPT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=Seq2SeqModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + encoder_outputs=None, + past_key_values=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + + # different to other models, CPT automatically creates decoder_input_ids from + # input_ids if no decoder_input_ids are provided + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + input_ids, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + # mpu.reset_checkpointed_activations_memory_buffer() + use_cache = False + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=torch.ones_like(input_ids), + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=True, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and isinstance(encoder_outputs, (tuple, list)): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + if isinstance(encoder_outputs, (torch.Tensor)): + encoder_hidden_states = encoder_outputs + else: + encoder_hidden_states = encoder_outputs[1][-self.num_decoder_layers - 1] + + # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + encoder_head_mask=head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state if isinstance(encoder_outputs, dict) else None, + encoder_hidden_states=encoder_outputs.hidden_states if isinstance(encoder_outputs, dict) else None, + encoder_attentions=encoder_outputs.attentions if isinstance(encoder_outputs, dict) else None, + ) + + +@add_start_docstrings( + "The CPT Model with a language modeling head. Can be used for summarization.", CPT_START_DOCSTRING +) +class CPTForConditionalGeneration(CPTPretrainedModel): + base_model_prefix = "model" + _keys_to_ignore_on_load_missing = [ + r"final_logits_bias", + r"encoder\.version", + r"decoder\.version", + r"lm_head\.weight", + ] + + def __init__(self, config): + super().__init__(config) + self.model = CPTModel(config) + self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) + self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) + + self.init_weights() + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + def resize_token_embeddings(self, new_num_tokens: int) -> "nn.Embedding": + new_embeddings = super().resize_token_embeddings(new_num_tokens) + self._resize_final_logits_bias(new_num_tokens) + return new_embeddings + + def _resize_final_logits_bias(self, new_num_tokens: int) -> None: + old_num_tokens = self.final_logits_bias.shape[-1] + if new_num_tokens <= old_num_tokens: + new_bias = self.final_logits_bias[:, :new_num_tokens] + else: + extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device) + new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) + self.register_buffer("final_logits_bias", new_bias) + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + @add_start_docstrings_to_model_forward(CPT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + encoder_outputs=None, + past_key_values=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should either be in ``[0, ..., + config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``. + Returns: + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if decoder_input_ids is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + def prepare_inputs_for_generation( + self, + decoder_input_ids, + past=None, + attention_mask=None, + head_mask=None, + use_cache=None, + encoder_outputs=None, + **kwargs + ): + # cut decoder_input_ids if past is used + if past is not None: + decoder_input_ids = decoder_input_ids[:, -1:] + + return { + "input_ids": None, # encoder_outputs is defined. input_ids not needed + "encoder_outputs": encoder_outputs, + "past_key_values": past, + "decoder_input_ids": decoder_input_ids, + "attention_mask": attention_mask, + "head_mask": head_mask, + "use_cache": use_cache, # change this to avoid caching (presumably for debugging) + } + + @staticmethod + def _expand_inputs_for_generation( + input_ids: "torch.LongTensor", + expand_size: int = 1, + is_encoder_decoder: bool = False, + attention_mask: "torch.LongTensor" = None, + encoder_outputs = None, + **model_kwargs, + ): + expanded_return_idx = ( + torch.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1).to(input_ids.device) + ) + input_ids = input_ids.index_select(0, expanded_return_idx) + + if "token_type_ids" in model_kwargs: + token_type_ids = model_kwargs["token_type_ids"] + model_kwargs["token_type_ids"] = token_type_ids.index_select(0, expanded_return_idx) + + if attention_mask is not None: + model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx) + + if is_encoder_decoder: + assert encoder_outputs is not None + device = encoder_outputs.last_hidden_state.device + encoder_outputs["hidden_states"] = tuple(h.index_select(0, expanded_return_idx.to(device)) \ + for h in encoder_outputs["hidden_states"]) + model_kwargs["encoder_outputs"] = encoder_outputs + return input_ids, model_kwargs + + def prepare_decoder_input_ids_from_labels(self, labels: "torch.Tensor"): + return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + + @staticmethod + def _reorder_cache(past, beam_idx): + reordered_past = () + for layer_past in past: + # cached cross_attention states don't have to be reordered -> they are always the same + reordered_past += ( + tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:], + ) + return reordered_past + + +@add_start_docstrings( + """ + CPT model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE + tasks. + """, + CPT_START_DOCSTRING, +) +class CPTForSequenceClassification(CPTPretrainedModel): + def __init__(self, config: CPTConfig, cls_mode=1, **kwargs): + super().__init__(config, **kwargs) + self.model = CPTModel(config) + cls_mode = getattr(config, 'cls_mode', cls_mode) + if cls_mode == 1: + logger.info('Encoder for classification.') + cls_dim = config.d_model + elif cls_mode == 2: + logger.info('Decoder for classification.') + cls_dim = config.d_model + elif cls_mode == 3: + logger.info('Both encoder & decoder for classification.') + cls_dim = config.d_model * 2 + else: + raise NotImplementedError + + self.cls_head = CPTClassificationHead( + cls_dim, + cls_dim, + config.num_labels, + config.classifier_dropout, + ) + self.model._init_weights(self.cls_head.dense) + self.model._init_weights(self.cls_head.out_proj) + self.cls_mode = cls_mode + config.cls_mode = cls_mode + + @add_start_docstrings_to_model_forward(CPT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=Seq2SeqSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + encoder_outputs=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + if input_ids is None and inputs_embeds is not None: + raise NotImplementedError( + f"Passing input embeddings is currently not supported for {self.__class__.__name__}" + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + hidden_states = outputs.last_hidden_state + enc_hidden_states = outputs.encoder_last_hidden_state + enc_rep = enc_hidden_states[:, 0] + + eos_mask = input_ids.eq(self.config.eos_token_id) + + if len(torch.unique(eos_mask.sum(1))) > 1: + raise ValueError("All examples must have the same number of tokens.") + dec_rep = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[ + :, -1, : + ] + + if self.cls_mode == 1: + logits = self.cls_head(enc_rep) + elif self.cls_mode == 2: + logits = self.cls_head(dec_rep) + elif self.cls_mode == 3: + rep = torch.cat([enc_rep, dec_rep], dim=-1) + logits = self.cls_head(rep) + else: + raise NotImplementedError + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return Seq2SeqSequenceClassifierOutput( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + """ + CPT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + CPT_START_DOCSTRING, +) +class CPTForQuestionAnswering(CPTPretrainedModel): + def __init__(self, config: CPTConfig, cls_mode=1, **kwargs): + super().__init__(config, **kwargs) + config.num_labels = 2 + self.num_labels = config.num_labels + + self.model = CPTModel(config) + + cls_mode = getattr(config, 'cls_mode', cls_mode) + if cls_mode == 1: + logger.info('Encoder for classification.') + cls_dim = config.d_model + elif cls_mode == 2: + logger.info('Decoder for classification.') + cls_dim = config.d_model + elif cls_mode == 3: + logger.info('Both encoder & decoder for classification.') + cls_dim = config.d_model * 2 + else: + raise NotImplementedError + + self.qa_outputs = nn.Linear(cls_dim, config.num_labels) + self.model._init_weights(self.qa_outputs) + + self.cls_mode = cls_mode + config.cls_mode = cls_mode + + @add_start_docstrings_to_model_forward(CPT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=Seq2SeqSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + encoder_outputs=None, + start_positions=None, + end_positions=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is None and inputs_embeds is not None: + raise NotImplementedError( + f"Passing input embeddings is currently not supported for {self.__class__.__name__}" + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + hidden_states = outputs.last_hidden_state + enc_hidden_states = outputs.encoder_last_hidden_state + + if self.cls_mode == 1: + logits = self.qa_outputs(enc_hidden_states) + elif self.cls_mode == 2: + logits = self.qa_outputs(hidden_states) + elif self.cls_mode == 3: + rep = torch.cat([enc_hidden_states, hidden_states], dim=-1) + logits = self.qa_outputs(rep) + else: + raise NotImplementedError + + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = ( + start_logits, + end_logits, + ) + outputs[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return Seq2SeqQuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +class CPTForMaskedLM(CPTPretrainedModel): + _keys_to_ignore_on_load_missing = [ + r"final_logits_bias", + r"encoder\.version", + r"decoder\.version", + r"lm_head\.weight", + ] + def __init__(self, config, **kwargs): + super().__init__(config) + self.model = CPTModel(config) + self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) + self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) + + self.init_weights() + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + def get_output_embeddings(self): + return self.lm_head + + def forward( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + encoder_outputs=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is None and inputs_embeds is not None: + raise NotImplementedError( + f"Passing input embeddings is currently not supported for {self.__class__.__name__}" + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + ) + + hidden_states = outputs.last_hidden_state + enc_hidden_states = outputs.encoder_last_hidden_state + + dec_logits = self.lm_head(hidden_states) + self.final_logits_bias + enc_logits = self.lm_head(enc_hidden_states) + self.final_logits_bias + + if not return_dict: + logits = (enc_logits, dec_logits) + output = (logits,) + outputs[1:] + return output + + return Seq2SeqLMOutput( + loss=None, + logits=(enc_logits, dec_logits), + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) \ No newline at end of file diff --git a/fastNLP/transformers/torch/models/gpt2/__init__.py b/fastNLP/transformers/torch/models/gpt2/__init__.py new file mode 100644 index 00000000..70f24bfa --- /dev/null +++ b/fastNLP/transformers/torch/models/gpt2/__init__.py @@ -0,0 +1,19 @@ +__all__ = [ + "GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", + "GPT2Config", + + "GPT2_PRETRAINED_MODEL_ARCHIVE_LIST", + "GPT2DoubleHeadsModel", + "GPT2ForSequenceClassification", + "GPT2ForTokenClassification", + "GPT2LMHeadModel", + "GPT2Model", + "GPT2PreTrainedModel", + + "GPT2Tokenizer", +] + +from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config +from .tokenization_gpt2 import GPT2Tokenizer +from .modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_LIST, GPT2DoubleHeadsModel, GPT2ForSequenceClassification, \ + GPT2ForTokenClassification, GPT2LMHeadModel, GPT2Model, GPT2PreTrainedModel \ No newline at end of file diff --git a/fastNLP/transformers/torch/models/gpt2/configuration_gpt2.py b/fastNLP/transformers/torch/models/gpt2/configuration_gpt2.py new file mode 100644 index 00000000..c0794e5a --- /dev/null +++ b/fastNLP/transformers/torch/models/gpt2/configuration_gpt2.py @@ -0,0 +1,184 @@ +# coding=utf-8 +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" OpenAI GPT-2 configuration """ + +from fastNLP.transformers.torch.configuration_utils import PretrainedConfig + +__all__ = [ + "GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", + "GPT2Config", +] + +GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "gpt2": "https://huggingface.co/gpt2/resolve/main/config.json", + "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/config.json", + "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/config.json", + "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/config.json", + "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json", +} + + +class GPT2Config(PretrainedConfig): + """ + This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model` or a + :class:`~transformers.TFGPT2Model`. It is used to instantiate a GPT-2 model according to the specified arguments, + defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration + to that of the GPT-2 `small `__ architecture. + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model + outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. + + + Args: + vocab_size (:obj:`int`, `optional`, defaults to 50257): + Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the + :obj:`inputs_ids` passed when calling :class:`~transformers.GPT2Model` or + :class:`~transformers.TFGPT2Model`. + n_positions (:obj:`int`, `optional`, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + n_ctx (:obj:`int`, `optional`, defaults to 1024): + Dimensionality of the causal mask (usually same as n_positions). + n_embd (:obj:`int`, `optional`, defaults to 768): + Dimensionality of the embeddings and hidden states. + n_layer (:obj:`int`, `optional`, defaults to 12): + Number of hidden layers in the Transformer encoder. + n_head (:obj:`int`, `optional`, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + n_inner (:obj:`int`, `optional`, defaults to None): + Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd + activation_function (:obj:`str`, `optional`, defaults to :obj:`"gelu"`): + Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]`. + resid_pdrop (:obj:`float`, `optional`, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + embd_pdrop (:obj:`int`, `optional`, defaults to 0.1): + The dropout ratio for the embeddings. + attn_pdrop (:obj:`float`, `optional`, defaults to 0.1): + The dropout ratio for the attention. + layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5): + The epsilon to use in the layer normalization layers + initializer_range (:obj:`float`, `optional`, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + summary_type (:obj:`string`, `optional`, defaults to :obj:`"cls_index"`): + Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel` + and :class:`~transformers.TFGPT2DoubleHeadsModel`. + + Has to be one of the following options: + + - :obj:`"last"`: Take the last token hidden state (like XLNet). + - :obj:`"first"`: Take the first token hidden state (like BERT). + - :obj:`"mean"`: Take the mean of all tokens hidden states. + - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2). + - :obj:`"attn"`: Not implemented now, use multi-head attention. + summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`): + Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel` + and :class:`~transformers.TFGPT2DoubleHeadsModel`. + + Whether or not to add a projection after the vector extraction. + summary_activation (:obj:`str`, `optional`): + Argument used when doing sequence summary. Used in for the multiple choice head in + :class:`~transformers.GPT2DoubleHeadsModel`. + + Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation. + summary_proj_to_labels (:obj:`bool`, `optional`, defaults to :obj:`True`): + Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel` + and :class:`~transformers.TFGPT2DoubleHeadsModel`. + + Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes. + summary_first_dropout (:obj:`float`, `optional`, defaults to 0.1): + Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel` + and :class:`~transformers.TFGPT2DoubleHeadsModel`. + + The dropout ratio to be used after the projection and activation. + scale_attn_weights (:obj:`bool`, `optional`, defaults to :obj:`True`): + Scale attention weights by dividing by sqrt(hidden_size).. + use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not the model should return the last key/values attentions (not used by all models). + + Example:: + + >>> from transformers import GPT2Model, GPT2Config + + >>> # Initializing a GPT2 configuration + >>> configuration = GPT2Config() + + >>> # Initializing a model from the configuration + >>> model = GPT2Model(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + """ + + model_type = "gpt2" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "hidden_size": "n_embd", + "max_position_embeddings": "n_positions", + "num_attention_heads": "n_head", + "num_hidden_layers": "n_layer", + } + + def __init__( + self, + vocab_size=50257, + n_positions=1024, + n_ctx=1024, + n_embd=768, + n_layer=12, + n_head=12, + n_inner=None, + activation_function="gelu_new", + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + summary_type="cls_index", + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + scale_attn_weights=True, + use_cache=True, + bos_token_id=50256, + eos_token_id=50256, + **kwargs + ): + self.vocab_size = vocab_size + self.n_ctx = n_ctx + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.n_inner = n_inner + self.activation_function = activation_function + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_first_dropout = summary_first_dropout + self.summary_proj_to_labels = summary_proj_to_labels + self.scale_attn_weights = scale_attn_weights + self.use_cache = use_cache + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/fastNLP/transformers/torch/models/gpt2/modeling_gpt2.py b/fastNLP/transformers/torch/models/gpt2/modeling_gpt2.py new file mode 100644 index 00000000..27626f0d --- /dev/null +++ b/fastNLP/transformers/torch/models/gpt2/modeling_gpt2.py @@ -0,0 +1,1393 @@ +# coding=utf-8 +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch OpenAI GPT-2 model.""" + +from dataclasses import dataclass +from typing import Optional, Tuple + +from .configuration_gpt2 import GPT2Config +from fastNLP.transformers.torch.activations import ACT2FN +from fastNLP.transformers.torch.file_utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from fastNLP.transformers.torch.modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from fastNLP.transformers.torch.modeling_utils import ( + Conv1D, + PreTrainedModel, + SequenceSummary, + find_pruneable_heads_and_indices, + prune_conv1d_layer, +) +from fastNLP.transformers.torch.utils.model_parallel_utils import assert_device_map, get_device_map + +from fastNLP.envs.imports import _NEED_IMPORT_TORCH +from fastNLP.core.log import logger + +__all__ = [ + "GPT2_PRETRAINED_MODEL_ARCHIVE_LIST", + "GPT2DoubleHeadsModel", + "GPT2ForSequenceClassification", + "GPT2ForTokenClassification", + "GPT2LMHeadModel", + "GPT2Model", + "GPT2PreTrainedModel", +] + +if _NEED_IMPORT_TORCH: + import torch + import torch.utils.checkpoint + from torch import nn + from torch.nn import CrossEntropyLoss, MSELoss, Module +else: + from fastNLP.core.utils.dummy_class import DummyClass as Module + +_CHECKPOINT_FOR_DOC = "gpt2" +_CONFIG_FOR_DOC = "GPT2Config" +_TOKENIZER_FOR_DOC = "GPT2Tokenizer" + +GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "gpt2", + "gpt2-medium", + "gpt2-large", + "gpt2-xl", + "distilgpt2", + # See all GPT-2 models at https://huggingface.co/models?filter=gpt2 +] + +class GPT2Attention(Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + + max_positions = config.max_position_embeddings + self.register_buffer( + "bias", + torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view( + 1, 1, max_positions, max_positions + ), + ) + self.register_buffer("masked_bias", torch.tensor(-1e4)) + + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + self.split_size = self.embed_dim + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." + ) + + self.scale_attn_weights = config.scale_attn_weights + self.is_cross_attention = is_cross_attention + + if self.is_cross_attention: + self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim) + self.q_attn = Conv1D(self.embed_dim, self.embed_dim) + else: + self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim) + self.c_proj = Conv1D(self.embed_dim, self.embed_dim) + + self.attn_dropout = nn.Dropout(config.attn_pdrop) + self.resid_dropout = nn.Dropout(config.resid_pdrop) + + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads) + index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)]) + + # Prune conv1d layers + self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1) + self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0) + + # Update hyper params + self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads)) + self.num_heads = self.num_heads - len(heads) + self.pruned_heads = self.pruned_heads.union(heads) + + def _attn(self, query, key, value, attention_mask=None, head_mask=None): + attn_weights = torch.matmul(query, key.transpose(-1, -2)) + + if self.scale_attn_weights: + attn_weights = attn_weights / (float(value.size(-1)) ** 0.5) + + if not self.is_cross_attention: + # if only "normal" attention layer implements causal mask + query_length, key_length = query.size(-2), key.size(-2) + causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool() + attn_weights = torch.where(causal_mask, attn_weights, self.masked_bias.to(attn_weights.dtype)) + + if attention_mask is not None: + # Apply the attention mask + attn_weights = attn_weights + attention_mask + + attn_weights = nn.Softmax(dim=-1)(attn_weights) + attn_weights = self.attn_dropout(attn_weights) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + attn_output = torch.matmul(attn_weights, value) + + return attn_output, attn_weights + + def _split_heads(self, tensor, num_heads, attn_head_size): + """ + Splits hidden_size dim into attn_head_size and num_heads + """ + new_shape = tensor.size()[:-1] + (num_heads, attn_head_size) + tensor = tensor.view(*new_shape) + return tensor.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features) + + def _merge_heads(self, tensor, num_heads, attn_head_size): + """ + Merges attn_head_size dim and num_attn_heads dim into hidden_size + """ + tensor = tensor.permute(0, 2, 1, 3).contiguous() + new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,) + return tensor.view(new_shape) + + def forward( + self, + hidden_states, + layer_past=None, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + use_cache=False, + output_attentions=False, + ): + if encoder_hidden_states is not None: + if not hasattr(self, "q_attn"): + raise ValueError( + "If class is used as cross attention, the weights `q_attn` have to be defined. " + "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`." + ) + + query = self.q_attn(hidden_states) + key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2) + attention_mask = encoder_attention_mask + else: + query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2) + + query = self._split_heads(query, self.num_heads, self.head_dim) + key = self._split_heads(key, self.num_heads, self.head_dim) + value = self._split_heads(value, self.num_heads, self.head_dim) + + if layer_past is not None: + past_key, past_value = layer_past + key = torch.cat((past_key, key), dim=-2) + value = torch.cat((past_value, value), dim=-2) + + if use_cache is True: + present = (key, value) + else: + present = None + + attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) + + attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim) + attn_output = self.c_proj(attn_output) + attn_output = self.resid_dropout(attn_output) + + outputs = (attn_output, present) + if output_attentions: + outputs += (attn_weights,) + + return outputs # a, present, (attentions) + + +class GPT2MLP(Module): + def __init__(self, intermediate_size, config): + super().__init__() + embed_dim = config.hidden_size + self.c_fc = Conv1D(intermediate_size, embed_dim) + self.c_proj = Conv1D(embed_dim, intermediate_size) + self.act = ACT2FN[config.activation_function] + self.dropout = nn.Dropout(config.resid_pdrop) + + def forward(self, hidden_states): + hidden_states = self.c_fc(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.c_proj(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class GPT2Block(Module): + def __init__(self, config): + super().__init__() + hidden_size = config.hidden_size + inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size + + self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + self.attn = GPT2Attention(config) + self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + if config.add_cross_attention: + self.crossattention = GPT2Attention(config, is_cross_attention=True) + self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + self.mlp = GPT2MLP(inner_dim, config) + + def forward( + self, + hidden_states, + layer_past=None, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + use_cache=False, + output_attentions=False, + ): + residual = hidden_states + hidden_states = self.ln_1(hidden_states) + attn_outputs = self.attn( + hidden_states, + layer_past=layer_past, + attention_mask=attention_mask, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + ) + attn_output = attn_outputs[0] # output_attn: a, present, (attentions) + outputs = attn_outputs[1:] + # residual connection + hidden_states = attn_output + residual + + if encoder_hidden_states is not None: + # add one self-attention block for cross-attention + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with " + "cross-attention layers by setting `config.add_cross_attention=True`" + ) + residual = hidden_states + hidden_states = self.ln_cross_attn(hidden_states) + cross_attn_outputs = self.crossattention( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + ) + attn_output = cross_attn_outputs[0] + # residual connection + hidden_states = residual + attn_output + outputs = outputs + cross_attn_outputs[2:] # add cross attentions if we output attention weights + + residual = hidden_states + hidden_states = self.ln_2(hidden_states) + feed_forward_hidden_states = self.mlp(hidden_states) + # residual connection + hidden_states = residual + feed_forward_hidden_states + + if use_cache: + outputs = (hidden_states,) + outputs + else: + outputs = (hidden_states,) + outputs[1:] + + return outputs # hidden_states, present, (attentions, cross_attentions) + + +class GPT2PreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = GPT2Config + base_model_prefix = "transformer" + is_parallelizable = True + supports_gradient_checkpointing = True + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, (nn.Linear, Conv1D)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, GPT2Model): + module.gradient_checkpointing = value + + +@dataclass +class GPT2DoubleHeadsModelOutput(ModelOutput): + """ + Base class for outputs of models predicting if two sentences are consecutive or not. + + Args: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided): + Language modeling loss. + mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided): + Multiple choice classification loss. + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + mc_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): + Prediction scores of the multiple choice classification head (scores for each choice before SoftMax). + past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): + Tuple of length :obj:`config.n_layers`, containing tuples of tensors of shape :obj:`(batch_size, num_heads, + sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + :obj:`past_key_values` input) to speed up sequential decoding. + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, + sequence_length, sequence_length)`. + + GPT2Attentions weights after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional["torch.FloatTensor"] = None + mc_loss: Optional["torch.FloatTensor"] = None + logits: "torch.FloatTensor" = None + mc_logits: "torch.FloatTensor" = None + past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None + hidden_states: Optional[Tuple["torch.FloatTensor"]] = None + attentions: Optional[Tuple["torch.FloatTensor"]] = None + + +GPT2_START_DOCSTRING = r""" + + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Parameters: + config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. +""" + +GPT2_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`): + :obj:`input_ids_length` = ``sequence_length`` if :obj:`past_key_values` is ``None`` else + ``past_key_values[0][0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input + sequence tokens in the vocabulary. + + If :obj:`past_key_values` is used, only ``input_ids`` that do not have their past calculated should be + passed as ``input_ids``. + + Indices can be obtained using :class:`~transformers.GPT2Tokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers`): + Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see + :obj:`past_key_values` output below). Can be used to speed up sequential decoding. The ``input_ids`` which + have their past given to this model should not be passed as ``input_ids`` as they have already been + computed. + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + `What are position IDs? <../glossary.html#position-ids>`_ + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + + If :obj:`past_key_values` is used, optionally only the last :obj:`inputs_embeds` have to be input (see + :obj:`past_key_values`). + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. +""" +PARALLELIZE_DOCSTRING = r""" + This is an experimental feature and is a subject to change at a moment's notice. + + Uses a device map to distribute attention modules of the model across several devices. If no device map is given, + it will evenly distribute blocks across all devices. + + Args: + device_map (:obj:`Dict[int, list]`, optional, defaults to None): + A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always + automatically mapped to the first device (for esoteric reasons). That means that the first device should + have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the + following number of attention modules: + + - gpt2: 12 + - gpt2-medium: 24 + - gpt2-large: 36 + - gpt2-xl: 48 + + Example:: + + # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules: + model = GPT2LMHeadModel.from_pretrained('gpt2-xl') + device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7, 8], + + 1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21], + 2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34], + 3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]} + model.parallelize(device_map) +""" +DEPARALLELIZE_DOCSTRING = r""" + Moves the model to cpu from a model parallel state. + + Example:: + + # On a 4 GPU machine with gpt2-large: + model = GPT2LMHeadModel.from_pretrained('gpt2-large') + device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7], + + 1: [8, 9, 10, 11, 12, 13, 14, 15], + 2: [16, 17, 18, 19, 20, 21, 22, 23], + 3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]} + model.parallelize(device_map) # Splits the model across several devices + model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache() +""" + + +@add_start_docstrings( + "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.", + GPT2_START_DOCSTRING, +) +class GPT2Model(GPT2PreTrainedModel): + _keys_to_ignore_on_load_missing = ["attn.masked_bias"] + + def __init__(self, config): + super().__init__(config) + + self.embed_dim = config.hidden_size + + self.wte = nn.Embedding(config.vocab_size, self.embed_dim) + self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim) + + self.drop = nn.Dropout(config.embd_pdrop) + self.h = nn.ModuleList([GPT2Block(config) for _ in range(config.num_hidden_layers)]) + self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + self.init_weights() + + # Model parallel + self.model_parallel = False + self.device_map = None + self.gradient_checkpointing = False + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + # Check validity of device_map + self.device_map = ( + get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map + ) + assert_device_map(self.device_map, len(self.h)) + self.model_parallel = True + self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys())) + self.last_device = "cuda:" + str(max(self.device_map.keys())) + self.wte = self.wte.to(self.first_device) + self.wpe = self.wpe.to(self.first_device) + # Load onto devices + for k, v in self.device_map.items(): + for block in v: + cuda_device = "cuda:" + str(k) + self.h[block] = self.h[block].to(cuda_device) + # ln_f to last + self.ln_f = self.ln_f.to(self.last_device) + + @add_start_docstrings(DEPARALLELIZE_DOCSTRING) + def deparallelize(self): + self.model_parallel = False + self.device_map = None + self.first_device = "cpu" + self.last_device = "cpu" + self.wte = self.wte.to("cpu") + self.wpe = self.wpe.to("cpu") + for index in range(len(self.h)): + self.h[index] = self.h[index].to("cpu") + self.ln_f = self.ln_f.to("cpu") + torch.cuda.empty_cache() + + def get_input_embeddings(self): + return self.wte + + def set_input_embeddings(self, new_embeddings): + self.wte = new_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + """ + for layer, heads in heads_to_prune.items(): + self.h[layer].attn.prune_heads(heads) + + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPastAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + past_key_values=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + batch_size = input_ids.shape[0] + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size = inputs_embeds.shape[0] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, input_shape[-1]) + if position_ids is not None: + position_ids = position_ids.view(-1, input_shape[-1]) + + if past_key_values is None: + past_length = 0 + past_key_values = tuple([None] * len(self.h)) + else: + past_length = past_key_values[0][0].size(-2) + if position_ids is None: + position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) + position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) + + # GPT2Attention mask. + if attention_mask is not None: + if batch_size <= 0: + raise ValueError("batch_size has to be defined and > 0") + attention_mask = attention_mask.view(batch_size, -1) + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask = attention_mask[:, None, None, :] + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_mask = attention_mask.to(dtype=self.dtype) # fp16 compatibility + attention_mask = (1.0 - attention_mask) * -10000.0 + + # If a 2D ou 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.add_cross_attention and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # head_mask has shape n_layer x batch x n_heads x N x N + head_mask = self.get_head_mask(head_mask, self.config.n_layer) + + if inputs_embeds is None: + inputs_embeds = self.wte(input_ids) + position_embeds = self.wpe(position_ids) + hidden_states = inputs_embeds + position_embeds + + if token_type_ids is not None: + token_type_embeds = self.wte(token_type_ids) + hidden_states = hidden_states + token_type_embeds + + hidden_states = self.drop(hidden_states) + + output_shape = input_shape + (hidden_states.size(-1),) + + presents = () if use_cache else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + all_hidden_states = () if output_hidden_states else None + for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): + + # Model parallel + if self.model_parallel: + torch.cuda.set_device(hidden_states.device) + # Ensure layer_past is on same device as hidden_states (might not be correct) + if layer_past is not None: + layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past) + # Ensure that attention_mask is always on the same device as hidden_states + if attention_mask is not None: + attention_mask = attention_mask.to(hidden_states.device) + if isinstance(head_mask, torch.Tensor): + head_mask = head_mask.to(hidden_states.device) + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, use_cache, output_attentions) + + return custom_forward + + outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(block), + hidden_states, + None, + attention_mask, + head_mask[i], + encoder_hidden_states, + encoder_attention_mask, + ) + else: + outputs = block( + hidden_states, + layer_past=layer_past, + attention_mask=attention_mask, + head_mask=head_mask[i], + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + ) + + hidden_states = outputs[0] + if use_cache is True: + presents = presents + (outputs[1],) + + if output_attentions: + all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],) + + # Model Parallel: If it's the last layer for that device, put things on the next device + if self.model_parallel: + for k, v in self.device_map.items(): + if i == v[-1] and "cuda:" + str(k) != self.last_device: + hidden_states = hidden_states.to("cuda:" + str(k + 1)) + + hidden_states = self.ln_f(hidden_states) + + hidden_states = hidden_states.view(*output_shape) + # Add last hidden state + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions] + if v is not None + ) + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=presents, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + """ + The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """, + GPT2_START_DOCSTRING, +) +class GPT2LMHeadModel(GPT2PreTrainedModel): + _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.transformer = GPT2Model(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + + self.init_weights() + + # Model parallel + self.model_parallel = False + self.device_map = None + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + self.device_map = ( + get_device_map(len(self.transformer.h), range(torch.cuda.device_count())) + if device_map is None + else device_map + ) + assert_device_map(self.device_map, len(self.transformer.h)) + self.transformer.parallelize(self.device_map) + self.lm_head = self.lm_head.to(self.transformer.first_device) + self.model_parallel = True + + @add_start_docstrings(DEPARALLELIZE_DOCSTRING) + def deparallelize(self): + self.transformer.deparallelize() + self.transformer = self.transformer.to("cpu") + self.lm_head = self.lm_head.to("cpu") + self.model_parallel = False + torch.cuda.empty_cache() + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs): + token_type_ids = kwargs.get("token_type_ids", None) + # only last token for inputs_ids if past is defined in kwargs + if past: + input_ids = input_ids[:, -1].unsqueeze(-1) + if token_type_ids is not None: + token_type_ids = token_type_ids[:, -1].unsqueeze(-1) + + attention_mask = kwargs.get("attention_mask", None) + position_ids = kwargs.get("position_ids", None) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past: + position_ids = position_ids[:, -1].unsqueeze(-1) + else: + position_ids = None + return { + "input_ids": input_ids, + "past_key_values": past, + "use_cache": kwargs.get("use_cache"), + "position_ids": position_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + } + + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=CausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + past_key_values=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to + ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.transformer.first_device) + hidden_states = hidden_states.to(self.lm_head.weight.device) + + lm_logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + cross_attentions=transformer_outputs.cross_attentions, + ) + + @staticmethod + def _reorder_cache(past: Tuple[Tuple["torch.Tensor"]], beam_idx: "torch.Tensor") -> Tuple[Tuple["torch.Tensor"]]: + """ + This function is used to re-order the :obj:`past_key_values` cache if + :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is + called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step. + """ + return tuple( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past) + for layer_past in past + ) + + +@add_start_docstrings( + """ +The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for +RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the +input embeddings, the classification head takes as input the input of a specified classification token index in the +input sequence). +""", + GPT2_START_DOCSTRING, +) +class GPT2DoubleHeadsModel(GPT2PreTrainedModel): + _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + config.num_labels = 1 + self.transformer = GPT2Model(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + self.multiple_choice_head = SequenceSummary(config) + + self.init_weights() + + # Model parallel + self.model_parallel = False + self.device_map = None + + @add_start_docstrings(PARALLELIZE_DOCSTRING) + def parallelize(self, device_map=None): + self.device_map = ( + get_device_map(len(self.transformer.h), range(torch.cuda.device_count())) + if device_map is None + else device_map + ) + assert_device_map(self.device_map, len(self.transformer.h)) + self.transformer.parallelize(self.device_map) + self.lm_head = self.lm_head.to(self.transformer.first_device) + self.multiple_choice_head = self.multiple_choice_head.to(self.transformer.first_device) + self.model_parallel = True + + @add_start_docstrings(DEPARALLELIZE_DOCSTRING) + def deparallelize(self): + self.transformer.deparallelize() + self.transformer = self.transformer.to("cpu") + self.lm_head = self.lm_head.to("cpu") + self.multiple_choice_head = self.multiple_choice_head.to("cpu") + self.model_parallel = False + torch.cuda.empty_cache() + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs): + token_type_ids = kwargs.get("token_type_ids", None) + # only last token for inputs_ids if past is defined in kwargs + if past: + input_ids = input_ids[:, -1].unsqueeze(-1) + if token_type_ids is not None: + token_type_ids = token_type_ids[:, -1].unsqueeze(-1) + + attention_mask = kwargs.get("attention_mask", None) + position_ids = kwargs.get("position_ids", None) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past: + position_ids = position_ids[:, -1].unsqueeze(-1) + else: + position_ids = None + + return { + "input_ids": input_ids, + "past_key_values": past, + "use_cache": kwargs.get("use_cache"), + "position_ids": position_ids, + "attention_mask": attention_mask, + "token_type_ids": token_type_ids, + } + + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + past_key_values=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + mc_token_ids=None, + labels=None, + mc_labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + **kwargs, + ): + r""" + mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input): + Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) - + 1[``. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size - 1]`` All labels set to + ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size - 1]`` + mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`): + Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., + num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see + `input_ids` above) + + Return: + + Example:: + + >>> import torch + >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel + + >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2') + + >>> # Add a [CLS] to the vocabulary (we should train it also!) + >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'}) + + >>> embedding_layer = model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size + + >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] + >>> encoded_choices = [tokenizer.encode(s) for s in choices] + >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices] + + >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2 + >>> mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1 + + >>> outputs = model(input_ids, mc_token_ids=mc_token_ids) + >>> lm_logits = outputs.logits + >>> mc_logits = outputs.mc_logits + + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + + # Set device for model parallelism + if self.model_parallel: + torch.cuda.set_device(self.transformer.first_device) + hidden_states = hidden_states.to(self.lm_head.weight.device) + + lm_logits = self.lm_head(hidden_states) + mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1) + + mc_loss = None + if mc_labels is not None: + loss_fct = CrossEntropyLoss() + mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)) + lm_loss = None + if labels is not None: + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss() + lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + if not return_dict: + output = (lm_logits, mc_logits) + transformer_outputs[1:] + if mc_loss is not None: + output = (mc_loss,) + output + return ((lm_loss,) + output) if lm_loss is not None else output + + return GPT2DoubleHeadsModelOutput( + loss=lm_loss, + mc_loss=mc_loss, + logits=lm_logits, + mc_logits=mc_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + @staticmethod + def _reorder_cache(past: Tuple[Tuple["torch.Tensor"]], beam_idx: "torch.Tensor") -> Tuple[Tuple["torch.Tensor"]]: + """ + This function is used to re-order the :obj:`past_key_values` cache if + :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is + called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step. + """ + return tuple( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past) + for layer_past in past + ) + + +@add_start_docstrings( + """ + The GPT2 Model transformer with a sequence classification head on top (linear layer). + + :class:`~transformers.GPT2ForSequenceClassification` uses the last token in order to do the classification, as + other causal models (e.g. GPT-1) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each + row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot + guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same (take + the last value in each row of the batch). + """, + GPT2_START_DOCSTRING, +) +class GPT2ForSequenceClassification(GPT2PreTrainedModel): + _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.transformer = GPT2Model(config) + self.score = nn.Linear(config.n_embd, self.num_labels, bias=False) + + self.init_weights() + + # Model parallel + self.model_parallel = False + self.device_map = None + + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="microsoft/DialogRPT-updown", + output_type=SequenceClassifierOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + past_key_values=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size, sequence_length = input_ids.shape[:2] + else: + batch_size, sequence_length = inputs_embeds.shape[:2] + + assert ( + self.config.pad_token_id is not None or batch_size == 1 + ), "Cannot handle batch sizes > 1 if no padding token is defined." + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1 + else: + sequence_lengths = -1 + logger.warning( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + f"unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + + pooled_logits = logits[range(batch_size), sequence_lengths] + + loss = None + if labels is not None: + if self.num_labels == 1: + # We are doing regression + loss_fct = MSELoss() + loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1)) + else: + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@add_start_docstrings( + """ + GPT2 Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + GPT2_START_DOCSTRING, +) +class GPT2ForTokenClassification(GPT2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.transformer = GPT2Model(config) + if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None: + classifier_dropout = config.classifier_dropout + elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None: + classifier_dropout = config.hidden_dropout + else: + classifier_dropout = 0.1 + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + # Model parallel + self.model_parallel = False + self.device_map = None + + @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint="microsoft/DialogRPT-updown", + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + past_key_values=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + hidden_states = self.dropout(hidden_states) + logits = self.classifier(hidden_states) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) + ) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + transformer_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/fastNLP/transformers/torch/models/gpt2/tokenization_gpt2.py b/fastNLP/transformers/torch/models/gpt2/tokenization_gpt2.py new file mode 100644 index 00000000..94932da0 --- /dev/null +++ b/fastNLP/transformers/torch/models/gpt2/tokenization_gpt2.py @@ -0,0 +1,308 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI GPT.""" + +import json +import os +from functools import lru_cache +from typing import TYPE_CHECKING, List, Optional, Tuple + +import regex as re + +from fastNLP.transformers.torch.tokenization_utils import AddedToken, PreTrainedTokenizer +# if TYPE_CHECKING: +# from transformers.pipelines.conversational import Conversation +from fastNLP.core.log import logger + +__all__ = [ + "GPT2Tokenizer", +] + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", +} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "gpt2": "https://huggingface.co/gpt2/resolve/main/vocab.json", + "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/vocab.json", + "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/vocab.json", + "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/vocab.json", + "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/vocab.json", + }, + "merges_file": { + "gpt2": "https://huggingface.co/gpt2/resolve/main/merges.txt", + "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/merges.txt", + "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/merges.txt", + "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/merges.txt", + "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/merges.txt", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "gpt2": 1024, + "gpt2-medium": 1024, + "gpt2-large": 1024, + "gpt2-xl": 1024, + "distilgpt2": 1024, +} + + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control + characters the bpe code barfs on. + + The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab + if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for + decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup + tables between utf-8 bytes and unicode strings. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2 ** 8): + if b not in bs: + bs.append(b) + cs.append(2 ** 8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class GPT2Tokenizer(PreTrainedTokenizer): + """ + Construct a GPT-2 tokenizer. Based on byte-level Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + :: + + >>> from transformers import GPT2Tokenizer + >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2") + >>> tokenizer("Hello world")['input_ids'] + [15496, 995] + >>> tokenizer(" Hello world")['input_ids'] + [18435, 995] + + You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + .. note:: + + When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first + one). + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. + Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + merges_file (:obj:`str`): + Path to the merges file. + errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode + `__ for more information. + unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`): + The beginning of sequence token. + eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`): + The end of sequence token. + add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (GPT2 tokenizer detect beginning of words by the preceding space). + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + merges_file, + errors="replace", + unk_token="<|endoftext|>", + bos_token="<|endoftext|>", + eos_token="<|endoftext|>", + add_prefix_space=False, + **kwargs + ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + super().__init__( + errors=errors, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + bpe_merges = merges_handle.read().split("\n")[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_merges] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + self.add_prefix_space = add_prefix_space + + # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + + @property + def vocab_size(self): + return len(self.encoder) + + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + def _tokenize(self, text): + """Tokenize a string.""" + bpe_tokens = [] + for token in re.findall(self.pat, text): + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) + return bpe_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + text = "".join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) + return text + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, ensure_ascii=False)) + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write("#version: 0.2\n") + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): + add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) + if is_split_into_words or add_prefix_space: + text = " " + text + return (text, kwargs) + + # def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]: + # input_ids = [] + # for is_user, text in conversation.iter_texts(): + # input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id]) + # if len(input_ids) > self.model_max_length: + # input_ids = input_ids[-self.model_max_length :] + # return input_ids diff --git a/fastNLP/transformers/torch/models/roberta/__init__.py b/fastNLP/transformers/torch/models/roberta/__init__.py new file mode 100644 index 00000000..582ea614 --- /dev/null +++ b/fastNLP/transformers/torch/models/roberta/__init__.py @@ -0,0 +1,21 @@ +__all__ = [ + "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", + "RobertaConfig", + + "ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST", + "RobertaForCausalLM", + "RobertaForMaskedLM", + "RobertaForMultipleChoice", + "RobertaForQuestionAnswering", + "RobertaForSequenceClassification", + "RobertaForTokenClassification", + "RobertaModel", + "RobertaPreTrainedModel", + + "RobertaTokenizer", +] + +from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig +from .tokenization_roberta import RobertaTokenizer +from .modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, RobertaForCausalLM, RobertaForMaskedLM, RobertaForMultipleChoice, \ + RobertaForQuestionAnswering, RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel, RobertaPreTrainedModel \ No newline at end of file diff --git a/fastNLP/transformers/torch/models/roberta/configuration_roberta.py b/fastNLP/transformers/torch/models/roberta/configuration_roberta.py new file mode 100644 index 00000000..9a514be1 --- /dev/null +++ b/fastNLP/transformers/torch/models/roberta/configuration_roberta.py @@ -0,0 +1,65 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" RoBERTa configuration """ +from ..bert.configuration_bert import BertConfig +from fastNLP.core.log import logger + +__all__ = [ + "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", + "RobertaConfig", +] + +ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "roberta-base": "https://huggingface.co/roberta-base/resolve/main/config.json", + "roberta-large": "https://huggingface.co/roberta-large/resolve/main/config.json", + "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/config.json", + "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/config.json", + "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/config.json", + "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/config.json", +} + + +class RobertaConfig(BertConfig): + r""" + This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel` or a + :class:`~transformers.TFRobertaModel`. It is used to instantiate a RoBERTa model according to the specified + arguments, defining the model architecture. + + + Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model + outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. + + The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. It reuses the + same defaults. Please check the parent class for more information. + + Examples:: + + >>> from transformers import RobertaConfig, RobertaModel + + >>> # Initializing a RoBERTa configuration + >>> configuration = RobertaConfig() + + >>> # Initializing a model from the configuration + >>> model = RobertaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + """ + model_type = "roberta" + + def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs): + """Constructs RobertaConfig.""" + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/fastNLP/transformers/torch/models/roberta/modeling_roberta.py b/fastNLP/transformers/torch/models/roberta/modeling_roberta.py new file mode 100644 index 00000000..4e914214 --- /dev/null +++ b/fastNLP/transformers/torch/models/roberta/modeling_roberta.py @@ -0,0 +1,1584 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch RoBERTa model. """ + +import math + +from packaging import version + +from fastNLP.transformers.torch.activations import ACT2FN, gelu +from fastNLP.transformers.torch.file_utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + replace_return_docstrings, +) +from fastNLP.transformers.torch.modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from fastNLP.transformers.torch.modeling_utils import ( + PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) +from .configuration_roberta import RobertaConfig +from fastNLP.envs.imports import _NEED_IMPORT_TORCH +from fastNLP.core.log import logger + +if _NEED_IMPORT_TORCH: + import torch + import torch.utils.checkpoint + from torch import nn + from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss, Module +else: + from fastNLP.core.utils.dummy_class import DummyClass as Module + +__all__ = [ + "ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST", + "RobertaForCausalLM", + "RobertaForMaskedLM", + "RobertaForMultipleChoice", + "RobertaForQuestionAnswering", + "RobertaForSequenceClassification", + "RobertaForTokenClassification", + "RobertaModel", + "RobertaPreTrainedModel", +] + +_CHECKPOINT_FOR_DOC = "roberta-base" +_CONFIG_FOR_DOC = "RobertaConfig" +_TOKENIZER_FOR_DOC = "RobertaTokenizer" + +ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "roberta-base", + "roberta-large", + "roberta-large-mnli", + "distilroberta-base", + "roberta-base-openai-detector", + "roberta-large-openai-detector", + # See all RoBERTa models at https://huggingface.co/models?filter=roberta +] + + +class RobertaEmbeddings(Module): + """ + Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. + """ + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__ + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + if version.parse(torch.__version__) > version.parse("1.6.0"): + self.register_buffer( + "token_type_ids", + torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device), + persistent=False, + ) + + # End copy + self.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx + ) + + def forward( + self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + if position_ids is None: + if input_ids is not None: + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length) + else: + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) + + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + def create_position_ids_from_inputs_embeds(self, inputs_embeds): + """ + We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. + + Args: + inputs_embeds: torch.Tensor + + Returns: torch.Tensor + """ + input_shape = inputs_embeds.size()[:-1] + sequence_length = input_shape[1] + + position_ids = torch.arange( + self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device + ) + return position_ids.unsqueeze(0).expand(input_shape) + + +# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Roberta +class RobertaSelfAttention(Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput +class RobertaSelfOutput(Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta +class RobertaAttention(Module): + def __init__(self, config): + super().__init__() + self.self = RobertaSelfAttention(config) + self.output = RobertaSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate +class RobertaIntermediate(Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput +class RobertaOutput(Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Roberta +class RobertaLayer(Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = RobertaAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added" + self.crossattention = RobertaAttention(config) + self.intermediate = RobertaIntermediate(config) + self.output = RobertaOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + assert hasattr( + self, "crossattention" + ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`" + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Roberta +class RobertaEncoder(Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([RobertaLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, past_key_value, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertPooler +class RobertaPooler(Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class RobertaPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = RobertaConfig + base_model_prefix = "roberta" + supports_gradient_checkpointing = True + + # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, RobertaEncoder): + module.gradient_checkpointing = value + + def update_keys_to_ignore(self, config, del_keys_to_ignore): + """Remove some keys from ignore list""" + if not config.tie_word_embeddings: + # must make a new list, or the class variable gets modified! + self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore] + self._keys_to_ignore_on_load_missing = [ + k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore + ] + + +ROBERTA_START_DOCSTRING = r""" + + This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic + methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, + pruning heads etc.) + + This model is also a PyTorch `torch.nn.Module `__ + subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to + general usage and behavior. + + Parameters: + config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the + model. Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. +""" + +ROBERTA_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~transformers.RobertaTokenizer`. See + :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, + 1]``: + + - 0 corresponds to a `sentence A` token, + - 1 corresponds to a `sentence B` token. + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, + config.max_position_embeddings - 1]``. + + `What are position IDs? <../glossary.html#position-ids>`_ + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", + ROBERTA_START_DOCSTRING, +) +class RobertaModel(RobertaPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in `Attention is + all you need`_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz + Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration + set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder` + argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an + input to the forward pass. + + .. _`Attention is all you need`: https://arxiv.org/abs/1706.03762 + + """ + + _keys_to_ignore_on_load_missing = [r"position_ids"] + + # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = RobertaEmbeddings(config) + self.encoder = RobertaEncoder(config) + + self.pooler = RobertaPooler(config) if add_pooling_layer else None + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + # Copied from transformers.models.bert.modeling_bert.BertModel.forward + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings( + """RoBERTa Model with a `language modeling` head on top for CLM fine-tuning. """, ROBERTA_START_DOCSTRING +) +class RobertaForCausalLM(RobertaPreTrainedModel): + _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning("If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`") + + self.roberta = RobertaModel(config, add_pooling_layer=False) + self.lm_head = RobertaLMHead(config) + + # The LM head weights require special treatment only when they are tied with the word embeddings + self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) + + self.init_weights() + + def get_output_embeddings(self): + return self.lm_head.decoder + + def set_output_embeddings(self, new_embeddings): + self.lm_head.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are + ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + + Returns: + + Example:: + + >>> from transformers import RobertaTokenizer, RobertaForCausalLM, RobertaConfig + >>> import torch + + >>> tokenizer = RobertaTokenizer.from_pretrained('roberta-base') + >>> config = RobertaConfig.from_pretrained("roberta-base") + >>> config.is_decoder = True + >>> model = RobertaForCausalLM.from_pretrained('roberta-base', config=config) + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.logits + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss() + lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past} + + def _reorder_cache(self, past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + return reordered_past + + +@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING) +class RobertaForMaskedLM(RobertaPreTrainedModel): + _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"] + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + "If you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.roberta = RobertaModel(config, add_pooling_layer=False) + self.lm_head = RobertaLMHead(config) + + # The LM head weights require special treatment only when they are tied with the word embeddings + self.update_keys_to_ignore(config, ["lm_head.decoder.weight"]) + + self.init_weights() + + def get_output_embeddings(self): + return self.lm_head.decoder + + def set_output_embeddings(self, new_embeddings): + self.lm_head.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + mask="", + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class RobertaLMHead(Module): + """Roberta Head for masked language modeling.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.decoder = nn.Linear(config.hidden_size, config.vocab_size) + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + self.decoder.bias = self.bias + + def forward(self, features, **kwargs): + x = self.dense(features) + x = gelu(x) + x = self.layer_norm(x) + + # project back to size of vocabulary with bias + x = self.decoder(x) + + return x + + def _tie_weights(self): + # To tie those two weights if they get disconnected (on TPU or when the bias is resized) + self.bias = self.decoder.bias + + +@add_start_docstrings( + """ + RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + ROBERTA_START_DOCSTRING, +) +class RobertaForSequenceClassification(RobertaPreTrainedModel): + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.roberta = RobertaModel(config, add_pooling_layer=False) + self.classifier = RobertaClassificationHead(config) + + self.init_weights() + + @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + ROBERTA_START_DOCSTRING, +) +class RobertaForMultipleChoice(RobertaPreTrainedModel): + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def __init__(self, config): + super().__init__(config) + + self.roberta = RobertaModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + self.init_weights() + + @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + token_type_ids=None, + attention_mask=None, + labels=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., + num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See + :obj:`input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + flat_inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.roberta( + flat_input_ids, + position_ids=flat_position_ids, + token_type_ids=flat_token_type_ids, + attention_mask=flat_attention_mask, + head_mask=head_mask, + inputs_embeds=flat_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + ROBERTA_START_DOCSTRING, +) +class RobertaForTokenClassification(RobertaPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.roberta = RobertaModel(config, add_pooling_layer=False) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + labels=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels - + 1]``. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) + ) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class RobertaClassificationHead(Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = torch.tanh(x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +@add_start_docstrings( + """ + Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + ROBERTA_START_DOCSTRING, +) +class RobertaForQuestionAnswering(RobertaPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.roberta = RobertaModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + tokenizer_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the + sequence are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols + are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + x: torch.Tensor x: + + Returns: torch.Tensor + """ + # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. + mask = input_ids.ne(padding_idx).int() + incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask + return incremental_indices.long() + padding_idx diff --git a/fastNLP/transformers/torch/models/roberta/tokenization_roberta.py b/fastNLP/transformers/torch/models/roberta/tokenization_roberta.py new file mode 100644 index 00000000..c0c11e29 --- /dev/null +++ b/fastNLP/transformers/torch/models/roberta/tokenization_roberta.py @@ -0,0 +1,254 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for RoBERTa.""" + +from typing import List, Optional + +from fastNLP.transformers.torch.tokenization_utils import AddedToken +from ..gpt2.tokenization_gpt2 import GPT2Tokenizer +from fastNLP.core.log import logger + +__all__ = [ + "RobertaTokenizer", +] + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", +} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "roberta-base": "https://huggingface.co/roberta-base/resolve/main/vocab.json", + "roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json", + "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json", + "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json", + "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/vocab.json", + "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/vocab.json", + }, + "merges_file": { + "roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt", + "roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt", + "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt", + "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt", + "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/merges.txt", + "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/merges.txt", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "roberta-base": 512, + "roberta-large": 512, + "roberta-large-mnli": 512, + "distilroberta-base": 512, + "roberta-base-openai-detector": 512, + "roberta-large-openai-detector": 512, +} + + +class RobertaTokenizer(GPT2Tokenizer): + """ + Constructs a RoBERTa tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + :: + + >>> from transformers import RobertaTokenizer + >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base") + >>> tokenizer("Hello world")['input_ids'] + [0, 31414, 232, 328, 2] + >>> tokenizer(" Hello world")['input_ids'] + [0, 20920, 232, 2] + + You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + .. note:: + + When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first + one). + + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. + Users should refer to this superclass for more information regarding those methods. + + Args: + vocab_file (:obj:`str`): + Path to the vocabulary file. + merges_file (:obj:`str`): + Path to the merges file. + errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode + `__ for more information. + bos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the :obj:`cls_token`. + eos_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The end of sequence token. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the end of + sequence. The token used is the :obj:`sep_token`. + sep_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (RoBERTa tokenizer detect beginning of words by the preceding space). + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + merges_file, + errors="replace", + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + add_prefix_space=False, + **kwargs + ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + super().__init__( + vocab_file=vocab_file, + merges_file=merges_file, + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A RoBERTa sequence has the following format: + + - single sequence: `` X `` + - pair of sequences: `` A B `` + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` method. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not + make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + + Returns: + :obj:`List[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): + add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) + if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): + text = " " + text + return (text, kwargs) diff --git a/fastNLP/transformers/torch/tokenization_utils.py b/fastNLP/transformers/torch/tokenization_utils.py new file mode 100644 index 00000000..f0f57e39 --- /dev/null +++ b/fastNLP/transformers/torch/tokenization_utils.py @@ -0,0 +1,915 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see + tokenization_utils_fast.py +""" +import bisect +import itertools +import re +import unicodedata +from collections import OrderedDict +from typing import Any, Dict, List, Optional, Tuple, Union, overload + +from .file_utils import PaddingStrategy, TensorType, add_end_docstrings +from .tokenization_utils_base import ( + ENCODE_KWARGS_DOCSTRING, + ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING, + INIT_TOKENIZER_DOCSTRING, + AddedToken, + BatchEncoding, + EncodedInput, + EncodedInputPair, + PreTokenizedInput, + PreTokenizedInputPair, + PreTrainedTokenizerBase, + TextInput, + TextInputPair, + TruncationStrategy, +) + +from fastNLP.core.log import logger + +# Slow tokenizers are saved in a vocabulary plus three separated files +SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" +ADDED_TOKENS_FILE = "added_tokens.json" +TOKENIZER_CONFIG_FILE = "tokenizer_config.json" + + +class Trie: + """ + Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass + Loose reference https://en.wikipedia.org/wiki/Trie + """ + + def __init__(self): + self.data = {} + + def add(self, word: str): + """ + Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation. + The special key `""` is used to represent termination. + + This function is idempotent, adding twice the same word will leave the trie unchanged + + Example:: + + >>> trie = Trie() + >>> trie.add("Hello 友達") + >>> trie.data + {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}} + >>> trie.add("Hello") + >>> trie.data + {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}} + """ + if not word: + # Prevent empty string + return + ref = self.data + for char in word: + ref[char] = char in ref and ref[char] or {} + ref = ref[char] + ref[""] = 1 + + def split(self, text: str) -> List[str]: + """ + Will look for the words added to the trie within `text`. Output is the original string splitted along the + boundaries of the words found. + + This trie will match the longest possible word first ! + + Example:: + + >>> trie = Trie() + >>> trie.split("[CLS] This is a extra_id_100") + ["[CLS] This is a extra_id_100"] + >>> trie.add("[CLS]") + >>> trie.add("extra_id_1") + >>> trie.add("extra_id_100") + >>> trie.split("[CLS] This is a extra_id_100") + ["[CLS]", " This is a ", "extra_id_100"] + """ + # indexes are counted left of the chars index. + # "hello", index 0, is left of h, index 1 is between h and e. + # index 5 is right of the "o". + + # States are going to capture every possible start (indexes as above) + # as keys, and have as values, a pointer to the position in the trie + # where we're at. This is a partial match for now. + # This enables to keep track of multiple matches while we're iterating + # the string + # If the trie contains, "blowing", and "lower" and we encounter the + # string "blower", we need to split into ["b", "lower"]. + # This is where we need to keep track of multiple possible starts. + states = OrderedDict() + + # This will contain every indices where we need + # to cut. + # We force to cut at offset 0 and len(text) (added later) + offsets = [0] + + # This is used by the lookahead which needs to skip over + # some text where the full match exceeded the place in the initial + # for loop + skip = None + # Main loop, Giving this algorithm O(n) complexity + for current, current_char in enumerate(text): + if skip and current < skip: + # Prevents the lookahead for matching twice + # like extra_id_100 and id_100 + continue + + # This will track every state + # that stop matching, we need to stop tracking them. + # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then + # fail on "b", we need to remove 0 from the valid states. + to_remove = set() + # Whenever we found a match, we need to drop everything + # this is a greedy algorithm, it will match on the first found token + reset = False + + # In this case, we already have partial matches (But unfinished) + for start, trie_pointer in states.items(): + if "" in trie_pointer: + # This is a final match, we need to reset and + # store the results in `offsets`. + + # Lookahead to match longest first + # Important in case of extra_id_1 vs extra_id_100 + lookahead_index = current + end = current + next_char = text[lookahead_index] if lookahead_index < len(text) else None + while next_char in trie_pointer: + trie_pointer = trie_pointer[next_char] + lookahead_index += 1 + if "" in trie_pointer: + end = lookahead_index + skip = lookahead_index + + if lookahead_index == len(text): + # End of string + break + next_char = text[lookahead_index] + # End lookahead + + # Storing and resetting + offsets.append(start) + offsets.append(end) + reset = True + elif current_char in trie_pointer: + # The current character being looked at has a match within the trie + # update the pointer (it will be stored back into states later). + trie_pointer = trie_pointer[current_char] + + # Storing back the new pointer into the states. + # Partial matches got longer by one. + states[start] = trie_pointer + else: + # The new character has not match in the trie, we need + # to stop keeping track of this partial match. + # We can't do it directly within the loop because of how + # python iteration works + to_remove.add(start) + + # Either clearing the full start (we found a real match) + # Or clearing only the partial matches that didn't work. + if reset: + states = {} + else: + for start in to_remove: + del states[start] + + # If this character is a starting character within the trie + # start keeping track of this partial match. + if current_char in self.data: + states[current] = self.data[current_char] + + # We have a cut at the end with states. + for start, trie_pointer in states.items(): + if "" in trie_pointer: + # This is a final match, we need to reset and + # store the results in `offsets`. + end = len(text) + offsets.append(start) + offsets.append(end) + # Longest cut is always the one with lower start so the first + # item so we need to break. + break + + # We have all the offsets now, we just need to do the actual splitting. + # We need to eventually add the first part of the string and the eventual + # last part. + offsets.append(len(text)) + tokens = [] + start = 0 + for end in offsets: + if start == end: + # This might happen if there's a match at index 0 + # we're also preventing zero-width cuts in case of two + # consecutive matches + continue + tokens.append(text[start:end]) + start = end + + return tokens + + +def _is_whitespace(char): + """Checks whether `char` is a whitespace character.""" + # \t, \n, and \r are technically control characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `char` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `char` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False + + +def _is_end_of_word(text): + """Checks whether the last character in text is one of a punctuation, control or whitespace character.""" + last_char = text[-1] + return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char)) + + +def _is_start_of_word(text): + """Checks whether the first character in text is one of a punctuation, control or whitespace character.""" + first_char = text[0] + return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char)) + + +def _insert_one_token_to_ordered_list(token_list: List[str], new_token: str): + """ + Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted. + """ + insertion_idx = bisect.bisect_left(token_list, new_token) + # Checks if new_token is already in the ordered token_list + if insertion_idx < len(token_list) and token_list[insertion_idx] == new_token: + # new_token is in token_list, don't add + return + else: + token_list.insert(insertion_idx, new_token) + + +@add_end_docstrings(INIT_TOKENIZER_DOCSTRING) +class PreTrainedTokenizer(PreTrainedTokenizerBase): + """ + Base class for all slow tokenizers. + + Inherits from :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`. + + Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading + pretrained tokenizers as well as adding tokens to the vocabulary. + + This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the + specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + # Added tokens - We store this for both slow and fast tokenizers + # until the serialization of Fast tokenizers is updated + self.added_tokens_encoder: Dict[str, int] = {} + self.added_tokens_decoder: Dict[int, str] = {} + self.unique_no_split_tokens: List[str] = [] + self.tokens_trie = Trie() + + self._decode_use_source_tokenizer = False + + @property + def is_fast(self) -> bool: + return False + + @property + def vocab_size(self) -> int: + """ + :obj:`int`: Size of the base vocabulary (without the added tokens). + """ + raise NotImplementedError + + def get_added_vocab(self) -> Dict[str, int]: + """ + Returns the added tokens in the vocabulary as a dictionary of token to index. + + Returns: + :obj:`Dict[str, int]`: The added tokens. + """ + return self.added_tokens_encoder + + def __len__(self): + """ + Size of the full vocabulary with the added tokens. + """ + return self.vocab_size + len(self.added_tokens_encoder) + + def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int: + """ + Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to + it with indices starting from length of the current vocabulary. + + Args: + new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`): + Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by + checking if the tokenizer assign the index of the ``unk_token`` to them). + special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the tokens should be added as special tokens. + + Returns: + :obj:`int`: The number of tokens actually added to the vocabulary. + + Examples:: + + # Let's see how to increase the vocabulary of Bert model and tokenizer + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained('bert-base-uncased') + + num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) + print('We have added', num_added_toks, 'tokens') + # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + model.resize_token_embeddings(len(tokenizer)) + """ + new_tokens = [str(tok) for tok in new_tokens] + + tokens_to_add = [] + for token in new_tokens: + if not isinstance(token, str): + raise TypeError(f"Token {token} is not a string but a {type(token)}.") + if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case: + token = token.lower() + if ( + token != self.unk_token + and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) + and token not in tokens_to_add + ): + tokens_to_add.append(token) + if self.verbose: + logger.info(f"Adding {token} to the vocabulary") + + added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add)) + added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} + self.added_tokens_encoder.update(added_tok_encoder) + self.added_tokens_decoder.update(added_tok_decoder) + + # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert) + if special_tokens: + if len(new_tokens) == 1: + _insert_one_token_to_ordered_list(self.unique_no_split_tokens, new_tokens[0]) + else: + self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens))) + else: + # Or on the newly added tokens + if len(tokens_to_add) == 1: + _insert_one_token_to_ordered_list(self.unique_no_split_tokens, tokens_to_add[0]) + else: + self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add))) + self._create_trie(self.unique_no_split_tokens) + + return len(tokens_to_add) + + def _create_trie(self, unique_no_split_tokens): + trie = Trie() + for token in unique_no_split_tokens: + if hasattr(self, "do_lower_case") and self.do_lower_case and token not in self.all_special_tokens: + trie.add(token.lower()) + else: + trie.add(token) + self.tokens_trie = trie + + def num_special_tokens_to_add(self, pair: bool = False) -> int: + """ + Returns the number of added tokens when encoding a sequence with special tokens. + + .. note:: + This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not + put this inside your training loop. + + Args: + pair (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether the number of added tokens should be computed in the case of a sequence pair or a single + sequence. + + Returns: + :obj:`int`: Number of special tokens added to sequences. + """ + token_ids_0 = [] + token_ids_1 = [] + return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) + + def tokenize(self, text: TextInput, **kwargs) -> List[str]: + """ + Converts a string in a sequence of tokens, using the tokenizer. + + Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies + (BPE/SentencePieces/WordPieces). Takes care of added tokens. + + Args: + text (:obj:`str`): + The sequence to be encoded. + **kwargs (additional keyword arguments): + Passed along to the model-specific ``prepare_for_tokenization`` preprocessing method. + + Returns: + :obj:`List[str]`: The list of tokens. + """ + # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors + all_special_tokens_extended = dict( + (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken) + ) + + text, kwargs = self.prepare_for_tokenization(text, **kwargs) + + if kwargs: + logger.warning(f"Keyword arguments {kwargs} not recognized.") + + # TODO: should this be in the base class? + if hasattr(self, "do_lower_case") and self.do_lower_case: + # convert non-special tokens to lowercase + escaped_special_toks = [ + re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens) + ] + pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" + text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text) + + no_split_token = set(self.unique_no_split_tokens) + tokens = self.tokens_trie.split(text) + # ["This is something", "", " else"] + for i, token in enumerate(tokens): + if token in no_split_token: + tok_extended = all_special_tokens_extended.get(token, None) + left = tokens[i - 1] if i > 0 else None + right = tokens[i + 1] if i < len(tokens) - 1 else None + if isinstance(tok_extended, AddedToken): + if tok_extended.rstrip and right: + # A bit counter-intuitive but we strip the left of the string + # since tok_extended.rstrip means the special token is eating all white spaces on its right + tokens[i + 1] = right.lstrip() + # Strip white spaces on the left + if tok_extended.lstrip and left: + tokens[i - 1] = left.rstrip() # Opposite here + else: + # We strip left and right by default + if right: + tokens[i + 1] = right.lstrip() + if left: + tokens[i - 1] = left.rstrip() + # ["This is something", "", "else"] + tokenized_text = [] + for token in tokens: + # Need to skip eventual empty (fully stripped) tokens + if not token: + continue + if token in no_split_token: + tokenized_text.append(token) + else: + tokenized_text.extend(self._tokenize(token)) + # ["This", " is", " something", "", "else"] + return tokenized_text + + def _tokenize(self, text, **kwargs): + """ + Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based + vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). + + Do NOT take care of added tokens. + """ + raise NotImplementedError + + def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: + """ + Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the + vocabulary. + + Args: + tokens (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s). + + Returns: + :obj:`int` or :obj:`List[int]`: The token id or list of token ids. + """ + if tokens is None: + return None + + if isinstance(tokens, str): + return self._convert_token_to_id_with_added_voc(tokens) + + ids = [] + for token in tokens: + ids.append(self._convert_token_to_id_with_added_voc(token)) + return ids + + def _convert_token_to_id_with_added_voc(self, token): + if token is None: + return None + + if token in self.added_tokens_encoder: + return self.added_tokens_encoder[token] + return self._convert_token_to_id(token) + + def _convert_token_to_id(self, token): + raise NotImplementedError + + def _encode_plus( + self, + text: Union[TextInput, PreTokenizedInput, EncodedInput], + text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + def get_input_ids(text): + if isinstance(text, str): + tokens = self.tokenize(text, **kwargs) + return self.convert_tokens_to_ids(tokens) + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): + if is_split_into_words: + tokens = list( + itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text)) + ) + return self.convert_tokens_to_ids(tokens) + else: + return self.convert_tokens_to_ids(text) + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): + return text + else: + if is_split_into_words: + raise ValueError( + f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_split_into_words=True`." + ) + else: + raise ValueError( + f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." + ) + + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers." + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast." + "More information on available tokenizers at " + "https://github.com/huggingface/transformers/pull/2674" + ) + + first_ids = get_input_ids(text) + second_ids = get_input_ids(text_pair) if text_pair is not None else None + + return self.prepare_for_model( + first_ids, + pair_ids=second_ids, + add_special_tokens=add_special_tokens, + padding=padding_strategy.value, + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + prepend_batch_axis=True, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + verbose=verbose, + ) + + def _batch_encode_plus( + self, + batch_text_or_text_pairs: Union[ + List[TextInput], + List[TextInputPair], + List[PreTokenizedInput], + List[PreTokenizedInputPair], + List[EncodedInput], + List[EncodedInputPair], + ], + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + def get_input_ids(text): + if isinstance(text, str): + tokens = self.tokenize(text, **kwargs) + return self.convert_tokens_to_ids(tokens) + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): + if is_split_into_words: + tokens = list( + itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text)) + ) + return self.convert_tokens_to_ids(tokens) + else: + return self.convert_tokens_to_ids(text) + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): + return text + else: + raise ValueError( + "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." + ) + + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers." + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast." + ) + + input_ids = [] + for ids_or_pair_ids in batch_text_or_text_pairs: + if not isinstance(ids_or_pair_ids, (list, tuple)): + ids, pair_ids = ids_or_pair_ids, None + elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)): + ids, pair_ids = ids_or_pair_ids, None + else: + ids, pair_ids = ids_or_pair_ids + + first_ids = get_input_ids(ids) + second_ids = get_input_ids(pair_ids) if pair_ids is not None else None + input_ids.append((first_ids, second_ids)) + + batch_outputs = self._batch_prepare_for_model( + input_ids, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + return_tensors=return_tensors, + verbose=verbose, + ) + + return BatchEncoding(batch_outputs) + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def _batch_prepare_for_model( + self, + batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]], + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_length: bool = False, + verbose: bool = True, + ) -> BatchEncoding: + """ + Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It + adds special tokens, truncates sequences if overflowing while taking into account the special tokens and + manages a moving window (with user defined stride) for overflowing tokens + + Args: + batch_ids_pairs: list of tokenized input ids or input ids pairs + """ + + batch_outputs = {} + for first_ids, second_ids in batch_ids_pairs: + outputs = self.prepare_for_model( + first_ids, + second_ids, + add_special_tokens=add_special_tokens, + padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=None, # we pad in batch afterward + return_attention_mask=False, # we pad in batch afterward + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + return_tensors=None, # We convert the whole batch to tensors at the end + prepend_batch_axis=False, + verbose=verbose, + ) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + batch_outputs = self.pad( + batch_outputs, + padding=padding_strategy.value, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + ) + + batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors) + + return batch_outputs + + def prepare_for_tokenization( + self, text: str, is_split_into_words: bool = False, **kwargs + ) -> Tuple[str, Dict[str, Any]]: + """ + Performs any necessary transformations before tokenization. + + This method should pop the arguments from kwargs and return the remaining :obj:`kwargs` as well. We test the + :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used. + + Args: + text (:obj:`str`): + The text to prepare. + is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`, the + tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace) + which it will tokenize. This is useful for NER or token classification. + kwargs: + Keyword arguments to use for the tokenization. + + Returns: + :obj:`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs. + """ + return (text, kwargs) + + def get_special_tokens_mask( + self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0 (:obj:`List[int]`): + List of ids of the first sequence. + token_ids_1 (:obj:`List[int]`, `optional`): + List of ids of the second sequence. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) + + @overload + def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: + ... + + @overload + def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]: + ... + + def convert_ids_to_tokens( + self, ids: Union[int, List[int]], skip_special_tokens: bool = False + ) -> Union[str, List[str]]: + """ + Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and + added tokens. + + Args: + ids (:obj:`int` or :obj:`List[int]`): + The token id (or token ids) to convert to tokens. + skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to remove special tokens in the decoding. + + Returns: + :obj:`str` or :obj:`List[str]`: The decoded token(s). + """ + if isinstance(ids, int): + if ids in self.added_tokens_decoder: + return self.added_tokens_decoder[ids] + else: + return self._convert_id_to_token(ids) + tokens = [] + for index in ids: + index = int(index) + if skip_special_tokens and index in self.all_special_ids: + continue + if index in self.added_tokens_decoder: + tokens.append(self.added_tokens_decoder[index]) + else: + tokens.append(self._convert_id_to_token(index)) + return tokens + + def _convert_id_to_token(self, index: int) -> str: + raise NotImplementedError + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + return " ".join(tokens) + + def _decode( + self, + token_ids: List[int], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: bool = True, + spaces_between_special_tokens: bool = True, + **kwargs + ) -> str: + self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) + + filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) + + # To avoid mixing byte-level and unicode for byte-level BPT + # we need to build string separately for added tokens and byte-level tokens + # cf. https://github.com/huggingface/transformers/issues/1133 + sub_texts = [] + current_sub_text = [] + for token in filtered_tokens: + if skip_special_tokens and token in self.all_special_ids: + continue + if token in self.added_tokens_encoder: + if current_sub_text: + sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + current_sub_text = [] + sub_texts.append(token) + else: + current_sub_text.append(token) + if current_sub_text: + sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + + if spaces_between_special_tokens: + text = " ".join(sub_texts) + else: + text = "".join(sub_texts) + + if clean_up_tokenization_spaces: + clean_text = self.clean_up_tokenization(text) + return clean_text + else: + return text diff --git a/fastNLP/transformers/torch/tokenization_utils_base.py b/fastNLP/transformers/torch/tokenization_utils_base.py new file mode 100644 index 00000000..aebf4bb6 --- /dev/null +++ b/fastNLP/transformers/torch/tokenization_utils_base.py @@ -0,0 +1,3351 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user +fronting encoding methods) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary +of output with special method for the Fast tokenizers) +""" + +import copy +import json +import os +import re +from collections import OrderedDict, UserDict +from contextlib import contextmanager +from dataclasses import dataclass, field +from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union + +import numpy as np +from packaging import version + +import requests + +from . import __version__ +from .file_utils import ( + ExplicitEnum, + PaddingStrategy, + TensorType, + _is_numpy, + _is_torch, + _is_torch_device, + add_end_docstrings, + cached_path, + is_offline_mode, + is_remote_url, + is_tokenizers_available, + to_py_obj, +) + +from fastNLP.envs.imports import _NEED_IMPORT_TORCH +from fastNLP.core.log import logger + +if _NEED_IMPORT_TORCH: + import torch + +if is_tokenizers_available(): + from tokenizers import AddedToken + from tokenizers import Encoding as EncodingFast +else: + + @dataclass(frozen=True, eq=True) + class AddedToken: + """ + AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the + way it should behave. + """ + + content: str = field(default_factory=str) + single_word: bool = False + lstrip: bool = False + rstrip: bool = False + normalized: bool = True + + def __getstate__(self): + return self.__dict__ + + @dataclass + class EncodingFast: + """This is dummy class because without the `tokenizers` library we don't have these objects anyway""" + + pass + +VERY_LARGE_INTEGER = int(1e30) # This is used to set the max input length for a model with infinite size input +LARGE_INTEGER = int(1e20) # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER + +# Define type aliases and NamedTuples +TextInput = str +PreTokenizedInput = List[str] +EncodedInput = List[int] +TextInputPair = Tuple[str, str] +PreTokenizedInputPair = Tuple[List[str], List[str]] +EncodedInputPair = Tuple[List[int], List[int]] + + +# Slow tokenizers used to be saved in three separated files +SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" +ADDED_TOKENS_FILE = "added_tokens.json" +TOKENIZER_CONFIG_FILE = "tokenizer_config.json" + +# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file +FULL_TOKENIZER_FILE = "tokenizer.json" + + +class TruncationStrategy(ExplicitEnum): + """ + Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for + tab-completion in an IDE. + """ + + ONLY_FIRST = "only_first" + ONLY_SECOND = "only_second" + LONGEST_FIRST = "longest_first" + DO_NOT_TRUNCATE = "do_not_truncate" + + +class CharSpan(NamedTuple): + """ + Character span in the original string. + + Args: + start (:obj:`int`): Index of the first character in the original string. + end (:obj:`int`): Index of the character following the last character in the original string. + """ + + start: int + end: int + + +class TokenSpan(NamedTuple): + """ + Token span in an encoded string (list of tokens). + + Args: + start (:obj:`int`): Index of the first token in the span. + end (:obj:`int`): Index of the token following the last token in the span. + """ + + start: int + end: int + + +class BatchEncoding(UserDict): + """ + Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus` and + :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode` methods (tokens, + attention_masks, etc). + + This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes + utility methods to map from word/character space to token space. + + Args: + data (:obj:`dict`): + Dictionary of lists/arrays/tensors returned by the encode/batch_encode methods ('input_ids', + 'attention_mask', etc.). + encoding (:obj:`tokenizers.Encoding` or :obj:`Sequence[tokenizers.Encoding]`, `optional`): + If the tokenizer is a fast tokenizer which outputs additional information like mapping from word/character + space to token space the :obj:`tokenizers.Encoding` instance or list of instance (for batches) hold this + information. + tensor_type (:obj:`Union[None, str, TensorType]`, `optional`): + You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at + initialization. + prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to add a batch axis when converting to tensors (see :obj:`tensor_type` above). + n_sequences (:obj:`Optional[int]`, `optional`): + You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at + initialization. + """ + + def __init__( + self, + data: Optional[Dict[str, Any]] = None, + encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None, + tensor_type: Union[None, str, TensorType] = None, + prepend_batch_axis: bool = False, + n_sequences: Optional[int] = None, + ): + super().__init__(data) + + if isinstance(encoding, EncodingFast): + encoding = [encoding] + + self._encodings = encoding + + if n_sequences is None and encoding is not None and len(encoding): + n_sequences = encoding[0].n_sequences + + self._n_sequences = n_sequences + + self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis) + + @property + def n_sequences(self) -> Optional[int]: + """ + :obj:`Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this + :class:`~transformers.BatchEncoding`. Currently can be one of :obj:`None` (unknown), :obj:`1` (a single + sentence) or :obj:`2` (a pair of sentences) + """ + return self._n_sequences + + @property + def is_fast(self) -> bool: + """ + :obj:`bool`: Indicate whether this :class:`~transformers.BatchEncoding` was generated from the result of a + :class:`~transformers.PreTrainedTokenizerFast` or not. + """ + return self._encodings is not None + + def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]: + """ + If the key is a string, returns the value of the dict associated to :obj:`key` ('input_ids', 'attention_mask', + etc.). + + If the key is an integer, get the :obj:`tokenizers.Encoding` for batch item with index :obj:`key`. + """ + if isinstance(item, str): + return self.data[item] + elif self._encodings is not None: + return self._encodings[item] + else: + raise KeyError( + "Indexing with integers (to access backend Encoding for a given batch index) " + "is not available when using Python based tokenizers" + ) + + def __getattr__(self, item: str): + try: + return self.data[item] + except KeyError: + raise AttributeError + + def __getstate__(self): + return {"data": self.data, "encodings": self._encodings} + + def __setstate__(self, state): + if "data" in state: + self.data = state["data"] + + if "encodings" in state: + self._encodings = state["encodings"] + + def keys(self): + return self.data.keys() + + def values(self): + return self.data.values() + + def items(self): + return self.data.items() + + # After this point: + # Extended properties and methods only available for fast (Rust-based) tokenizers + # provided by HuggingFace tokenizers library. + + @property + def encodings(self) -> Optional[List[EncodingFast]]: + """ + :obj:`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns + :obj:`None` if the input was tokenized through Python (i.e., not a fast) tokenizer. + """ + return self._encodings + + def tokens(self, batch_index: int = 0) -> List[str]: + """ + Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to + integer indices) at a given batch index (only works for the output of a fast tokenizer). + + Args: + batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch. + + Returns: + :obj:`List[str]`: The list of tokens at that index. + """ + if not self._encodings: + raise ValueError("tokens() is not available when using Python-based tokenizers") + return self._encodings[batch_index].tokens + + def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]: + """ + Return a list mapping the tokens to the id of their original sentences: + + - :obj:`None` for special tokens added around or between sequences, + - :obj:`0` for tokens corresponding to words in the first sequence, + - :obj:`1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly + encoded. + + Args: + batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch. + + Returns: + :obj:`List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens + added by the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their + corresponding sequence. + """ + if not self._encodings: + raise ValueError("sequence_ids() is not available when using Python-based tokenizers") + return self._encodings[batch_index].sequence_ids + + def words(self, batch_index: int = 0) -> List[Optional[int]]: + """ + Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer. + + Args: + batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch. + + Returns: + :obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by + the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding + word (several tokens will be mapped to the same word index if they are parts of that word). + """ + if not self._encodings: + raise ValueError("words() is not available when using Python-based tokenizers") + logger.warn( + "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, " + "but more self-explanatory `BatchEncoding.word_ids()` property.", + FutureWarning, + ) + return self.word_ids(batch_index) + + def word_ids(self, batch_index: int = 0) -> List[Optional[int]]: + """ + Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer. + + Args: + batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch. + + Returns: + :obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by + the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding + word (several tokens will be mapped to the same word index if they are parts of that word). + """ + if not self._encodings: + raise ValueError("word_ids() is not available when using Python-based tokenizers") + return self._encodings[batch_index].word_ids + + def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int: + """ + Get the index of the sequence represented by the given token. In the general use case, this method returns + :obj:`0` for a single sequence or the first sequence of a pair, and :obj:`1` for the second sequence of a pair + + Can be called as: + + - ``self.token_to_sequence(token_index)`` if batch size is 1 + - ``self.token_to_sequence(batch_index, token_index)`` if batch size is greater than 1 + + This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e., + words are defined by the user). In this case it allows to easily associate encoded tokens with provided + tokenized words. + + Args: + batch_or_token_index (:obj:`int`): + Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of + the token in the sequence. + token_index (:obj:`int`, `optional`): + If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the + sequence. + + Returns: + :obj:`int`: Index of the word in the input sequence. + """ + + if not self._encodings: + raise ValueError("token_to_sequence() is not available when using Python based tokenizers") + if token_index is not None: + batch_index = batch_or_token_index + else: + batch_index = 0 + token_index = batch_or_token_index + if batch_index < 0: + batch_index = self._batch_size + batch_index + if token_index < 0: + token_index = self._seq_len + token_index + return self._encodings[batch_index].token_to_sequence(token_index) + + def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int: + """ + Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch. + + Can be called as: + + - ``self.token_to_word(token_index)`` if batch size is 1 + - ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1 + + This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e., + words are defined by the user). In this case it allows to easily associate encoded tokens with provided + tokenized words. + + Args: + batch_or_token_index (:obj:`int`): + Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of + the token in the sequence. + token_index (:obj:`int`, `optional`): + If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the + sequence. + + Returns: + :obj:`int`: Index of the word in the input sequence. + """ + + if not self._encodings: + raise ValueError("token_to_word() is not available when using Python based tokenizers") + if token_index is not None: + batch_index = batch_or_token_index + else: + batch_index = 0 + token_index = batch_or_token_index + if batch_index < 0: + batch_index = self._batch_size + batch_index + if token_index < 0: + token_index = self._seq_len + token_index + return self._encodings[batch_index].token_to_word(token_index) + + def word_to_tokens( + self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0 + ) -> Optional[TokenSpan]: + """ + Get the encoded token span corresponding to a word in a sequence of the batch. + + Token spans are returned as a :class:`~transformers.tokenization_utils_base.TokenSpan` with: + + - **start** -- Index of the first token. + - **end** -- Index of the token following the last token. + + Can be called as: + + - ``self.word_to_tokens(word_index, sequence_index: int = 0)`` if batch size is 1 + - ``self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)`` if batch size is greater or equal + to 1 + + This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words + are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized + words. + + Args: + batch_or_word_index (:obj:`int`): + Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of + the word in the sequence. + word_index (:obj:`int`, `optional`): + If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the + sequence. + sequence_index (:obj:`int`, `optional`, defaults to 0): + If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0 + or 1) the provided word index belongs to. + + Returns: + Optional :class:`~transformers.tokenization_utils_base.TokenSpan` Span of tokens in the encoded sequence. + Returns :obj:`None` if no tokens correspond to the word. + """ + + if not self._encodings: + raise ValueError("word_to_tokens() is not available when using Python based tokenizers") + if word_index is not None: + batch_index = batch_or_word_index + else: + batch_index = 0 + word_index = batch_or_word_index + if batch_index < 0: + batch_index = self._batch_size + batch_index + if word_index < 0: + word_index = self._seq_len + word_index + span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index) + return TokenSpan(*span) if span is not None else None + + def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan: + """ + Get the character span corresponding to an encoded token in a sequence of the batch. + + Character spans are returned as a :class:`~transformers.tokenization_utils_base.CharSpan` with: + + - **start** -- Index of the first character in the original string associated to the token. + - **end** -- Index of the character following the last character in the original string associated to the + token. + + Can be called as: + + - ``self.token_to_chars(token_index)`` if batch size is 1 + - ``self.token_to_chars(batch_index, token_index)`` if batch size is greater or equal to 1 + + Args: + batch_or_token_index (:obj:`int`): + Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of + the token in the sequence. + token_index (:obj:`int`, `optional`): + If a batch index is provided in `batch_or_token_index`, this can be the index of the token or tokens in + the sequence. + + Returns: + :class:`~transformers.tokenization_utils_base.CharSpan`: Span of characters in the original string. + """ + + if not self._encodings: + raise ValueError("token_to_chars() is not available when using Python based tokenizers") + if token_index is not None: + batch_index = batch_or_token_index + else: + batch_index = 0 + token_index = batch_or_token_index + return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index))) + + def char_to_token( + self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0 + ) -> int: + """ + Get the index of the token in the encoded output comprising a character in the original string for a sequence + of the batch. + + Can be called as: + + - ``self.char_to_token(char_index)`` if batch size is 1 + - ``self.char_to_token(batch_index, char_index)`` if batch size is greater or equal to 1 + + This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words + are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized + words. + + Args: + batch_or_char_index (:obj:`int`): + Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of + the word in the sequence + char_index (:obj:`int`, `optional`): + If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the + sequence. + sequence_index (:obj:`int`, `optional`, defaults to 0): + If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0 + or 1) the provided character index belongs to. + + + Returns: + :obj:`int`: Index of the token. + """ + + if not self._encodings: + raise ValueError("char_to_token() is not available when using Python based tokenizers") + if char_index is not None: + batch_index = batch_or_char_index + else: + batch_index = 0 + char_index = batch_or_char_index + return self._encodings[batch_index].char_to_token(char_index, sequence_index) + + def word_to_chars( + self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0 + ) -> CharSpan: + """ + Get the character span in the original string corresponding to given word in a sequence of the batch. + + Character spans are returned as a CharSpan NamedTuple with: + + - start: index of the first character in the original string + - end: index of the character following the last character in the original string + + Can be called as: + + - ``self.word_to_chars(word_index)`` if batch size is 1 + - ``self.word_to_chars(batch_index, word_index)`` if batch size is greater or equal to 1 + + Args: + batch_or_word_index (:obj:`int`): + Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of + the word in the sequence + word_index (:obj:`int`, `optional`): + If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the + sequence. + sequence_index (:obj:`int`, `optional`, defaults to 0): + If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0 + or 1) the provided word index belongs to. + + Returns: + :obj:`CharSpan` or :obj:`List[CharSpan]`: Span(s) of the associated character or characters in the string. + CharSpan are NamedTuple with: + + - start: index of the first character associated to the token in the original string + - end: index of the character following the last character associated to the token in the original + string + """ + + if not self._encodings: + raise ValueError("word_to_chars() is not available when using Python based tokenizers") + if word_index is not None: + batch_index = batch_or_word_index + else: + batch_index = 0 + word_index = batch_or_word_index + return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index))) + + def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int: + """ + Get the word in the original string corresponding to a character in the original string of a sequence of the + batch. + + Can be called as: + + - ``self.char_to_word(char_index)`` if batch size is 1 + - ``self.char_to_word(batch_index, char_index)`` if batch size is greater than 1 + + This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words + are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized + words. + + Args: + batch_or_char_index (:obj:`int`): + Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of + the character in the original string. + char_index (:obj:`int`, `optional`): + If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the + original string. + sequence_index (:obj:`int`, `optional`, defaults to 0): + If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0 + or 1) the provided character index belongs to. + + + Returns: + :obj:`int` or :obj:`List[int]`: Index or indices of the associated encoded token(s). + """ + + if not self._encodings: + raise ValueError("char_to_word() is not available when using Python based tokenizers") + if char_index is not None: + batch_index = batch_or_char_index + else: + batch_index = 0 + char_index = batch_or_char_index + return self._encodings[batch_index].char_to_word(char_index, sequence_index) + + def convert_to_tensors( + self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False + ): + """ + Convert the inner content to tensors. + + Args: + tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): + The type of tensors to use. If :obj:`str`, should be one of the values of the enum + :class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done. + prepend_batch_axis (:obj:`int`, `optional`, defaults to :obj:`False`): + Whether or not to add the batch dimension during the conversion. + """ + if tensor_type is None: + return self + + # Convert to TensorType + if not isinstance(tensor_type, TensorType): + tensor_type = TensorType(tensor_type) + + # Get a function reference for the correct framework + if tensor_type == TensorType.PYTORCH: + as_tensor = torch.tensor + is_tensor = torch.is_tensor + else: + as_tensor = np.asarray + is_tensor = _is_numpy + # (mfuntowicz: This code is unreachable) + # else: + # raise ImportError( + # f"Unable to convert output to tensors format {tensor_type}" + # ) + + # Do the tensor conversion in batch + for key, value in self.items(): + try: + if prepend_batch_axis: + value = [value] + + if not is_tensor(value): + tensor = as_tensor(value) + + # Removing this for now in favor of controlling the shape with `prepend_batch_axis` + # # at-least2d + # if tensor.ndim > 2: + # tensor = tensor.squeeze(0) + # elif tensor.ndim < 2: + # tensor = tensor[None, :] + + self[key] = tensor + except: # noqa E722 + if key == "overflowing_tokens": + raise ValueError( + "Unable to create tensor returning overflowing tokens of different lengths. " + "Please see if a fast version of this tokenizer is available to have this feature available." + ) + raise ValueError( + "Unable to create tensor, you should probably activate truncation and/or padding " + "with 'padding=True' 'truncation=True' to have batched tensors with the same length." + ) + + return self + + # @torch_required + def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding": + """ + Send all values to device by calling :obj:`v.to(device)` (PyTorch only). + + Args: + device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on. + + Returns: + :class:`~transformers.BatchEncoding`: The same instance after modification. + """ + + # This check catches things like APEX blindly calling "to" on all inputs to a module + # Otherwise it passes the casts down and casts the LongTensor containing the token idxs + # into a HalfTensor + if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int): + self.data = {k: v.to(device=device) for k, v in self.data.items()} + else: + logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.") + return self + + +class SpecialTokensMixin: + """ + A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` to + handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be + used to directly access these special tokens in a model-independent manner and allow to set and update the special + tokens. + + Args: + bos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing the beginning of a sentence. + eos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing the end of a sentence. + unk_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing an out-of-vocabulary token. + sep_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token separating two different sentences in the same input (used by BERT for instance). + pad_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by + attention mechanisms or loss computation. + cls_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing the class of the input (used by BERT for instance). + mask_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing a masked token (used by masked-language modeling pretraining objectives, like + BERT). + additional_special_tokens (tuple or list of :obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A tuple or a list of additional special tokens. + """ + + SPECIAL_TOKENS_ATTRIBUTES = [ + "bos_token", + "eos_token", + "unk_token", + "sep_token", + "pad_token", + "cls_token", + "mask_token", + "additional_special_tokens", + ] + + def __init__(self, verbose=True, **kwargs): + self._bos_token = None + self._eos_token = None + self._unk_token = None + self._sep_token = None + self._pad_token = None + self._cls_token = None + self._mask_token = None + self._pad_token_type_id = 0 + self._additional_special_tokens = [] + self.verbose = verbose + + # We directly set the hidden value to allow initialization with special tokens + # which are not yet in the vocabulary. Necessary for serialization/de-serialization + # TODO clean this up at some point (probably by switching to fast tokenizers) + for key, value in kwargs.items(): + if value is None: + continue + if key in self.SPECIAL_TOKENS_ATTRIBUTES: + if key == "additional_special_tokens": + assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple" + assert all( + isinstance(t, (str, AddedToken)) for t in value + ), "One of the tokens is not a string or an AddedToken" + setattr(self, key, value) + elif isinstance(value, (str, AddedToken)): + setattr(self, key, value) + else: + raise TypeError(f"special token {key} has to be either str or AddedToken but got: {type(value)}") + + def sanitize_special_tokens(self) -> int: + """ + Make sure that all the special tokens attributes of the tokenizer (:obj:`tokenizer.mask_token`, + :obj:`tokenizer.cls_token`, etc.) are in the vocabulary. + + Add the missing ones to the vocabulary if needed. + + Return: + :obj:`int`: The number of tokens added in the vocabulary during the operation. + """ + return self.add_tokens(self.all_special_tokens_extended, special_tokens=True) + + def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int: + """ + Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If + special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the + current vocabulary). + + .. Note:: + When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of + the model so that its embedding matrix matches the tokenizer. + + In order to do that, please use the :meth:`~transformers.PreTrainedModel.resize_token_embeddings` method. + + Using :obj:`add_special_tokens` will ensure your special tokens can be used in several ways: + + - Special tokens are carefully handled by the tokenizer (they are never split). + - You can easily refer to special tokens using tokenizer class attributes like :obj:`tokenizer.cls_token`. This + makes it easy to develop model-agnostic training and fine-tuning scripts. + + When possible, special tokens are already registered for provided pretrained models (for instance + :class:`~transformers.BertTokenizer` :obj:`cls_token` is already registered to be :obj`'[CLS]'` and XLM's one + is also registered to be :obj:`''`). + + Args: + special_tokens_dict (dictionary `str` to `str` or :obj:`tokenizers.AddedToken`): + Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``, + ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, + ``additional_special_tokens``]. + + Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer + assign the index of the ``unk_token`` to them). + + Returns: + :obj:`int`: Number of tokens added to the vocabulary. + + Examples:: + + # Let's see how to add a new classification token to GPT-2 + tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + model = GPT2Model.from_pretrained('gpt2') + + special_tokens_dict = {'cls_token': ''} + + num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) + print('We have added', num_added_toks, 'tokens') + # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer. + model.resize_token_embeddings(len(tokenizer)) + + assert tokenizer.cls_token == '' + """ + if not special_tokens_dict: + return 0 + + added_tokens = 0 + for key, value in special_tokens_dict.items(): + assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token" + + if self.verbose: + logger.info(f"Assigning {value} to the {key} key of the tokenizer") + setattr(self, key, value) + + if key == "additional_special_tokens": + assert isinstance(value, (list, tuple)) and all( + isinstance(t, (str, AddedToken)) for t in value + ), f"Tokens {value} for key {key} should all be str or AddedToken instances" + added_tokens += self.add_tokens(value, special_tokens=True) + else: + assert isinstance( + value, (str, AddedToken) + ), f"Token {value} for key {key} should be a str or an AddedToken instance" + added_tokens += self.add_tokens([value], special_tokens=True) + + return added_tokens + + def add_tokens( + self, new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]], special_tokens: bool = False + ) -> int: + """ + Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to + it with indices starting from length of the current vocabulary. + + .. Note:: + When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of + the model so that its embedding matrix matches the tokenizer. + + In order to do that, please use the :meth:`~transformers.PreTrainedModel.resize_token_embeddings` method. + + Args: + new_tokens (:obj:`str`, :obj:`tokenizers.AddedToken` or a list of `str` or :obj:`tokenizers.AddedToken`): + Tokens are only added if they are not already in the vocabulary. :obj:`tokenizers.AddedToken` wraps a + string token to let you personalize its behavior: whether this token should only match against a single + word, whether this token should strip all potential whitespaces on the left side, whether this token + should strip all potential whitespaces on the right side, etc. + special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Can be used to specify if the token is a special token. This mostly change the normalization behavior + (special tokens like CLS or [MASK] are usually not lower-cased for instance). + + See details for :obj:`tokenizers.AddedToken` in HuggingFace tokenizers library. + + Returns: + :obj:`int`: Number of tokens added to the vocabulary. + + Examples:: + + # Let's see how to increase the vocabulary of Bert model and tokenizer + tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained('bert-base-uncased') + + num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) + print('We have added', num_added_toks, 'tokens') + # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer. + model.resize_token_embeddings(len(tokenizer)) + """ + if not new_tokens: + return 0 + + if not isinstance(new_tokens, (list, tuple)): + new_tokens = [new_tokens] + + return self._add_tokens(new_tokens, special_tokens=special_tokens) + + def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int: + raise NotImplementedError + + @property + def bos_token(self) -> str: + """ + :obj:`str`: Beginning of sentence token. Log an error if used while not having been set. + """ + if self._bos_token is None and self.verbose: + logger.error("Using bos_token, but it is not set yet.") + return None + return str(self._bos_token) + + @property + def eos_token(self) -> str: + """ + :obj:`str`: End of sentence token. Log an error if used while not having been set. + """ + if self._eos_token is None and self.verbose: + logger.error("Using eos_token, but it is not set yet.") + return None + return str(self._eos_token) + + @property + def unk_token(self) -> str: + """ + :obj:`str`: Unknown token. Log an error if used while not having been set. + """ + if self._unk_token is None and self.verbose: + logger.error("Using unk_token, but it is not set yet.") + return None + return str(self._unk_token) + + @property + def sep_token(self) -> str: + """ + :obj:`str`: Separation token, to separate context and query in an input sequence. Log an error if used while + not having been set. + """ + if self._sep_token is None and self.verbose: + logger.error("Using sep_token, but it is not set yet.") + return None + return str(self._sep_token) + + @property + def pad_token(self) -> str: + """ + :obj:`str`: Padding token. Log an error if used while not having been set. + """ + if self._pad_token is None and self.verbose: + logger.error("Using pad_token, but it is not set yet.") + return None + return str(self._pad_token) + + @property + def cls_token(self) -> str: + """ + :obj:`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the + full depth of the model. Log an error if used while not having been set. + """ + if self._cls_token is None and self.verbose: + logger.error("Using cls_token, but it is not set yet.") + return None + return str(self._cls_token) + + @property + def mask_token(self) -> str: + """ + :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while + not having been set. + """ + if self._mask_token is None and self.verbose: + logger.error("Using mask_token, but it is not set yet.") + return None + return str(self._mask_token) + + @property + def additional_special_tokens(self) -> List[str]: + """ + :obj:`List[str]`: All the additional special tokens you may want to use. Log an error if used while not having + been set. + """ + if self._additional_special_tokens is None and self.verbose: + logger.error("Using additional_special_tokens, but it is not set yet.") + return None + return [str(tok) for tok in self._additional_special_tokens] + + @bos_token.setter + def bos_token(self, value): + self._bos_token = value + + @eos_token.setter + def eos_token(self, value): + self._eos_token = value + + @unk_token.setter + def unk_token(self, value): + self._unk_token = value + + @sep_token.setter + def sep_token(self, value): + self._sep_token = value + + @pad_token.setter + def pad_token(self, value): + self._pad_token = value + + @cls_token.setter + def cls_token(self, value): + self._cls_token = value + + @mask_token.setter + def mask_token(self, value): + self._mask_token = value + + @additional_special_tokens.setter + def additional_special_tokens(self, value): + self._additional_special_tokens = value + + @property + def bos_token_id(self) -> Optional[int]: + """ + :obj:`Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns :obj:`None` if the token + has not been set. + """ + if self._bos_token is None: + return None + return self.convert_tokens_to_ids(self.bos_token) + + @property + def eos_token_id(self) -> Optional[int]: + """ + :obj:`Optional[int]`: Id of the end of sentence token in the vocabulary. Returns :obj:`None` if the token has + not been set. + """ + if self._eos_token is None: + return None + return self.convert_tokens_to_ids(self.eos_token) + + @property + def unk_token_id(self) -> Optional[int]: + """ + :obj:`Optional[int]`: Id of the unknown token in the vocabulary. Returns :obj:`None` if the token has not been + set. + """ + if self._unk_token is None: + return None + return self.convert_tokens_to_ids(self.unk_token) + + @property + def sep_token_id(self) -> Optional[int]: + """ + :obj:`Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input + sequence. Returns :obj:`None` if the token has not been set. + """ + if self._sep_token is None: + return None + return self.convert_tokens_to_ids(self.sep_token) + + @property + def pad_token_id(self) -> Optional[int]: + """ + :obj:`Optional[int]`: Id of the padding token in the vocabulary. Returns :obj:`None` if the token has not been + set. + """ + if self._pad_token is None: + return None + return self.convert_tokens_to_ids(self.pad_token) + + @property + def pad_token_type_id(self) -> int: + """ + :obj:`int`: Id of the padding token type in the vocabulary. + """ + return self._pad_token_type_id + + @property + def cls_token_id(self) -> Optional[int]: + """ + :obj:`Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input + sequence leveraging self-attention along the full depth of the model. + + Returns :obj:`None` if the token has not been set. + """ + if self._cls_token is None: + return None + return self.convert_tokens_to_ids(self.cls_token) + + @property + def mask_token_id(self) -> Optional[int]: + """ + :obj:`Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language + modeling. Returns :obj:`None` if the token has not been set. + """ + if self._mask_token is None: + return None + return self.convert_tokens_to_ids(self.mask_token) + + @property + def additional_special_tokens_ids(self) -> List[int]: + """ + :obj:`List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not + having been set. + """ + return self.convert_tokens_to_ids(self.additional_special_tokens) + + @bos_token_id.setter + def bos_token_id(self, value): + self._bos_token = self.convert_tokens_to_ids(value) + + @eos_token_id.setter + def eos_token_id(self, value): + self._eos_token = self.convert_tokens_to_ids(value) + + @unk_token_id.setter + def unk_token_id(self, value): + self._unk_token = self.convert_tokens_to_ids(value) + + @sep_token_id.setter + def sep_token_id(self, value): + self._sep_token = self.convert_tokens_to_ids(value) + + @pad_token_id.setter + def pad_token_id(self, value): + self._pad_token = self.convert_tokens_to_ids(value) + + @cls_token_id.setter + def cls_token_id(self, value): + self._cls_token = self.convert_tokens_to_ids(value) + + @mask_token_id.setter + def mask_token_id(self, value): + self._mask_token = self.convert_tokens_to_ids(value) + + @additional_special_tokens_ids.setter + def additional_special_tokens_ids(self, values): + self._additional_special_tokens = [self.convert_tokens_to_ids(value) for value in values] + + @property + def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]: + """ + :obj:`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (:obj:`cls_token`, + :obj:`unk_token`, etc.) to their values (:obj:`''`, :obj:`''`, etc.). + + Convert potential tokens of :obj:`tokenizers.AddedToken` type to string. + """ + set_attr = {} + for attr in self.SPECIAL_TOKENS_ATTRIBUTES: + attr_value = getattr(self, "_" + attr) + if attr_value: + set_attr[attr] = ( + type(attr_value)(str(attr_value_sub) for attr_value_sub in attr_value) + if isinstance(attr_value, (list, tuple)) + else str(attr_value) + ) + return set_attr + + @property + def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]: + """ + :obj:`Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: A dictionary + mapping special token class attributes (:obj:`cls_token`, :obj:`unk_token`, etc.) to their values + (:obj:`''`, :obj:`''`, etc.). + + Don't convert tokens of :obj:`tokenizers.AddedToken` type to string so they can be used to control more finely + how special tokens are tokenized. + """ + set_attr = {} + for attr in self.SPECIAL_TOKENS_ATTRIBUTES: + attr_value = getattr(self, "_" + attr) + if attr_value: + set_attr[attr] = attr_value + return set_attr + + @property + def all_special_tokens(self) -> List[str]: + """ + :obj:`List[str]`: All the special tokens (:obj:`''`, :obj:`''`, etc.) mapped to class attributes. + + Convert tokens of :obj:`tokenizers.AddedToken` type to string. + """ + all_toks = [str(s) for s in self.all_special_tokens_extended] + return all_toks + + @property + def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]: + """ + :obj:`List[Union[str, tokenizers.AddedToken]]`: All the special tokens (:obj:`''`, :obj:`''`, etc.) + mapped to class attributes. + + Don't convert tokens of :obj:`tokenizers.AddedToken` type to string so they can be used to control more finely + how special tokens are tokenized. + """ + all_toks = [] + set_attr = self.special_tokens_map_extended + for attr_value in set_attr.values(): + all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]) + all_toks = list(OrderedDict.fromkeys(all_toks)) + return all_toks + + @property + def all_special_ids(self) -> List[int]: + """ + :obj:`List[int]`: List the ids of the special tokens(:obj:`''`, :obj:`''`, etc.) mapped to class + attributes. + """ + all_toks = self.all_special_tokens + all_ids = self.convert_tokens_to_ids(all_toks) + return all_ids + + +ENCODE_KWARGS_DOCSTRING = r""" + add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to encode the sequences with the special tokens relative to their model. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`): + Activates and controls padding. Accepts the following values: + + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a + single sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`): + Activates and controls truncation. Accepts the following values: + + * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument + :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not + provided. This will truncate token by token, removing a token from the longest sequence in the pair + if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to + the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with + sequence lengths greater than the model maximum admissible input size). + max_length (:obj:`int`, `optional`): + Controls the maximum length to use by one of the truncation/padding parameters. + + If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum + length is required by one of the truncation/padding parameters. If the model has no specific maximum + input length (like XLNet) truncation/padding to a maximum length will be deactivated. + stride (:obj:`int`, `optional`, defaults to 0): + If set to a number along with :obj:`max_length`, the overflowing tokens returned when + :obj:`return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence + returned to provide some overlap between truncated and overflowing sequences. The value of this + argument defines the number of overlapping tokens. + is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`, the + tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace) + which it will tokenize. This is useful for NER or token classification. + pad_to_multiple_of (:obj:`int`, `optional`): + If set will pad the sequence to a multiple of the provided value. This is especially useful to enable + the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): + If set, will return tensors instead of list of python integers. Acceptable values are: + + * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. + * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. + * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. +""" + +ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" + return_token_type_ids (:obj:`bool`, `optional`): + Whether to return token type IDs. If left to the default, will return the token type IDs according to + the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are token type IDs? <../glossary.html#token-type-ids>`__ + return_attention_mask (:obj:`bool`, `optional`): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are attention masks? <../glossary.html#attention-mask>`__ + return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch + of pairs) is provided with :obj:`truncation_strategy = longest_first` or :obj:`True`, an error is + raised instead of returning overflowing tokens. + return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to return special tokens mask information. + return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to return :obj:`(char_start, char_end)` for each token. + + This is only available on fast tokenizers inheriting from + :class:`~transformers.PreTrainedTokenizerFast`, if using Python's tokenizer, this method will raise + :obj:`NotImplementedError`. + return_length (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to return the lengths of the encoded inputs. + verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to print more information and warnings. + **kwargs: passed to the :obj:`self.tokenize()` method + + Return: + :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. + + `What are input IDs? <../glossary.html#input-ids>`__ + + - **token_type_ids** -- List of token type ids to be fed to a model (when :obj:`return_token_type_ids=True` + or if `"token_type_ids"` is in :obj:`self.model_input_names`). + + `What are token type IDs? <../glossary.html#token-type-ids>`__ + + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`). + + `What are attention masks? <../glossary.html#attention-mask>`__ + + - **overflowing_tokens** -- List of overflowing tokens sequences (when a :obj:`max_length` is specified and + :obj:`return_overflowing_tokens=True`). + - **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and + :obj:`return_overflowing_tokens=True`). + - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying + regular sequence tokens (when :obj:`add_special_tokens=True` and :obj:`return_special_tokens_mask=True`). + - **length** -- The length of the inputs (when :obj:`return_length=True`) +""" + +INIT_TOKENIZER_DOCSTRING = r""" + Class attributes (overridden by derived classes) + + - **vocab_files_names** (:obj:`Dict[str, str]`) -- A dictionary with, as keys, the ``__init__`` keyword name of + each vocabulary file required by the model, and as associated values, the filename for saving the associated + file (string). + - **pretrained_vocab_files_map** (:obj:`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the + high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the + low-level being the :obj:`short-cut-names` of the pretrained models with, as associated values, the + :obj:`url` to the associated pretrained vocabulary file. + - **max_model_input_sizes** (:obj:`Dict[str, Optional[int]]`) -- A dictionary with, as keys, the + :obj:`short-cut-names` of the pretrained models, and as associated values, the maximum length of the sequence + inputs of this model, or :obj:`None` if the model has no maximum input size. + - **pretrained_init_configuration** (:obj:`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the + :obj:`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments + to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the + tokenizer with the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained` + method. + - **model_input_names** (:obj:`List[str]`) -- A list of inputs expected in the forward pass of the model. + - **padding_side** (:obj:`str`) -- The default value for the side on which the model should have padding + applied. Should be :obj:`'right'` or :obj:`'left'`. + + Args: + model_max_length (:obj:`int`, `optional`): + The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is + loaded with :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`, this + will be set to the value stored for the associated model in ``max_model_input_sizes`` (see above). If no + value is provided, will default to VERY_LARGE_INTEGER (:obj:`int(1e30)`). + padding_side: (:obj:`str`, `optional`): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. + model_input_names (:obj:`List[string]`, `optional`): + The list of inputs accepted by the forward pass of the model (like :obj:`"token_type_ids"` or + :obj:`"attention_mask"`). Default value is picked from the class attribute of the same name. + bos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing the beginning of a sentence. Will be associated to ``self.bos_token`` and + ``self.bos_token_id``. + eos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing the end of a sentence. Will be associated to ``self.eos_token`` and + ``self.eos_token_id``. + unk_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing an out-of-vocabulary token. Will be associated to ``self.unk_token`` and + ``self.unk_token_id``. + sep_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token separating two different sentences in the same input (used by BERT for instance). Will be + associated to ``self.sep_token`` and ``self.sep_token_id``. + pad_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by + attention mechanisms or loss computation. Will be associated to ``self.pad_token`` and + ``self.pad_token_id``. + cls_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing the class of the input (used by BERT for instance). Will be associated to + ``self.cls_token`` and ``self.cls_token_id``. + mask_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A special token representing a masked token (used by masked-language modeling pretraining objectives, like + BERT). Will be associated to ``self.mask_token`` and ``self.mask_token_id``. + additional_special_tokens (tuple or list of :obj:`str` or :obj:`tokenizers.AddedToken`, `optional`): + A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the + tokenization process. Will be associated to ``self.additional_special_tokens`` and + ``self.additional_special_tokens_ids``. +""" + + +@add_end_docstrings(INIT_TOKENIZER_DOCSTRING) +class PreTrainedTokenizerBase(SpecialTokensMixin): + """ + Base class for :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`. + + Handles shared (mostly boiler plate) methods for those two classes. + """ + + vocab_files_names: Dict[str, str] = {} + pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {} + pretrained_init_configuration: Dict[str, Dict[str, Any]] = {} + max_model_input_sizes: Dict[str, Optional[int]] = {} + + # first name has to correspond to main model input name + # to make sure `tokenizer.pad(...)` works correctly + model_input_names: List[str] = ["input_ids", "token_type_ids", "attention_mask"] + padding_side: str = "right" + slow_tokenizer_class = None + + def __init__(self, **kwargs): + # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``) + self.init_inputs = () + self.init_kwargs = copy.deepcopy(kwargs) + self.name_or_path = kwargs.pop("name_or_path", "") + + # For backward compatibility we fallback to set model_max_length from max_len if provided + model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None)) + self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER + + # Padding side is right by default and overridden in subclasses. If specified in the kwargs, it is changed. + self.padding_side = kwargs.pop("padding_side", self.padding_side) + assert self.padding_side in [ + "right", + "left", + ], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}" + self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) + + self.deprecation_warnings = ( + {} + ) # Use to store when we have already noticed a deprecation warning (avoid overlogging). + + super().__init__(**kwargs) + + @property + def max_len_single_sentence(self) -> int: + """ + :obj:`int`: The maximum length of a sentence that can be fed to the model. + """ + return self.model_max_length - self.num_special_tokens_to_add(pair=False) + + @property + def max_len_sentences_pair(self) -> int: + """ + :obj:`int`: The maximum combined length of a pair of sentences that can be fed to the model. + """ + return self.model_max_length - self.num_special_tokens_to_add(pair=True) + + @max_len_single_sentence.setter + def max_len_single_sentence(self, value) -> int: + # For backward compatibility, allow to try to setup 'max_len_single_sentence'. + if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose: + if not self.deprecation_warnings.get("max_len_single_sentence", False): + logger.warning( + "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." + ) + self.deprecation_warnings["max_len_single_sentence"] = True + else: + raise ValueError( + "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up." + ) + + @max_len_sentences_pair.setter + def max_len_sentences_pair(self, value) -> int: + # For backward compatibility, allow to try to setup 'max_len_sentences_pair'. + if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose: + if not self.deprecation_warnings.get("max_len_sentences_pair", False): + logger.warning( + "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." + ) + self.deprecation_warnings["max_len_sentences_pair"] = True + else: + raise ValueError( + "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up." + ) + + def __repr__(self) -> str: + return ( + f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}', " + f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast}, " + f"padding_side='{self.padding_side}', special_tokens={self.special_tokens_map_extended})" + ) + + def get_vocab(self) -> Dict[str, int]: + """ + Returns the vocabulary as a dictionary of token to index. + + :obj:`tokenizer.get_vocab()[token]` is equivalent to :obj:`tokenizer.convert_tokens_to_ids(token)` when + :obj:`token` is in the vocab. + + Returns: + :obj:`Dict[str, int]`: The vocabulary. + """ + raise NotImplementedError() + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs): + r""" + Instantiate a :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` (or a derived class) from + a predefined tokenizer. + + Args: + pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): + Can be either: + + - A string, the `model id` of a predefined tokenizer hosted inside a model repo on huggingface.co. + Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a + user or organization name, like ``dbmdz/bert-base-german-cased``. + - A path to a `directory` containing vocabulary files required by the tokenizer, for instance saved + using the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained` + method, e.g., ``./my_model_directory/``. + - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary + file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g., + ``./my_model_directory/vocab.txt``. + cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`): + Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the + standard cache should not be used. + force_download (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to force the (re-)download the vocabulary files and override the cached versions if they + exist. + resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to delete incompletely received files. Attempt to resume the download if such a file + exists. + proxies (:obj:`Dict[str, str], `optional`): + A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + use_auth_token (:obj:`str` or `bool`, `optional`): + The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token + generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). + local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to only rely on local files and not to attempt to download any files. + revision(:obj:`str`, `optional`, defaults to :obj:`"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any + identifier allowed by git. + subfolder (:obj:`str`, `optional`): + In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for + facebook/rag-token-base), specify it here. + inputs (additional positional arguments, `optional`): + Will be passed along to the Tokenizer ``__init__`` method. + kwargs (additional keyword arguments, `optional`): + Will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like + ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, + ``mask_token``, ``additional_special_tokens``. See parameters in the ``__init__`` for more details. + + .. note:: + + Passing :obj:`use_auth_token=True` is required when you want to use a private model. + + Examples:: + + # We can't instantiate directly the base class `PreTrainedTokenizerBase` so let's show our examples on a derived class: BertTokenizer + # Download vocabulary from huggingface.co and cache. + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + + # Download vocabulary from huggingface.co (user-uploaded) and cache. + tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased') + + # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) + tokenizer = BertTokenizer.from_pretrained('./test/saved_model/') + + # If the tokenizer uses a single vocabulary file, you can point directly to this file + tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt') + + # You can link tokens to special vocabulary when instantiating + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='') + # You should be sure '' is in the vocabulary when doing that. + # Otherwise use tokenizer.add_special_tokens({'unk_token': ''}) instead) + assert tokenizer.unk_token == '' + + """ + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", False) + proxies = kwargs.pop("proxies", None) + local_files_only = kwargs.pop("local_files_only", False) + use_auth_token = kwargs.pop("use_auth_token", None) + revision = kwargs.pop("revision", None) + subfolder = kwargs.pop("subfolder", None) + from_pipeline = kwargs.pop("_from_pipeline", None) + from_auto_class = kwargs.pop("_from_auto", False) + + user_agent = {"file_type": "tokenizer", "from_auto_class": from_auto_class, "is_fast": "Fast" in cls.__name__} + if from_pipeline is not None: + user_agent["using_pipeline"] = from_pipeline + + if is_offline_mode() and not local_files_only: + logger.info("Offline mode: forcing local_files_only=True") + local_files_only = True + + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + vocab_files = {} + init_configuration = {} + + if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): + if len(cls.vocab_files_names) > 1: + raise ValueError( + f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not " + "supported for this tokenizer. Use a model identifier or the path to a directory instead." + ) + logger.warn( + f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and " + "won't be possible anymore in v5. Use a model identifier or the path to a directory instead.", + FutureWarning, + ) + file_id = list(cls.vocab_files_names.keys())[0] + vocab_files[file_id] = pretrained_model_name_or_path + else: + raise RuntimeError("At this point pretrained_model_name_or_path is either a directory or a model identifier name, ", + "which is not supported in fastNLP now.") + + # Get files from url, cache, or disk depending on the case + resolved_vocab_files = {} + unresolved_files = [] + for file_id, file_path in vocab_files.items(): + if file_path is None: + resolved_vocab_files[file_id] = None + else: + try: + resolved_vocab_files[file_id] = cached_path( + file_path, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + use_auth_token=use_auth_token, + user_agent=user_agent, + ) + + except FileNotFoundError as error: + if local_files_only: + unresolved_files.append(file_id) + else: + raise error + + except requests.exceptions.HTTPError as err: + if "404 Client Error" in str(err): + logger.debug(err) + resolved_vocab_files[file_id] = None + else: + raise err + + if len(unresolved_files) > 0: + logger.info( + f"Can't load following files from cache: {unresolved_files} and cannot check if these " + "files are necessary for the tokenizer to operate." + ) + + if all(full_file_name is None for full_file_name in resolved_vocab_files.values()): + msg = ( + f"Can't load tokenizer for '{pretrained_model_name_or_path}'. Make sure that:\n\n" + f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" + f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing relevant tokenizer files\n\n" + ) + + if revision is not None: + msg += f"- or '{revision}' is a valid git identifier (branch name, a tag name, or a commit id) that exists for this model name as listed on its model page on 'https://huggingface.co/models'\n\n" + + raise EnvironmentError(msg) + + for file_id, file_path in vocab_files.items(): + if file_id not in resolved_vocab_files: + continue + + if file_path == resolved_vocab_files[file_id]: + logger.info(f"loading file {file_path}") + else: + logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}") + + return cls._from_pretrained( + resolved_vocab_files, + pretrained_model_name_or_path, + init_configuration, + *init_inputs, + use_auth_token=use_auth_token, + **kwargs, + ) + + @classmethod + def _from_pretrained( + cls, + resolved_vocab_files, + pretrained_model_name_or_path, + init_configuration, + *init_inputs, + use_auth_token=None, + **kwargs + ): + # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json + # file or if `from_slow` is set to True. + from_slow = kwargs.get("from_slow", False) + has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None + if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None: + slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained( + copy.deepcopy(resolved_vocab_files), + pretrained_model_name_or_path, + copy.deepcopy(init_configuration), + *init_inputs, + **(copy.deepcopy(kwargs)), + ) + else: + slow_tokenizer = None + + # Prepare tokenizer initialization kwargs + # Did we saved some inputs and kwargs to reload ? + tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) + if tokenizer_config_file is not None: + with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle: + init_kwargs = json.load(tokenizer_config_handle) + # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers. + config_tokenizer_class = init_kwargs.get("tokenizer_class") + init_kwargs.pop("tokenizer_class", None) + saved_init_inputs = init_kwargs.pop("init_inputs", ()) + if not init_inputs: + init_inputs = saved_init_inputs + else: + config_tokenizer_class = None + init_kwargs = init_configuration + + if config_tokenizer_class is None: + from .models.auto.configuration_auto import AutoConfig # tests_ignore + + # Second attempt. If we have not yet found tokenizer_class, let's try to use the config. + try: + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, use_auth_token=use_auth_token) + config_tokenizer_class = config.tokenizer_class + except (OSError, ValueError, KeyError): + # skip if an error occurred. + config = None + if config_tokenizer_class is None: + # Third attempt. If we have not yet found the original type of the tokenizer, + # we are loading we see if we can infer it from the type of the configuration file + from .models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES # tests_ignore + + if hasattr(config, "model_type"): + model_type = config.model_type + else: + # Fallback: use pattern matching on the string. + model_type = None + for pattern in TOKENIZER_MAPPING_NAMES.keys(): + if pattern in str(pretrained_model_name_or_path): + model_type = pattern + break + + if model_type is not None: + config_tokenizer_class, config_tokenizer_class_fast = TOKENIZER_MAPPING_NAMES.get( + model_type, (None, None) + ) + if config_tokenizer_class is None: + config_tokenizer_class = config_tokenizer_class_fast + + if config_tokenizer_class is not None: + if cls.__name__.replace("Fast", "") != config_tokenizer_class.replace("Fast", ""): + logger.warning( + "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. " + "It may result in unexpected tokenization. \n" + f"The tokenizer class you load from this checkpoint is '{config_tokenizer_class}'. \n" + f"The class this function is called from is '{cls.__name__}'." + ) + + # Update with newly provided kwargs + init_kwargs.update(kwargs) + + # Convert AddedTokens serialized as dict to class instances + def convert_added_tokens(obj: Union[AddedToken, Any]): + if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken": + obj.pop("__type") + return AddedToken(**obj) + elif isinstance(obj, (list, tuple)): + return list(convert_added_tokens(o) for o in obj) + elif isinstance(obj, dict): + return {k: convert_added_tokens(v) for k, v in obj.items()} + return obj + + init_kwargs = convert_added_tokens(init_kwargs) + + # Set max length if needed + if pretrained_model_name_or_path in cls.max_model_input_sizes: + # if we're using a pretrained model, ensure the tokenizer + # wont index sequences longer than the number of positional embeddings + model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path] + if model_max_length is not None and isinstance(model_max_length, (int, float)): + init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length) + + # Merge resolved_vocab_files arguments in init_kwargs. + added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) + for args_name, file_path in resolved_vocab_files.items(): + if args_name not in init_kwargs: + init_kwargs[args_name] = file_path + + if slow_tokenizer is not None: + init_kwargs["__slow_tokenizer"] = slow_tokenizer + + init_kwargs["name_or_path"] = pretrained_model_name_or_path + + # Instantiate tokenizer. + try: + tokenizer = cls(*init_inputs, **init_kwargs) + except OSError: + raise OSError( + "Unable to load vocabulary from file. " + "Please check that the provided vocabulary is accessible and not corrupted." + ) + + # Save inputs and kwargs for saving and re-loading with ``save_pretrained`` + # Removed: Now done at the base class level + # tokenizer.init_inputs = init_inputs + # tokenizer.init_kwargs = init_kwargs + + # If there is a complementary special token map, load it + special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) + if special_tokens_map_file is not None: + with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: + special_tokens_map = json.load(special_tokens_map_handle) + for key, value in special_tokens_map.items(): + if key in kwargs and kwargs[key]: + # This value has already been redefined by the kwargs + # We keep this new value and ignore the one stored in the special_tokens_map_file + + continue + + if isinstance(value, dict): + value = AddedToken(**value) + elif isinstance(value, list): + value = [AddedToken(**token) if isinstance(token, dict) else token for token in value] + setattr(tokenizer, key, value) + + # Add supplementary tokens. + special_tokens = tokenizer.all_special_tokens + if added_tokens_file is not None: + with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: + added_tok_encoder = json.load(added_tokens_handle) + + # Sort added tokens by index + added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1])) + + for token, index in added_tok_encoder_sorted: + if has_tokenizer_file and index != len(tokenizer) and tokenizer.convert_tokens_to_ids(token) != index: + # Tokenizer fast: added token needs to either be in the vocabulary with the proper index or the + # index is the current length of the tokenizer (not in vocabulary) + raise ValueError( + f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found " + f"{index}." + ) + elif not has_tokenizer_file and index != len(tokenizer): + # Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the + # current length of the tokenizer. + raise ValueError( + f"Non-consecutive added token '{token}' found. " + f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary." + ) + + # Safe to call on a tokenizer fast even if token already there. + tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens)) + + # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab + added_tokens = tokenizer.sanitize_special_tokens() + if added_tokens: + logger.warning( + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained." + ) + + return tokenizer + + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + legacy_format: Optional[bool] = None, + filename_prefix: Optional[str] = None, + **kwargs, + ) -> Tuple[str]: + """ + Save the full tokenizer state. + + + This method make sure the full tokenizer can then be re-loaded using the + :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method.. + + .. Warning:: + This won't save modifications you may have applied to the tokenizer after the instantiation (for instance, + modifying :obj:`tokenizer.do_lower_case` after creation). + + Args: + save_directory (:obj:`str` or :obj:`os.PathLike`): The path to a directory where the tokenizer will be saved. + legacy_format (:obj:`bool`, `optional`): + Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON + format as well as in legacy format if it exists, i.e. with tokenizer specific vocabulary and a separate + added_tokens files. + + If :obj:`False`, will only save the tokenizer in the unified JSON format. This format is incompatible + with "slow" tokenizers (not powered by the `tokenizers` library), so the tokenizer will not be able to + be loaded in the corresponding "slow" tokenizer. + + If :obj:`True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a + value error is raised. + filename_prefix: (:obj:`str`, `optional`): + A prefix to add to the names of the files saved by the tokenizer. + push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to push your model to the Hugging Face model hub after saving it. + + .. warning:: + + Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with + :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are + pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory + instead. + + Returns: + A tuple of :obj:`str`: The files saved. + """ + if os.path.isfile(save_directory): + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") + return + + os.makedirs(save_directory, exist_ok=True) + + special_tokens_map_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE + ) + tokenizer_config_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE + ) + + tokenizer_config = copy.deepcopy(self.init_kwargs) + if len(self.init_inputs) > 0: + tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs) + for file_id in self.vocab_files_names.keys(): + tokenizer_config.pop(file_id, None) + + # Sanitize AddedTokens + def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True): + if isinstance(obj, AddedToken): + out = obj.__getstate__() + if add_type_field: + out["__type"] = "AddedToken" + return out + elif isinstance(obj, (list, tuple)): + return list(convert_added_tokens(o, add_type_field=add_type_field) for o in obj) + elif isinstance(obj, dict): + return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()} + return obj + + # add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization + tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True) + + # Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained + tokenizer_class = self.__class__.__name__ + # Remove the Fast at the end unless we have a special `PreTrainedTokenizerFast` + if tokenizer_class.endswith("Fast") and tokenizer_class != "PreTrainedTokenizerFast": + tokenizer_class = tokenizer_class[:-4] + tokenizer_config["tokenizer_class"] = tokenizer_class + + with open(tokenizer_config_file, "w", encoding="utf-8") as f: + f.write(json.dumps(tokenizer_config, ensure_ascii=False)) + logger.info(f"tokenizer config file saved in {tokenizer_config_file}") + + # Sanitize AddedTokens in special_tokens_map + write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False) + with open(special_tokens_map_file, "w", encoding="utf-8") as f: + f.write(json.dumps(write_dict, ensure_ascii=False)) + logger.info(f"Special tokens file saved in {special_tokens_map_file}") + + file_names = (tokenizer_config_file, special_tokens_map_file) + + save_files = self._save_pretrained( + save_directory=save_directory, + file_names=file_names, + legacy_format=legacy_format, + filename_prefix=filename_prefix, + ) + + return save_files + + def _save_pretrained( + self, + save_directory: Union[str, os.PathLike], + file_names: Tuple[str], + legacy_format: Optional[bool] = None, + filename_prefix: Optional[str] = None, + ) -> Tuple[str]: + """ + Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens. + + Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the + specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained` + """ + if legacy_format is False: + raise ValueError( + "Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format." + ) + + save_directory = str(save_directory) + + added_tokens_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE + ) + added_vocab = self.get_added_vocab() + if added_vocab: + with open(added_tokens_file, "w", encoding="utf-8") as f: + out_str = json.dumps(added_vocab, ensure_ascii=False) + f.write(out_str) + logger.info(f"added tokens file saved in {added_tokens_file}") + + vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix) + + return file_names + vocab_files + (added_tokens_file,) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + """ + Save only the vocabulary of the tokenizer (vocabulary + added tokens). + + This method won't save the configuration and special token mappings of the tokenizer. Use + :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save the whole state of the tokenizer. + + Args: + save_directory (:obj:`str`): + The directory in which to save the vocabulary. + filename_prefix (:obj:`str`, `optional`): + An optional prefix to add to the named of the saved files. + + Returns: + :obj:`Tuple(str)`: Paths to the files saved. + """ + raise NotImplementedError + + def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]: + """ + Converts a string in a sequence of tokens, replacing unknown tokens with the :obj:`unk_token`. + + Args: + text (:obj:`str`): + The sequence to be encoded. + pair (:obj:`str`, `optional`): + A second sequence to be encoded with the first. + add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to add the special tokens associated with the corresponding model. + kwargs (additional keyword arguments, `optional`): + Will be passed to the underlying model specific encode method. See details in + :meth:`~transformers.PreTrainedTokenizerBase.__call__` + + Returns: + :obj:`List[str]`: The list of tokens. + """ + raise NotImplementedError + + @add_end_docstrings( + ENCODE_KWARGS_DOCSTRING, + """ + **kwargs: Passed along to the `.tokenize()` method. + """, + """ + Returns: + :obj:`List[int]`, :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`: The tokenized ids of the + text. + """, + ) + def encode( + self, + text: Union[TextInput, PreTokenizedInput, EncodedInput], + text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = False, + max_length: Optional[int] = None, + stride: int = 0, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs + ) -> List[int]: + """ + Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary. + + Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. + + Args: + text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`): + The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the + ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids`` + method). + text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`): + Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using + the ``tokenize`` method) or a list of integers (tokenized string ids using the + ``convert_tokens_to_ids`` method). + """ + encoded_inputs = self.encode_plus( + text, + text_pair=text_pair, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + return_tensors=return_tensors, + **kwargs, + ) + + return encoded_inputs["input_ids"] + + def num_special_tokens_to_add(self, pair: bool = False) -> int: + raise NotImplementedError + + def _get_padding_truncation_strategies( + self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs + ): + """ + Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy + and pad_to_max_length) and behaviors. + """ + old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate") + old_pad_to_max_length = kwargs.pop("pad_to_max_length", False) + + # Backward compatibility for previous behavior, maybe we should deprecate it: + # If you only set max_length, it activates truncation for max_length + if max_length is not None and padding is False and truncation is False: + if verbose: + if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False): + logger.warning( + "Truncation was not explicitly activated but `max_length` is provided a specific value, " + "please use `truncation=True` to explicitly truncate examples to max length. " + "Defaulting to 'longest_first' truncation strategy. " + "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy " + "more precisely by providing a specific strategy to `truncation`." + ) + self.deprecation_warnings["Truncation-not-explicitly-activated"] = True + truncation = "longest_first" + + # Get padding strategy + if padding is False and old_pad_to_max_length: + if verbose: + logger.warn( + "The `pad_to_max_length` argument is deprecated and will be removed in a future version, " + "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or " + "use `padding='max_length'` to pad to a max length. In this case, you can give a specific " + "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the " + "maximal input size of the model (e.g. 512 for Bert).", + FutureWarning, + ) + if max_length is None: + padding_strategy = PaddingStrategy.LONGEST + else: + padding_strategy = PaddingStrategy.MAX_LENGTH + elif padding is not False: + if padding is True: + if verbose: + if max_length is not None and (truncation is False or truncation == "do_not_truncate"): + logger.warn( + "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. " + "To pad to max length, use `padding='max_length'`." + ) + if old_pad_to_max_length is not False: + logger.warn("Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`.") + padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch + elif not isinstance(padding, PaddingStrategy): + padding_strategy = PaddingStrategy(padding) + elif isinstance(padding, PaddingStrategy): + padding_strategy = padding + else: + padding_strategy = PaddingStrategy.DO_NOT_PAD + + # Get truncation strategy + if truncation is False and old_truncation_strategy != "do_not_truncate": + if verbose: + logger.warn( + "The `truncation_strategy` argument is deprecated and will be removed in a future version, " + "use `truncation=True` to truncate examples to a max length. You can give a specific " + "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the " + "maximal input size of the model (e.g. 512 for Bert). " + " If you have pairs of inputs, you can give a specific truncation strategy selected among " + "`truncation='only_first'` (will only truncate the first sentence in the pairs) " + "`truncation='only_second'` (will only truncate the second sentence in the pairs) " + "or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).", + FutureWarning, + ) + truncation_strategy = TruncationStrategy(old_truncation_strategy) + elif truncation is not False: + if truncation is True: + truncation_strategy = ( + TruncationStrategy.LONGEST_FIRST + ) # Default to truncate the longest sequences in pairs of inputs + elif not isinstance(truncation, TruncationStrategy): + truncation_strategy = TruncationStrategy(truncation) + elif isinstance(truncation, TruncationStrategy): + truncation_strategy = truncation + else: + truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE + + # Set max length if needed + if max_length is None: + if padding_strategy == PaddingStrategy.MAX_LENGTH: + if self.model_max_length > LARGE_INTEGER: + if verbose: + if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False): + logger.warning( + "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. " + "Default to no padding." + ) + self.deprecation_warnings["Asking-to-pad-to-max_length"] = True + padding_strategy = PaddingStrategy.DO_NOT_PAD + else: + max_length = self.model_max_length + + if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE: + if self.model_max_length > LARGE_INTEGER: + if verbose: + if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False): + logger.warning( + "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. " + "Default to no truncation." + ) + self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True + truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE + else: + max_length = self.model_max_length + + # Test if we have a padding token + if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0): + raise ValueError( + "Asking to pad but the tokenizer does not have a padding token. " + "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` " + "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`." + ) + + # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided + if ( + truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE + and padding_strategy != PaddingStrategy.DO_NOT_PAD + and pad_to_multiple_of is not None + and max_length is not None + and (max_length % pad_to_multiple_of != 0) + ): + raise ValueError( + f"Truncation and padding are both activated but " + f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})." + ) + + return padding_strategy, truncation_strategy, max_length, kwargs + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], + text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = False, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + """ + Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of + sequences. + + Args: + text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + """ + # Input type checking for clearer error + def _is_valid_text_input(t): + if isinstance(t, str): + # Strings are fine + return True + elif isinstance(t, (list, tuple)): + # List are fine as long as they are... + if len(t) == 0: + # ... empty + return True + elif isinstance(t[0], str): + # ... list of strings + return True + elif isinstance(t[0], (list, tuple)): + # ... list with an empty list or with a list of strings + return len(t[0]) == 0 or isinstance(t[0][0], str) + else: + return False + else: + return False + + if not _is_valid_text_input(text): + raise ValueError( + "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples)." + ) + + if text_pair is not None and not _is_valid_text_input(text_pair): + raise ValueError( + "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples)." + ) + + if is_split_into_words: + is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)) + else: + is_batched = isinstance(text, (list, tuple)) + + if is_batched: + if isinstance(text_pair, str): + raise TypeError( + "when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`." + ) + if text_pair is not None and len(text) != len(text_pair): + raise ValueError( + f"batch length of `text`: {len(text)} does not match batch length of `text_pair`: {len(text_pair)}." + ) + batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text + return self.batch_encode_plus( + batch_text_or_text_pairs=batch_text_or_text_pairs, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + is_split_into_words=is_split_into_words, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + else: + return self.encode_plus( + text=text, + text_pair=text_pair, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + is_split_into_words=is_split_into_words, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def encode_plus( + self, + text: Union[TextInput, PreTokenizedInput, EncodedInput], + text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = False, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + """ + Tokenize and prepare for the model a sequence or a pair of sequences. + + .. warning:: + This method is deprecated, ``__call__`` should be used instead. + + Args: + text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the latter only for not-fast tokenizers)): + The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the + ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids`` + method). + text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`): + Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using + the ``tokenize`` method) or a list of integers (tokenized string ids using the + ``convert_tokens_to_ids`` method). + """ + + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + return self._encode_plus( + text=text, + text_pair=text_pair, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + is_split_into_words=is_split_into_words, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def _encode_plus( + self, + text: Union[TextInput, PreTokenizedInput, EncodedInput], + text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + raise NotImplementedError + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def batch_encode_plus( + self, + batch_text_or_text_pairs: Union[ + List[TextInput], + List[TextInputPair], + List[PreTokenizedInput], + List[PreTokenizedInputPair], + List[EncodedInput], + List[EncodedInputPair], + ], + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = False, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + """ + Tokenize and prepare for the model a list of sequences or a list of pairs of sequences. + + .. warning:: + This method is deprecated, ``__call__`` should be used instead. + + Args: + batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`): + Batch of sequences or pair of sequences to be encoded. This can be a list of + string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see + details in ``encode_plus``). + """ + + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + return self._batch_encode_plus( + batch_text_or_text_pairs=batch_text_or_text_pairs, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + is_split_into_words=is_split_into_words, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + **kwargs, + ) + + def _batch_encode_plus( + self, + batch_text_or_text_pairs: Union[ + List[TextInput], + List[TextInputPair], + List[PreTokenizedInput], + List[PreTokenizedInputPair], + List[EncodedInput], + List[EncodedInputPair], + ], + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs + ) -> BatchEncoding: + raise NotImplementedError + + def pad( + self, + encoded_inputs: Union[ + BatchEncoding, + List[BatchEncoding], + Dict[str, EncodedInput], + Dict[str, List[EncodedInput]], + List[Dict[str, EncodedInput]], + ], + padding: Union[bool, str, PaddingStrategy] = True, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + verbose: bool = True, + ) -> BatchEncoding: + """ + Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length + in the batch. + + Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``, + ``self.pad_token_id`` and ``self.pad_token_type_id``) + + .. note:: + + If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the + result will use the same type unless you provide a different tensor type with ``return_tensors``. In the + case of PyTorch tensors, you will lose the specific device of your tensors however. + + Args: + encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`): + Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str, + List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str, + List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as + well as in a PyTorch Dataloader collate function. + + Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), + see the note above for the return type. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a + single sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + max_length (:obj:`int`, `optional`): + Maximum length of the returned list and optionally padding length (see above). + pad_to_multiple_of (:obj:`int`, `optional`): + If set will pad the sequence to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + >= 7.5 (Volta). + return_attention_mask (:obj:`bool`, `optional`): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. + + `What are attention masks? <../glossary.html#attention-mask>`__ + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): + If set, will return tensors instead of list of python integers. Acceptable values are: + + * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. + * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. + * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. + verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to print more information and warnings. + """ + # If we have a list of dicts, let's convert it in a dict of lists + # We do this to allow using this method as a collate_fn function in PyTorch Dataloader + if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)): + encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()} + + # The model's main input name, usually `input_ids`, has be passed for padding + if self.model_input_names[0] not in encoded_inputs: + raise ValueError( + "You should supply an encoding or a list of encodings to this method " + f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}" + ) + + required_input = encoded_inputs[self.model_input_names[0]] + + if not required_input: + if return_attention_mask: + encoded_inputs["attention_mask"] = [] + return encoded_inputs + + # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects + # and rebuild them afterwards if no return_tensors is specified + # Note that we lose the specific device the tensor may be on for PyTorch + + first_element = required_input[0] + if isinstance(first_element, (list, tuple)): + # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element. + index = 0 + while len(required_input[index]) == 0: + index += 1 + if index < len(required_input): + first_element = required_input[index][0] + # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do. + if not isinstance(first_element, (int, list, tuple)): + if _is_torch(first_element): + return_tensors = "pt" if return_tensors is None else return_tensors + elif isinstance(first_element, np.ndarray): + return_tensors = "np" if return_tensors is None else return_tensors + else: + raise ValueError( + f"type of {first_element} unknown: {type(first_element)}. " + f"Should be one of a python, numpy, pytorch or tensorflow object." + ) + + for key, value in encoded_inputs.items(): + encoded_inputs[key] = to_py_obj(value) + + # Convert padding_strategy in PaddingStrategy + padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( + padding=padding, max_length=max_length, verbose=verbose + ) + + required_input = encoded_inputs[self.model_input_names[0]] + if required_input and not isinstance(required_input[0], (list, tuple)): + encoded_inputs = self._pad( + encoded_inputs, + max_length=max_length, + padding_strategy=padding_strategy, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + ) + return BatchEncoding(encoded_inputs, tensor_type=return_tensors) + + batch_size = len(required_input) + assert all( + len(v) == batch_size for v in encoded_inputs.values() + ), "Some items in the output dictionary have a different batch size than others." + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = max(len(inputs) for inputs in required_input) + padding_strategy = PaddingStrategy.MAX_LENGTH + + batch_outputs = {} + for i in range(batch_size): + inputs = dict((k, v[i]) for k, v in encoded_inputs.items()) + outputs = self._pad( + inputs, + max_length=max_length, + padding_strategy=padding_strategy, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + ) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + return BatchEncoding(batch_outputs, tensor_type=return_tensors) + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create the token type IDs corresponding to the sequences passed. `What are token type IDs? + <../glossary.html#token-type-ids>`__ + + Should be overridden in a subclass if the model has a special way of building those. + + Args: + token_ids_0 (:obj:`List[int]`): The first tokenized sequence. + token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence. + + Returns: + :obj:`List[int]`: The token type ids. + """ + if token_ids_1 is None: + return len(token_ids_0) * [0] + return [0] * len(token_ids_0) + [1] * len(token_ids_1) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. + + This implementation does not add special tokens and this method should be overridden in a subclass. + + Args: + token_ids_0 (:obj:`List[int]`): The first tokenized sequence. + token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence. + + Returns: + :obj:`List[int]`: The model input with special tokens. + """ + if token_ids_1 is None: + return token_ids_0 + return token_ids_0 + token_ids_1 + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def prepare_for_model( + self, + ids: List[int], + pair_ids: Optional[List[int]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = False, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + prepend_batch_axis: bool = False, + **kwargs + ) -> BatchEncoding: + """ + Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It + adds special tokens, truncates sequences if overflowing while taking into account the special tokens and + manages a moving window (with user defined stride) for overflowing tokens. Please Note, for `pair_ids` + different than `None` and `truncation_strategy = longest_first` or `True`, it is not possible to return + overflowing tokens. Such a combination of arguments will raise an error. + + Args: + ids (:obj:`List[int]`): + Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize`` + and ``convert_tokens_to_ids`` methods. + pair_ids (:obj:`List[int]`, `optional`): + Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize`` + and ``convert_tokens_to_ids`` methods. + """ + + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + pair = bool(pair_ids is not None) + len_ids = len(ids) + len_pair_ids = len(pair_ids) if pair else 0 + + if return_token_type_ids and not add_special_tokens: + raise ValueError( + "Asking to return token_type_ids while setting add_special_tokens to False " + "results in an undefined behavior. Please set add_special_tokens to True or " + "set return_token_type_ids to None." + ) + + if ( + return_overflowing_tokens + and truncation_strategy == TruncationStrategy.LONGEST_FIRST + and pair_ids is not None + ): + raise ValueError( + "Not possible to return overflowing tokens for pair of sequences with the " + "`longest_first`. Please select another truncation strategy than `longest_first`, " + "for instance `only_second` or `only_first`." + ) + + # Load from model defaults + if return_token_type_ids is None: + return_token_type_ids = "token_type_ids" in self.model_input_names + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + encoded_inputs = {} + + # Compute the total size of the returned encodings + total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) + + # Truncation: Handle max sequence length + overflowing_tokens = [] + if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: + ids, pair_ids, overflowing_tokens = self.truncate_sequences( + ids, + pair_ids=pair_ids, + num_tokens_to_remove=total_len - max_length, + truncation_strategy=truncation_strategy, + stride=stride, + ) + + if return_overflowing_tokens: + encoded_inputs["overflowing_tokens"] = overflowing_tokens + encoded_inputs["num_truncated_tokens"] = total_len - max_length + + # Add special tokens + if add_special_tokens: + sequence = self.build_inputs_with_special_tokens(ids, pair_ids) + token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) + else: + sequence = ids + pair_ids if pair else ids + token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else []) + + # Build output dictionary + encoded_inputs["input_ids"] = sequence + if return_token_type_ids: + encoded_inputs["token_type_ids"] = token_type_ids + if return_special_tokens_mask: + if add_special_tokens: + encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) + else: + encoded_inputs["special_tokens_mask"] = [0] * len(sequence) + + # Check lengths + self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose) + + # Padding + if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask: + encoded_inputs = self.pad( + encoded_inputs, + max_length=max_length, + padding=padding_strategy.value, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + ) + + if return_length: + encoded_inputs["length"] = len(encoded_inputs["input_ids"]) + + batch_outputs = BatchEncoding( + encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis + ) + + return batch_outputs + + def truncate_sequences( + self, + ids: List[int], + pair_ids: Optional[List[int]] = None, + num_tokens_to_remove: int = 0, + truncation_strategy: Union[str, TruncationStrategy] = "longest_first", + stride: int = 0, + ) -> Tuple[List[int], List[int], List[int]]: + """ + Truncates a sequence pair in-place following the strategy. + + Args: + ids (:obj:`List[int]`): + Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize`` + and ``convert_tokens_to_ids`` methods. + pair_ids (:obj:`List[int]`, `optional`): + Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize`` + and ``convert_tokens_to_ids`` methods. + num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0): + Number of tokens to remove using the truncation strategy. + truncation_strategy (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`): + The strategy to follow for truncation. Can be: + + * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will + truncate token by token, removing a token from the longest sequence in the pair if a pair of + sequences (or a batch of pairs) is provided. + * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to + the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths + greater than the model maximum admissible input size). + stride (:obj:`int`, `optional`, defaults to 0): + If set to a positive number, the overflowing tokens returned will contain some tokens from the main + sequence returned. The value of this argument defines the number of additional tokens. + + Returns: + :obj:`Tuple[List[int], List[int], List[int]]`: The truncated ``ids``, the truncated ``pair_ids`` and the + list of overflowing tokens. Note: The `longest_first` strategy returns empty list of overflowing_tokens if + a pair of sequences (or a batch of pairs) is provided. + """ + if num_tokens_to_remove <= 0: + return ids, pair_ids, [] + + if not isinstance(truncation_strategy, TruncationStrategy): + truncation_strategy = TruncationStrategy(truncation_strategy) + + overflowing_tokens = [] + if truncation_strategy == TruncationStrategy.ONLY_FIRST or ( + truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None + ): + if len(ids) > num_tokens_to_remove: + window_len = min(len(ids), stride + num_tokens_to_remove) + overflowing_tokens = ids[-window_len:] + ids = ids[:-num_tokens_to_remove] + else: + error_msg = ( + f"We need to remove {num_tokens_to_remove} to truncate the input " + f"but the first sequence has a length {len(ids)}. " + ) + if truncation_strategy == TruncationStrategy.ONLY_FIRST: + error_msg = ( + error_msg + "Please select another truncation strategy than " + f"{truncation_strategy}, for instance 'longest_first' or 'only_second'." + ) + logger.error(error_msg) + elif truncation_strategy == TruncationStrategy.LONGEST_FIRST: + logger.warning( + f"Be aware, overflowing tokens are not returned for the setting you have chosen," + f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' " + f"truncation strategy. So the returned list will always be empty even if some " + f"tokens have been removed." + ) + for _ in range(num_tokens_to_remove): + if pair_ids is None or len(ids) > len(pair_ids): + ids = ids[:-1] + else: + pair_ids = pair_ids[:-1] + elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None: + if len(pair_ids) > num_tokens_to_remove: + window_len = min(len(pair_ids), stride + num_tokens_to_remove) + overflowing_tokens = pair_ids[-window_len:] + pair_ids = pair_ids[:-num_tokens_to_remove] + else: + logger.error( + f"We need to remove {num_tokens_to_remove} to truncate the input" + f"but the second sequence has a length {len(pair_ids)}. " + f"Please select another truncation strategy than {truncation_strategy}, " + f"for instance 'longest_first' or 'only_first'." + ) + + return (ids, pair_ids, overflowing_tokens) + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in self.padding_side: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + >= 7.5 (Volta). + return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + required_input = encoded_inputs[self.model_input_names[0]] + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + if needs_to_be_padded: + difference = max_length - len(required_input) + if self.padding_side == "right": + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(required_input) + [0] * difference + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = ( + encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference + ) + if "special_tokens_mask" in encoded_inputs: + encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference + encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference + elif self.padding_side == "left": + if return_attention_mask: + encoded_inputs["attention_mask"] = [0] * difference + [1] * len(required_input) + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[ + "token_type_ids" + ] + if "special_tokens_mask" in encoded_inputs: + encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + else: + raise ValueError("Invalid padding strategy:" + str(self.padding_side)) + elif return_attention_mask and "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * len(required_input) + + return encoded_inputs + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + """ + Converts a sequence of tokens in a single string. The most simple way to do it is ``" ".join(tokens)`` but we + often want to remove sub-word tokenization artifacts at the same time. + + Args: + tokens (:obj:`List[str]`): The token to join in a string. + + Returns: + :obj:`str`: The joined tokens. + """ + raise NotImplementedError + + def batch_decode( + self, + sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor"], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: bool = True, + **kwargs + ) -> List[str]: + """ + Convert a list of lists of token ids into a list of strings by calling decode. + + Args: + sequences (:obj:`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`): + List of tokenized input ids. Can be obtained using the ``__call__`` method. + skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to remove special tokens in the decoding. + clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to clean up the tokenization spaces. + kwargs (additional keyword arguments, `optional`): + Will be passed to the underlying model specific decode method. + + Returns: + :obj:`List[str]`: The list of decoded sentences. + """ + return [ + self.decode( + seq, + skip_special_tokens=skip_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + for seq in sequences + ] + + def decode( + self, + token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor"], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: bool = True, + **kwargs + ) -> str: + """ + Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special + tokens and clean up tokenization spaces. + + Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. + + Args: + token_ids (:obj:`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`): + List of tokenized input ids. Can be obtained using the ``__call__`` method. + skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to remove special tokens in the decoding. + clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to clean up the tokenization spaces. + kwargs (additional keyword arguments, `optional`): + Will be passed to the underlying model specific decode method. + + Returns: + :obj:`str`: The decoded sentence. + """ + # Convert inputs to python lists + token_ids = to_py_obj(token_ids) + + return self._decode( + token_ids=token_ids, + skip_special_tokens=skip_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + def _decode( + self, + token_ids: Union[int, List[int]], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: bool = True, + **kwargs + ) -> str: + raise NotImplementedError + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0 (:obj:`List[int]`): + List of ids of the first sequence. + token_ids_1 (:obj:`List[int]`, `optional`): + List of ids of the second sequence. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + assert already_has_special_tokens and token_ids_1 is None, ( + "You cannot use ``already_has_special_tokens=False`` with this tokenizer. " + "Please use a slow (full python) tokenizer to activate this argument." + "Or set `return_special_tokens_mask=True` when calling the encoding method " + "to get the special tokens mask in any tokenizer. " + ) + + all_special_ids = self.all_special_ids # cache the property + + special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0] + + return special_tokens_mask + + @staticmethod + def clean_up_tokenization(out_string: str) -> str: + """ + Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms. + + Args: + out_string (:obj:`str`): The text to clean up. + + Returns: + :obj:`str`: The cleaned-up string. + """ + out_string = ( + out_string.replace(" .", ".") + .replace(" ?", "?") + .replace(" !", "!") + .replace(" ,", ",") + .replace(" ' ", "'") + .replace(" n't", "n't") + .replace(" 'm", "'m") + .replace(" 's", "'s") + .replace(" 've", "'ve") + .replace(" 're", "'re") + ) + return out_string + + def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Optional[int], verbose: bool): + """ + Depending on the input and internal state we might trigger a warning about a sequence that is too long for its + corresponding model + + Args: + ids (:obj:`List[str]`): The ids produced by the tokenization + max_length (:obj:`int`, `optional`): The max_length desired (does not trigger a warning if it is set) + verbose (:obj:`bool`): Whether or not to print more information and warnings. + + """ + if max_length is None and len(ids) > self.model_max_length and verbose: + if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False): + logger.warning( + "Token indices sequence length is longer than the specified maximum sequence length " + f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model " + "will result in indexing errors" + ) + self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True + + @contextmanager + def as_target_tokenizer(self): + """ + Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to + sequence-to-sequence models that need a slightly different processing for the labels. + """ + yield + + def prepare_seq2seq_batch( + self, + src_texts: List[str], + tgt_texts: Optional[List[str]] = None, + max_length: Optional[int] = None, + max_target_length: Optional[int] = None, + padding: str = "longest", + return_tensors: str = None, + truncation: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + Prepare model inputs for translation. For best performance, translate one sentence at a time. + + Arguments: + src_texts (:obj:`List[str]`): + List of documents to summarize or source language texts. + tgt_texts (:obj:`list`, `optional`): + List of summaries or target language texts. + max_length (:obj:`int`, `optional`): + Controls the maximum length for encoder inputs (documents to summarize or source language texts) If + left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length + is required by one of the truncation/padding parameters. If the model has no specific maximum input + length (like XLNet) truncation/padding to a maximum length will be deactivated. + max_target_length (:obj:`int`, `optional`): + Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set + to :obj:`None`, this will use the max_length value. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`): + Activates and controls padding. Accepts the following values: + + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a + single sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`): + If set, will return tensors instead of list of python integers. Acceptable values are: + + * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects. + * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects. + * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects. + truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`True`): + Activates and controls truncation. Accepts the following values: + + * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument + :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not + provided. This will truncate token by token, removing a token from the longest sequence in the pair + if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to + the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with + sequence lengths greater than the model maximum admissible input size). + **kwargs: + Additional keyword arguments passed along to :obj:`self.__call__`. + + Return: + :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields: + + - **input_ids** -- List of token ids to be fed to the encoder. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model. + - **labels** -- List of token ids for tgt_texts. + + The full set of keys ``[input_ids, attention_mask, labels]``, will only be returned if tgt_texts is passed. + Otherwise, input_ids, attention_mask will be the only keys. + """ + # docstyle-ignore + formatted_warning = """ +`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular +`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare +your targets. + +Here is a short example: + +model_inputs = tokenizer(src_texts, ...) +with tokenizer.as_target_tokenizer(): + labels = tokenizer(tgt_texts, ...) +model_inputs["labels"] = labels["input_ids"] + +See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice. +For a more complete example, see the implementation of `prepare_seq2seq_batch`. +""" + logger.warn(formatted_warning, FutureWarning) + # mBART-specific kwargs that should be ignored by other models. + kwargs.pop("src_lang", None) + kwargs.pop("tgt_lang", None) + if max_length is None: + max_length = self.model_max_length + model_inputs = self( + src_texts, + add_special_tokens=True, + return_tensors=return_tensors, + max_length=max_length, + padding=padding, + truncation=truncation, + **kwargs, + ) + if tgt_texts is None: + return model_inputs + # Process tgt_texts + if max_target_length is None: + max_target_length = max_length + with self.as_target_tokenizer(): + labels = self( + tgt_texts, + add_special_tokens=True, + return_tensors=return_tensors, + padding=padding, + max_length=max_target_length, + truncation=truncation, + **kwargs, + ) + model_inputs["labels"] = labels["input_ids"] + return model_inputs diff --git a/fastNLP/transformers/torch/utils/__init__.py b/fastNLP/transformers/torch/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/fastNLP/transformers/torch/utils/model_parallel_utils.py b/fastNLP/transformers/torch/utils/model_parallel_utils.py new file mode 100644 index 00000000..3a145df9 --- /dev/null +++ b/fastNLP/transformers/torch/utils/model_parallel_utils.py @@ -0,0 +1,54 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from math import ceil + + +def assert_device_map(device_map, num_blocks): + blocks = list(range(0, num_blocks)) + + device_map_blocks = [item for sublist in list(device_map.values()) for item in sublist] + + # Duplicate check + duplicate_blocks = [] + for i in device_map_blocks: + if device_map_blocks.count(i) > 1 and i not in duplicate_blocks: + duplicate_blocks.append(i) + # Missing blocks + missing_blocks = [i for i in blocks if i not in device_map_blocks] + extra_blocks = [i for i in device_map_blocks if i not in blocks] + + assert len(duplicate_blocks) == 0, ( + "Duplicate attention blocks specified in device_map. Attention blocks must be specified to one device. These " + "attention blocks were specified more than once: " + str(duplicate_blocks) + ) + assert len(missing_blocks) == 0, ( + "There are attention blocks for this model that are not specified in the device_map. Add these attention " + "blocks to a device on the device_map: " + str(missing_blocks) + ) + assert ( + len(extra_blocks) == 0 + ), "The device_map contains more attention blocks than this model has. Remove these from the device_map:" + str( + extra_blocks + ) + + +def get_device_map(n_layers, devices): + """Returns a dictionary of layers distributed evenly across all devices.""" + layers = list(range(n_layers)) + n_blocks = int(ceil(n_layers / len(devices))) + layers_list = list(layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks)) + + return dict(zip(devices, layers_list)) diff --git a/fastNLP/transformers/torch/utils/versions.py b/fastNLP/transformers/torch/utils/versions.py new file mode 100644 index 00000000..cb2fbdb9 --- /dev/null +++ b/fastNLP/transformers/torch/utils/versions.py @@ -0,0 +1,120 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Utilities for working with package versions +""" + +import operator +import re +import sys +from typing import Optional + +from packaging import version + + +# The package importlib_metadata is in a different place, depending on the python version. +if sys.version_info < (3, 8): + import importlib_metadata +else: + import importlib.metadata as importlib_metadata + + +ops = { + "<": operator.lt, + "<=": operator.le, + "==": operator.eq, + "!=": operator.ne, + ">=": operator.ge, + ">": operator.gt, +} + + +def _compare_versions(op, got_ver, want_ver, requirement, pkg, hint): + if got_ver is None: + raise ValueError("got_ver is None") + if want_ver is None: + raise ValueError("want_ver is None") + if not ops[op](version.parse(got_ver), version.parse(want_ver)): + raise ImportError( + f"{requirement} is required for a normal functioning of this module, but found {pkg}=={got_ver}.{hint}" + ) + + +def require_version(requirement: str, hint: Optional[str] = None) -> None: + """ + Perform a runtime check of the dependency versions, using the exact same syntax used by pip. + + The installed module version comes from the `site-packages` dir via `importlib_metadata`. + + Args: + requirement (:obj:`str`): pip style definition, e.g., "tokenizers==0.9.4", "tqdm>=4.27", "numpy" + hint (:obj:`str`, `optional`): what suggestion to print in case of requirements not being met + + Example:: + + require_version("pandas>1.1.2") + require_version("numpy>1.18.5", "this is important to have for whatever reason") + + """ + + hint = f"\n{hint}" if hint is not None else "" + + # non-versioned check + if re.match(r"^[\w_\-\d]+$", requirement): + pkg, op, want_ver = requirement, None, None + else: + match = re.findall(r"^([^!=<>\s]+)([\s!=<>]{1,2}.+)", requirement) + if not match: + raise ValueError( + f"requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23, but got {requirement}" + ) + pkg, want_full = match[0] + want_range = want_full.split(",") # there could be multiple requirements + wanted = {} + for w in want_range: + match = re.findall(r"^([\s!=<>]{1,2})(.+)", w) + if not match: + raise ValueError( + f"requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23, but got {requirement}" + ) + op, want_ver = match[0] + wanted[op] = want_ver + if op not in ops: + raise ValueError(f"{requirement}: need one of {list(ops.keys())}, but got {op}") + + # special case + if pkg == "python": + got_ver = ".".join([str(x) for x in sys.version_info[:3]]) + for op, want_ver in wanted.items(): + _compare_versions(op, got_ver, want_ver, requirement, pkg, hint) + return + + # check if any version is installed + try: + got_ver = importlib_metadata.version(pkg) + except importlib_metadata.PackageNotFoundError: + raise importlib_metadata.PackageNotFoundError( + f"The '{requirement}' distribution was not found and is required by this application. {hint}" + ) + + # check that the right version is installed if version number or a range was provided + if want_ver is not None: + for op, want_ver in wanted.items(): + _compare_versions(op, got_ver, want_ver, requirement, pkg, hint) + + +def require_version_core(requirement): + """require_version wrapper which emits a core-specific hint on failure""" + hint = "Try: pip install transformers -U or pip install -e '.[dev]' if you're working with git master" + return require_version(requirement, hint) From df0651baaecb899b02a7205c4db840a88eca3eac Mon Sep 17 00:00:00 2001 From: x54-729 <17307130121@fudan.edu.cn> Date: Sat, 30 Apr 2022 09:00:29 +0000 Subject: [PATCH 2/9] =?UTF-8?q?=E5=88=A0=E9=99=A4=20transformers=20?= =?UTF-8?q?=E4=B8=AD=E7=9A=84=E4=BE=9D=E8=B5=96=E5=8C=85=E5=AD=97=E5=85=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/transformers/torch/deepspeed.py | 4 +- .../torch/dependency_versions_check.py | 20 ----- .../torch/dependency_versions_table.py | 76 ------------------- 3 files changed, 2 insertions(+), 98 deletions(-) delete mode 100644 fastNLP/transformers/torch/dependency_versions_check.py delete mode 100644 fastNLP/transformers/torch/dependency_versions_table.py diff --git a/fastNLP/transformers/torch/deepspeed.py b/fastNLP/transformers/torch/deepspeed.py index fc3fcc7c..e60a7ce8 100644 --- a/fastNLP/transformers/torch/deepspeed.py +++ b/fastNLP/transformers/torch/deepspeed.py @@ -22,7 +22,7 @@ import weakref from copy import deepcopy from functools import partialmethod -from .dependency_versions_check import dep_version_check +from .utils.versions import require_version from fastNLP.envs.imports import _NEED_IMPORT_TORCH from fastNLP.core.log import logger @@ -55,7 +55,7 @@ class HfDeepSpeedConfig: # set global weakref object set_hf_deepspeed_config(self) - dep_version_check("deepspeed") + require_version("deepspeed>=0.5.3") if isinstance(config_file_or_dict, dict): # Don't modify user's data should they want to reuse it (e.g. in tests), because once we diff --git a/fastNLP/transformers/torch/dependency_versions_check.py b/fastNLP/transformers/torch/dependency_versions_check.py deleted file mode 100644 index 30e8f448..00000000 --- a/fastNLP/transformers/torch/dependency_versions_check.py +++ /dev/null @@ -1,20 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import sys - -from .dependency_versions_table import deps -from .utils.versions import require_version - -def dep_version_check(pkg, hint=None): - require_version(deps[pkg], hint) diff --git a/fastNLP/transformers/torch/dependency_versions_table.py b/fastNLP/transformers/torch/dependency_versions_table.py deleted file mode 100644 index ef396637..00000000 --- a/fastNLP/transformers/torch/dependency_versions_table.py +++ /dev/null @@ -1,76 +0,0 @@ -# THIS FILE HAS BEEN AUTOGENERATED. To update: -# 1. modify the `_deps` dict in setup.py -# 2. run `make deps_table_update`` -deps = { - "Pillow": "Pillow", - "black": "black==21.4b0", - "codecarbon": "codecarbon==1.2.0", - "cookiecutter": "cookiecutter==1.7.2", - "dataclasses": "dataclasses", - "datasets": "datasets", - "deepspeed": "deepspeed>=0.5.3", - "docutils": "docutils==0.16.0", - "fairscale": "fairscale>0.3", - "faiss-cpu": "faiss-cpu", - "fastapi": "fastapi", - "filelock": "filelock", - "flake8": "flake8>=3.8.3", - "flax": "flax>=0.3.4", - "fugashi": "fugashi>=1.0", - "GitPython": "GitPython<3.1.19", - "huggingface-hub": "huggingface-hub>=0.0.17", - "importlib_metadata": "importlib_metadata", - "ipadic": "ipadic>=1.0.0,<2.0", - "isort": "isort>=5.5.4", - "jax": "jax>=0.2.8", - "jaxlib": "jaxlib>=0.1.65", - "jieba": "jieba", - "keras2onnx": "keras2onnx", - "nltk": "nltk", - "numpy": "numpy>=1.17", - "onnxconverter-common": "onnxconverter-common", - "onnxruntime-tools": "onnxruntime-tools>=1.4.2", - "onnxruntime": "onnxruntime>=1.4.0", - "optuna": "optuna", - "optax": "optax>=0.0.8", - "packaging": "packaging>=20.0", - "parameterized": "parameterized", - "protobuf": "protobuf", - "psutil": "psutil", - "pyyaml": "pyyaml>=5.1", - "pydantic": "pydantic", - "pytest": "pytest", - "pytest-timeout": "pytest-timeout", - "pytest-xdist": "pytest-xdist", - "python": "python>=3.6.0", - "ray[tune]": "ray[tune]", - "recommonmark": "recommonmark", - "regex": "regex!=2019.12.17", - "requests": "requests", - "rouge-score": "rouge-score", - "sacrebleu": "sacrebleu>=1.4.12,<2.0.0", - "sacremoses": "sacremoses", - "sagemaker": "sagemaker>=2.31.0", - "scikit-learn": "scikit-learn", - "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92", - "sigopt": "sigopt", - "soundfile": "soundfile", - "sphinx-copybutton": "sphinx-copybutton", - "sphinx-markdown-tables": "sphinx-markdown-tables", - "sphinx-rtd-theme": "sphinx-rtd-theme==0.4.3", - "sphinx": "sphinx==3.2.1", - "sphinxext-opengraph": "sphinxext-opengraph==0.4.1", - "sphinx-intl": "sphinx-intl", - "starlette": "starlette", - "tensorflow-cpu": "tensorflow-cpu>=2.3", - "tensorflow": "tensorflow>=2.3", - "timeout-decorator": "timeout-decorator", - "timm": "timm", - "tokenizers": "tokenizers>=0.10.1,<0.11", - "torch": "torch>=1.0", - "torchaudio": "torchaudio", - "tqdm": "tqdm>=4.27", - "unidic": "unidic>=1.0.2", - "unidic_lite": "unidic_lite>=1.0.7", - "uvicorn": "uvicorn", -} From b3e0ebd7fc56b119ce4116c41fd7660071165940 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Sat, 30 Apr 2022 17:10:03 +0800 Subject: [PATCH 3/9] =?UTF-8?q?=E4=BC=98=E5=8C=96=E4=BA=86Collator?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/callbacks/callback.py | 3 +- .../callbacks/load_best_model_callback.py | 33 ++++----- fastNLP/core/collators/new_collator.py | 34 +++++---- fastNLP/core/collators/padders/utils.py | 2 + fastNLP/core/collators/utils.py | 51 ++++++++------ fastNLP/core/dataloaders/fdataloader.py | 7 -- .../core/dataloaders/torch_dataloader/fdl.py | 2 +- .../collators/padders/test_numpy_padder.py | 2 +- tests/core/collators/test_new_collator.py | 70 ++++++++++++++++++- tests/core/collators/test_utils.py | 16 ++--- 10 files changed, 148 insertions(+), 72 deletions(-) delete mode 100644 fastNLP/core/dataloaders/fdataloader.py diff --git a/fastNLP/core/callbacks/callback.py b/fastNLP/core/callbacks/callback.py index 982df7da..7f0c290d 100644 --- a/fastNLP/core/callbacks/callback.py +++ b/fastNLP/core/callbacks/callback.py @@ -126,7 +126,8 @@ class Callback: :param trainer: `fastNLP.Trainer` :param batch: batch 的数据,已经经过 input_mapping (如果有) 以及 移动到指定设备 。 - :param list[int] indices: 当前的 batch 是 dataset 中的哪些数据 + :param list[int] indices: 当前的 batch 是 dataset 中的哪些数据。仅在 DataLoader 支持得到当前 batch index 的时候有值, + 其它时候为 None 。 """ pass diff --git a/fastNLP/core/callbacks/load_best_model_callback.py b/fastNLP/core/callbacks/load_best_model_callback.py index 5addd2e2..32534d2a 100644 --- a/fastNLP/core/callbacks/load_best_model_callback.py +++ b/fastNLP/core/callbacks/load_best_model_callback.py @@ -94,20 +94,21 @@ class LoadBestModelCallback(HasMonitorCallback): else: self.buffer.seek(0) trainer.load_model(folder=self.buffer, only_state_dict=self.only_state_dict) - - self._delete_after_after(trainer) - - def _delete_after_after(self, trainer): - trainer.driver.barrier() if self.delete_after_after: - if self.real_save_folder: - logger.info(f"Deleting {self.real_save_folder}...") - shutil.rmtree(self.real_save_folder, ignore_errors=True) - try: - # 如果是 emtpy 的,就会被删除掉 - os.rmdir(self.save_folder) - except: - pass - elif hasattr(self, 'buffer'): - self.buffer.close() - del self.buffer \ No newline at end of file + trainer.driver.barrier() + self._delete_folder() + trainer.driver.barrier() + + def _delete_folder(self): + if self.real_save_folder: + logger.info(f"Deleting {self.real_save_folder}...") + shutil.rmtree(self.real_save_folder, ignore_errors=True) + try: + # 如果是 emtpy 的,就会被删除掉 + os.rmdir(self.save_folder) + logger.debug(f"Since {self.save_folder} is an empty folder, it has been removed.") + except: + pass + elif hasattr(self, 'buffer'): + self.buffer.close() + del self.buffer \ No newline at end of file diff --git a/fastNLP/core/collators/new_collator.py b/fastNLP/core/collators/new_collator.py index 869a60a7..9123a293 100644 --- a/fastNLP/core/collators/new_collator.py +++ b/fastNLP/core/collators/new_collator.py @@ -6,7 +6,7 @@ from .padders.get_padder import get_padder import re from .utils import unpack_batch_mapping, unpack_batch_nested_mapping, pack_batch_nested_mapping, unpack_batch_sequence, \ - pack_batch_sequence, NESTED_DICT_SEPARATOR + pack_batch_sequence sequence_idx_str = re.compile(r'^_\d+$') # 形如_0, _1 SUPPORTED_BACKENDS = ['torch', 'jittor', 'paddle', 'numpy', 'raw', None] @@ -16,10 +16,11 @@ class Collator: def __init__(self, backend='torch'): """ 用于 pad 数据的对象。会自动将所有能够 pad (由 fastNLP 根据数据判定能否 pad )的数据都进行 pad 操作,默认 pad 的值为 0。 - 可使用 set_pad() 函数调整。如果有些 field 不想输出,可以使用 set_ignore() 函数进行设置。 + 可使用 set_pad() 函数调整。如果有些 field 不想输出,可以使用 set_ignore() 函数进行设置。Collator 在第一次进行 pad 的 + 时候自动根据设置以及数据情况,为每个 field 获取一个 padder ,在之后的每次调用中,都将使用对应的 Padder 给对应的 field 。 - :param backend: 对于可以 pad 的 field,使用哪种 tensor,支持 ['torch','jittor','paddle','numpy','raw',None], - 若为 None ,则不进行 padding 。 + :param backend: 对于可以 pad 的 field,使用哪种 tensor,支持 ['torch','jittor','paddle','numpy','raw',None]。 + 若为 None ,则不进行 padding 。该参数对本身就不能进行 pad 的数据没用影响,不能 pad 的数据返回一定是 list 。 """ self.unpack_batch_func = None self.pack_batch_func = None @@ -54,22 +55,25 @@ class Collator: else: self.batch_data_type = 's' logger.debug(f"Since batch[0] has type:{type(batch[0])}, so the batch_data_type " - f"is {self.batch_data_type}") + f"is `{self.batch_data_type}`.") if self.batch_data_type == 's': - self.unpack_batch_func = lambda x:{'_single': x} # 不需要做任何调整 - self.pack_batch_func = lambda x:x['_single'] + self.unpack_batch_func = lambda batch, ignore_fields: {'_single': batch} # 不需要做任何调整 + self.pack_batch_func = lambda x: x['_single'] elif self.batch_data_type == 'l': self.unpack_batch_func = unpack_batch_sequence self.pack_batch_func = pack_batch_sequence elif self.batch_data_type == 'd': - if any([isinstance(v, Mapping) for v in batch[0].values()]): # 可能存在 nested 的dict。{'a': {'b': xx}}->{'a@@b': value} + if any([isinstance(v, Mapping) for v in batch[0].values()]): # 可能存在 nested 的dict。{'a': {'b': xx}}->{('a', 'b'): value} self.unpack_batch_func = unpack_batch_nested_mapping self.pack_batch_func = pack_batch_nested_mapping else: self.unpack_batch_func = unpack_batch_mapping self.pack_batch_func = lambda x:x - unpack_batch:Dict = self.unpack_batch_func(batch) # 将各自 field 组成 batch 形式。 + if self.unpack_batch_func is unpack_batch_nested_mapping: # 比较特殊,需要防止继续往下延伸 + unpack_batch: Dict = self.unpack_batch_func(batch, self.ignore_fields, set(self.input_fields.keys())) + else: + unpack_batch:Dict = self.unpack_batch_func(batch, self.ignore_fields) # 将各自 field 组成 batch 形式。 pad_batch = {} if len(self.padders)==0: # 第一次运行,准备 padder @@ -96,13 +100,13 @@ class Collator: return self.pack_batch_func(pad_batch) # 根据情况恢复成与输入一致的类型 - def set_pad(self, field_name:str, pad_val:Union[int, float, None]=0, dtype=None, backend=None, + def set_pad(self, field_name:Union[str, tuple], pad_val:Union[int, float, None]=0, dtype=None, backend=None, pad_fn:Callable=None) -> "Collator": """ 如果需要对某个 field 的内容进行特殊的调整,请使用这个函数。 :param field_name: 需要调整的 field 的名称。如果 Dataset 的 __getitem__ 方法返回的是 dict 类型的,则可以直接使用对应的 - field 的 key 来表示,如果是 nested 的 dict,可以使用 @@ 来连接不同层次的 key,例如 {'a': {'b': 1}} 中的使用 a@@b; + field 的 key 来表示,如果是 nested 的 dict,可以使用元组表示多层次的 key,例如 {'a': {'b': 1}} 中的使用 ('a', 'b'); 如果 __getitem__ 返回的是 Sequence 类型的,则可以使用 '_0', '_1' 表示序列中第 0 或 1 个元素。如果该 field 在数据中没 有找到,则报错;如果 __getitem__ 返回的是就是整体内容,请使用 "_single" 。 :param pad_val: 这个 field 的默认 pad 值。如果设置为 None,则表示该 field 不需要 pad , fastNLP 默认只会对可以 pad 的 @@ -126,11 +130,11 @@ class Collator: f"index, but other field is set as dict mode." elif self.batch_data_type == 'l': assert sequence_idx_str.match(field_name) is not None, f"Other field is set as list mode. But the new " \ - f"field name is {field_name}" + f"field name is {field_name}." if field_name == '_single': self.batch_data_type = 's' - elif sequence_idx_str.match(field_name): + elif isinstance(field_name, str) and sequence_idx_str.match(field_name): self.batch_data_type = 'l' else: self.batch_data_type = 'd' @@ -165,8 +169,8 @@ class Collator: collator.set_ignore('field1', 'field2') :param field_names: 需要忽略的 field 的名称。如果 Dataset 的 __getitem__ 方法返回的是 dict 类型的,则可以直接使用对应的 - field 的 key 来表示,如果是 nested 的 dict,可以使用 @@ 来连接不同层次的 key,例如 {'a': {'b': 1}} 中的使用 a@@b; - 如果 __getitem__ 返回的是 Sequence 类型的,则可以使用 '_0', '_1' 表示序列中第 0 或 1 个元素。 + field 的 key 来表示,如果是 nested 的 dict,可以使用元组来表示,例如 {'a': {'b': 1}} 中的使用 ('a', 'b'); 如果 + __getitem__ 返回的是 Sequence 类型的,则可以使用 '_0', '_1' 表示序列中第 0 或 1 个元素。 :return: 返回 Collator 自身 """ for field_name in field_names: diff --git a/fastNLP/core/collators/padders/utils.py b/fastNLP/core/collators/padders/utils.py index f6240219..d2d3a8e0 100644 --- a/fastNLP/core/collators/padders/utils.py +++ b/fastNLP/core/collators/padders/utils.py @@ -149,6 +149,7 @@ def is_number(dtype): if dtype in (float, int, complex, bool) and not is_numpy_generic_class(dtype) \ and not is_numpy_number_dtype(dtype): return True + return False except: return False @@ -161,6 +162,7 @@ if __name__ == '__main__': # print(type(b[0])) # print(b) # import torch + print(is_number(type('a'))) print(is_number_or_numpy_number(type(3))) # True print(is_number_or_numpy_number(type(3.1))) # True print(is_number_or_numpy_number(type('3'))) # False diff --git a/fastNLP/core/collators/utils.py b/fastNLP/core/collators/utils.py index 9a397c66..1a82aa23 100644 --- a/fastNLP/core/collators/utils.py +++ b/fastNLP/core/collators/utils.py @@ -2,54 +2,58 @@ from collections import defaultdict from functools import reduce from typing import Sequence, Mapping, Dict -NESTED_DICT_SEPARATOR = '@@' - -def unpack_batch_mapping(batch:Sequence[Mapping])->Dict: +def unpack_batch_mapping(batch:Sequence[Mapping], ignore_fields:set)->Dict: """ 将 Sequence[Mapping] 转为 Dict 。例如 [{'a': [1, 2], 'b': 1}, {'a': [3], 'b': 2}] -> {'a': [[1, 2], [3]], 'b': [1, 2]} :param batch: + :param ignore_fields: :return: """ dict_batch = defaultdict(list) for sample in batch: for key, value in sample.items(): + if key in ignore_fields: + continue dict_batch[key].append(value) return dict_batch -def unpack_batch_nested_mapping(batch:Sequence[Mapping], _parent='')->Dict: +def unpack_batch_nested_mapping(batch:Sequence[Mapping], ignore_fields:set, stop_deep_fields:set)->Dict: """ 将 nested 的 dict 中的内容展开到一个 flat dict 中 :param batch: - :param _parent: 内部使用 + :param ignore_fields: 需要忽略的 field 。 + :param stop_deep_fields: 不需要继续往下衍射的 :return: """ dict_batch = defaultdict(list) - if _parent != '': - _parent += NESTED_DICT_SEPARATOR for sample in batch: for key, value in sample.items(): - if isinstance(value, Mapping): - _dict_batch = _unpack_batch_nested_mapping(value, _parent=_parent + key) + if key in ignore_fields: + continue + if isinstance(value, Mapping) and key not in stop_deep_fields: + _dict_batch = _unpack_batch_nested_mapping(value, ignore_fields, stop_deep_fields, _parent=(key,)) for key, value in _dict_batch.items(): dict_batch[key].append(value) else: - dict_batch[_parent + key].append(value) + dict_batch[key].append(value) return dict_batch -def _unpack_batch_nested_mapping(value, _parent)->Dict: +def _unpack_batch_nested_mapping(value, ignore_fields, stop_deep_fields, _parent)->Dict: _dict = {} - _parent += NESTED_DICT_SEPARATOR for k, v in value.items(): - if isinstance(v, Mapping): - __dict = _unpack_batch_nested_mapping(v, _parent=_parent + k) + _k = _parent + (k,) + if _k in ignore_fields: + continue + if isinstance(v, Mapping) and _k not in stop_deep_fields: + __dict = _unpack_batch_nested_mapping(v, ignore_fields, stop_deep_fields, _parent=_k) _dict.update(__dict) else: - _dict[_parent + k] = v + _dict[_k] = v return _dict @@ -63,10 +67,11 @@ def pack_batch_nested_mapping(batch:Mapping) -> Dict: dicts = [] for key, value in batch.items(): - keys = key.split(NESTED_DICT_SEPARATOR) - d = {keys[-1]: value} - for key in keys[:-1:][::-1]: - d = {key: d} + if not isinstance(key, tuple): + key = [key] + d = {key[-1]: value} + for k in key[:-1:][::-1]: + d = {k: d} dicts.append(d) return reduce(_merge_dict, dicts) @@ -85,17 +90,21 @@ def _merge_dict(a, b, path=None): return a -def unpack_batch_sequence(batch:Sequence[Sequence])->Dict: +def unpack_batch_sequence(batch:Sequence[Sequence], ignore_fields)->Dict: """ 将 Sequence[Sequence] 转为 Mapping 。例如 [[[1, 2], 2], [[3], 2]] -> {'_0': [[1, 2], [3]], '_1': [1, 2]} :param batch: + :param ignore_fields: 需要忽略的field :return: """ dict_batch = defaultdict(list) for sample in batch: for i, content in enumerate(sample): - dict_batch[f'_{i}'].append(content) + field_name = f'_{i}' + if field_name in ignore_fields: + continue + dict_batch[field_name].append(content) return dict_batch diff --git a/fastNLP/core/dataloaders/fdataloader.py b/fastNLP/core/dataloaders/fdataloader.py deleted file mode 100644 index 742f3909..00000000 --- a/fastNLP/core/dataloaders/fdataloader.py +++ /dev/null @@ -1,7 +0,0 @@ -__all__ = [ - 'FDataLoader' -] - - -class FDataLoader: - pass diff --git a/fastNLP/core/dataloaders/torch_dataloader/fdl.py b/fastNLP/core/dataloaders/torch_dataloader/fdl.py index cf8e2c31..02721aaf 100644 --- a/fastNLP/core/dataloaders/torch_dataloader/fdl.py +++ b/fastNLP/core/dataloaders/torch_dataloader/fdl.py @@ -17,7 +17,7 @@ if _NEED_IMPORT_TORCH: from torch.utils.data import DataLoader, Sampler from torch.utils.data._utils.collate import default_collate else: - from ..fdataloader import FDataLoader as DataLoader + from fastNLP.core.utils.dummy_class import DummyClass as DataLoader class _FDataSet: diff --git a/tests/core/collators/padders/test_numpy_padder.py b/tests/core/collators/padders/test_numpy_padder.py index 42665857..6cc9d668 100644 --- a/tests/core/collators/padders/test_numpy_padder.py +++ b/tests/core/collators/padders/test_numpy_padder.py @@ -10,7 +10,7 @@ class TestNumpyNumberPadder: def test_run(self): padder = NumpyNumberPadder(ele_dtype=int, dtype=int, pad_val=-1) a = [1, 2, 3] - assert isinstance(a, np.ndarray) + assert isinstance(padder(a), np.ndarray) assert (padder(a) == np.array(a)).sum() == 3 diff --git a/tests/core/collators/test_new_collator.py b/tests/core/collators/test_new_collator.py index 5fc82c91..7c27b3a9 100644 --- a/tests/core/collators/test_new_collator.py +++ b/tests/core/collators/test_new_collator.py @@ -158,7 +158,7 @@ class TestCollator: # 测试 ignore collator = Collator(backend='raw') - collator.set_ignore('str', 'int', 'lst_int', 'nested_dict@@a') + collator.set_ignore('str', 'int', 'lst_int', ('nested_dict', 'a')) raw_pad_batch = {'lst_str': [['1'], ['2', '2']], 'nest_lst_int': [[[1, 0], [0, 0]], [[1, 0], [1, 2]]], 'float': [1.1, 2.1], 'lst_float': [[1.1], [2.1]], 'bool': [True, False], 'numpy': [np.array([1.]), np.array([0.])], 'dict': {'1': ['1', '2']}, 'set': [{'1'}, {'2'}], 'nested_dict': {'b': [[1, 2], [1, 2]]}} findDictDiff(raw_pad_batch, collator(dict_batch)) @@ -171,7 +171,7 @@ class TestCollator: # 测试设置 pad 值 collator = Collator(backend='raw') collator.set_pad('nest_lst_int', pad_val=100) - collator.set_ignore('str', 'int', 'lst_int', 'nested_dict@@a') + collator.set_ignore('str', 'int', 'lst_int', ('nested_dict','a')) raw_pad_batch = {'lst_str': [['1'], ['2', '2']], 'nest_lst_int': [[[1, 100], [100, 100]], [[1, 100], [1, 2]]], 'float': [1.1, 2.1], 'lst_float': [[1.1], [2.1]], 'bool': [True, False], 'numpy': [np.array([1.]), np.array([0.])], 'dict': {'1': ['1', '2']}, 'set': [{'1'}, {'2'}], 'nested_dict': {'b': [[1, 2], [1, 2]]}} findDictDiff(raw_pad_batch, collator(dict_batch)) @@ -217,6 +217,72 @@ class TestCollator: collator.set_pad('_single') findListDiff(list_batch, collator(list_batch)) + def test_nest_ignore(self): + dict_batch = [{ + 'str': '1', + 'lst_str': ['1'], + 'int': 1, + 'lst_int': [1], + 'nest_lst_int': [[1]], + 'float': 1.1, + 'lst_float': [1.1], + 'bool': True, + 'numpy': np.ones(1), + 'dict': {'1': '1'}, + 'set': {'1'}, + 'nested_dict': {'int': 1, 'lst_int':[1, 2], 'c': {'int': 1}} + }, + { + 'str': '2', + 'lst_str': ['2', '2'], + 'int': 2, + 'lst_int': [1, 2], + 'nest_lst_int': [[1], [1, 2]], + 'float': 2.1, + 'lst_float': [2.1], + 'bool': False, + 'numpy': np.zeros(1), + 'dict': {'1': '2'}, + 'set': {'2'}, + 'nested_dict': {'int': 1, 'lst_int': [1, 2], 'c': {'int': 1}} + } + ] + # 测试 ignore + collator = Collator(backend='raw') + collator.set_ignore('str', 'int', 'lst_int', ('nested_dict', 'int')) + raw_pad_batch = {'lst_str': [['1'], ['2', '2']], 'nest_lst_int': [[[1, 0], [0, 0]], [[1, 0], [1, 2]]], + 'float': [1.1, 2.1], 'lst_float': [[1.1], [2.1]], 'bool': [True, False], + 'numpy': [np.array([1.]), np.array([0.])], 'dict': {'1': ['1', '2']}, + 'set': [{'1'}, {'2'}], 'nested_dict': {'lst_int': [[1, 2], [1, 2]], + 'c': {'int':[1, 1]}}} + findDictDiff(raw_pad_batch, collator(dict_batch)) + + collator = Collator(backend='raw') + collator.set_pad(('nested_dict', 'c'), pad_val=None) + collator.set_ignore('str', 'int', 'lst_int') + raw_pad_batch = {'lst_str': [['1'], ['2', '2']], 'nest_lst_int': [[[1, 0], [0, 0]], [[1, 0], [1, 2]]], + 'float': [1.1, 2.1], 'lst_float': [[1.1], [2.1]], 'bool': [True, False], + 'numpy': [np.array([1.]), np.array([0.])], 'dict': {'1': ['1', '2']}, + 'set': [{'1'}, {'2'}], 'nested_dict': {'lst_int': [[1, 2], [1, 2]], + 'c': [{'int':1}, {'int':1}]}} + pad_batch = collator(dict_batch) + findDictDiff(raw_pad_batch, pad_batch) + + collator = Collator(backend='raw') + collator.set_pad(('nested_dict', 'c'), pad_val=1) + with pytest.raises(BaseException): + collator(dict_batch) + + collator = Collator(backend='raw') + collator.set_ignore('str', 'int', 'lst_int') + collator.set_pad(('nested_dict', 'c'), pad_fn=lambda x: [d['int'] for d in x]) + pad_batch = collator(dict_batch) + raw_pad_batch = {'lst_str': [['1'], ['2', '2']], 'nest_lst_int': [[[1, 0], [0, 0]], [[1, 0], [1, 2]]], + 'float': [1.1, 2.1], 'lst_float': [[1.1], [2.1]], 'bool': [True, False], + 'numpy': [np.array([1.]), np.array([0.])], 'dict': {'1': ['1', '2']}, + 'set': [{'1'}, {'2'}], 'nested_dict': {'lst_int': [[1, 2], [1, 2]], + 'c': [1, 1]}} + findDictDiff(raw_pad_batch, pad_batch) diff --git a/tests/core/collators/test_utils.py b/tests/core/collators/test_utils.py index d56dacc6..74c54a36 100644 --- a/tests/core/collators/test_utils.py +++ b/tests/core/collators/test_utils.py @@ -4,25 +4,25 @@ from fastNLP.core.collators.utils import * def test_unpack_batch_mapping(): batch = [{'a': [1, 2], 'b': 1}, {'a': [3], 'b': 2}] - assert unpack_batch_mapping(batch)=={'a': [[1, 2], [3]], 'b': [1, 2]} + assert unpack_batch_mapping(batch, {})=={'a': [[1, 2], [3]], 'b': [1, 2]} def test_unpack_batch_nested_mapping(): batch = [{'a': [1, 2], 'b': 1, 'c': {'c': 1}}, {'a': [3], 'b': 2, 'c': {'c': 2}}] - assert unpack_batch_nested_mapping(batch) == {'a': [[1, 2], [3]], 'b': [1, 2], 'c@@c': [1, 2]} + assert unpack_batch_nested_mapping(batch, {}, {}) == {'a': [[1, 2], [3]], 'b': [1, 2], ('c','c'): [1, 2]} batch = [{'a': [1, 2], 'b': 1, 'c': {'c': {'c': 1}}}, {'a': [3], 'b': 2, 'c': {'c': {'c': 2}}}] - assert unpack_batch_nested_mapping(batch) == {'a': [[1, 2], [3]], 'b': [1, 2], 'c@@c@@c': [1, 2]} + assert unpack_batch_nested_mapping(batch, {}, {}) == {'a': [[1, 2], [3]], 'b': [1, 2], ('c', 'c', 'c'): [1, 2]} batch = [{'a': [1, 2], 'b': 1, 'c': {'c': {'c': 1, 'd':[1, 1]}, 'd': [1]}}, {'a': [3], 'b': 2, 'c': {'c': {'c': 2, 'd': [2, 2]}, 'd': [2, 2]}}] - assert unpack_batch_nested_mapping(batch) == {'a': [[1, 2], [3]], 'b': [1, 2], 'c@@c@@c': [1, 2], - 'c@@c@@d':[[1, 1], [2, 2]], 'c@@d': [[1], [2, 2]]} + assert unpack_batch_nested_mapping(batch, {}, {}) == {'a': [[1, 2], [3]], 'b': [1, 2], ('c', 'c', 'c'): [1, 2], + ('c','c', 'd'):[[1, 1], [2, 2]], ('c', 'd'): [[1], [2, 2]]} def test_pack_batch_nested_mapping(): - batch = {'a': [[1, 2], [3]], 'b': [1, 2], 'c@@c@@c': [1, 2], - 'c@@c@@d':[[1, 1], [2, 2]], 'c@@d': [[1], [2, 2]]} + batch = {'a': [[1, 2], [3]], 'b': [1, 2], ('c', 'c', 'c'): [1, 2], + ('c', 'c', 'd'):[[1, 1], [2, 2]], ('c', 'd'): [[1], [2, 2]]} new_batch = pack_batch_nested_mapping(batch) assert new_batch == {'a': [[1, 2], [3]], 'b': [1, 2], 'c': {'c':{'c': [1, 2], 'd': [[1, 1], [2, 2]]}, 'd':[[1], [2, 2]]}} @@ -30,7 +30,7 @@ def test_pack_batch_nested_mapping(): def test_unpack_batch_sequence(): batch = [[1, 2, 3], [2, 4, 6]] - new_batch = unpack_batch_sequence(batch) + new_batch = unpack_batch_sequence(batch, {}) assert new_batch == {'_0': [1, 2], '_1': [2, 4], '_2': [3, 6]} From a6103f634253458b909f3e1d8113f94e2f34921c Mon Sep 17 00:00:00 2001 From: x54-729 <17307130121@fudan.edu.cn> Date: Sat, 30 Apr 2022 10:51:55 +0000 Subject: [PATCH 4/9] =?UTF-8?q?=E9=87=8D=E5=91=BD=E5=90=8D=E4=B8=8D?= =?UTF-8?q?=E9=9C=80=E8=A6=81pytest=E7=9A=84=E6=B5=8B=E8=AF=95=E6=96=87?= =?UTF-8?q?=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../controllers/{test_trainer_fleet.py => _test_trainer_fleet.py} | 0 ...st_trainer_fleet_outside.py => _test_trainer_fleet_outside.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/core/controllers/{test_trainer_fleet.py => _test_trainer_fleet.py} (100%) rename tests/core/controllers/{test_trainer_fleet_outside.py => _test_trainer_fleet_outside.py} (100%) diff --git a/tests/core/controllers/test_trainer_fleet.py b/tests/core/controllers/_test_trainer_fleet.py similarity index 100% rename from tests/core/controllers/test_trainer_fleet.py rename to tests/core/controllers/_test_trainer_fleet.py diff --git a/tests/core/controllers/test_trainer_fleet_outside.py b/tests/core/controllers/_test_trainer_fleet_outside.py similarity index 100% rename from tests/core/controllers/test_trainer_fleet_outside.py rename to tests/core/controllers/_test_trainer_fleet_outside.py From b3c9819fb84c93b674af71bee60f50aed3179fab Mon Sep 17 00:00:00 2001 From: x54-729 <17307130121@fudan.edu.cn> Date: Sat, 30 Apr 2022 12:55:57 +0000 Subject: [PATCH 5/9] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20=5F=5Finit=5F=5F.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/core/dataloaders/jittor_dataloader/__init__.py | 0 tests/core/dataloaders/paddle_dataloader/__init__.py | 0 tests/core/dataloaders/torch_dataloader/__init__.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/core/dataloaders/jittor_dataloader/__init__.py create mode 100644 tests/core/dataloaders/paddle_dataloader/__init__.py create mode 100644 tests/core/dataloaders/torch_dataloader/__init__.py diff --git a/tests/core/dataloaders/jittor_dataloader/__init__.py b/tests/core/dataloaders/jittor_dataloader/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/core/dataloaders/paddle_dataloader/__init__.py b/tests/core/dataloaders/paddle_dataloader/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/core/dataloaders/torch_dataloader/__init__.py b/tests/core/dataloaders/torch_dataloader/__init__.py new file mode 100644 index 00000000..e69de29b From cf2ef2ecd79a43f9ecf4054f067231fc421e0dd9 Mon Sep 17 00:00:00 2001 From: x54-729 <17307130121@fudan.edu.cn> Date: Sat, 30 Apr 2022 13:04:55 +0000 Subject: [PATCH 6/9] =?UTF-8?q?=E8=B0=83=E6=95=B4=E6=B5=8B=E8=AF=95?= =?UTF-8?q?=E4=BE=8B=E7=9A=84backend=E8=AE=BE=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../paddle_driver/initialize_paddle_driver.py | 2 +- .../torch_driver/initialize_torch_driver.py | 4 +-- fastNLP/core/metrics/utils.py | 5 ++- .../test_load_best_model_callback_torch.py | 4 +-- tests/core/controllers/_test_trainer_fleet.py | 1 - .../_test_trainer_fleet_outside.py | 1 - tests/core/controllers/test_trainer_paddle.py | 4 +-- .../drivers/paddle_driver/test_dist_utils.py | 1 - .../core/drivers/paddle_driver/test_fleet.py | 2 -- .../test_initialize_paddle_driver.py | 3 -- .../paddle_driver/test_single_device.py | 3 -- .../core/drivers/paddle_driver/test_utils.py | 2 -- tests/core/drivers/torch_driver/test.py | 31 +++++++++++++++++++ tests/core/drivers/torch_driver/test_ddp.py | 2 -- .../test_initialize_torch_driver.py | 3 -- .../torch_driver/test_single_device.py | 2 -- tests/core/drivers/torch_driver/test_utils.py | 2 -- .../core/samplers/test_unrepeated_sampler.py | 18 +++++------ 18 files changed, 48 insertions(+), 42 deletions(-) create mode 100644 tests/core/drivers/torch_driver/test.py diff --git a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py index 9a9d4198..c0489e6e 100644 --- a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py +++ b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py @@ -14,7 +14,7 @@ if _NEED_IMPORT_PADDLE: import paddle def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[int]]], - model: paddle.nn.Layer, **kwargs) -> PaddleDriver: + model: "paddle.nn.Layer", **kwargs) -> PaddleDriver: r""" 用来根据参数 `driver` 和 `device` 来确定并且初始化一个具体的 `Driver` 实例然后返回回去; 1、如果检测到当前进程为用户通过 `python -m paddle.distributed.launch xxx.py` 方式拉起的,则将 diff --git a/fastNLP/core/drivers/torch_driver/initialize_torch_driver.py b/fastNLP/core/drivers/torch_driver/initialize_torch_driver.py index 5ee946c4..7cef7316 100644 --- a/fastNLP/core/drivers/torch_driver/initialize_torch_driver.py +++ b/fastNLP/core/drivers/torch_driver/initialize_torch_driver.py @@ -11,8 +11,8 @@ from fastNLP.core.log import logger from fastNLP.envs import FASTNLP_BACKEND_LAUNCH -def initialize_torch_driver(driver: str, device: Optional[Union[str, torch.device, int, List[int]]], - model: torch.nn.Module, **kwargs) -> TorchDriver: +def initialize_torch_driver(driver: str, device: Optional[Union[str, "torch.device", int, List[int]]], + model: "torch.nn.Module", **kwargs) -> TorchDriver: r""" 用来根据参数 `driver` 和 `device` 来确定并且初始化一个具体的 `Driver` 实例然后返回回去; 注意如果输入的 `device` 如果和 `driver` 对应不上就直接报错; diff --git a/fastNLP/core/metrics/utils.py b/fastNLP/core/metrics/utils.py index ce6f618b..6d3fd74a 100644 --- a/fastNLP/core/metrics/utils.py +++ b/fastNLP/core/metrics/utils.py @@ -11,9 +11,8 @@ _IS_ALLENNLP_AVAILABLE = _module_available('allennlp') if _IS_ALLENNLP_AVAILABLE: from allennlp.training.metrics import Metric as allennlp_Metric -if _NEED_IMPORT_TORCH and _IS_TORCHMETRICS_AVAILABLE: - if _IS_TORCHMETRICS_AVAILABLE: - from torchmetrics import Metric as torchmetrics_Metric +if _IS_TORCHMETRICS_AVAILABLE: + from torchmetrics import Metric as torchmetrics_Metric if _NEED_IMPORT_PADDLE: from paddle.metric import Metric as paddle_Metric diff --git a/tests/core/callbacks/test_load_best_model_callback_torch.py b/tests/core/callbacks/test_load_best_model_callback_torch.py index 0bc63bd5..b042ae0f 100644 --- a/tests/core/callbacks/test_load_best_model_callback_torch.py +++ b/tests/core/callbacks/test_load_best_model_callback_torch.py @@ -16,7 +16,7 @@ from fastNLP.core.controllers.trainer import Trainer from fastNLP.core.metrics.accuracy import Accuracy from fastNLP.core.callbacks.load_best_model_callback import LoadBestModelCallback from fastNLP.core import Evaluator -from fastNLP.core.utils.utils import safe_rm +from fastNLP.core import rank_zero_rm from fastNLP.core.drivers.torch_driver import TorchSingleDriver from tests.helpers.models.torch_model import TorchNormalModel_Classification_1 from tests.helpers.datasets.torch_data import TorchArgMaxDataset @@ -112,7 +112,7 @@ def test_load_best_model_callback( results = evaluator.run() assert np.allclose(callbacks[0].monitor_value, results['acc#acc#dl1']) if save_folder: - safe_rm(save_folder) + rank_zero_rm(save_folder) if dist.is_initialized(): dist.destroy_process_group() diff --git a/tests/core/controllers/_test_trainer_fleet.py b/tests/core/controllers/_test_trainer_fleet.py index 46201c67..f438b6de 100644 --- a/tests/core/controllers/_test_trainer_fleet.py +++ b/tests/core/controllers/_test_trainer_fleet.py @@ -4,7 +4,6 @@ python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet.py """ import os -os.environ["FASTNLP_BACKEND"] = "paddle" import sys sys.path.append("../../../") diff --git a/tests/core/controllers/_test_trainer_fleet_outside.py b/tests/core/controllers/_test_trainer_fleet_outside.py index a48434fa..e8c9a244 100644 --- a/tests/core/controllers/_test_trainer_fleet_outside.py +++ b/tests/core/controllers/_test_trainer_fleet_outside.py @@ -4,7 +4,6 @@ python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet_outside.py """ import os -os.environ["FASTNLP_BACKEND"] = "paddle" import sys sys.path.append("../../../") diff --git a/tests/core/controllers/test_trainer_paddle.py b/tests/core/controllers/test_trainer_paddle.py index 8a3ab2ce..aaf20105 100644 --- a/tests/core/controllers/test_trainer_paddle.py +++ b/tests/core/controllers/test_trainer_paddle.py @@ -1,6 +1,4 @@ import pytest -import os -os.environ["FASTNLP_BACKEND"] = "paddle" from dataclasses import dataclass from fastNLP.core.controllers.trainer import Trainer @@ -25,7 +23,7 @@ class TrainPaddleConfig: shuffle: bool = True evaluate_every = 2 -@pytest.mark.parametrize("driver,device", [("paddle", "cpu"), ("paddle", 1)]) +@pytest.mark.parametrize("driver,device", [("paddle", "cpu"), ("paddle", 1), ("fleet", [0, 1])]) # @pytest.mark.parametrize("driver,device", [("fleet", [0, 1])]) @pytest.mark.parametrize("callbacks", [[RecordMetricCallback(monitor="acc#acc", metric_threshold=0.0, larger_better=True), RichCallback(5)]]) diff --git a/tests/core/drivers/paddle_driver/test_dist_utils.py b/tests/core/drivers/paddle_driver/test_dist_utils.py index 9b81c38d..bd43378e 100644 --- a/tests/core/drivers/paddle_driver/test_dist_utils.py +++ b/tests/core/drivers/paddle_driver/test_dist_utils.py @@ -3,7 +3,6 @@ import sys import signal import pytest import traceback -os.environ["FASTNLP_BACKEND"] = "paddle" import numpy as np diff --git a/tests/core/drivers/paddle_driver/test_fleet.py b/tests/core/drivers/paddle_driver/test_fleet.py index 34c80888..6190dd8c 100644 --- a/tests/core/drivers/paddle_driver/test_fleet.py +++ b/tests/core/drivers/paddle_driver/test_fleet.py @@ -1,8 +1,6 @@ import pytest -import os from pathlib import Path -os.environ["FASTNLP_BACKEND"] = "paddle" from fastNLP.core.drivers.paddle_driver.fleet import PaddleFleetDriver from fastNLP.core.samplers import ( RandomSampler, diff --git a/tests/core/drivers/paddle_driver/test_initialize_paddle_driver.py b/tests/core/drivers/paddle_driver/test_initialize_paddle_driver.py index df96d746..c8b5bfff 100644 --- a/tests/core/drivers/paddle_driver/test_initialize_paddle_driver.py +++ b/tests/core/drivers/paddle_driver/test_initialize_paddle_driver.py @@ -1,8 +1,5 @@ -import os import pytest -os.environ["FASTNLP_BACKEND"] = "paddle" - from fastNLP.core.drivers import PaddleSingleDriver, PaddleFleetDriver from fastNLP.core.drivers.paddle_driver.initialize_paddle_driver import initialize_paddle_driver from fastNLP.envs import get_gpu_count diff --git a/tests/core/drivers/paddle_driver/test_single_device.py b/tests/core/drivers/paddle_driver/test_single_device.py index 2aa4e0e6..ec40e9f3 100644 --- a/tests/core/drivers/paddle_driver/test_single_device.py +++ b/tests/core/drivers/paddle_driver/test_single_device.py @@ -1,6 +1,3 @@ -import os -from re import S -os.environ["FASTNLP_BACKEND"] = "paddle" import pytest from pathlib import Path diff --git a/tests/core/drivers/paddle_driver/test_utils.py b/tests/core/drivers/paddle_driver/test_utils.py index 690d0fb8..69be8055 100644 --- a/tests/core/drivers/paddle_driver/test_utils.py +++ b/tests/core/drivers/paddle_driver/test_utils.py @@ -1,6 +1,4 @@ -import os import pytest -os.environ["FASTNLP_BACKEND"] = "paddle" from fastNLP.core.drivers.paddle_driver.utils import ( get_device_from_visible, diff --git a/tests/core/drivers/torch_driver/test.py b/tests/core/drivers/torch_driver/test.py new file mode 100644 index 00000000..3a1a280d --- /dev/null +++ b/tests/core/drivers/torch_driver/test.py @@ -0,0 +1,31 @@ +import sys +sys.path.append("../../../../") +from fastNLP.core.drivers.torch_driver.ddp import TorchDDPDriver +from tests.helpers.models.torch_model import TorchNormalModel_Classification_1 + +import torch + +device = [0, 1] +torch_model = TorchNormalModel_Classification_1(10, 10) +torch_opt = torch.optim.Adam(params=torch_model.parameters(), lr=0.01) +device = [torch.device(i) for i in device] +driver = TorchDDPDriver( + model=torch_model, + parallel_device=device, + fp16=False +) +driver.set_optimizers(torch_opt) +driver.setup() +print("-----------first--------------") + +device = [0, 2] +torch_model = TorchNormalModel_Classification_1(10, 10) +torch_opt = torch.optim.Adam(params=torch_model.parameters(), lr=0.01) +device = [torch.device(i) for i in device] +driver = TorchDDPDriver( + model=torch_model, + parallel_device=device, + fp16=False +) +driver.set_optimizers(torch_opt) +driver.setup() \ No newline at end of file diff --git a/tests/core/drivers/torch_driver/test_ddp.py b/tests/core/drivers/torch_driver/test_ddp.py index 0e91fe77..87787fbc 100644 --- a/tests/core/drivers/torch_driver/test_ddp.py +++ b/tests/core/drivers/torch_driver/test_ddp.py @@ -1,8 +1,6 @@ import pytest -import os from pathlib import Path -os.environ["FASTNLP_BACKEND"] = "torch" from fastNLP.core.drivers.torch_driver.ddp import TorchDDPDriver from fastNLP.core.samplers import ( RandomSampler, diff --git a/tests/core/drivers/torch_driver/test_initialize_torch_driver.py b/tests/core/drivers/torch_driver/test_initialize_torch_driver.py index 6c47e30e..3e612964 100644 --- a/tests/core/drivers/torch_driver/test_initialize_torch_driver.py +++ b/tests/core/drivers/torch_driver/test_initialize_torch_driver.py @@ -1,8 +1,5 @@ -import os import pytest -os.environ["FASTNLP_BACKEND"] = "torch" - from fastNLP.core.drivers import TorchSingleDriver, TorchDDPDriver from fastNLP.core.drivers.torch_driver.initialize_torch_driver import initialize_torch_driver from fastNLP.envs import get_gpu_count diff --git a/tests/core/drivers/torch_driver/test_single_device.py b/tests/core/drivers/torch_driver/test_single_device.py index b8a8def9..f46f69c0 100644 --- a/tests/core/drivers/torch_driver/test_single_device.py +++ b/tests/core/drivers/torch_driver/test_single_device.py @@ -1,5 +1,3 @@ -import os -os.environ["FASTNLP_BACKEND"] = "torch" import pytest from pathlib import Path diff --git a/tests/core/drivers/torch_driver/test_utils.py b/tests/core/drivers/torch_driver/test_utils.py index 8f0172e0..4df767b5 100644 --- a/tests/core/drivers/torch_driver/test_utils.py +++ b/tests/core/drivers/torch_driver/test_utils.py @@ -1,6 +1,4 @@ -import os import pytest -os.environ["FASTNLP_BACKEND"] = "torch" from fastNLP.core.drivers.torch_driver.utils import ( replace_batch_sampler, diff --git a/tests/core/samplers/test_unrepeated_sampler.py b/tests/core/samplers/test_unrepeated_sampler.py index 4a271f41..39d4e34f 100644 --- a/tests/core/samplers/test_unrepeated_sampler.py +++ b/tests/core/samplers/test_unrepeated_sampler.py @@ -28,12 +28,12 @@ class TestUnrepeatedSampler: @pytest.mark.parametrize('num_replicas', [2, 3]) @pytest.mark.parametrize('num_of_data', [2, 3, 4, 100]) @pytest.mark.parametrize('shuffle', [False, True]) - def test_multi(self, num_replica, num_of_data, shuffle): + def test_multi(self, num_replicas, num_of_data, shuffle): data = DatasetWithVaryLength(num_of_data=num_of_data) samplers = [] - for i in range(num_replica): + for i in range(num_replicas): sampler = UnrepeatedRandomSampler(dataset=data, shuffle=shuffle) - sampler.set_distributed(num_replica, rank=i) + sampler.set_distributed(num_replicas, rank=i) samplers.append(sampler) indexes = list(chain(*samplers)) @@ -52,12 +52,12 @@ class TestUnrepeatedSortedSampler: @pytest.mark.parametrize('num_replicas', [2, 3]) @pytest.mark.parametrize('num_of_data', [2, 3, 4, 100]) - def test_multi(self, num_replica, num_of_data): + def test_multi(self, num_replicas, num_of_data): data = DatasetWithVaryLength(num_of_data=num_of_data) samplers = [] - for i in range(num_replica): + for i in range(num_replicas): sampler = UnrepeatedSortedSampler(dataset=data, length=data.data) - sampler.set_distributed(num_replica, rank=i) + sampler.set_distributed(num_replicas, rank=i) samplers.append(sampler) # 保证顺序是没乱的 @@ -83,12 +83,12 @@ class TestUnrepeatedSequentialSampler: @pytest.mark.parametrize('num_replicas', [2, 3]) @pytest.mark.parametrize('num_of_data', [2, 3, 4, 100]) - def test_multi(self, num_replica, num_of_data): + def test_multi(self, num_replicas, num_of_data): data = DatasetWithVaryLength(num_of_data=num_of_data) samplers = [] - for i in range(num_replica): + for i in range(num_replicas): sampler = UnrepeatedSequentialSampler(dataset=data, length=data.data) - sampler.set_distributed(num_replica, rank=i) + sampler.set_distributed(num_replicas, rank=i) samplers.append(sampler) # 保证顺序是没乱的 From 35f05932687ddf93229d5d26987e9030b744acd9 Mon Sep 17 00:00:00 2001 From: YWMditto Date: Sat, 30 Apr 2022 21:39:20 +0800 Subject: [PATCH 7/9] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E4=B8=80?= =?UTF-8?q?=E4=BA=9B=E6=B5=8B=E8=AF=95=E6=96=87=E4=BB=B6=E7=9A=84=E5=90=8D?= =?UTF-8?q?=E7=A7=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../{test_logger.py => test_logger_torch.py} | 0 .../test_reproducible_batch_sampler.py | 294 +++++++++--------- 2 files changed, 147 insertions(+), 147 deletions(-) rename tests/core/log/{test_logger.py => test_logger_torch.py} (100%) diff --git a/tests/core/log/test_logger.py b/tests/core/log/test_logger_torch.py similarity index 100% rename from tests/core/log/test_logger.py rename to tests/core/log/test_logger_torch.py diff --git a/tests/core/samplers/test_reproducible_batch_sampler.py b/tests/core/samplers/test_reproducible_batch_sampler.py index 3514c331..6cf4b7d4 100644 --- a/tests/core/samplers/test_reproducible_batch_sampler.py +++ b/tests/core/samplers/test_reproducible_batch_sampler.py @@ -9,153 +9,153 @@ from fastNLP.core.samplers import RandomBatchSampler, BucketedBatchSampler from fastNLP.core.drivers.torch_driver.utils import replace_batch_sampler from tests.helpers.datasets.torch_data import TorchNormalDataset - -class TestReproducibleBatchSampler: - # TODO 拆分测试,在这里只测试一个东西 - def test_torch_dataloader_1(self): - import torch - from torch.utils.data import DataLoader - # no shuffle - before_batch_size = 7 - dataset = TorchNormalDataset(num_of_data=100) - dataloader = DataLoader(dataset, batch_size=before_batch_size) - re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) - dataloader = replace_batch_sampler(dataloader, re_batchsampler) - - forward_steps = 3 - iter_dataloader = iter(dataloader) - for _ in range(forward_steps): - next(iter_dataloader) - - # 1. 保存状态 - _get_re_batchsampler = dataloader.batch_sampler - assert isinstance(_get_re_batchsampler, RandomBatchSampler) - state = _get_re_batchsampler.state_dict() - assert state == {"index_list": array("I", list(range(100))), "num_consumed_samples": forward_steps*before_batch_size, - "sampler_type": "RandomBatchSampler"} - - # 2. 断点重训,重新生成一个 dataloader; - # 不改变 batch_size; - dataloader = DataLoader(dataset, batch_size=before_batch_size) - re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) - re_batchsampler.load_state_dict(state) - dataloader = replace_batch_sampler(dataloader, re_batchsampler) - - real_res = [] - supposed_res = (torch.tensor(list(range(21, 28))), torch.tensor(list(range(28, 35)))) - forward_steps = 2 - iter_dataloader = iter(dataloader) - for _ in range(forward_steps): - real_res.append(next(iter_dataloader)) - - for i in range(forward_steps): - assert all(real_res[i] == supposed_res[i]) - - # 改变 batch_size; - after_batch_size = 3 - dataloader = DataLoader(dataset, batch_size=after_batch_size) - re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) - re_batchsampler.load_state_dict(state) - dataloader = replace_batch_sampler(dataloader, re_batchsampler) - - real_res = [] - supposed_res = (torch.tensor(list(range(21, 24))), torch.tensor(list(range(24, 27)))) - forward_steps = 2 - iter_dataloader = iter(dataloader) - for _ in range(forward_steps): - real_res.append(next(iter_dataloader)) - - for i in range(forward_steps): - assert all(real_res[i] == supposed_res[i]) - - # 断点重训的第二轮是否是一个完整的 dataloader; - # 先把断点重训所在的那一个 epoch 跑完; - begin_idx = 27 - while True: - try: - data = next(iter_dataloader) - _batch_size = len(data) - assert all(data == torch.tensor(list(range(begin_idx, begin_idx + _batch_size)))) - begin_idx += _batch_size - except StopIteration: - break - - # 开始新的一轮; - begin_idx = 0 - iter_dataloader = iter(dataloader) - while True: - try: - data = next(iter_dataloader) - _batch_size = len(data) - assert all(data == torch.tensor(list(range(begin_idx, begin_idx + _batch_size)))) - begin_idx += _batch_size - except StopIteration: - break - - def test_torch_dataloader_2(self): - # 测试新的一轮的 index list 是重新生成的,而不是沿用上一轮的; - from torch.utils.data import DataLoader - # no shuffle - before_batch_size = 7 - dataset = TorchNormalDataset(num_of_data=100) - # 开启 shuffle,来检验断点重训后的第二轮的 index list 是不是重新生成的; - dataloader = DataLoader(dataset, batch_size=before_batch_size, shuffle=True) - re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) - dataloader = replace_batch_sampler(dataloader, re_batchsampler) - - # 将一轮的所有数据保存下来,看是否恢复的是正确的; - all_supposed_data = [] - forward_steps = 3 - iter_dataloader = iter(dataloader) - for _ in range(forward_steps): - all_supposed_data.extend(next(iter_dataloader).tolist()) - - # 1. 保存状态 - _get_re_batchsampler = dataloader.batch_sampler - assert isinstance(_get_re_batchsampler, RandomBatchSampler) - state = _get_re_batchsampler.state_dict() - - # 2. 断点重训,重新生成一个 dataloader; - # 不改变 batch_size; - dataloader = DataLoader(dataset, batch_size=before_batch_size, shuffle=True) - re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) - re_batchsampler.load_state_dict(state) - dataloader = replace_batch_sampler(dataloader, re_batchsampler) - - # 先把这一轮的数据过完; - pre_index_list = dataloader.batch_sampler.state_dict()["index_list"] - while True: - try: - all_supposed_data.extend(next(iter_dataloader).tolist()) - except StopIteration: - break - assert all_supposed_data == list(pre_index_list) - - # 重新开启新的一轮; - for _ in range(3): - iter_dataloader = iter(dataloader) - res = [] - while True: - try: - res.append(next(iter_dataloader)) - except StopIteration: - break - - def test_3(self): - import torch - from torch.utils.data import DataLoader - before_batch_size = 7 - dataset = TorchNormalDataset(num_of_data=100) - # 开启 shuffle,来检验断点重训后的第二轮的 index list 是不是重新生成的; - dataloader = DataLoader(dataset, batch_size=before_batch_size) - - for idx, data in enumerate(dataloader): - if idx > 3: - break - - iterator = iter(dataloader) - for each in iterator: - pass +# +# class TestReproducibleBatchSampler: +# # TODO 拆分测试,在这里只测试一个东西 +# def test_torch_dataloader_1(self): +# import torch +# from torch.utils.data import DataLoader +# # no shuffle +# before_batch_size = 7 +# dataset = TorchNormalDataset(num_of_data=100) +# dataloader = DataLoader(dataset, batch_size=before_batch_size) +# re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) +# dataloader = replace_batch_sampler(dataloader, re_batchsampler) +# +# forward_steps = 3 +# iter_dataloader = iter(dataloader) +# for _ in range(forward_steps): +# next(iter_dataloader) +# +# # 1. 保存状态 +# _get_re_batchsampler = dataloader.batch_sampler +# assert isinstance(_get_re_batchsampler, RandomBatchSampler) +# state = _get_re_batchsampler.state_dict() +# assert state == {"index_list": array("I", list(range(100))), "num_consumed_samples": forward_steps*before_batch_size, +# "sampler_type": "RandomBatchSampler"} +# +# # 2. 断点重训,重新生成一个 dataloader; +# # 不改变 batch_size; +# dataloader = DataLoader(dataset, batch_size=before_batch_size) +# re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) +# re_batchsampler.load_state_dict(state) +# dataloader = replace_batch_sampler(dataloader, re_batchsampler) +# +# real_res = [] +# supposed_res = (torch.tensor(list(range(21, 28))), torch.tensor(list(range(28, 35)))) +# forward_steps = 2 +# iter_dataloader = iter(dataloader) +# for _ in range(forward_steps): +# real_res.append(next(iter_dataloader)) +# +# for i in range(forward_steps): +# assert all(real_res[i] == supposed_res[i]) +# +# # 改变 batch_size; +# after_batch_size = 3 +# dataloader = DataLoader(dataset, batch_size=after_batch_size) +# re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) +# re_batchsampler.load_state_dict(state) +# dataloader = replace_batch_sampler(dataloader, re_batchsampler) +# +# real_res = [] +# supposed_res = (torch.tensor(list(range(21, 24))), torch.tensor(list(range(24, 27)))) +# forward_steps = 2 +# iter_dataloader = iter(dataloader) +# for _ in range(forward_steps): +# real_res.append(next(iter_dataloader)) +# +# for i in range(forward_steps): +# assert all(real_res[i] == supposed_res[i]) +# +# # 断点重训的第二轮是否是一个完整的 dataloader; +# # 先把断点重训所在的那一个 epoch 跑完; +# begin_idx = 27 +# while True: +# try: +# data = next(iter_dataloader) +# _batch_size = len(data) +# assert all(data == torch.tensor(list(range(begin_idx, begin_idx + _batch_size)))) +# begin_idx += _batch_size +# except StopIteration: +# break +# +# # 开始新的一轮; +# begin_idx = 0 +# iter_dataloader = iter(dataloader) +# while True: +# try: +# data = next(iter_dataloader) +# _batch_size = len(data) +# assert all(data == torch.tensor(list(range(begin_idx, begin_idx + _batch_size)))) +# begin_idx += _batch_size +# except StopIteration: +# break +# +# def test_torch_dataloader_2(self): +# # 测试新的一轮的 index list 是重新生成的,而不是沿用上一轮的; +# from torch.utils.data import DataLoader +# # no shuffle +# before_batch_size = 7 +# dataset = TorchNormalDataset(num_of_data=100) +# # 开启 shuffle,来检验断点重训后的第二轮的 index list 是不是重新生成的; +# dataloader = DataLoader(dataset, batch_size=before_batch_size, shuffle=True) +# re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) +# dataloader = replace_batch_sampler(dataloader, re_batchsampler) +# +# # 将一轮的所有数据保存下来,看是否恢复的是正确的; +# all_supposed_data = [] +# forward_steps = 3 +# iter_dataloader = iter(dataloader) +# for _ in range(forward_steps): +# all_supposed_data.extend(next(iter_dataloader).tolist()) +# +# # 1. 保存状态 +# _get_re_batchsampler = dataloader.batch_sampler +# assert isinstance(_get_re_batchsampler, RandomBatchSampler) +# state = _get_re_batchsampler.state_dict() +# +# # 2. 断点重训,重新生成一个 dataloader; +# # 不改变 batch_size; +# dataloader = DataLoader(dataset, batch_size=before_batch_size, shuffle=True) +# re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) +# re_batchsampler.load_state_dict(state) +# dataloader = replace_batch_sampler(dataloader, re_batchsampler) +# +# # 先把这一轮的数据过完; +# pre_index_list = dataloader.batch_sampler.state_dict()["index_list"] +# while True: +# try: +# all_supposed_data.extend(next(iter_dataloader).tolist()) +# except StopIteration: +# break +# assert all_supposed_data == list(pre_index_list) +# +# # 重新开启新的一轮; +# for _ in range(3): +# iter_dataloader = iter(dataloader) +# res = [] +# while True: +# try: +# res.append(next(iter_dataloader)) +# except StopIteration: +# break +# +# def test_3(self): +# import torch +# from torch.utils.data import DataLoader +# before_batch_size = 7 +# dataset = TorchNormalDataset(num_of_data=100) +# # 开启 shuffle,来检验断点重训后的第二轮的 index list 是不是重新生成的; +# dataloader = DataLoader(dataset, batch_size=before_batch_size) +# +# for idx, data in enumerate(dataloader): +# if idx > 3: +# break +# +# iterator = iter(dataloader) +# for each in iterator: +# pass class DatasetWithVaryLength: From 6da627d4ceb2103046b374ca9c4d76d8e627469e Mon Sep 17 00:00:00 2001 From: lxr-tech <1838593642@qq.com> Date: Sat, 30 Apr 2022 23:29:12 +0800 Subject: [PATCH 8/9] modify-fastnlp_tutorial_0-lxr-220430 --- fastNLP/core/metrics/accuracy.py | 4 +- .../metrics/classify_f1_pre_rec_metric.py | 4 +- tutorials/fastnlp_tutorial_0.ipynb | 701 +++++++----------- .../figures/T0-fig-trainer-and-evaluator.png | Bin 0 -> 104863 bytes 4 files changed, 255 insertions(+), 454 deletions(-) create mode 100644 tutorials/figures/T0-fig-trainer-and-evaluator.png diff --git a/fastNLP/core/metrics/accuracy.py b/fastNLP/core/metrics/accuracy.py index d9ccb332..0869d8c8 100644 --- a/fastNLP/core/metrics/accuracy.py +++ b/fastNLP/core/metrics/accuracy.py @@ -28,7 +28,7 @@ class Accuracy(Metric): def get_metric(self) -> dict: r""" - get_metric 函数将根据 evaluate 函数累计的评价指标统计量来计算最终的评价结果. + get_metric 函数将根据 update 函数累计的评价指标统计量来计算最终的评价结果. :return dict evaluate_result: {"acc": float} """ @@ -37,7 +37,7 @@ class Accuracy(Metric): def update(self, pred, target, seq_len=None): r""" - evaluate函数将针对一个批次的预测结果做评价指标的累计 + update 函数将针对一个批次的预测结果做评价指标的累计 :param torch.Tensor pred: 预测的tensor, tensor的形状可以是torch.Size([B,]), torch.Size([B, n_classes]), torch.Size([B, max_len]), 或者torch.Size([B, max_len, n_classes]) diff --git a/fastNLP/core/metrics/classify_f1_pre_rec_metric.py b/fastNLP/core/metrics/classify_f1_pre_rec_metric.py index 2c71602d..8de007ce 100644 --- a/fastNLP/core/metrics/classify_f1_pre_rec_metric.py +++ b/fastNLP/core/metrics/classify_f1_pre_rec_metric.py @@ -56,7 +56,7 @@ class ClassifyFPreRecMetric(Metric): def get_metric(self) -> dict: r""" - get_metric函数将根据evaluate函数累计的评价指标统计量来计算最终的评价结果. + get_metric函数将根据update函数累计的评价指标统计量来计算最终的评价结果. :return dict evaluate_result: {"acc": float} """ @@ -117,7 +117,7 @@ class ClassifyFPreRecMetric(Metric): def update(self, pred, target, seq_len=None): r""" - evaluate函数将针对一个批次的预测结果做评价指标的累计 + update 函数将针对一个批次的预测结果做评价指标的累计 :param torch.Tensor pred: 预测的tensor, tensor的形状可以是torch.Size([B,]), torch.Size([B, n_classes]), torch.Size([B, max_len]), 或者torch.Size([B, max_len, n_classes]) diff --git a/tutorials/fastnlp_tutorial_0.ipynb b/tutorials/fastnlp_tutorial_0.ipynb index 01913ac0..28fcfddf 100644 --- a/tutorials/fastnlp_tutorial_0.ipynb +++ b/tutorials/fastnlp_tutorial_0.ipynb @@ -15,15 +15,15 @@ "\n", "    1.3   trainer 内部初始化 evaluater\n", "\n", - "  2   使用 trainer 训练模型\n", + "  2   使用 fastNLP 0.8 搭建 argmax 模型\n", "\n", - "    2.1   argmax 模型实例\n", + "    2.1   trainer_step 和 evaluator_step\n", "\n", - "    2.2   trainer 的参数匹配\n", + "    2.2   trainer 和 evaluator 的参数匹配\n", "\n", - "    2.3   trainer 的实际使用 \n", + "    2.3   一个实际案例:argmax 模型\n", "\n", - "  3   使用 evaluator 评测模型\n", + "  3   使用 fastNLP 0.8 训练 argmax 模型\n", " \n", "    3.1   trainer 外部初始化的 evaluator\n", "\n", @@ -50,21 +50,21 @@ "\n", "```python\n", "trainer = Trainer(\n", - " model=model,\n", - " train_dataloader=train_dataloader,\n", - " optimizers=optimizer,\n", + " model=model, # 模型基于 torch.nn.Module\n", + " train_dataloader=train_dataloader, # 加载模块基于 torch.utils.data.DataLoader \n", + " optimizers=optimizer, # 优化模块基于 torch.optim.*\n", "\t...\n", - "\tdriver=\"torch\",\n", - "\tdevice=0,\n", + "\tdriver=\"torch\", # 使用 pytorch 模块进行训练 \n", + "\tdevice='cuda', # 使用 GPU:0 显卡执行训练\n", "\t...\n", ")\n", "...\n", "evaluator = Evaluator(\n", - " model=model,\n", - " dataloaders=evaluate_dataloader,\n", - " metrics={'acc': Accuracy()} \n", + " model=model, # 模型基于 torch.nn.Module\n", + " dataloaders=evaluate_dataloader, # 加载模块基于 torch.utils.data.DataLoader\n", + " metrics={'acc': Accuracy()}, # 测评方法使用 fastNLP.core.metrics.Accuracy \n", " ...\n", - " driver=trainer.driver,\n", + " driver=trainer.driver, # 保持同 trainer 的 driver 一致\n", "\tdevice=None,\n", " ...\n", ")\n", @@ -88,7 +88,7 @@ "\n", "注:在同一脚本中,`Trainer`和`Evaluator`使用的`driver`应当保持一致\n", "\n", - "  一个不能违背的原则在于:**不要将多卡的`driver`前使用单卡的`driver`**(???),这样使用可能会带来很多意想不到的错误。" + "  一个不能违背的原则在于:**不要将多卡的`driver`前使用单卡的`driver`**(???),这样使用可能会带来很多意想不到的错误" ] }, { @@ -109,10 +109,10 @@ " optimizers=optimizer,\n", "\t...\n", "\tdriver=\"torch\",\n", - "\tdevice=0,\n", + "\tdevice='cuda',\n", "\t...\n", - " evaluate_dataloaders=evaluate_dataloader,\n", - " metrics={'acc': Accuracy()},\n", + " evaluate_dataloaders=evaluate_dataloader, # 传入参数 evaluator_dataloaders\n", + " metrics={'acc': Accuracy()}, # 传入参数 metrics\n", "\t...\n", ")\n", "```" @@ -123,7 +123,7 @@ "id": "0c9c7dda", "metadata": {}, "source": [ - "## 2. 使用 trainer 训练模型" + "## 2. argmax 模型的搭建实例" ] }, { @@ -131,71 +131,41 @@ "id": "524ac200", "metadata": {}, "source": [ - "### 2.1 argmax 模型实例\n", + "### 2.1 trainer_step 和 evaluator_step\n", "\n", - "本节将通过训练`argmax`模型,简单介绍如何`Trainer`模块的使用方式\n", + "在`fastNLP 0.8`中,使用`pytorch.nn.Module`搭建需要训练的模型,在搭建模型过程中,除了\n", "\n", - "  使用`pytorch`定义`argmax`模型,输入一组固定维度的向量,输出其中数值最大的数的索引\n", - "\n", - "  除了添加`pytorch`要求的`forward`方法外,还需要添加 **`train_step`** 和 **`evaluate_step`** 这两个方法" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5314482b", - "metadata": { - "pycharm": { - "is_executing": true - } - }, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "\n", - "class ArgMaxModel(nn.Module):\n", - " def __init__(self, num_labels, feature_dimension):\n", - " super(ArgMaxModel, self).__init__()\n", - " self.num_labels = num_labels\n", - "\n", - " self.linear1 = nn.Linear(in_features=feature_dimension, out_features=10)\n", - " self.ac1 = nn.ReLU()\n", - " self.linear2 = nn.Linear(in_features=10, out_features=10)\n", - " self.ac2 = nn.ReLU()\n", - " self.output = nn.Linear(in_features=10, out_features=num_labels)\n", - " self.loss_fn = nn.CrossEntropyLoss()\n", + "  添加`pytorch`要求的`forward`方法外,还需要添加 **`train_step`** 和 **`evaluate_step`** 这两个方法\n", + "***\n", + "```python\n", + "class Model(torch.nn.Module):\n", + " def __init__(self):\n", + " super(Model, self).__init__()\n", + " self.loss_fn = torch.nn.CrossEntropyLoss()\n", + " pass\n", "\n", " def forward(self, x):\n", - " x = self.ac1(self.linear1(x))\n", - " x = self.ac2(self.linear2(x))\n", - " x = self.output(x)\n", - " return x\n", + " pass\n", "\n", " def train_step(self, x, y):\n", - " x = self(x)\n", - " return {\"loss\": self.loss_fn(x, y)}\n", + " pred = self(x)\n", + " return {\"loss\": self.loss_fn(pred, y)}\n", "\n", " def evaluate_step(self, x, y):\n", - " x = self(x)\n", - " x = torch.max(x, dim=-1)[1]\n", - " return {\"pred\": x, \"target\": y}" - ] - }, - { - "cell_type": "markdown", - "id": "ca897322", - "metadata": {}, - "source": [ + " pred = self(x)\n", + " pred = torch.max(pred, dim=-1)[1]\n", + " return {\"pred\": pred, \"target\": y}\n", + "```\n", + "***\n", "在`fastNLP 0.8`中,**函数`train_step`是`Trainer`中参数`train_fn`的默认值**\n", "\n", - "  由于,在`Trainer`训练时,**`Trainer`通过参数`_train_fn_`对应的模型方法获得当前数据批次的损失值**\n", + "  由于,在`Trainer`训练时,**`Trainer`通过参数`train_fn`对应的模型方法获得当前数据批次的损失值**\n", "\n", "  因此,在`Trainer`训练时,`Trainer`首先会寻找模型是否定义了`train_step`这一方法\n", "\n", "    如果没有找到,那么`Trainer`会默认使用模型的`forward`函数来进行训练的前向传播过程\n", "\n", - "注:在`fastNLP 0.8`中,`Trainer`要求模型通过`train_step`来返回一个字典,将损失值作为`loss`的键值\n", + "注:在`fastNLP 0.8`中,**`Trainer`要求模型通过`train_step`来返回一个字典**,**满足如`{\"loss\": loss}`的形式**\n", "\n", "  此外,这里也可以通过传入`Trainer`的参数`output_mapping`来实现高度化的定制,具体请见这一note(???)\n", "\n", @@ -205,7 +175,11 @@ "\n", "  从用户角度,模型通过`evaluate_step`方法来返回一个字典,内容与传入`Evaluator`的`metrics`一致\n", "\n", - "" + "  从模块角度,该字典的键值和`metric`中的`update`函数的签名一致,这样的机制在传参时被称为“**参数匹配**”\n", + "\n", + "***\n", + "\n", + "![fastNLP 0.8 中,Trainer 和 Evaluator 的关系图](./figures/T0-fig-trainer-and-evaluator.png)" ] }, { @@ -213,13 +187,52 @@ "id": "fb3272eb", "metadata": {}, "source": [ - "### 2.2 trainer 的参数匹配\n", + "### 2.2 trainer 和 evaluator 的参数匹配\n", + "\n", + "在`fastNLP 0.8`中,参数匹配涉及到两个方面,分别是在\n", + "\n", + "  一方面,**在模型的前向传播中**,**`dataloader`向`train_step`或`evaluate_step`函数传递`batch`**\n", + "\n", + "  另方面,**在模型的评测过程中**,**`evaluate_dataloader`向`metric`的`update`函数传递`batch`**\n", "\n", - "`fastNLP 0.8`中的参数匹配涉及到两个方面,一是在模型训练或者评测的前向传播过程中,如果从`dataloader`中出来一个`batch`的数据是一个字典,那么我们会查看模型的`train_step`和`evaluate_step`方法的参数签名,然后对于每一个参数,我们会根据其名字从 batch 这一字典中选择出对应的数据传入进去。例如在接下来的定义`Dataset`的部分,注意`ArgMaxDatset`的`__getitem__`方法,您可以通过在`Trainer`和`Evaluator`中设置参数 `model_wo_auto_param_call`来关闭这一行为。当您关闭了这一行为后,我们会将`batch`直接传给您的`train_step`、`evaluate_step`或者 `forward`函数。\n", + "对于前者,在`Trainer`和`Evaluator`中的参数`model_wo_auto_param_call`被设置为`False`时\n", "\n", - "二是在传入`Trainer`或者`Evaluator metrics`后,我们会在需要评测的时间点主动调用`metrics`来对`evaluate_dataloaders`进行评测,这一功能主要就是通过对`metrics`的`update`方法和一个`batch`的数据进行参数评测实现的。首先需要明确的是一个 metric 的计算通常分为 `update` 和 `get_metric`两步,其中`update`表示更新一个`batch`的评测数据,`get_metric` 表示根据已经得到的评测数据计算出最终的评测值,例如对于 `Accuracy`来说,其在`update`的时候会更新一个`batch`计算正确的数量 right_num 和计算错误的数量 total_num,最终在 `get_metric` 时返回评测值`right_num / total_num`。\n", + "    **`fastNLP 0.8`要求`dataloader`生成的每个`batch`**,**满足如`{\"x\": x, \"y\": y}`的形式**\n", + "\n", + "  同时,`fastNLP 0.8`会查看模型的`train_step`和`evaluate_step`方法的参数签名,并为对应参数传入对应数值\n", + "\n", + "    **字典形式的定义**,**对应在`Dataset`定义的`__getitem__`方法中**,例如下方的`ArgMaxDatset`\n", + "\n", + "  而在`Trainer`和`Evaluator`中的参数`model_wo_auto_param_call`被设置为`True`时\n", + "\n", + "    `fastNLP 0.8`会将`batch`直接传给模型的`train_step`、`evaluate_step`或`forward`函数\n", + "***\n", + "```python\n", + "class Dataset(torch.utils.data.Dataset):\n", + " def __init__(self, x, y):\n", + " self.x = x\n", + " self.y = y\n", + "\n", + " def __len__(self):\n", + " return len(self.x)\n", + "\n", + " def __getitem__(self, item):\n", + " return {\"x\": self.x[item], \"y\": self.y[item]}\n", + "```\n", + "***\n", + "对于后者,首先要明确,在`Trainer`和`Evaluator`中,`metrics`的计算分为`update`和`get_metric`两步\n", "\n", - "因为`fastNLP 0.8`的`metrics`是自动计算的(只需要传给`Trainer`或者`Evaluator`),因此其一定依赖于参数匹配。对于从`evaluate_dataloader`中生成的一个`batch`的数据,我们会查看传给 `Trainer`(最终是传给`Evaluator`)和`Evaluator`的每一个`metric`,然后查看其`update`函数的函数签名,然后根据每一个参数的名字从`batch`字典中选择出对应的数据传入进去。" + "    **`update`函数**,**针对一个`batch`的预测结果**,计算其累计的评价指标\n", + "\n", + "    **`get_metric`函数**,**统计`update`函数累计的评价指标**,来计算最终的评价结果\n", + "\n", + "  例如对于`Accuracy`来说,`update`函数会更新一个`batch`的正例数量`right_num`和负例数量`total_num`\n", + "\n", + "    而`get_metric`函数则会返回所有`batch`的评测值`right_num / total_num`\n", + "\n", + "  在此基础上,**`fastNLP 0.8`要求`evaluate_dataloader`生成的每个`batch`传递给对应的`metric`**\n", + "\n", + "    **以`{\"pred\": y_pred, \"target\": y_true}`的形式**,对应其`update`函数的函数签名" ] }, { @@ -227,9 +240,65 @@ "id": "f62b7bb1", "metadata": {}, "source": [ - "### 2.3 trainer的实际使用\n", + "### 2.3 一个实际案例:argmax 模型\n", "\n", - "接下来我们创建用于训练的 dataset,其接受三个参数:数据维度、数据量和随机数种子,生成指定数量的维度为 `feature_dimension` 向量,而每一个向量的标签就是该向量中最大值的索引。" + "下文将通过训练`argmax`模型,简单介绍如何`Trainer`模块的使用方式\n", + "\n", + "  首先,使用`pytorch.nn.Module`定义`argmax`模型,目标是输入一组固定维度的向量,输出其中数值最大的数的索引" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "5314482b", + "metadata": { + "pycharm": { + "is_executing": false + } + }, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "\n", + "class ArgMaxModel(nn.Module):\n", + " def __init__(self, num_labels, feature_dimension):\n", + " super(ArgMaxModel, self).__init__()\n", + " self.num_labels = num_labels\n", + "\n", + " self.linear1 = nn.Linear(in_features=feature_dimension, out_features=10)\n", + " self.ac1 = nn.ReLU()\n", + " self.linear2 = nn.Linear(in_features=10, out_features=10)\n", + " self.ac2 = nn.ReLU()\n", + " self.output = nn.Linear(in_features=10, out_features=num_labels)\n", + " self.loss_fn = nn.CrossEntropyLoss()\n", + "\n", + " def forward(self, x):\n", + " pred = self.ac1(self.linear1(x))\n", + " pred = self.ac2(self.linear2(pred))\n", + " pred = self.output(pred)\n", + " return pred\n", + "\n", + " def train_step(self, x, y):\n", + " pred = self(x)\n", + " return {\"loss\": self.loss_fn(pred, y)}\n", + "\n", + " def evaluate_step(self, x, y):\n", + " pred = self(x)\n", + " pred = torch.max(pred, dim=-1)[1]\n", + " return {\"pred\": pred, \"target\": y}" + ] + }, + { + "cell_type": "markdown", + "id": "71f3fa6b", + "metadata": {}, + "source": [ + "  接着,使用`torch.utils.data.Dataset`定义`ArgMaxDataset`数据集\n", + "\n", + "    数据集包含三个参数:维度`feature_dimension`、数据量`data_num`和随机种子`seed`\n", + "\n", + "    数据及初始化是,自动生成指定维度的向量,并为每个向量标注出其中最大值的索引作为预测标签" ] }, { @@ -245,7 +314,7 @@ "source": [ "from torch.utils.data import Dataset\n", "\n", - "class ArgMaxDatset(Dataset):\n", + "class ArgMaxDataset(Dataset):\n", " def __init__(self, feature_dimension, data_num=1000, seed=0):\n", " self.num_labels = feature_dimension\n", " self.feature_dimension = feature_dimension\n", @@ -269,7 +338,9 @@ "id": "2cb96332", "metadata": {}, "source": [ - "现在准备好数据和模型。" + "  然后,根据`ArgMaxModel`类初始化模型实例,保持输入维度`feature_dimension`和输出标签数量`num_labels`一致\n", + "\n", + "    再根据`ArgMaxDataset`类初始化两个数据集实例,分别用来模型测试和模型评测,数据量各1000笔" ] }, { @@ -283,16 +354,10 @@ }, "outputs": [], "source": [ - "from torch.utils.data import DataLoader\n", - "\n", - "train_dataset = ArgMaxDatset(feature_dimension=10, data_num=1000)\n", - "evaluate_dataset = ArgMaxDatset(feature_dimension=10, data_num=100)\n", - "\n", - "train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n", - "evaluate_dataloader = DataLoader(evaluate_dataset, batch_size=8)\n", + "model = ArgMaxModel(num_labels=10, feature_dimension=10)\n", "\n", - "# num_labels 设置为 10,与 feature_dimension 保持一致,因为我们是预测十个位置中哪一个的概率最大。\n", - "model = ArgMaxModel(num_labels=10, feature_dimension=10)" + "train_dataset = ArgMaxDataset(feature_dimension=10, data_num=1000)\n", + "evaluate_dataset = ArgMaxDataset(feature_dimension=10, data_num=100)" ] }, { @@ -300,12 +365,33 @@ "id": "4e7d25ee", "metadata": {}, "source": [ - "将优化器也定义好。" + "  此外,使用`torch.utils.data.DataLoader`初始化两个数据加载模块,批量大小同为8,分别用于训练和测评" ] }, { "cell_type": "code", "execution_count": 4, + "id": "363b5b09", + "metadata": {}, + "outputs": [], + "source": [ + "from torch.utils.data import DataLoader\n", + "\n", + "train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n", + "evaluate_dataloader = DataLoader(evaluate_dataset, batch_size=8)" + ] + }, + { + "cell_type": "markdown", + "id": "c8d4443f", + "metadata": {}, + "source": [ + "  最后,使用`torch.optim.SGD`初始化一个优化模块,基于随机梯度下降法" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "id": "dc28a2d9", "metadata": { "pycharm": { @@ -321,15 +407,33 @@ }, { "cell_type": "markdown", - "id": "4f1fba81", + "id": "eb8ca6cf", + "metadata": {}, + "source": [ + "## 3. 使用 fastNLP 0.8 训练 argmax 模型\n", + "\n", + "### 3.1 trainer 外部初始化的 evaluator" + ] + }, + { + "cell_type": "markdown", + "id": "55145553", "metadata": {}, "source": [ - "现在万事俱备,开始使用 Trainer 进行训练!" + "通过从`fastNLP`库中导入`Trainer`类,初始化`trainer`实例,对模型进行训练\n", + "\n", + "  需要导入预先定义好的模型`model`、对应的数据加载模块`train_dataloader`、优化模块`optimizer`\n", + "\n", + "  通过`progress_bar`设定进度条格式,默认为`\"auto\"`,此外还有`\"rich\"`、`\"raw\"`和`None`\n", + "\n", + "    但对于`\"auto\"`和`\"rich\"`格式,训练结束后进度条会不显示(???)\n", + "\n", + "  通过`n_epochs`设定优化迭代轮数,默认为20;全部`Trainer`的全部变量与函数可以通过`dir(trainer)`查询" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "b51b7a2d", "metadata": { "pycharm": { @@ -349,167 +453,20 @@ }, "metadata": {}, "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "['__annotations__',\n", - " '__class__',\n", - " '__delattr__',\n", - " '__dict__',\n", - " '__dir__',\n", - " '__doc__',\n", - " '__eq__',\n", - " '__format__',\n", - " '__ge__',\n", - " '__getattribute__',\n", - " '__gt__',\n", - " '__hash__',\n", - " '__init__',\n", - " '__init_subclass__',\n", - " '__le__',\n", - " '__lt__',\n", - " '__module__',\n", - " '__ne__',\n", - " '__new__',\n", - " '__reduce__',\n", - " '__reduce_ex__',\n", - " '__repr__',\n", - " '__setattr__',\n", - " '__sizeof__',\n", - " '__str__',\n", - " '__subclasshook__',\n", - " '__weakref__',\n", - " '_check_callback_called_legality',\n", - " '_check_train_batch_loop_legality',\n", - " '_custom_callbacks',\n", - " '_driver',\n", - " '_evaluate_dataloaders',\n", - " '_fetch_matched_fn_callbacks',\n", - " '_set_num_eval_batch_per_dl',\n", - " '_train_batch_loop',\n", - " '_train_dataloader',\n", - " '_train_step',\n", - " '_train_step_signature_fn',\n", - " 'accumulation_steps',\n", - " 'add_callback_fn',\n", - " 'backward',\n", - " 'batch_idx_in_epoch',\n", - " 'batch_step_fn',\n", - " 'callback_manager',\n", - " 'check_batch_step_fn',\n", - " 'cur_epoch_idx',\n", - " 'data_device',\n", - " 'dataloader',\n", - " 'device',\n", - " 'driver',\n", - " 'driver_name',\n", - " 'epoch_validate',\n", - " 'evaluate_batch_step_fn',\n", - " 'evaluate_dataloaders',\n", - " 'evaluate_every',\n", - " 'evaluate_fn',\n", - " 'evaluator',\n", - " 'extract_loss_from_outputs',\n", - " 'fp16',\n", - " 'get_no_sync_context',\n", - " 'global_forward_batches',\n", - " 'has_checked_train_batch_loop',\n", - " 'input_mapping',\n", - " 'kwargs',\n", - " 'larger_better',\n", - " 'load',\n", - " 'load_model',\n", - " 'marker',\n", - " 'metrics',\n", - " 'model',\n", - " 'model_device',\n", - " 'monitor',\n", - " 'move_data_to_device',\n", - " 'n_epochs',\n", - " 'num_batches_per_epoch',\n", - " 'on',\n", - " 'on_after_backward',\n", - " 'on_after_optimizers_step',\n", - " 'on_after_trainer_initialized',\n", - " 'on_after_zero_grad',\n", - " 'on_before_backward',\n", - " 'on_before_optimizers_step',\n", - " 'on_before_zero_grad',\n", - " 'on_exception',\n", - " 'on_fetch_data_begin',\n", - " 'on_fetch_data_end',\n", - " 'on_load_checkpoint',\n", - " 'on_load_model',\n", - " 'on_sanity_check_begin',\n", - " 'on_sanity_check_end',\n", - " 'on_save_checkpoint',\n", - " 'on_save_model',\n", - " 'on_train_batch_begin',\n", - " 'on_train_batch_end',\n", - " 'on_train_begin',\n", - " 'on_train_end',\n", - " 'on_train_epoch_begin',\n", - " 'on_train_epoch_end',\n", - " 'on_validate_begin',\n", - " 'on_validate_end',\n", - " 'optimizers',\n", - " 'output_mapping',\n", - " 'run',\n", - " 'save',\n", - " 'save_model',\n", - " 'set_grad_to_none',\n", - " 'state',\n", - " 'step',\n", - " 'step_validate',\n", - " 'total_batches',\n", - " 'train_batch_loop',\n", - " 'train_dataloader',\n", - " 'train_fn',\n", - " 'train_step',\n", - " 'trainer_state',\n", - " 'zero_grad']" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ "from fastNLP import Trainer\n", "\n", - "# 定义一个 Trainer\n", "trainer = Trainer(\n", " model=model,\n", - " driver=\"torch\", # 使用 pytorch 进行训练\n", - " device=0, # 使用 GPU:0\n", + " driver=\"torch\",\n", + " device='cuda',\n", " train_dataloader=train_dataloader,\n", " optimizers=optimizer,\n", - " n_epochs=10, # 训练 40 个 epoch\n", - " progress_bar=\"rich\"\n", - ")\n", - "dir(trainer)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "f8fe9c32", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "FullArgSpec(args=['self', 'num_train_batch_per_epoch', 'num_eval_batch_per_dl', 'num_eval_sanity_batch', 'resume_from', 'resume_training', 'catch_KeyboardInterrupt'], varargs=None, varkw=None, defaults=(-1, -1, 2, None, True, None), kwonlyargs=[], kwonlydefaults=None, annotations={'num_train_batch_per_epoch': , 'num_eval_batch_per_dl': , 'num_eval_sanity_batch': , 'resume_from': , 'resume_training': })\n" - ] - } - ], - "source": [ - "import inspect \n", - "\n", - "print(inspect.getfullargspec(trainer.run))" + " n_epochs=10, # 设定迭代轮数 \n", + " progress_bar=\"auto\" # 设定进度条格式\n", + ")" ] }, { @@ -517,16 +474,20 @@ "id": "6e202d6e", "metadata": {}, "source": [ - "没有问题,那么开始真正的训练!" + "通过使用`Trainer`类的`run`函数,进行训练\n", + "\n", + "  其中,可以通过参数`num_train_batch_per_epoch`决定每个`epoch`运行多少个`batch`后停止,默认全部\n", + "\n", + "  此外,可以通过`inspect.getfullargspec(trainer.run)`查询`run`函数的全部参数列表" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "id": "ba047ead", "metadata": { "pycharm": { - "is_executing": false + "is_executing": true } }, "outputs": [ @@ -585,29 +546,27 @@ "trainer.run()" ] }, - { - "cell_type": "markdown", - "id": "eb8ca6cf", - "metadata": {}, - "source": [ - "## 3. 使用 evaluator 评测模型" - ] - }, { "cell_type": "markdown", "id": "c16c5fa4", "metadata": {}, "source": [ - "模型训练好了我们开始使用 Evaluator 进行评测,查看效果怎么样吧。" + "通过从`fastNLP`库中导入`Evaluator`类,初始化`evaluator`实例,对模型进行评测\n", + "\n", + "  需要导入预先定义好的模型`model`、对应的数据加载模块`evaluate_dataloader`\n", + "\n", + "  需要注意的是评测方法`metrics`,设定为形如`{'acc': fastNLP.core.metrics.Accuracy()}`的字典\n", + "\n", + "  类似地,也可以通过`progress_bar`限定进度条格式,默认为`\"auto\"`" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "id": "1c6b6b36", "metadata": { "pycharm": { - "is_executing": false + "is_executing": true } }, "outputs": [], @@ -617,100 +576,32 @@ "\n", "evaluator = Evaluator(\n", " model=model,\n", - " driver=trainer.driver, # 使用 trainer 已经启动的 driver;\n", + " driver=trainer.driver, # 需要使用 trainer 已经启动的 driver\n", " device=None,\n", " dataloaders=evaluate_dataloader,\n", - " metrics={'acc': Accuracy()} # 注意这里一定得是一个字典;\n", + " metrics={'acc': Accuracy()} # 需要严格使用此种形式的字典\n", ")" ] }, { - "cell_type": "code", - "execution_count": 11, - "id": "257061df", - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['__annotations__',\n", - " '__class__',\n", - " '__delattr__',\n", - " '__dict__',\n", - " '__dir__',\n", - " '__doc__',\n", - " '__eq__',\n", - " '__format__',\n", - " '__ge__',\n", - " '__getattribute__',\n", - " '__gt__',\n", - " '__hash__',\n", - " '__init__',\n", - " '__init_subclass__',\n", - " '__le__',\n", - " '__lt__',\n", - " '__module__',\n", - " '__ne__',\n", - " '__new__',\n", - " '__reduce__',\n", - " '__reduce_ex__',\n", - " '__repr__',\n", - " '__setattr__',\n", - " '__sizeof__',\n", - " '__str__',\n", - " '__subclasshook__',\n", - " '__weakref__',\n", - " '_dist_sampler',\n", - " '_evaluate_batch_loop',\n", - " '_evaluate_step',\n", - " '_evaluate_step_signature_fn',\n", - " '_metric_wrapper',\n", - " '_metrics',\n", - " 'dataloaders',\n", - " 'device',\n", - " 'driver',\n", - " 'evaluate_batch_loop',\n", - " 'evaluate_batch_step_fn',\n", - " 'evaluate_fn',\n", - " 'evaluate_step',\n", - " 'finally_progress_bar',\n", - " 'get_dataloader_metric',\n", - " 'input_mapping',\n", - " 'metrics',\n", - " 'metrics_wrapper',\n", - " 'model',\n", - " 'model_use_eval_mode',\n", - " 'move_data_to_device',\n", - " 'output_mapping',\n", - " 'progress_bar',\n", - " 'remove_progress_bar',\n", - " 'reset',\n", - " 'run',\n", - " 'separator',\n", - " 'start_progress_bar',\n", - " 'update',\n", - " 'update_progress_bar',\n", - " 'verbose']" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "cell_type": "markdown", + "id": "8157bb9b", + "metadata": {}, "source": [ - "dir(evaluator)" + "通过使用`Evaluator`类的`run`函数,进行训练\n", + "\n", + "  其中,可以通过参数`num_eval_batch_per_dl`决定每个`evaluate_dataloader`运行多少个`batch`停止,默认全部\n", + "\n", + "  最终,输出形如`{'acc#acc': acc}`的字典,中间的进度条会在运行结束后丢弃掉(???)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "id": "f7cb0165", "metadata": { "pycharm": { - "is_executing": false + "is_executing": true } }, "outputs": [ @@ -750,11 +641,11 @@ { "data": { "text/html": [ - "
{'acc#acc': 0.3}\n",
+       "
{'acc#acc': 0.43}\n",
        "
\n" ], "text/plain": [ - "\u001b[1m{\u001b[0m\u001b[32m'acc#acc'\u001b[0m: \u001b[1;36m0.3\u001b[0m\u001b[1m}\u001b[0m\n" + "\u001b[1m{\u001b[0m\u001b[32m'acc#acc'\u001b[0m: \u001b[1;36m0.43\u001b[0m\u001b[1m}\u001b[0m\n" ] }, "metadata": {}, @@ -763,10 +654,10 @@ { "data": { "text/plain": [ - "{'acc#acc': 0.3}" + "{'acc#acc': 0.43}" ] }, - "execution_count": 12, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -780,39 +671,37 @@ "id": "dd9f68fa", "metadata": {}, "source": [ - "## 4. 在 trainer 中加入 metric 来自动评测;" - ] - }, - { - "cell_type": "markdown", - "id": "ca97c9a4", - "metadata": {}, - "source": [ - "现在我们尝试在训练过程中进行评测。" + "### 3.2 trainer 内部初始化的 evaluator \n", + "\n", + "通过在初始化`trainer`实例时加入`evaluate_dataloaders`和`metrics`,可以实现在训练过程中进行评测\n", + "\n", + "  通过`progress_bar`同时设定训练和评估进度条格式,训练结束后进度条会不显示(???)\n", + "\n", + "  **通过`evaluate_every`设定评估频率**,可以为负数、正数或者函数:\n", + "\n", + "    **为负数时**,**表示每隔几个`epoch`评估一次**;**为正数时**,**则表示每隔几个`batch`评估一次**" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 10, "id": "183c7d19", "metadata": { "pycharm": { - "is_executing": false + "is_executing": true } }, "outputs": [], "source": [ - "# 重新定义一个 Trainer\n", - "\n", "trainer = Trainer(\n", " model=model,\n", - " driver=trainer.driver, # 因为我们是在同一脚本中,因此这里的 driver 同样需要重用;\n", + " driver=trainer.driver, # 因为是在同个脚本中,这里的 driver 同样需要重用\n", " train_dataloader=train_dataloader,\n", " evaluate_dataloaders=evaluate_dataloader,\n", " metrics={'acc': Accuracy()},\n", " optimizers=optimizer,\n", - " n_epochs=10, # 训练 40 个 epoch;\n", - " evaluate_every=-1, # 表示每一个 epoch 的结束会进行 evaluate;\n", + " n_epochs=10, \n", + " evaluate_every=-1, # 表示每个 epoch 的结束进行评估\n", ")" ] }, @@ -821,16 +710,18 @@ "id": "714cc404", "metadata": {}, "source": [ - "再次训练。" + "通过使用`Trainer`类的`run`函数,进行训练\n", + "\n", + "  还可以通过参数`num_eval_sanity_batch`决定每次训练前运行多少个`evaluate_batch`进行评测,默认为2" ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "id": "2e4daa2c", "metadata": { "pycharm": { - "is_executing": false + "is_executing": true } }, "outputs": [ @@ -884,96 +775,6 @@ "source": [ "trainer.run()" ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "eabda5eb", - "metadata": {}, - "outputs": [], - "source": [ - "evaluator = Evaluator(\n", - " model=model,\n", - " driver=trainer.driver, # 使用 trainer 已经启动的 driver;\n", - " dataloaders=evaluate_dataloader,\n", - " metrics={'acc': Accuracy()} # 注意这里一定得是一个字典;\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "a310d157", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
{'acc#acc': 0.5}\n",
-       "
\n" - ], - "text/plain": [ - "\u001b[1m{\u001b[0m\u001b[32m'acc#acc'\u001b[0m: \u001b[1;36m0.5\u001b[0m\u001b[1m}\u001b[0m\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/plain": [ - "{'acc#acc': 0.5}" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "evaluator.run()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1ef78f0", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/tutorials/figures/T0-fig-trainer-and-evaluator.png b/tutorials/figures/T0-fig-trainer-and-evaluator.png new file mode 100644 index 0000000000000000000000000000000000000000..a98ab83b48b29a07ba450f077a95fc5fcf5a8659 GIT binary patch literal 104863 zcmd43Ra{nEvfZaD^K@VC z?>;GO%{Av7V~+Tbm|u{Ll<-T0*9Z_05HCeV1mqwfo?$^iKr_KX0Z$C0WicQi2q8oT zJ}S6C?l-@1P*8T;a_?yG(!oEh{Q}8DO4MMg=Ct-P+G)gOA&QfckJ)y+nw>-$>Me9- zRVG=^5ySQGS)-hn2 z;j~x+EpzAXZcm7_pUZH08&`;J?N<>jKj6jYdwVWFICcH}`K=2Q5{?%F3dIKkh7jV< ze|=s;>03->ApHB}pO?XHtRJ3+^V*?)&qfi5kMdvVupw+&qH>FIJIePxM1=uf_Ll&E`JG@=$pX4Ld{JCPNJFv0@{ z@z|r?exb?r^kO|329uvK7iL|P#A+u2saNOs@Lgbd!Gm8S0Wh2bMgXygsPB4rvPk)C z?dE(>J2`=3Tp#}*Y6wN)A~dN)@n0G70W^*<25n&SROfGpV9}=Wct4Cw^Lzn_{J;r$ zeQ0`xu<+$KnSNpr7fyj`H7+fa#+x_vh-p8F-#Mx{52w8lDu5j4aDeDzj z;zVGk5yAr$NB<uq{h@Nr2l&K#|mQNQJ;)*c?-J{Q6k{%TuoZoj!}0( zhUB%pm=6cL2WPLUO^}_72hKE1)sh*c9m*C-@-p--B|n%_Ae5I27A><*JX(~@SG34f z#atmyOlzL!PF2%uV>q90PVbVBNe?DE&f|$eU#9L%AY8<#AYSYc{WM(Wpw?u2Ie)$c zrBtr0ax9Sh-8ipcc+UA+IJ zqt6k923X#J(})TJ&NVR&vlc9}BCnxluO;(}Y7#MtJXaM?Z7nBLRLu~iF^ZG9G*b#5@|8FDcp#LrTTk2dU7b9}&Co^jvrzO=tP;l}Zz$gKJBJf zvgCI%mFj?wCX0Zg2Zc2KkWCG9jIjpFX{6FKE0V)Kf1I=<`d38_?H+4C7a4ykI+=Fw z1k};0wq9-Ql$Km)@WCyR=ls@6-)v%^V5!3HFBQLu<@J&mu2Hi8$*Wy3FUy(NWNMzB zrqT$}k*+a5vyelbU*@HTYZAM?hi`|~QLhQt{G5jM}Ctl*9S1V*ZqbaDgu6awiWk427f@j}VXGw!)& zre=fMl}H^vhHbK6B#Doc`g=%y2ML}oD8?Xrtn9Wp$FVtCdK2?9x*YZyYf=i5ft0!9 zxmrgJk)|(%ER{z;9?6*58;s6}q;AH>u~Z3;2P36vU44`k&&$$V%%J{4p2not@9wlO zckef_H393Hw1P%EKgXEwe=>PcVqRFxEb?*@Fud#Cq<$r`J!u&zx57N0XyojNn_2&)OnmY!wO zB^DbC4Phz&nz|iHCX)EiHeaLg3g&(1Hwp%e1e%{u6K6??o7~mhYq(SC%%!WYpS_}$|^SJ!HD_$o%}QVkE0(+3KIH`n#rW! z^yFSnif5@c9(qX{sr`JPs99e4pvz^~Sl#ZLJ0csK@j`Pi%Bb1w1M1$6aGWu%>Z4YP zwH)2)MShw?I`#-ve}5dvcmc5BJ<0x*I9`v8={}5_GTNGb2 zslM1dIT)PkXkA@;24mfrUb*q&SrtC{AMH{g2ej)SFOuNXx1v!P{vP4FO+!1pq7J(1 zK&6Tb7pby~m9u>?!K}FfpQOZDE|N5%n*-+OHPJ8hZ-zla>Ob+*⪻w1 z9jR%G&1@Bp;Hd{4H~H@Mv*vHX-Zcg_jlMK#0v0iPw6EIcRxc3PwmkfH@SFbT$!rE~OsvX&sH zEX!~CQ|C3geoF2#>2fq1Xc$VVD>G;|!{na!#TQaRt4qp-2QzZ>IVDle^%I*ar)o?a z4Dlrm?3bz ze!<5Ia$4usw#mv?Y_FW4$w#^o_uktC>U%qCh1EV25ZlE=4shls3ywzoUd0u6Prw)4IR>Ot?%Q1!4kB5T)<+o%Z0g|=zD<+WyaIU^r z=pPnp&G_|m@TZOU%WN#Cnb`~*>=@e{s86e1GHyY2tsdVluBiBL&jW=)9V>S1rwDaT z%8$jRrU7#sM#DYM3j4Ch1B;5FpU$xQd+O@q%?soRm=lf#2aVOvY9KobkJN)j#W1Jq zq;C@L{EkVQAh%kE_SE3w?Qf2jwPHF;)ouays+!^lwk1s+Hb2F2W3dKPu?M{u9c#aO z2}OOn4u~vTfqDZyPllJ2H3!mn@=@~=JjvFmYMkUVsWaxzTM7E|@^-q8(-1UuKPU4C zS&AToQHjgYz!R)Vfmd0m72~q$YrwRBNe8tG>s2-`Wouvsgvy#Jym%GZ(s z7YobH_(Dn1z|>YAZs}@&=UOcUU-jT{`Go&*mGCFGqUOgF5KgG)v|nT60Cg|cut!Ej zahb{_*}c(}MVSem-B)2IxMlihQh$(@C@yf> zcQSJTbs?KywCX_TLySR~g(w;K`hmDp^QA(=UN!2Y1(o>JYxX1ski}>~fYJWWjLXjr z&6=?D8ho-H_yDDQ#B0sT3*|X>j38=ZBS}ly68&jv;&_;Z-ssrn%^mHYId2K1){*1}~1*d^uvE@`S zXQk>hLE3!n?F#L2F-+!HXHH0pzpBSSR}VTYq>a6Mq*7PNW(%(gTi9#6NWzqCI^@^Z z>XFiHoXFK4zNNG|IwTEN@7`c9S*u)7h+H~Xvs{TQtbNDL)DMvY_TFHhk_KR^6#~@f zPngOIN{oRv?tBB;c4I*f;#jJ>s?G-+7ylK4uxvnfZpKB6gRvBPm@IEY--Do8Fvv;zfCGCi~ThDgWJJR zd1-JvpOH#bTOI;8l7S`Z#DisoVt}XZ`LQsmoO;6FqY5YO{x(1hHp+9X=w$H8U@&kNP?BHkhSXg&; z{xv<+foIRu(oMWPv5&_wDF}{!q965C?($JH&(;b+x7z=dnRXB97))ygagz@(G^^U> znO^Tw!XNQ(V9?r%s7h;J`m!KfP*}(&yI{R4>Bl6nD{pY_r(C~^#V9I+hw6BE6c1e9bfJiZr<+XJy6Jq1gnEGdv{u+Q(XijQIH-xn^ib1MJa%6IT zAIm!JsAIc=V;D|HD}4T~MO{%iAlI+Df3tSDx#o*vO$!O;HKt1`S$zGTpIuNRLlMS0 zRf2rER9<{zrH7*CS}xv;{*JFL`ZZzt$j=O!2r<+yF8h`=U)G#f^V3J3(|@ukt2uPt zNWA1eA@fd{x-3sLAG|G{j!m<=x0&o8j~vm`sDFyAd>IJ=-}O0Q;1$>mbj11iNo*A? zrWI@=^6 z6D;hG-t|S&@TqB33ckcc@(w7k({!o~5xt;=Cp?j zi&jZCY^=9>l%wklH7?@j8rtRVvUw6K151D9X|tpy;fo)ir3QPH4jUp>A10^{3QkT? z^|L4ArswE|#j98oskb{OTT5gRI&Ha%eyFzOb@WId?r1$Gs>{uhK29*t8Bbom{unMX zd(O^3RxWBCW)}iNn)AW^L!1L)1RX1D0@A5L82Cw4YFmkVoz+6U?e)oq{npS+L`08^ zSz|034VTUSjm+sc7;N&tvk@KCv5{|P4ls2LCvs{vExO|BVL z{_nX^P$3l@1g=aF|DFD~yEYXBbWE}R5Bn0h)aX@WF+HScaSO5Lu*zm?+Rlo)`u57x zzAa;~mzM)NJTO(B{XQPJP{5NhYEAfqWyIG926KY;EP6z3I80NF4@=?Af>z=3LFpLT z8O2stZqzzP8ziKkQbM0W<*D-Y7UT4n=YDg4q?eSKSHKyKi%J3vu_Fv%G)lE9Tog-) zGCIXwl>{BJ6pE45`LG6h#Oa^=4u;Yv!n8+mxX0z5w_wS2c zMel4>yA}x&)0>rnQaTa`{Yxhs`mP_=k2z~@4UWAPl2m%KR|Xci`p!!epMUEGDGP9j zc04@A0pOkY!1Se+MRX1%73(sG?T0Ll>+RK- z&cs}ns@8a`4(C}2x1nc#5Hpbw>Mtj(=Np))rM~E% zhVf4CcJ|(#CMeZ99{k+N4U6TcP?P&IxnHD}V`|lHvAkmF*o=05FlTnm<(?FN#c&w} zrCF!Ud63g4Y+mncGC(+Zj%{H8bi&Td; zf%kJ(9_C3tM5O_pJqwS32PbKP-c`!W$+bdmfVFiK?oSx$#y88jyI56-mzYkgYyn~gnDZYW@-X2OWkP#t7D-yp zO5-}enB8uV)H0q*#Has}CO6^72wlP`HS^U<07c2xj|fgw z5;jgFxj>7eqo8VS9w*YhK6KnCaY`4%cz#Y2Yt*4Z7%>4uPfy?BwZ>FH{^=NA5JKC5 zj|=ImFIcD?VCc=wIt>fo*WmPyZVrqqu71Bs83LUfGQrBvFA(=#61zcflT=}Tj8kM^19QKdrz zdj-a^!)4!JjLVF&a0Zy{^hT3ko}Hzz*}v^H>H|EO2FxL7uT+@8Q^7(=ihTOBxPiIs zXwaQQ`AabO*-^Gj(H1GgrjYd6s{5kumj=4#eo(7z^UYPJ%bTjQ!VmAi9=BhOX~|J? zKj=R-`yobdbu_xkb1;&WN?o%$=P0y37n<;O2A4^He~_!!EXWTtq!R<28mGJ=B46tc zZ?#$ZF_ z+ZA@G)qEM6LZK6^Opk8qnlGj6qcfjzx+^whC`1{okzH)Aa;V4ENF)&u>RObn{z_LO zR)HQlvKoMS>;ZV^VV=jks$uTi-J&u%i!cdTC{eMtt29c{(C&R0QXg7&NCJenPxg!e zYSq=M@HpKnNdH&0#o4Qyl_6J5H~vXtQhmhk;@rW+G~6WIgtxWeN)jJbNM!JCQdpJBrzv z8Nb18YgsH4 z@v2z{P4G0P9{~YY<1~5;4;TWSSMUb$rS{cSPPMc&F^IZo$;}lckfh7+hUx|rGwd=7 z^nJ6~)o$H!@Icz35;}(=DXiAtEawx_@YEGD9oOEy3w(!_)EJ+Ar@M|FJ)wuTD2ZY! z$IbT@>~%h%L3)gmpssQQVhaiPRp6V&Uf1WalxHN0;sv)+sq+mL`maud->bbq2u5-$ z#>2?KziBVGlnC=^_&J9qOH(~kB3_S~nI1~ALe)G=G~Apbt=x3TJRW^)LStwu=5Cwrbm$4eRq!8u#tcZ65PzcOATJ>DHH;zsuIcUr`#P4_24)u-Z<^>v!eHn!}l*zxbVHM8N=fk*|}D{+CI5 zA9b)dERcAvOK-auPn&Jmx`Kj*_>xsv!NwHA*944WLrCxclKBAy)6!&jP`?+w(p_)A zTl~GC6aB43{Xy-#%xHVnD$NtS{DghpEUwsQVf>m#Nf|>jKl^6BdEzZGkYNo= zA>#Unnq3$J*|S0QIw+(yl8o+fg5lt8pKtGeGd3T92ExxsQvVqYg84)Z@Ftwb*$oU| zARu|5@=bV1U;P~r;Q|9i0}LQv`uyW5JzfCz=r_X&IDY__oR80ArU|J70;f_g7z;T3#-5L3CQNwmFy##4`0R$9mGz;lm~Gv>v9c@29Fw z#}AuMhm^HFnSvL)>WX4g6g(p7*aryNfHl9ZFbvqr-Q(m4Gpm;?l>mR+NBhjG(SkjH?V(}bg zx6x;K9}#G^7nqrYI;KRJ0)$HNB0hnKFdpv`z$i)w1_}tzNG5(V!^j2tfIrL|IdzcE zDJwftZoP@MjZ9AFFdj;IFQ1DYPn<0-CN?)W_i6BERG5ItD)hq442P_#g4h(}v*1JDAF+~$j?ZGlG{S*oR^a{7#(*N zROcuf3cA7xXj|{PaCNH#avx_n!v(g-Bh<8K@t;dIf_;gbYVXqtsk?W8RBw zj}X4qJ(!wOl7AVjfO&jP`*=dT$!XAA?P|ppnUq;7DBjOPb<}cOdag9HrTg`~#M+aR zhi6%MCHc7uMy0Ar3g`8DbT0Z*6Itus<&t}uKrM1s^I%^Jpg9#_mE^J)0?P-=TgdiD zDl|22okiPjygI8zG$QJ<&!0c*UWioXqp@Ben+tiXEb8?=%d}maxP(9mvUr~&9Ggo6i_6z7ZKqr@a z)hphvAfY-dBGI~#{ zne?1Bx3<@HP65S6gZdHRJ2NxUu*~$-y-2sI<-s!GJ-j1T zQGvGSd93$vZf)I?+fQHEuj1F@R;R~Pfv;=GNv(+YD_wO}PZ24ajbT5ln+-PDQq z$)H)UN(w|+!+^Jp*DT(ZJkK{dW$-K(lB%&4_|Q+(x<#C}p@`JLxCB+ethl)*$(zzv zi9ge5;YSJIOJ@%A0*ZxFz(voP%JmjGU1y>C`^4Q;z3p15ff$2gwd$Zn58y^noB_G7 z=Yc{50|8*ke7I3G|56lfMlF-ZpL&W%NYYwMqYv*0ir_mf-&`xQAn|kJZrN7(`|69d zkq6Iv8^DKRCYr%q<2rAIXL#4-$IKAywIlOX2Ya){Gn%4hQ)?rnaJf&ByVmO_H(iA*JY-R2WM%89dii~qzhe~@#v$~l_d@qZ>loF$kcplv`c%4SD-w31qUdR5WjoD5yAAzup?Hq3mjdQ zPw3@^CcTVCYLt~zNig$N1fkKtx}soo^vbCmFQ%_Eb6W2Yti>Ccn35cGv*=#j0Qx8t z3qj$ga6ASW!rwU(6dxZ=C?$A`l@TL`+0H&hi#QR}B6DR-BK53-!MD2Ueq2*!XZ&wA zSL5SibcoYRm<^1L)fLZj#N!7V(>+c^)zy<-dx5iwxB|Pi$FJqTcMG09n;UYL7CfVk z)o6BCN++8|T))x)_fBzEKiNqIDM%fuonjVgZlIic8LCIk#9nT?Oq+$ePRKd=3)qST zN$^sau7QvG?@oE&zz6=Q2F38)pUID7%kzl`DSEud$dybsnyR)ZW_L|68^xpi8K&B` zO`JFQWqGv*f2mLxk=aUc#-rn*0GQnzbH(KmA z#LD4}MUAP6#{H|5rSM(XQQzZ?D#@3LNd^^Ph@N2-H{4fz)hsB6oEJ*xUcFfD44JAR zWn4T{5XWwSa~YSnREka;PUSFWuN$;ugSVEO!e<3Em+`?#@3;9H;`+c4g?1Z_Fq6Uu2sIj@R3h7>n7`r32LZypikJLYDS&^>Ku0me z{(zu%VjZ;2e1kq~s+dPEopv^>*XOMw`i&Z=RidmWU!MQl&2&tZNLLim`Ui~0+1C6$ zuyxzaaUEM+v>fE(ZqhZPTqDgEy{_H>>WqOA5PO*QGwe& zXlfv@y^^}@PfY%*2syD=rh*c4IJc9^+BI+Q@pDD7^);K_m{bDjIU+ac-P#b))E_Rr zA20P1h9?I4=!Mf+ZBFA-X3}G7Fn|^T-gOxI=diFOM_=&&l_GZZ2xJ9fzmAQoPAqK^ zqheoQ$VeeH_q$o+^z)00T!zH$p+yxjYR}->Gx&&Fwusr!6IekU1Fuc1atA9&Gu1Qe zONYH)aU&f^TS^jxV?{r>+I~LZ=}=zMVPzGA9mgpxE;o1+x4Ho z0xsOA!H2DBbF9w``NI5lmtWrQV2~es%It@1QB`$YBuF?vNuMMAA40zs3@Tq z*gWbF_x!uOCEQOe$)`?%h#MFctp5V%f6t>D*khJpl-EB`{?2y}4g?-?82%3<0x^Mr zty*LZ`Cr@c`y0UIC^+o$=o`WP!AI8OyJlDD1xnR^FZgGQ20 zWP4&FFBO~tS1l1{`-kU$C$c32vcMY03_$bcU!(V%Fj8r)5DfA?PxAp33% zp3ncM!XSsLqxtufPw3ch-ozw#ng3E-063s0KuAKuFGR1U&q&(-C)Cm7BV?yYLN23o zRY8P;REJq-Fn{f35!=dVw*YeNXT@Q$O5u$VSMSLOFu6doPZQ+EV1mDQNao9^ho`*~ zqQVP&6iiOn2puk~OP`MpMDKhHmrdl0Q#58U)}@`1%woM%Z_BT>QNI?(!QE(AcL2leP9D4?8BL>tAl}F8(7xAQ8yydv?8Uc1HyA zy1$8~R4&zOrXaUVxjE^`(B{Z`84K*gcLb0n{xw0%Cvg{f4ux7^a=lIw@Xq*phJrwA zBSN|q=t)I9?GgeEpCVj>cVMYzLwYs#Sb$Mj?^t)Q9&3J>t?dNx&J` zJ92=W+4W={9*As`7<;K!V1TV~LAoNp?w8QxVI;|=@C{U`=8b%3b<7tM+~RVadA#Cy z@wuQc*!3e}FI@Iyx6-vb>GEojS6r*0T@T2_(kzOLcHj$dA(}{0t6&#DV|8_tR3g0~ zX2Jpxi51+~Bx1ot1^P6RMfa#lNZLT59yl1@k?a1`^a>cM(LDzgme&GKAUm^DU@f``&NL|J6>Wrv zOK9lgS13S~+5&v8w?tCLC^D0w!9x+@d(sK@8Lx$;k^CGGXBl3>SfKvYG<|^{yN?>k zm^qIv_)v#JM4hg&QO-hB^W;Nyys~l6BhgGD5^EmdFEINv(RV6zUciif@;Sy?^oV8C zuDAK9)j1x4d1@SmBBB6QIb^;nAjd+syCy2tnE5R;^pV=&rYm52ecfjg*nt>o1ds50 zCW5y>c88P*@|vHNLXEoxy&&FC8aY}%*oX8w8u8hCCbGBa)=-f`4#^j%iIW1~|4B5M zinR6c)0AzBzUI^ELlKdKB4A;3Sa6)?8BRFfI9?NFe>Myt`zVaOScMsH=0lbP1oSBAGHm)F-O@V*!GA+ibX#(dIZsVdl#rp-{@v z>W{CtJZW35%9ihP-)D47CdW0qp0?hfmu+;YMcc!{kOEL6uv2d4cl)pS$MW&Cmcs)M zsEO+FsYT;u-tr+X3?wY}xf-_9lS)Njk=a5#5+>Wa{Zh5c92?=f+rGODqq>|z%X%rh z8itye#!Q|9(v7~*xAPoZ1C*x}XA9M*RfCCi^82!DH{13cCGBM-r>UxjUyIK1Y9(p! zAWgwp93TTK*d{EJH@N^5MWep7kvBSrcs=~eiMP^h(3{~rZP*gv;raFBMKArc{jVI5 z#2{f0nDNub*QaqNrJgwrZi{KmTg(iHYgg~dBpTf~xEBBa+at^iLjU0B0Fu{vKoJpv z!ty$Pp$bJ^#yyq#&?~*3F0IS15tjIY==9Q?=V&8-gA{KQx05Y1Q}|P2>aqh!E!h-x zj9iL|U|MM~F-MJCvE1sy!7(Z|G0(JN*2@lDpDZz1G!;%PWV{>IWIE+7e@(m?&nC6O z!NRQi%GxR@D@$KUSJ#G!n!Ta4!mQS0VxZyKoO|J=?*S9_oILK!#Thpd&iTo%I_U5I z7qmyNRYmf92UNv{=OC7SWe6>%XYo3t_-Ln@bUD@jO6vRYc%sccS4l=B z8Iml!8j3Z1$HjBEGL1CHB!BS-L742xyx1iHDgj-v2g zFH`8C=G>>7-MkuKrJxpW+Y6>?oGMo@-zg0&36ZU`(&1in)+W%pZkE?x(s_-2A?)JP zi7)c{9z^>`OD7fw;*}{cZG@i1miGQ#x8~*+=uAQLtr*zzOyqN%pAs;qAC)Jc`mHMX z*bM}Y23tZm)ZU`G!ND8<zMRa`R1^L1WcWJ^A^hObe*lQ8IS&c|k_C#j$j49i+|PF$}pG~5lcoiXuAE? z^KsU=6V=l0Cn+faG9zpRl~n&ncQQ1k?aY2k4%bn0jPj>w4*Ux#h}<}XTcoXgvkQmn zz6M*X&uZn(F1eSL2cHl~%+%h`mlW1URdz3asnl?-P(#(EfZR<+m@ZPW9en>HL78pZ zEua{gQjd%Is?*PWiN;FN+P49T(?y_8O@Mlt)&+0X04VfO(9w{-YLY!w_s9E2Ecnk6 zK|sA1{#omObicX$a8R7|dUkDhtU{ljXZC0}wUSPs2=2XFGOq9mNm z=%0F@Q}(`ld#i~9xm39kdxvgZv+o?s>ZB+71Aut2-i&Ti77W+5@yU!nYE!8xA(=Sm z2k7#auhno_31{t>mGeWiEG?WA7~fc%j!IF}EL8SgTgYFu)q<9uoew7=t*4>q89i@X0xL2=w?^EztL@bmkDj)D`DOlv0>rRtITUS@H4BkIaa3 zH05p~$D%Agdd-lgItBCBXZt-lLQw+-QJ3OHHkOLBohRngpp+`0ak%n+awpeuj^=Wx zH)+%EoTld#tG$xe;fO*wQYoUUZPr9dX=t8F-2k?12q(R^^iQ2hAsAhBaz8cNZR|hQ(|r3k?>pL+%u+BfW~1%(=D7^Rjg2lgC72ws4bo z=QwIo6wh&!6-tlBzLNu~-KNAjC}F|Q%z%FJftcwu5j&o)%sb_+4U^m93zK$BI>XXI zC+BZUZn)@|43pMyiV>-_ML>$iskwZ&IYaV?-RoWBN%mR?tTQuv3ZR5Y7RN!ziSV!V zCZR-Zj4X5830y?RAsRRjosu3)M)xudmV=>qmr=dtq<@zfSLyJK_R3tTrCp!hhKawu z`oI{2V-Sxjse|_AMLhBZe!8JaYg!AWY$r_~Lzzv#`}y#0l2g8>)x}p2;ZA&v;RVf{ z7hcXEYFlb*Ci*=ybv|mw&$WoV=jBl5{g|Fz(%dkKFN5!^W>LtS%_%bGU|1;V2}V+z zHY$SIlq_;)ji0QUz@P3D7vmDjZ!$UTFnQQ3tuj1ghh{;YoW~wp+ex!i{@C@v@$6gnpCd{dt8vqN2JwdlM}05Q5DCh${uSWgG{O1O$-o8q zu)1*IopRzUbM-2v%2j>Fdq!bC9TqumIEyb8vo56Et)`jSe?6BQ3wi-MC?`c!_h{n% z1X@H>JNU?@**{b~3vw#g&#IU|9eGz-t4ev&V}$Xl`P)*IXpw;~|3~XBnP<`*Tq5%l z-3RYOPlkc!o1{~fy`E+yi!#&Ejy>+PjW7EXHR6^x=S{6pmxofrWpC>>@*Fkdk}1a2 zbA*74d6=DfMmRq5|B4_0wE@KPE%Sg!F((nL|HfFe;k$>3}qw_vIUTo3(mE92TJ{zh(UeA@HpAapW< z`o!C%XM%f`odItw4i_uMYqQB%qVwII*&K4xOqb%CKwX@926(jEB^-AFR`&n413X1< zZOK5bF*)EdwuM-^mpK+zy?~01dLA@--qa~;Q7h*=@<7)W<<#%&TQRG*E$}k??uRdO z>E$$656A=^?eMSo!ECAHVjz~ZI1ax4dMkI4W-Z_FNWW_e|2Rmggsv@WM^k2 z%hsH9FJ68Db0B{(sj=r?*HhPKPuo~)katJT~&()a%+}*Y3=T&5iV$7I(`Rs`<-ddK;<vU0*kkCl{1R;ztWJ;BZOTkb{&P}LEy2DP3Ipw6T4XGa_F8--k35}6$dP$7T}6?y;V$;|ZawYrJiUP|NAg74 zGRsrRvkdjDCq}c3f%Ika+2*^t{Jn24>mhw#k^F*vy!g^vCtbx>pr1FlMx(klNI_3p zty`thwMd$LmjB$>fJmn4dcAiLRFTO|P{xx)3MXLW-k#a*+D>olhR%yO$aB+9P{MMR&noma93ru2#F=_grU5wK6-QLGZxX ze*tHHFj9+CRlEKMs(_d9cF+DOY5TrY^_QvVU1!iw7JY}U)yd%!63s@}Nw?movKuWF z2v-gL<^7ru+FZVR({%~xQqXMr(ZpC(r{G|c91Cn7KcNFX=r+vYWG#?jV@176_V(Pl zi;wy!gO{2Ac#|GlvJ?uX1@_A-fVA92p5!PW;)~)Mmt5H{dPvH%gO=wYf%*r~mITfQ zp<+*hhmsQc(w5!{2^3rp@9s3)o+hbUrFb|V$Pr5%4fJ?3Q&UQNA_<{SROG{T#l}m!EF4j<~H~MtX5Xt(@tcC@@ zlX|ipP+{S1D@AAvh-Qc zal2M#y76Muv-`$yh1BLVTjL|hM+@Dl3J*j&|BLrA{XFVttzXZVN`3Xc&s(+gU8*-n zZy)acj-$UOks;qNm2p3Kt9#Wfwr0o@3vC(>Rc&gzQ+Rg@Z)uLjdZQP)5@q~43Pyt? zMTWfmq#qRLWo~a9SE+u0O;%72O+~Y$u1!m`(znPnWe&2B3k9h;hHQVS#h{~C;}Qb8 zbP4GJ4=UUhJ_SDRG5`xv2_>0a>j>DI2o zK=lfLYdvt}Qkf&kFnq4snYXXAj|RWvYA$Mc(}io z$0ur3PW`O;p82U+=qGT6;o!^X7flP9U&4X@T?!c$Ln$Wp~m$=JYS#d{ke z8&%h;(hmf!e8R-JyV#NHJ)$0+7G#sgf$lBRP;-wHm$yUKxN|w>c_6PcMjh^Ad3Hj9 z9#cA?{y@$wd&QCsu#&GWB42oqz*|PI2K@xC9fSIGb?Jt&$N`h$F z&i3-aMh0U_Cl~>HHNjlOyX1c30c>m0S$-^|pcFyoSfu<3q;S1yV;dRqf zC@1s6OZznHfD&!|yijN#JZjMjxWKQ=pTdPX0M$SDtt!(9iQ*I2u#oV7KQ7^vQH7K5rPC~elH&TxXE!gJu&X4yK z`Un;Z!Ar%gE;~8@FK!4`t5l890a(wi>YR50&i;>*95x0Ncjg?s|F*H{cSASNlDbuQ zZpR*@hMA+1Q|JEgjvy1(O2Gq_)(g%y&M** zg3M;U6E^upN0Bl(>YwXKi6)RX>7;$8hhr^0Hu7$_h*lK{ExJr5B^AsDaEEI zWsi&GRW0WeGNuS=FRM#uYkFtr3rt_^vN#nZf^2A><)z)p@6J|`m6a-w+hMZyU0Mtd zOFrPxLBo*}LV{aF0^&$#GzsvTm8l+&Tir#O4Z^Ypr_RTI8$alyw|K%Ya9%?N2@;xj z7Fin}S(tfv9*t&M0oPs2Jmea3(c5L}SZ>a1A26H@93F>+t}AuCG!;K3q+_F0KtZ9v zMFMmXJa!wGX+Ds4)S9X_I}YA-4iVa0+Z(aY31%=k6qocKmL|>4>=#wT*3znK&u^G& z$*jyNzm_Iqsr-&FPjZu}Q>A48bUg?!oaHeve+(N6UX1t6s8PaM>|VTQluq}e$)4Lz zf%5{Mv-<}CkrDsK7R=p46j~Lk56{Y|#ctOTTIYSm?+XPrWHc06yshs?ZwL;kaM)hV zMdrF)(|@a+3;P*1Ll16}mW%Uo&!Z%*UL;=tE*wRT2*PqCJ(p&=X3wlLrjo%MbbF_+ z#ju5$A96Pfzh5lH{uyI5Fbud`wDJ9=lR{aP6$peRA}0;D<%KZCy`WZ**KFRAd!EsC zOvja6o=49Ol95i}w$zgPbk+`XdEdV_} z7AQ8inS)8oB{+=3_tK*U{f$qbWTk7RuiHCbq z_ufx8Ukt^_18jlc-Y_qeJ#e_Rz{VprjP6KwbOQgl$0jH{J7^R|gczz4&h?{HL!_&? z6z1UlX91NskBhPHs@zR%Y_cm2gI4cP$HBtp-2V?>Zvj|MLApg6 zq`SLYk?!u4?(UFMy1PWWyX!kweSP2G|6AX;*41^_Ioxyh*)w})_A~R$d29(t)s;9O z@Gmr#5!l(DF}!hlSYt5CfR8^b@!pDX;6Fc%;+-}{i0K0|XzVc8MJBj11yCY{}QHM%G?skfa@iMz?&-%Z(=RJlUw zTi#dC0JxrLvH*=%%fLe=2P1!u2IwWc=3TO_d=K&z4#w8RVc88-GKLdnllO-2KH`LC1Nk(2~e zmL?VT;!T`(zZY3@zqo3dBZKX$1sGWKk^{6t@CcYEbAbx+DEouoWzzz~a-aVy$@P91 zaeUgT-)oP$DOf#XGYdhwnJsiWQG80dYv+D@H`Y$a*|o9aP5)55BN+N!z0x)D-b4^b9!z^AVg!mEbm#>op@G%`D7=kk ztZ4py%F+O%4jq5-kWpFt>KuPrd18EVs>^nINS-&Ko)0*Br$wY zfrL)=dr0OR5)qf`iw;o1*~k_jV(az3-6hHB?=^g7JKx1xso&`5q!D(`F=dvYhFE&R z`MYo&7`74^hv8L&HKkX)RYRO3o!>pJomuuO1#+iTR=o%Wuyl0x)lw;S=1Y1VkIrt& z{lltmmq%p^<+TX0aoeyR)hq*bWsWpvSMQv5XX~h&Yt|eJ59YI!Ygi0-CWb@zCr=U+ zG@J6d=7jof>RJ}w)5b5nbx+0fnCYT02tRbwKH>$tDM zR>XE7tp;DgLdxw}VxN?IByB{u`n%}RhUkkIvi36RNYnM&&rx&Tvg{4K4mOk4xh^Nf$F&TS$sZ)fSct0G=3{a@G$AE#-1FJ* zFyOsiHbo)4NZo0S?_7)?PcAhVSq@3n$_z4}-^mbKkj0luu;oP1?bpjl!=5Y50pJ6v zMJx$F9`IHLtaG9uue0F{@9VS9k;f75R>uc`USsgP20xG;Vp=3XTv1K}LoX$=$T*%q zPGMn6Lu1Y3do^oQ{=qSGK>CzLR9lqt%4$(Ei@iBke1ZYaOU8QPpRF2PB1S0zX@~c`Ed?3kQook)BkCL zio9_zGV&CQLVwQ1;8}B2##yUi|75}n{l)rM+y+D}1>5ppacU;mjs0_$?CQy^^S;zY z=WPElb?!V^j2mI!*VbN`YHJu(SJ3!Tz+Ro~=v>ReJ#S8jU#Renr?|UT;amPEorI`E z2e$DNK-?D9U~zwxiTSm823LBazGCs}kHrH@rLjfOYp-(|yso`h9>kaYnh9~IYo=YwXs{%z|tu z-7vMUM)(W8;g_;1)b_K-3U!>M+y)jS$-&6t*pSyE3Y+rGQRqZASM^T|_*eJtVksr6_T=E4B_;z>Ihn%*O>NAsYoSc_-#T9S1I`}^)y09kc4f_|0h~v)aAuI| zn`6`*8SSe&u383XJ=ISb;JwECq>d-oT2>+aHv5*Vl!3#v=EZu&_Q&adtTr2A#fm3xJtL(KddaT6+ZPR~b| zPau~N1$BcuIZygu7(G!IIO*bnHD>H>yV*@@7=qy6qb3KHX>-{g&UouCUF_Bs+LJju_e*V)GSGX1 znIvru)%@hCXa>kh7mO^js^ky-horEb$Sg#WM4-jCZE)QpyxxEbE|SqYMhiPvmZS&F zI=Fy=nNNs?8Z4gk*T)CB0pCJlk;i#;C6yV8P}w^qB`@Q5fNjCH4m;qZwW+SN6gB3 zq(o#mJT;i2S!v=nI9=QyXlL;6lr`53BzRi1jc)N=9by*4sk4X6dSN-N;9f|Q^X;%` zJPnj!OvROld`Z>5F_4IVvdrmrvi`~?UT_Mav|0=nyC1!}A{(Tulv|n>E`FT0x+gFU z!I^oz{o4BqrrQUo)zyENMEuv>`;ZW6XdvJ{w69Nf6Q_u*oNap(dT&`&zs8ph=4f}b zv*(gl%*Z+PpB5H+dK0*!SsvFR4V+NAYV$yN_mq=HMhb=VWk|s=+f>%aam@4)?r&)% zGQeuE<;&Q)7gBf|k!CY;e#AOLdwb^)Gs@JNM%e|%xEiu*`u(#k(C4g zCKTH@vhZEUCOmGmlssx4ZRLC)Cz2D)3d)*;FY=bR-i1ZEPvF9vrxayf74$MyIX5aK z7aH~ZkIAJtevvTUzHw9%oJL0)dSfPSFx{mO8xT~OlOoQV_q2Huy$N8B|NC(GzG^F; zGERPNPeJT+3|#ty2bH+9-mjygA_nFS;x^1@h0qgJcqLeC?`;W_crBoFTqf!)kJ?YN z(-)*HIGgI*Ot*#!3#GMhr|kI22$eoXv}Cnu6qFFq$8#4wmo&bt@Ed$xye?rYX70Ws zAS!kjG^ULT%nHS-P?y2q4StLd$Xbeosc#J5`iz$cxkt*G3-)Up|mmvlB)OOCflMXihI{p`VE7y@h?`8xTLiJ zpSAAkqzXpjey379Nk&&Lo9ud&50^ir8ahNOS(JEO*se~54c|wY zVqd~fje9`wY!b~b-{)BidH$)xq~#KTo0f)01_;82pI>65#=lD<%6ZP5&qxQvV-u&9 zEIX5AHV?O@_SzpFx6d2p7`%BMjr#_FyM0X1V(_3dH+6kHjuPQv-~`+l`;lWG6I{ii zeIWx4fF7~^^vG5uMyJ(fJn?`-Av5230ofnXF^V@dFzA5!o$3NO7O0&UW}nYO;Fi2LA4x^2O-t=4Z)Zn+s5MnRefWNA zpl4e$p;uwjS2~LXd?dpoZ@POdxP{DpGr%k_OC0G0R;R1)Nj7H44X%6BSXiyG)tu#~ z+TJ!lOY(a|<)g^|C4=e^N7iOHRcVduNaZocJ@oS^hSt0ODUAgEsto-67Unexg#0iT zgJz?{&2FxgN_&ePU1m_sQIq!~?+UO@Of3M&h62Lqzp9jdB-cuuqLa1N3MH25@W0>R z#7*&+Q4d#*GT}v3(asOD8#Q8YO$gTwK@Z8RWaJs%pofodnCJ(&2HsNIW^2F*OLmlI zSGOr(nab4wk+Nt^N3A(W(+WTDJ&dz#8rz#%mrg%efC7h0?dGEVtLJOvH0XSCELzX} zEVyJS;FU1oKCIJ>??ZH-om1)uc?LH10UL_fm=MlRl_lTUUp=7? zaFqa-@eHm(*G6Lt=mGB7w5+{O05F+ z^7&?uoe=H_c%{7rwn+y1ixnfat50TPd)@}6cr9lrhoO@iCUs$qvnEUIp_A5c8rgZdRdr+rn} zE!nQXL%|1R@>8YGG&J({hWTVth@;1U5+v4o&d#$;8|GO=}#A z_Y7wukEZ}9a;tnfoYy;Zl^kaPbrBzZtV~Z-U^#2&*tzds;~e0CBzlJkXDtIP&Ql30qZ&)Z$#(m~ zH;E$1H|q#{D>pyaE}Gt-HtJSu7eNsr!#nvD!D4te~&*TwD>E=Rd`tL;=t7f`K*j0G8E)vmCd zWs99GDE7Ld{-o#gTvfeNyu3oRKih*i&m?Niok39ZU_+O=0Vp=yOEdN9BBDLfk{&gp zWMH=LaHa4`Pw`N;gKk7m>f;#E4*<-(jQl|XLQV8(XTX6ycxo7w+qJX1+@Etd@_IPG zL3${b2TIGkqdOMwHhb(E(D4WOA(URDO?5RC(~xN<(tgx>Y;dk)n5wk!+1SeoH)t_1 z;eP2?-Lgq173bXCokFuSd=^co>P1Pi6hk&_ZuUWdh!;pFr2*6`H3~EcHFUbC5=_#p zQr1Cx7K%n!y=5Rh0gkd;V9>4kZl({LB>QNMIA5EqE*;j({87>F# zH;(q#4&^21nu3J9^yeVo_18ZxMXsLoKIyja-sBvCBW%FO(Q z2cP-Xja&u*mIL(kz;)h6&fS>7Us$0?>x#j?jRVYG6f$vIXmN10vc=|1_mS@V$U_hh z^=-n%3s^;mrYBLxr*2XmO+ZKln>Vje=M^WrCf6~6==5J7NDgp6X17gXd|#n3dIEH6 z-9X$nlh@~rfelc;E+~x4=_fd8wtwP7c`(J6>phpB04r=_2%uA>dTjiKDHNpaS&;v| z=v#oKi5U0lf*4r=(w`Iu))I}Y%_JqW;6H^5$n3rag(j)|y8O>^-s-%9tfIt$<%K7- z0;e5*ZEwK4<6cm9g?7!(eH2>XA=(-&BOd5|O2@@lcPkk+M#tSri5T)Fue$@Fg9rtJ zqZz-g5xAlDpWg#H8lb}8(Eqg33uwl5dZQJtGEctL;>z+hKg<=({^)JqsavpWJ{#r+ zOeM4Bd`+Q39#IzBrK|3%e;-Al2_;Zb1ozQ6YVjk>$DF?5=V_k(e8_Z`GI8>`1b4U;%y$!f$@d?}-@? z6A-4JzaC}CxZbF4I|=rat=A^n#C3!uN!(!M89${C_<(a%TM!`<@ZYEK5h(bBxMbC_ z?hXXF-vlzWG2fIhw3)PwuIhfl07*lhrZfMdzHVyD{zt!njnFmfal!UCc0bDikN$*+ zyYWRAgl0k<;MvF_Ts8U{Ps}jwz=V*01>c<}AdIq`FJ{A0oI(Y>^5dYi)*%NZTE|5>b4NMh}dxbBUk&aKF(~mN8xgVs z6hG8(pPQ2hKsQ|gj{x|fBpY%)WYGIGpB99T6k6A-AeVUOpBH;G<(kdpzQl!m)|*(| zkX9H=Paz1siE1boc=+7K*uUB=RiSOK+{LIY$4lV3Z){ofs$kcl%Q?Z%A_1imM32Td zlYF|Je%wbb3Vhuw084thRD+6LEbqB*H_)oZFVG_Wjcs-d4$4xHH9ICb%AXA;D>YXT z0YM%xNfOdT#ixzNUS)UL?N=_!s3XrO0J9R~v--%)*D%oN(h)gIjwefm><)Pv7vDm- zk@m*x-UVg6lEVi@A@Lc?64kv;5)L4*QBbhf{Mn%ttR-b?5afh>3>dOYvQ9R@A^;6D_{^R%+`Xz+^S_0uEh5S-m9X|K%#1>E$6dfRDkurq?E(+&|?A>yV8uc)^(iegN z{S@-y(D!J_zbz=KmiygE+E*O*d!BNWUwZp3#X-B<73oEJ?E`~1Bfn0&b-Uu1*XJz= z-g8A}hwg$(?svHv4vDoPT6WH-R(HFuMuk@rV^?hqP~R@=y|W(XP=&GkK*{LC+mIMo{-F938up{&L?WVE)1mo z{7LwIZ!DvI%DQlR#Y?>2K%_E%RAKx!nkhAQMf-f5oNOt=YQeQzVd#|QWu2Pa+HX`2 zY!AK=1DOFFK8HNu&VB+vy0j7|)2|ex%7VqL-dK0FA@syE?(e7SAa3d2QPX;g9u{X> zRVSkiTh?~B3zvI5t;q#(tJ_~7*0OO0MO)^HfNAExjjON2>yD;v2anxBY=B}V_v4Ec zl>F9)s^3XPn5VgsQroHgMg<+@3ay*fMT<&ZVdf!!(Iiw0%J9SY>~Uf#_=Rvw)zeZ0 zU+xpCuy{+pCA;4gDDbt%gMqYqjoYF$K(H1kfK(F+QPioUC|H15_A-iNTXH^?ej)Y8 zse{Yu0fu@S*GO~{FtQvmz^rMk)zl)%C8{-unJZ!>K8(vd=UI`uEJUkFhu!L=OTH$z zoiBIEKP8|G*~a?UUkk9au@V`0<_}*(|LzJ@fjSyj1)RS>z57n>ZCP=Yfa=Fm% z8*=P;V=;~w&i6wTQi{`us(5bUOt8Cr*~VL!^W^G;%iT@BW4X^jEg^@RIR#vf6IA5zVO(EYuDg{HmY z=~l2Im(l!`-y7#`Q8)kQHZFr;z6^hYz;%_=KEz?TqJAtJw902NASZs&UP&m9N4cNN`4Y-`XXk@XWSC1Ekd3}r;%dupUS zwSKFHL?GkrSa>@9Y|OJ+NF6cgZge3xKZ5~PB#Zs`sy+gKzlrAiA2-25|0yo;7zo5T zXFy{Tpi9s7e)Y(U7cYj>xHb>3?DF>qxOB7x2AZ^ixNt_9u#HQ5xCYaRb~4f^xI z;wF&t1N5Q>E^qmbL#2RfTiLw1=f+L{`E?~JG86{g?Ho_{G0=-?xyg~niACo%u(IC@ zoLA$W#>X;qRxYmoy9mLHCJ+*$KKSsd=S2?792?gcEfGfCfOW(uq(g8MIa{ec>;}~L z6pBE9vTWHOU(4kZT^42woTdIJk`cFuB*1tWaCm>)26`S{fSJ6Fb|Q-B znk;#_uSfxp_?k@MgL5?Bo55clI)BfWxB+C;<~Ak<9lgoe2!Q$Wct5sSOeBA=?qP=k zdZr2ijag&=x0S1CihyvN36m^e9xlq#&k#&v-ucaO{(r^)N56=|K_p6JdQHS%$jUT$ z#%)t!h5o@L|40MS4d7bniZzLpZ`I#6_fY+RUMM>!I}El0h9w*+I_s}}_~$+^KqN!- zn9tF$fr1I(lK&_-_$vU4zZ-)+C1M0~dH?5Bv=7i56qW#${7;Fq0;*TKU`V;;F)GT!k<6e1htYl)Be3fhG(f?IpJ zsr1RrPzd8Qy^8)pc_0X+C5IMmK?v`W`Mp%fX`q<`4fLC>A z5l9H+XMa9~#LsBv#Rx!?{~Y*x2Ko3oBsmS&&!GD2Z%B&%!9>p>iAL{1MeImN6WkVq zoBg6HxjNH{%etQLjV#&NZxs`tfdu}3IyfN|!Vd!>UTk@eJ^yt)GYD4EN;6GC+8%Rb z|Bi&z)j=3dE&?e1`S({fAmp)9vXAcIF&q;^qV!=?jeU`m?*EZb>LJch00{{m!>{*-1l}i@KzlZ*u6ObZ*M$<&b z?^%Km66kP*RD#-LR_LV6Nxa48qu8rBoTZk0LEe-LKSEm&+oWWrlyYttos`TBH@F06 zl(fYxV`j?i9Mz_RX;qi*Y_AnJIT(WISiKYW^55xu2B8UAk?mNSi3)x|1IW+*6UO)}=oM^FK(N|HR>6_$HeBx_b`BQ7Z~43FP1$ za9b{!4C^FPA$v11bi=w0@$WPcb-;84_l`h-AMXvhK!Cc;vY_VekgBFl&6i-$OUx3y zF*%n0MLNR`ty(#WseocNy^5bDh4Df~KVP&9cLnoNGpdJ`-TovsijR1OM^m&`C2b#v zJ{X8nTL)#Qm(P60N8TgZsz$Z+rHy`v{6Y-nUs(xs&_k$>IgMpPfRDZMZ3&}wXE?^h)M>$%q8gnY5@X~bs z?5;cPINbkUKQ-h;Y4*PwC{3ZM z+;H)n6@M}tb+6Ov3NEe1C=H|r*o*OBmBdgvtShJMkbCh{h)Fb;r3|AN{AaNtSCIPz zTA33xJO@^vZ!bAA$`u~m89TJ&nay78`4~zVFVp&e)z`(>rw{7@N&@+p$bF@JJ{wSc zZ_`_|-HRz}y3`5k)`KV93{7a>LDkKgR0~kcI~-f8y47a*H%f$ST00oeJ7}PTbe#y@PrhvTQy`ov{{2*0%Zqc!o!H8Lt#L#rl@ySlC zzVy)KP*0w)IGW~8h>mZRn89!z&F5poCdYY zN!^igr|c3eR;(@qE%C%_Q^A^BKzTLZpUPKgFU1;DY*O6rXu}LYg-Y>+dqB<@H{>ls6XOaEPL8**F-HX=`g;hTAk_3U-(+^YobhAt%A7n zN^>*JR44ha^>ym9pU0dpl3`Kzx1(iI&i?(rD^EAqOBngZ zY-;@_`R}zP8rr|{9*D%&ejhH~>9vM>+2ncCYQ4e7+B`eCbUe8Hmbp+&ucxSRIZh;K zmGbYbT|+@Ivy00e2DZ6fP~$HOX!l_)%`wyjL=)*`JcBpAvq&z5Qm)zB#(J#NxW>;q zb*bVKwax?7ZJGnemzt07?)bf?48 zxkWq5u-7#JyP8;;ZCuBTe1;X|(jl!(EX+>7&88?f#UH^@n+D<>&C&1v+Jue}(D!ws zClWGr zo$IeuA?1$g!yjtqo$Nukl386$Mz&h^z2DL!y6C$*#49!F^390AVdx7X$x%#CSZvWl zcO->0#cfNu-FlBBCB2?L^gd6FNX!1-Bg=nJbXVSY`nwW|Q1Z{#6cJwOm;b~mqpv^> zGqSVe>j_q&YD8!tfy~cnJ{Si#NY9eO(YFRa)l=nSHhO<9DVvU3&Q5=s9u`g3pjyUd zUX|U)F+8+XWMATcSMV9RioGY^Xgus=>P?$fRei!?IoCV0%7Bf~mGu=&D=kzJy!9`7 zTV=_EicC@ySOgI*adMI&$(3sjB^_m5Pnh*6&gwnuA#Uh@-Ti&?GwkvE(>)~tSHOC& zyaoc{v#nFd4Kml_`&8UqzN|o2UpXTf zNE$y4c}5@R2UDD~4nugiwj|9??Z}c|7|(g^JM^g3f>VLOoPd9ttx-(_5B2xPEmA%u zCPy#*Siw%?2Pjf=@GJu)^Rm*MLwdSUKPWxSj&?$e20OA-U%I!Vce?dXIo>));f`JJ zI*4T|oAzZs!xBvUr_bdRQ#~#1>J}Y&lKrZA?6;#>x(6|`^|j_QL0=E3D72Vn_k7&` z>zBdbBmcxtTDZQSatwJMm8}T%Fh0CcIZb|3b+Yqw z?>TV=?8~$T4K1WGEr&MzR@luR^#8He&mbY>IxIxIFW9xIOHA6m)^Y4lnls8D#jX~v zgTr?Xw=Va3xZY_E;~i$=0@=d<#2~=;4oyhRgtTVTv6U;>(HB1wZkI&Jb1mQ0mz|=+ zSnh=%&Q7Bb%LoZOnJ=xjDM+|a?j)L*$@R&pU>!(KG_%;5r*}CA;8I zT7nk$Jv+JkmvFQmZ(24EIx4og*Grb(zG)>z4#9|{^zkW2p-g1Ub5_lWRnrI?%2Auz zT=jGd2%Gyb$1^cdjX6jnvo2@kySV6JNT@gN5Yj9CceqB7f!pjc$E!#9U()(M$Z(3( zy}KtkV^Yi(t$(Lb%*!3+gczdPT@%D*se^TE(mNh*;_A$F(C`>$un>dLvT@bQD&%0j zQL`%VT%RBo;{KA{nqgW9Pll8@S^HKOuJxo;ikiSH!%b zNr_IfC%gk2IiYvYAOb)|FQ+X(SW8x$T1mL$Ws5Uw}$}$Df_Rn33h}y--9@Ktj zMdK>pK@$8sdLkVWV9Q8@*XBz4jlv#5POIjPI&K+D(l^o8nC#C6%vqLKD&D{gU7Tn` zh83?@*09z!8l}G(E=b8=0+yrExND#}Lfur{h(&GfzDs7?Uk(D6F39JErK>DEPuoV= zP84|Y2W|}W^-f@3#xf0y8-nJC>9J_)3k62fUYa67n14gt206e@-wh_~`&sCkEp`$V zqm2g}Vy$7hTJ@dhrj2aAoN%s+=~J*Yf2+XKnA0ashE&1h_j>>OHukwf5?2hXdyauT z0>i}SaSg)C7#yVgG+&|Z`+3aeV{Bpw5Y8uo57VN|`1V!NUARJA1|R>~F$12XK>|oG z974qC`?Hbu>07u*5O)P4y0TGBG)n^&L7|Iflj z9wmE5qmC8Apv)dNwe^)YoX*mnk{8!dE_uSuy__HwQ8*x8mWZL+d6ypct*!Eh4B8#9 z?z|j-&;;*BTmSc<^d5Fu6^y}Tp}QnCkV|`lQ7g+gH-7s#rgEK!Q%`QhYJ?Goo^$t| zMk;+#io~k(6G>3|BX&%&RBqHYP)0j9Xa9={tifxMP|+{me;u z-6mS>f0C4-JKk({o1t6%HG3WbKxn(-ia81b0Y&yO=r}+}3S!!kuI8$rk{DDJNw!O= z#4lnV2C+UkxTwhQmdPOid%T%MS9(a!O4sXkTBe-$9}E(CzQjuJsbYPTT6&3~8{;f~ z6+AbYu-DQCVt^YQkd|O;hkAEG-idA7tXBft%u^^OJ?UD*JLM`hg+;^6ujO=VugCq2 z?XTAJz3r%B_zm@lO@71yAOC>{LfuRpS5qp*`-)DYQqB1_E_qY(P4>`ybaislD3gDj zyIKH}@kLJj&yuOnn~3MFV$%JgQ&5D-WZn&2pV=_6+55NH8>1v>jA4O4r}F5{aYvxV zO^S}8kkV%xX5i|OV9UCTa?3M3q*LYmNGE9DSOK~kZ6tb^^Koh6?I*Lha75?m%Ej38 z!SMF%D*as_k|rI=H`6zvb<^wLm=_&T8J69relG(0wf}jHA3~r(<@0AkezzyYQ<*Qp zE9siT7bWF;@!<2KE>StxAiAH;XrgE8o!>^X6m{P_*s6ZOEUh_8?0thpPSZBChFF-0aoSzZ#wE@$`}0vWb9&8*!75|$4<7QIjsImypZ z3YuzQEgX=J?f)pp$$R$c-PVvnxjZ6+S_Va4Ejr#)s_JbP=eg*2XjqcVEx(&A+e~V%XnVsTlPcw+p!#-n!V{5|%TZ z^!;afvvNZ0hd5!}#IIc00q+~R!G%~_{P9=^v%Np6862K(1zHodoa7C^uOW~LXd!%OIl2tF5 z+~FlZpmwy?{b*A2H@s)10KlRl2|{}S-l!3PWbdngi13|B_~GbV#%LKYp(zjV`R@Wr z1r~@3A@X8eS@c_ez)C=aPMKNkR9Oy!;vn0pn5N*~M_WR#;kE|#5DcK~yu3P;JahAe zT}ChPH_#Z5lQL6@Wf40U(v&&bJ(S)}4a#;n5R6*rNoM^qeopz<(18$vnLtc64vGWo z2_Hn;NIbJk*o0~kUuKgTG_^?XX|JyRvZ(|wAoXI%E)oI~O*Vw?LF*2SAIG+hW_>{IK++|_eHVSX@BRC!iIi<%vnP_h+=yY zYulv97Mn%SE`F5yeQaEa^iEarOQRuq%?Ad{Q`uY+w+Wf$Y`> zwU>~<-weD`m$a1E^ln1#rr7JTGbT!gN0fs#Gsv;SAx!Q^RyY!^yTPKp^Ofm+A>(tH zS+Y^9;~J;R9M`k3({}Ct;fe(e1fo$!U_*i1iumDM_t@RL=ut z#4N)@F~i)TtjwdTS!pf7JRCP9Ys4Dtm zR`q7ma*MpC^R4j1Tv4(RH=B%Bi{(>i3IGKWEZ+E@g?Jx`RK8XGR#_~9T~{FB3kd@Q znHP8nMjKHOD1Ym+DvEewwev(M%ERne5rl^C)9O^PLE#c=jrBVEFG7T@9%<+%>fmM}abCPRc#j=UF?8cAh8og17`+hS_mgPy(s16N< zfNkY+#r-Yeo#kG|_74rKD{-GQ!PD&2D^Vn56IakjtK}An;$dxq?im99&r3}%Q!_K# z?OqS38w2@r8GMer=$^U2JL21uF8TLI8$d)%(NF{!W;Tt zWkM=}e#{#hT>N3b3IE`Hta`_y!pF$!uV*k+x9d=@-*=HP;SH>1{?z?aYShcTH(o)5aHu00jGt$j5UG;uyIS^q*kFck}hTvk1Ae1C3rb zlLdaNbN=`0OLK86(Zn(fq4@js0eRa9ssu7UFEF1W>#<0mTA>bpoBdi%d33cJ9*LQQre40j3qZVSN(+BTT zo9fBzHSDlaFuwprQzv$E@oM_Yi5Y;+7CVMP4-#LKCnPZqYoi{{EL4q8W0bR@_^{e zZW)-Eur*_qG#1}y9prrKQ=v$V8B*NvK?jFe;(?`xqC0DN~ zA=4CcD!>~pWP56nS?#%l<51sv3lSDAW0I9V(Q%hqSO1PsJ@9@~+}BC&DLr*HImib0 z7;HbU#X_L~Db^T3R{9s^_{YOPGlvGl=9{DfVp>5BhsyOU<;5_TxtJH7~P7KmmQE>_IAoBSof6w0JQt%Bm zfG+$&$RfPBi39&Cx&!<>GAd{ z!ZP|0Jzitoi+qjUC^dR5Y2-+A?G_3x)=|rc9)(NF&*!r1GGEdy-m(z87VmgiFH5FO z9Y|!lF=zFRf1ky?EAA^w&~|+j>67OJBo?`J2*@yhW$lI}I`n9kS|r^3YE0d&x9&z} zpFZbtI`%Bei2KNL<&t`&*u0fCBoVk|;mEpCu`$JjtmXHXTQFHlzTFHn8(YyrZrC2} z_=vS-hW1F4C0$x69_#9xI!+w%`gCaAz$Utnt_28GPD{(&h+DnsZX?k&*t0}htps1) zTx@wyI}j;otB%DU$xKjN_1%-ZgVpzO5os|<#!zyOjw1{27^EWteNWm82?pCUsbCms znjVcbdGCZ~((sQ=XTWd7j_j(U`f+!PN2XGCUFAk6Sbuos*pQ3wwPKzwhVQYr6iz`7 zZL|kX0NZx3oH?rDIzmKMTf0_ySjWEmK@gvgCf6Cdu4yU*;B=){esCfWNvAKOfbLZg zZ+UgtAmG%zjqEUhkB-V<#Ri2!Q&yO-VJB?P&Od|h>CttKv^)`a1zE=|zt#>qE-!lG z<2FGAugtc$0*=BowT_NJ#=b$zzO}BdF8kKSW>&Mj^tM+;(T{*=^gZmo?l|e#lGSIi zCbe{Ln}FeXS95b(LlNSfD0Wmh=)%SI3p<5jk^a5;7>@FEti6|JXw?0RHgW))@6l+; zP%ZpZ)jijz#_rbhqPO~r-0z;r_=F3~%_J z%H*U^?ZG3}bR#2I5dXr(Et22;v;N9ZGbyd?1ZNbr+wE@pr;1gIH8hN+&HvY$dIqTs`aYwi?Hz+X&NkDB zk3ZftU4+f?Yq@p8EJmLqj^vB%83h$y(ycxFen-8#SIl63D&mgyeqxBMr+ClGJzkq~ zGL7qy^Q9>Xr)&zG#l!=wrjOlzeM_}^Jqxv92L_~8UG`DqR=Vtg#t6YQ@S5;3Xg~)~ zU>sUW(bEHZyWskoK;KtHV3YQC&C0dg%ZO;I=}CcR6EHRKkn0h-LeV|y8#PmQ%v&i_ z^J%T6hvBF>AE+YJXzyq19$L8*N0tAXm(wFWt*b4@nAmlz8f(k*;6Ro zer9Y-QX>lC_8H~2+uQDme`n`avJtJ7pg@ApOo>U`MxNzDouxVy?_%z!Lj$1iJhF)- z9ou8Gd(%7L-JAa3nB5EKr9n2a-V?pVn=Up758qGIl5Kc92EI_Zf^) zl+V(9>;Hzc4~46baN$y=%TIpSBrx0MO+xl}*s=|(-4W%WbM*xKyk)*47;CamjT3|I z<5}-o;zAc91JJrVnljoj3>$xByK#i$!?+$(@kJ)abYn4BA}I@4m#@L(cS$Lu_r(Wz1;D-wK=4e|K+;ciH6(9i4nC|M)T!7#p96 z&(p;@Ra_=KICua)HOP>)?Rebn$5eBZ>u~M1_nAZjPWW?vowOx4TYO^%rEdiEtreF1-!BQf_*B3Rg~ZDh)mk7>$f*<^cY|JZJ?T@$z0qEv7Iy2oDW_ z3du?H-;f*jW|yqneA8^5Hyd1N^15lm4+t50pJR*xRMqOXbvgeIba)}xJ;;%am-_V4 z-7U=d&rs0)q*1=t*d{lNx*9IsXqP3dN4wVn;l%|=V=9W%ZSMl$HBz8Mnz9rh_Zl?_T;xZ8B94>&yQ`C2a7S)%q ziDR&&vc&{+cYk6+A=?OK0(^c^)-)7#Z-vJ9VUr~|j6ivR9=X6=G=}Qv@)s?j0r;Ak z(2l)~g@iItv(mLco1N;ZZSa~S3B~*ahf$#5?8x#uj2hSsATxsK?`;ddnXfjx_QdPE zMx4wwdTOFbk<&P$p%!>l-$ox2m?CZ3$wO=K_Q}b{)qLZ=zGPfBlcQmpetyxwD=pFK z;>GOxQp34EF%Zk5QNDFh`%sR7ZJTr^6CN_vr!TmS$K=Nq%5!Nbuf1~`Iml4O4q3sh z=iode8l#udtkM@@{+aH9K=EPmx%}H~Xny=L1ADA86`Acw+45~zaS-DzXCVf?(f{?*~sCRXq_j&Ho$gdr2p!jW#-4oL&t zDgyb3H?OBKcEKvPZ$n8KnOZu+g@fWo(0iPD>le}{a2<*0;3Tf?Z44E+u4${RZTV9+~C-l!3L{va$A+LRZWq?WNNDBgNnl_?Up$GTOdiN86V9(oy8z74$^t4e7%)efo(j^e};g zFFi>Uk7vy#5!z$i3`(eA{3xi$j z;Q^&u*K0VnfRJistnj+o8eYBkFN_VkdqiwflJI624Av<{{KKHuwz`v4&=MRcYw(jj zfMP(^z}i*-i91LE?^P*73?W%^XNLP|z6WEB?;~8aEW3@P9fe`vI#pJkbGj= zoGF9@9j~uBIo0}%(30mne^x1n>!eB2(9ChjmBq$!7QxEscYVuue@!?2&auYNo4t@L z=1gt!-Pcphj*?7P)7gfnGj3(gN7C%3A(vyIK{0VnVXs-kQa{4;ug5qvoT4P7hPK>>ru<7_d{v4@oYuuM^d zBI!rjk1cXj+)%I4!2f09Q7nJf(kj%yMcV4^#Ih;XaS8>odFTx|OZxYY9RuJT;eY@! zB1j$GcVmO_E*17*0jWb(s~>%zLqD~}S)GaC%Ca^K|6fJ&@gdTw$|g;B3Go(w{;R;@ zM=TgnNWFxOMKtTD&wXRQJN;2|WCne?6dKpVWd)0zSgA&g>ocO+<(t~XttO!O^tZbK z|L;Qql15oVc@Cc*+WQK;n%}{$_Y6n0pG6t@+MaC=H8nLQCntwsF{A@kD3gT>#v|zm zJINWyB53|s7M1|lo?6QYvWG5E;Um@Vc0AU(#47~v$QV^=IAE!IV z(5wH4tha!Q;_Ls%6;VO~6#;3G?gnX)l2{OFmXOX}x?4m91f;vWo25aJ?p$DjMY@*m zSbpm#p6B`g|L4rvvkQA>=FZH$d*83R;~Y&!Hlcj4N@KrB_vV%^04upxv%boATmJpa z^Ve%Y{u}>2V}K0F394n6;sJ+WOdt8|vC^{dT869{)=!3ww1x8RMNDI2K_kNX)XAq& z!mL8K@A27R!k2uUYnAO1N?vZnst;GefPNDg(Qe*+w*%8B7NZ$iQ{aaG-ZU!vKM!le zC?U%qvDg3I4*5~Ne2E_08i>@$<@@YZkPW~yR;Y-WDp9zkg# zhJ+j17ONV`e478ZAEhtk+}Mi1?4mAtvhdHn_E&j{>JTxJj}-#^GoHJjmv6CWrEQlM zHN%!R7?>fveENeQXC?~WOGXwm*HN8xb%_q@MWZZUbkYGYj0J>t$%TK-}fDB zVdb|wpmOhD4+8PZI~$C~PEKB|=lO7dF`KBDQh=-U<^!HV`J3Dgk05b#GIG^waflQ` z+NQTp79}K}W8#3N=y@0GKYo^`!=@UWz|^ZK6ZhNxx?jv;STbEENB3`M@V|d~81`*{ z!FKvfgol%txo&pBM)tiJjd1d&iisWJHT$i5K`%Y8e1x~h=J_$0#tlrGMB-wqri&s& zKj5HUqrU?NIa`wJ`qfahS^7v6f{4-kz9{$KHucXaSan_MiKmyJ{{%n*`4k{PL4)VJ z-^9VOhT`^$%`@rYplXZokiVZW?-+k^?Qw>I0ZapU5N8&fxD>+@fIOj$@y>!B^bKox zoo-|ps2zVOJAR7=OrN`V)EcB^4DeS&qn0e|CkHH|o)fbmT(;DXmG4W&lzfl1U^PwR zSNBMo+&U}2U&m>^K~c9ogQ8v{_D>O%gk`ClzkWw)6qNhiu5)BtNWC2d#xlLqBc$8) zt3ACPnx=CsY2I095048Ds|K}#g&aQSTg>#ggN;2e;9 zlu!xo%we39-O_DVh-317__ufTqTd>^m~>aY(ft*udiX9S!>l`g+wOTx?wJ9%XS4E< zcTIaXnXbs4evSvOH^xIPOnXD;W=-m{&(FsZ*qok&{t<`a9=s-RPZ<1%NlIOU@0gWks@{HmitSUmV77tX{ zHJ0HyK)dZg%kXtuEGnNNg!u1~=k~Mzx@fs!p^a8Qej{*(T}LV9%QQ>=!R%>X1@4|T zL5BAy*G+lY_2>}IV9iR$9+nM++_=XBUd5BZzKW||_b4fUbEC(JYsT=TKqltKZR*^3 z68Rh31X(c0B0lT!i+ek>zoL!oU@3Qki~xhbR9jd)#Y#pCjvRPlP<@r$Zp_7ZrvH|I zDPO*9`KfAbRmngCPHa;mB*A94NYQ;hIVs~*-+L{5?>6LmFkvkd6YcW@IMMK%CIUZq zI)+o(hv>M#%qIa8rp;9gkBG>Uu0)f5p~4^V_+Yo+ql-CPX@3OW2-|D{K!C zf(7Z_-XbUC4#E@S5pUVG1~lmeO}1wO9x~ybdd}&et=T(Z7_E_r_CFz{KfO|fwzCQ< zO7sOQRnT6GSvqp41y4_!aK|Svy|fUoQoo1{KCa533`118W zs~ceo}+p|HvRE>-cyq;Fs#qx zswG-A`{PiG%^t!V*KY%|^Fx&X+6n6t8%+p(zTQ}{sh@`Y`!_JTNTD?odLM%lsV7L> zlyU&GZ>{;G{-906I140Nh5-yxNusa$VDe7QqWaN#dM6*-8m~?k)>IB+*f`)c-ssEk z=NRvB*94&VyA_5RFT-4H@nTTZ#H-yWo*I%6Z>l7CM9$A3?^oVQ59UVoM2+5ukr--{ zHlYxpOY(;qb=mlv$x16u-MhWqrEA`VeCnK(gQg3WUm8p0ssfXTNDF8%*k{|VUrkfY z-#3ov49M^H%MA<5wTXDJo~gBQ@JvAa(oJOviyDWt>4NN0zf$$@bRL_X?3nXscTL8F zbU9VANebj43{_%uak`UfD5wP`m3}cF3-ZboY}9gx7J_Y4hs_4+W+q#-Q*c{4Yo~SQ z1i>Ua-5BO2nnJ3MKZ2TfodOdV)9*am$dd|jozc7)&>wdz&jQWe(#!@EY;>o;f#dU= zr`xP}-LDLqO#nD%Uh-dtpP3DskdN*uvCd0&s9=70!kV>>-wGx&V?GV3{^E2XZd!%4 zE>^$3v+I&wJxI=KRHveEpdNV~5RwBYLd@q10+ck`w3Tg=Nk$-YPUz09!cFtMvCOP_e| zAdk_wNA->CLm3~n2+Q?V&^qCRQ0F*$D@jF#`5}TUs;K$qj{H|LBWt6YT)p6D_Tz|o zSf=90L~y6@(sUz8uBH{5rOzTZK-9)x?CG{(y&}aDCwgf7yFImR+c#qyLh^!?L)Rl< zgm$tr4<*N>H#||z%NGaV#Xd+HGgWZnI7Q`nf;A0G+P#-lplEf#w*i`Nu96}_cF1Rm z$o3<^y53lEX{wG7AyJjtH(Lsz_4*Iny87K;fOY&~#KJQ%S>J8@pvTI+rwGGZ&}hKp zkVgbA*=?E>pG=Uof-H)mdv>4r!(2p)jPY66d$wQk!fISRwTp)=BzbW3h42TDM(o2r z#oosG_Z!fU+C|!JE||Jj>+V^ZjmhE=Bm2g(s#=xu>uP5Tscpdfs>Z?R-!z88MVepV z-R$d1yUj7c+SMm6N>$_0AJ1u@O4Vs>d#RJt;e0?xsUnw`*O)iCc`K>Y6k~+8FsB+$YW`C`QcBp;4GAvtUc68F$ zmPpCc&U(F^l>#{|OU?`p0l8_BUEhcYsL+4ttG6OvOHDuQhX~n_RGX_@C#ZRSi*QV! zw!xn6nO(QmcC{?wC$}_oW|=2qyQo&%kUc%5#!gz*gr++Ov3i#gCZU)~W@7B~hDjjJcZdaa5~?aXR~YL#p8&8l)KnRh zr9Q+|y5GskT%>tde~&C{_oJ?0@AK}DWBgNK`V;*TS$<Bl6Ps`oR+sZFQS!1JDT=p5C`nyAkeH4 z*HC4VF@w`5l5O;MBaOJrb9+b3 z6tFG4-RikjABCki@NAoIhy)KSpKY9)TsPrzbA$U}sgShn@>E9TkiQ=!J3M-VT7e0^R30Z`L?vg~c5 zg0-Xi8ujoxfi02qGgx8j`|)ipkiFKV3GT4sq4T|jnU`0fVv!YBg?{b)E% zWw*Z@DQ%~B@q^Xfq;@N)tOrPL%2;btyh5b{`@fjCZG}#}TpzTXJK%7wXjRPjW#X&; zZoCo3ZyR$0^X{F9ouj3Sk$oaPewk#X6~)35f`a8j@*}vI0y84)&HTH>+E|CPRvyk+ zidb>QG4VlC%}bXl3;4p~K(-1;Y4I+(IuSKZC2GKM5bG0~wyq+Tgm@2|QIao2 zHiiJ<=J_-TBR?g=Ss^XWhId+%EKxLo_{YGkGX_&p5N1&aXsUXFAT08f^7L8F51n{! z>>UHI@4j$?UcDrxIU34a1UjXjq*t!mw!aO35iH^A86-(qwwNk>!kN^rH( zvfS(jNG_otDCQcMtU44c?hWgk!Xb%-um*aylUsan)+ocId#!dM7SX^9=JmC@2z*?Qj(HTw(qWx^=)@5iBYx_yo-TP#pk;zq&pJ};QSL&()C z#SEXGmGwcf9Dif$`Yn~mHb@1QPrlKap>jZC%QcT5=GjiqS!;7~EXi_%A(ZX4>Wqe3RAY4 z6@h`uchm7^=OsPmd?0|i#itNO*H@4n`~4XZ>4&R3-`F6xqs-U3YM&u16 zCR+M)Oe=C}uAU}JeTUfb(3jCi^wr8Twe?^FE}eG;5g^7Cfr_k7Tq(b(IpF?=gU zl-A?Gw0u3oI}_PEe$hq8^J-GH!>)2h_X)@SS2enB)%s)AG5ikTC0DcSmsUN*N!uYk zb!iM9YY`L;%YDU#ST>uK#xK%+ZWsUtQoRo3iDeZHtx_#RtpHfow7-)Rxlg2@i#dF+ zTMm~k9l5V2%rKA2JbUJSkJ{11>sOCL{7Hh!*xwI~gK9eJnlH+Ujcc5AzE5wQf@LI~ z6zoAC^yeV=iq%tzUqi_Tn;j(t67mtxMIrt_|JryL?&r{LiS>+XA;;3wKq{8)f$Sp}ULkA%71v$={l zLN(ScaqoaV*9Rg}R%LEgx$pz`LhLyY-EEZ)PW&3+A@Y+|;fDM8?<4%piC&|E zfTtq_9QLz;eDou1;ssGT;*~Zl3uz(A>A(5awj!s;-#C7nM+VG@x4QNw@BJ3ouyqH= zDEA4G+q>+WwKDBZmFS-^eM=)7=P}rza8LVPcEiICKGqx@=;G4?x%3$0u%jayKawH&M zihBWJ={YnSaqs_Oj`%|ou##rW2*w06nr`Wqz6g6&5!*J%^9(LS&#NyZs5o=+K>_f@ z`zT^+$fE8mL@WQxbB^a!fr97b!luoqmnvF&{i4Tr9=~z&sjES$Zl`c8pW2xQQ9AgqKk~EIG3J=p&l8kM=tPyvl7RqSto3Wk>0$^}w$-QQLjiVLwGLtL@ zAe12-lh#~kJW+>Zcz?H2T$#TAUCIzuy;HuizZ>9K52}E}Y;50qsqw~hFmB$zSZtH1 zDsuVe?ZE`Y*=H1ivBfF9ShkmLmKlZ%$pL;F!Cp0&g)G#IeHZ|%*YAI>nAUIq2vk$M z?jO0oi!|u^8t&}~$_E>B4$|A|bBP(~SbhLl9y1@52`K<$6^q66%FJ(>xrgTTtjfGc zUUFykGqgbb&lZ}u>1I-EW~SJTB7{d{Jvz_z%CN6ix>bDxF1hVTu z|IrtP6=mm4X}3R#5NaLX(!}+ow}G3RlRbM@FV>AuYAgnl@P6sGQ$|>hVqSL@R#&&? z<(bPwj#H1r$zaV1i4QEjRH&DLh9*wu(|q*C7Su-e}G=ElBt4b}BHIQqKMTeR2s8%5e=FZ#lT)hr2~e>e zb<&=l{|zR^vSgc%Ag8%+GJ~Qr`G85>xQqh8i>3(+-CYViar=z&5fpLVP9+h{7D+YuUU$ejJT-ITviE zl@|RNw*IKV)ZA%RpX@W(SAIEWC`+x~`?#M$W1Z52_AbiK?SS4SS->&2=&;dWndD}ES2)J*w$DogQPx$4}mM#q`M@7va2 zzWbf;uS;?IBS>$fqiagGa?n=YnY>)viNi|6t<@kObFwXs+g+mEKM;$uNKJ$TrzpNv zkw+&8{`xRvK&f5PS5IS|{9WP^!WDXU%G&GFHSMJT**#?WI(v-g16C>X)yRRyRS>_hq^Ai5B_}7K2
T|GR{-{URVMy^n-XL^->aok%c z(n8SrRhHu-Ba^}4Rv|45zq(#-t}zU(O80%HYJ;^@&FrtS%HM|f_)KtkvgfQuw!OYG zO2wXLmCyb(TN}*hHGO4pvYkPA*E*UVzNmgAW!`dP8kvRv^<*`7o%pWKR%Aq^X?&3e zMEHCz98_9a1U?R>c<~~fiy_QJi!m>$`w{$gjRgH4+}&Myd96+{648I1bp{FbuyO4> z6E?)7*GaUHJoXox2sk>;zr&vOGJYQOe( zS!U36RJ(#Gb(&OlxMhhAW}Pm^h79t8cx27bKb5ai4Dv9gHpe!`=vM_!tD9k;`JL}i z;DkY#ifp)>peJ#1Mf;XS2+Q_wTjz5O^NFFV)%*Rzw)Ij18{dE6{5d)p|8@1fsQJ4R ztq=OJEW}sjs8i18T^kPlrnP$3XMdsGZkSZfiNdvGMCi8(5`XG$r3@ZA=3<64)VjX| z*dTxUl#4Wjfu(ilR|Gcd3B7xt%n)N(J8f3Gk829;%x+!}oCD~csn71c2PRo?>4ps_ z+L8JFp&RB6X-6{?Thb+>JCf6yhT8i)hE1JwPfQj|vQq`KDR*|>zW$}n&Y!!haMV5U zS!b_>MTqe!EX(KUNP+=1*z*Ly-pwByPlYptrX%mgJ;BbFHn-+B`GE(hR)x!>s1@)K z6BP;~%Dlo#d{nOiXfrC%!AaqfqplIOPV5XfwqiN?!RIvr#U&@n-D5Jd>SZNTWS8}u z@DTOLEih#XbTSvw^~*76WNK2qyiB(XJnac%_f^LXbN@EsOkh#QP6i@?>D74%J{Do& z5+x%YcZF!&uzzk*a#eO2P;i*3Jt)Q=Mm?>%`~fZ0F`98Agbst`Qq9#dUE6WCgBttdP#T!Bx+tT!o0pmIHmtYtB*q=9>wrDYSDpnQW9mPPcm+^ae*u_5*~vDYzXvP3N`AomHb}b zGqrAFEzmsiI?+;VH8|&TN}#`?NZ_KOx4FV50y!w*&x|kizJyHM;cV`e}e>XYr4T%G8o7x7A3{<0Jyx!NB zYeS8SdCp+5BI?-VME^jMHHyJs9kO(sMHj2#%#@vf_y~yv6b-G(>AgfZjc;pLKtHH;Q3$bLi?$R2-U`Lw%-;=bu+VFu>8y z7OtDq?`%H(6T|LJG!gN+YP~ggZt#zo8AmG>RdcpK`6OJFb#d(Cn*GnQGOOwq9_-5o|#8xfW@Xj^?G zx595ucJ{J7E{3iGxZ=;xB%l7vsi7cTu0I0l+rvyDOy2JjvE+(SRu|0m<$5<68Gw$I z#ne9Xs>H?xYNb}&LsW)ZD|i`63?dTqU0nhLp&W3?DyR6efDpZzz3f*tN^TcLf#wR( z589!*=AsnZ$$Tju^}{b~!<AdA85RgNUYz_` z!Q`=2hDu`S37;>~Do+>pLJlR>kTf>@*E1R*@dl-b{gF;TgN6nGWHv?c%VY2Kw6_`+<1b85 zD>i*XTOb;S$YLCFSN-KHMt!eRD%0A`v3)FfQj-Kxn?r(9D?CT-8~0a1C|`ht+Kc6z zY%3e*G-m^6H9&;NKh$*x97`PmtEF0!3%56%8HQ&FpI&(Vu|c5ltmB)!PY+Q;0zi4$ z(+5#^y^9L@m6nFQ{G*pxSW|g`|F`NMC3jy94ns{rq#3SO-gxT>7_L1|(|z?YhC|5M zKfq6SYK5HRykk#td{L-i#FyxWP<-_~@&)8q7Kd5VpmihqBrbOI5rEKcHw}hSK00!usB0FZ@;oQn~h?ca z=4_O}eKyD6J~Kv16IM4-K}!U{{B}u)hw{63bI3g*u9IQBzO**?>t+>?{xsjB{hj6l z}t@exm#v>9@8zE7qr zyQ`~Df}K*5^nP_2E|in$)mjv1nx zMC!Qy?X{B~QM=+m%wewNiRt3PL*w*O1uC|4fAc^Hl9@3pIZb1&Lfcz9OMAcSXoEQS zz#OcrR~w?6S5}trl|QpDA+s5@G?mL^_lF+#Az1Gtox8$GoELGvNxI!^OQ~9i-}p88 zdfpYO3{7ADr7Hi^x1MXGWAj%nBg7^Z>ywo%bqy{K4}(FPbE2qq$Y4`)4QX9Z+FOl9 zb~QD3kanT$3M|4SE$~%85O-G5f~bh=F|4MHY`o`lnm3lYB2e``#KUhZhfyv8InMk& zwFvfOHvs@_Cy&9!Z=!ydVryG)c}btb@`?VqnnP%99&V?16||+tggi*5tasgNEp*xJ zO~%&TZ^?}k?jKr({>2h2q<7m`FK`Bcw&KdOY35{Bt1sF;{-`X}5uR1HiWBe6wUf)G zWKQJ74C`CnC9@lQ&JG$+nvO0WZ&1tOBFF@?QgYiHe|8_}2Gq{VW6|-rpj&JHtj7_r z*eYF-0P_#0@#=`jdo6#a;b}i;PBc#voqFeQmz|G!*kblbUwxqrYDO+MGEP*hQy8M!%V;MAv!+}xy? z(s?)gV>8=OSXb|(YThrbLHPaxJO8O(V{CJd{$3=w34Q*{MPWrn!MA-`?45jhDVjH1 zIl{us~-`ZX@4& z^Tck7neEZkX__LjjbTKRRmg9J?s(ejEnGH(l$+}Ad<3AVb7C@sbT$q)v{sA|wo3WP z>(?E?8x}5L|5I?%)zZ;?;)!l+zmf}VNqcA_5;UqO!Na`fY^Hf#FN%Ru-q@LsAn zRrJh0tN*<)3W*Xj$q0ckn(2pndu(^!N!z0M>X6NiYH;JpEWNcSqfM!dY8tdWAjBjN zG$#|wDFWa>U_qW^wQ_BX^XS9h|KYQ_PVfjkPAlJ^P1X?+-M5@gIvIwTEzE7yC+SCL z?4C9hO8zHmi0TKXy@lPu`-Hur)yGeJ+JUYg?4Y$6N_tTvar?dr~3;XCjXx65fi3NU=PEK?KYkwCIKbCMx$;B%`lOw*#1R-4{3FqEkg+n2x) zX$EiSz>nxWtTjs^RN_q;D;@24%xY4T9jk8rE=?y7mH&fC8FEE+|7VeT#Dj@3B1};% zqxrZ8$(P5U%_vAR7{nGUz8XAD(bF$o6{oDCe>c;KSnS3#n~l__d~g2dlX$x)bn}Vo2S_xE z=@xCgKI0Rxfr~bJ^|@7G;WvToQ$j9U8ZNTLJHE`webxQvcgWHR{k(O$!tMHIi91U_ zf&UVLGkv3ZYh-$gT08pm4muGWY<+A8sl*6a|17ckg7YeyrQI`S^aWSPSV#Y7U{HNn z@;pE5l|uh6)XzO5cd^Ccw?aSZbPo}Eq!26t+xB76|cq#qJb>&T5Q~pBsarm zdpu zymFeaxiCy6kSA^Q@E3@@rdU=jwzmk)CP&vLdJI`KHfx?ZHMV9jEsiP4|5SRu(IboMQ&fxY~fEw>Mf?u(je0f;r?JWrb zQo&0qXA9}T0P$H+i*HjQR;4xdl*I==|$zPMWQl|!qOo+072+{li)StNF~^@P5(jtmKPoDCzHOB!0Q?M;ycXFsa4?X2;5dok<03wcB`e3kMP zA^aG?aKffDVu!4Z;D49G!U`{vH_-ylfA~+b9_A~w1$0sO)J`&)4-moix?0&bpXBHt z;~S_Oc*(KdbC%sZ!B{t&QJtUg%)s-I5jHwpwzVI18m3ou_EHb}SQJ%GvHnrG{w$O7 zSiGV<28p9`64+=;75NgB^#pa?@AlwvgEAm%g)OI}UJh`oScmw+7hK#r+fe$0)2nt_ zwM0l=Vtzvo?L_S=6a!${=I$!uvUHx5a~<=bn|YX%)tu{u-$wS5%A^FL7}xO~}-GH?4(T zJo_d*sA=D9`eUB|&O&Rxevn3OiD0^N(o91sAazQ_2wT|FI%77-(%KlV7q3}&Tvn^a zPrywIch*3tIfXF!ByQU_t*rE_S>2Qz+4WCGeFYk3&DuF}oVYPGB{lyR#;lb_qc=Q# z`@AsQW8V{6Q$X(L6RTMR%j?Q1 z!1xt+d)(h$rPu zflqOur!dTrYEgRHd;!r_H17T! z2MJxY+yCxFEt9g%qJiv$$YW_9)Z+l@Puz8+WlOnF%IqOYQ5 z3Kgx3?Z>S|B|3jN^e7NzR+RNeYV!Xi)i4CQm~8P}A{PC&#oWfTjfFTAwxyozh|FKL z9Iy?mn--OSFmHv7#AXV6cJ#q3NGXB!&W_@9$^edCF((%;`b97`9-c*^@nP2o))JX{&nw@K&Aa)2 z6WcFu2KAPcf8KAH*{S{LwIjr-G5j(xQ9(<)CZ$!`v2t6_+h8-9JG^hPDNjFAB2 z(xF#KuuA`dF*lQ3KJ(Mb$;Z69$0)sLq-c%WDw&wMi0X?Gm3yJ6X0s+wG=8tePdI^t z&bHGjWJnXOB6+Ab_9^IZ5pmuo!KMQ=_904=XFJ)iJ;T%FmT)=57}Nr{iBZ@}nm5&~ ziPO*=sUA3su)k*r&2EN=UmkQS!pDrXOvPhf%@c`M88Zpy!CN@ubU_SKIDdR(rzzKf zpvIQv3u|f>vcXdtG|t@C)5^{#1>o1I)^xUqu;3DSeX**g%JIysxDM0>f-v776J@_r zIpu;|w>L8mqmo927tWW?(qDbt*A@W&E_@geFG)4KJIH!(JQ2i^{*s=T?)C1Nmr=o& zb}!R+F|HdtB@r!LOj%CO0+SX7Z7HEACqHJZCiLDJ*30Q`Q+7oNM{@|J&x?d@S8nae zZzNCQ5IAbt#kY&sIwAWu*`DqRheV`nUNZpVML4F>yI&o zC(Q9HB|o7Z&My6|B}p+Y# z8PSH^&FgFCJU0X?bAeNM4+(c+=tF)oK0rmwzoIaaQXMN{=b$2wr^FGEm z>GuiGtW;O!seR{qcfHIjZY2OWdm#^8o)O2x;8PmW4Tx734qM3)(DAJJ0a zen-u1UCj{DcQD)}pVizYJ34{tIz27_5242`g=|gSWkEXdBJClDPe1{teEdq3Trbhw z29{DUH94-Ssl0coWpO@i;Qb}&rAXe~@-S`!a@pg&xMq3=7qUEy!NOggFTnpkncY|CetVO$0$zAie_CqDfjtl? z@!A1w?E*!cdb-G3OUIPR zc>`OQ1;`D)0!?T|JdpvjZ~3HuU;Tx`^svs}OkUnzSiavclm*$^+M3;&AP%nP{?#9#K#H$wZ6d7oBSP z`=1z1z!UwEYtLB&k9r|Tq!#bnmGiMsfsDfs3W32z_k>l302I70eNUb$;&D8b#B&P& z=`Tl7d^g9>SpM@w5(P_(1=Qo6FA=9^AT=8NnPF3|fd+U&)NmtJMv4f(owc|3}De&e=uDv)@fzuI=Lp6k$s z9zQ!&#H+T}NJKVXoj;Cf9+xYZY}COVtN3ucGgq_8mPOvYL@>>2U0W+3BG{@<$8!B~ zEAL#=0yb4|qXsSSM*6`g(#2-}#X{MB_$QQEzL`Bre^^#wXdA!?{1rWHG#iGWFXQh0 zg~08xlYvifxvn)S(iX%U`M8e?ys;>}aTvuoN3QF@6p+Cer?6 zEx|^Vg8qcZiT&@1$4K`dJ3Q}EJuRj<~hDJzFbZkDEVS$*=P2?6U}@YG_iI2$e+ z;WM;0r3b&M+ismRNm0T*_zRT##VxY*ih!cw(>=4<*k;IRqt}l*`Lj#2?)AZz=I@DZ zCX)&5POi^&*VMXt;_ugE#eme0YDj2UQfDSMVQ)+W)-KL#4RQYkd4W4fqe1^gml=_7TtjeEiS9%Kw;- zn3+!h|4yj~sAgOe|NWr+`)Yyy9k@!lPn6tp)=(2MJhrsZgz#cvJN@l@WOqfIYVJ;v z$bI++=f1W0PVhG({U5BQ*e8YXmw0OZ#8%9FV_~erZZ@KO>SFnPq1CtbIS~|c`B8Y|5Bm!FAgbqwW@u$TI)bQV-TyS`&zt2{ zbQDOIbq%6~zSs%y7eIxo02NO<_D04R*G6j}p^#uZ3N#FJyktmLw#NpzqqOJNs;I_^ zlrk1hxnB{3Pt0KGhC$Ogr zc?p{@sIrKm!Y{950;?Tq%jzRLe#za#;7Fx94kv7Fnp-uTi+LJ4F-e! zBeTaiYgG=KdtcA+{?fr;POK6^fgCsiIf{lbWvkl!ggs%syNWJ48^7LpNV^5dKvp%l zeGdj8pBmk7|8Clf%gzmJb*LEzu=Lc#Z;2O948@^pjb3X2QJQfc38Gku{m{O@jll0U zvDLm25+C;!)t=k4w@+Ngr7c4<}ZO& z_5+7Shn0|H!JcEC7I^iivPKo+YVo420r>5t!KZ^ zJ-ad7-^om|6L3jOyfXKb5v)Z)_BfY8a0nC=L6Xs7q4pJEm={BmDRNPlr6y)~d-VwX zWHq@?xGi6U;NVj6A=3QG!fN;DKfxBy+QsAt%iMv8jYuo?iOw9E_xSJb5ln4 z?uk0Vl|SvAY#K|a?)*UKWxxTwF&eExZor%7;9li?rXhUowil1wOx&F>~pei)np@|9<}(8o~U?Aj5)gOuO==F#9ygz}Vn(_VXr>p(U}V)5yp1 z8rkS*!^#~OWn0(-OmWuA7jenF2>%vPVoPkPyd6hXk^Bgb=3hAd1XYyC_gqjv^eS%Z zO6_-N3@@Q73J3+7r2>-0hf7EM(rj0x!tmD?%+og4ZKfPBr6-|zK$d`U`M`~SrIvCE zx!!8fQTR-L_e8zX-M!|q7OPnH2iA*O_q*=InxFl6$!EDfQ_R|+*0xs=>&2TQo)BCW zAWuz2Hkk*FFubgPj2K-BeLxH6red!`z~=|BT;^j`v-?Tbo+@W<-KW&0V3qfjN3YL9 zDUOU&6MQ+Gg&2fT645hv_K5b!oTCK% zYl8Vo@-lIaPvDX$LBM4LL7H#{&~Si>&h;q~d4GB=)!fh@*6hu<;B&unzOUgKejT#m zBtm1)<{FGTPga*^J=T6SzN5AZ>pcTaEfz1OPZ2+_;@5hr8!&+KXZ3oJxtGS}`xp?G z4b;<~rO_fD6@a=%cj>Cdexv{bqv>Jc>nPPHW{q;(p8T0p|Lbig$n|b^SzcLZA`;vuI@Vthc za^GUS!aZ&@b=hXojB9VJG}TPyre%Lo_*X2=_7{reC?f=_507G2o|=4gtr`msj@g}# z8FbJ7^7YJW$0@P70KA+vwL1}fk*{ZV9+TAAUf8KM0oRI!->riXUJv&lh&Na^2%aS~ zJtF35bO`W|e1XE^7k4~mrs44N)9!VlUUJ^GJ`Ties7*S*tE%4El(?#?acL zls|ZNO?=Etd^&(=7Voc9`^yOV60uya#v1SaOBZa26r`_T?Yx@MH$dsNjl8wrgx;{gMWYd~)= zPC!$f=Sl9@)(0pwy(k$240~S#7jvV4$(8+TY0QuM+(C|y-6gy!Bm*}`pQW7&_1ErA4nWH} z#pSK{X%9l@i$NJ*(df9T17AH7@a$cQ;Ud77jAop8b9MVO;M-ly!vK08+CtFBPJnK} z7GI-3_mLVGXWR|mMA(?=1DqWIL*!J^M$c!FV z^SCZuoXnq^pLA7N>rIsw``u*Zir+7d-wg**R7uKjxDUq{k~N_(5IPaxN$DTI3lyX~ zWLCq5h{%rgM|^Zbj7{e(o~PMhgEPMwckZ^uX;f!KqxTrF_mVoPt#PWX-ppJ|Q{gAe zuO5!i6lU*ESn3mIRcLN;kJ4qZhi)_K&H32u=|?L~cqLZL4fZVKuX~44Rm}CST=Uv(5bkqpF7uxfb-7x|JsGkGanu5cli5!08_a>CFh>JSk z&9#FXP5}@-N?%Fme&x}WoS94!a1}dbuTzv|aI{ER!tlOdO|3z5a#W6k0jHADkLm?0 zgP_JCm#_ap(D_Y2(a*P2yxd3NYyCXk?lWz-$7OTERX2?HQQkhq*F4Lu>x5&ac6 z1_~_07pjw8L|4|&uxs{-a)PEk7i%6p|Dar+;@*19tKEn=usC)jFY{gxjBIbBs&Bpu zhI=|3#ok)MpU&VwjioY+EQc;C`tgTaz}QD|UN~tHSIAO6<}MDM?;d%PFL&#QqHnM} zLx$ZsIo&4JY(%8=V-jwrj6jHsys|t$qqV_S?kNVL^`C5U;^uoCX28~rf!XEPiK;2g zc8OKiUAPrmhAeo`2$wC3TkLv@7jacqXUH`Kb+7IHM_Q8YlJW(5e?r&pq3*gF?g$yN zz0YJp4B1pEBIH3VHj7ov%in$ChiNdEP|}8#FCAF0hsZLCfClLR5rwmGyt@pgb@Qll zH!r4&2FO)pR zFMT?5u%UL_(s=6-7ql3XL!|L7B#njxO481 zvpuQ`S|R>1>f(ESl=RdFug$L5t?eSRW@X($%TO~-;OIg;CHH1zorL_uv>!UIIPWeY z_R;zk?)Biv1gXq)6Vb~CR^F)`;k*i-UeZYR)GPvv=53X=!QuE)oVCHSR9%7H0ujxb z2=S;;v0u0f)^+xHWlXKcVPz?<+>;mgGHqbn1()%Rhu(1vFnhxc8g8-Lox+*niRfA^H>lj`ztYr*OmjwabV1{S|k= zl{~MYYQq+kEeaLs>FIwWMTFh=Mi z>n)(-=$2?IK7epL4oTRqfif%O`*Sr)6Q}99`#8@FnWUyi&w(;){Cq_C+V?E9e@Ibh_#rn@KX= z>CK5XKGBdVthr>X56i;Vfead!Wk?EW9QZcJ&9WSz*BMl~RVh-~d7>5{hm^HF(nE#v z?OV@-Ozx+H_{MP~nFz9q5R>_Lu4~#yZgc0rTA%9&+wcH+v+9kY&2}utIUSo6*X#I8 za*#b}wmOrNZ@?vC_2;v$jknc&5^Sv^)~6I(s^{>$x}B9HFLjP-HD5Zr_jmBS z_WH_7d?9l5i@INqN-zmSR&)waVkpt<2;kJAP;Z!GL@TZ=;$t!8TdpojjQg5`NhWRG ziiC$oN)_Wx&r)Kd*UZy2uHI4K$C+1JUtY1OP2HF<5aMq&CGQH>*~Ef?+M4y;C78YZ z;^WzyD`}6e2J?F7A!X;e+gTAyvv{RtW96YE9Gh8X(LXUKUk7N+Y}y^C*xKQ*ye^E` z@#~19YcJ|JzJAbKA-oHd9-|aUR*raRtv$ZI?!0?88_L1fMtudWvY(lDud`G zWV^|^2wtbP!;t-F<9;H04qnO>kVfW;6=^8!q8QE>nJsipUMs%0#Ik2@a?Y~EG(y%|7{-$kRlDdEy%fVU7RwI6rD zlOwp7ScRgy4O{Lx9F(i%zw=%R!+h7B)OO}hqL#BinKDnmbBb5-guUPiW4%YQ*B11@JPvH-#L=z>=kD%#T`Js*5HAp z27yky^9#RDtxm}B(eiV8i>QlD0qF}{PjSmA=L$oYTRQ%JNfhN!AvP)n7)W>)A>_Qg ztw(--5+hnlG9BJ;+aEPcF;0;rNjzt?Bcr$PJWX?roK5ka=Ys{4?@!;`FNN1vU6y`GNp@Lwe1*_(h?uM z+J1{rKQ0tF(XU(k$Zo{9>bOSVz-9U$+Q(Z;Cmda~5+fh<8g=co=0E~sqJjYk z&u^IB%!k_1?-}ItPYN8q@2%XLjwz4*QkZgG(%&}g;MY*`63T3Is|$CYkQ}2dNsRV! zb)<-d3n5iDIz`hw;*fh9cjEqZqayvzG@&E3=#x@Ls9{ZKbIT80{V!Tc{I8nd#`V-+ z)7Hj^b3wpX%5xQ)w3|6I1gCMFB8KwHtG5-?`0OHd(M&fh@6a3`8zTlkc2a!c}! zO`p7}F{ww3*ytY^_%wPV@r8>8_DG$(|dKOfAg6wgq%SeE5YA+US0699$#MIKLG*S=lHE zjiBOf>3le#xi~D8j<3wRn3!Hk_Sm|$DdMl6U2|`;)Db zW6;U7%!bCJ!r8;-LQ(N-#pk6yZO8?|0ol?KJYD%;NX`HSu|-CQKTsWXi+p z<(8kUc#P-jq2dYCd7DAfOX;|l5Rtw`clvla@pYwK&S#S)&;3Q-bff5`dC6mKoiDQ^ z_Uy@{mCHQkC@3@&xz;v-YOT|souA9*%ARuhVIn0nHQ9SGu6kN|J)A^UIze^u^OdvOo^K%1n;E zgqFkms1=tCn?mKg0O#(0U+6f~wG`bsa&D*a@#S~;7hJBeN1^ZjLY2ZPzCz*nKFfVt z;1&yIO~wyX1=4j?4a@4(yPZuQBoH40#56R9=pHplR-z?2k_JItEXJ2nV$Oi;m0cf z0B8hzHVm?Wwj5uezZ38bCl7N77DsL&M_V<9%iLZ;kcntB@g0;!GEC}qiu3THj;DAY zAIDcH*u+%WJ{BF@W&!Tbv8E%}Zp|ZaK}VBcqbav80#*p6+% zq?-#JUk+TX2_8og!xKoU^5;XGvg|2~PxTG?!1X8VK}|(L^XbzuMOf^j z7r9DY>Nefdkh2U$8eSFhcg~mX#@9aX`u!qng2d^LJMlj5H!cW7UcrVRAkPnv)pA&P zb3$)p8Uoo>H{fq`r{zHCeb+`^#Cl%*`+*1(&SrYt4QKjEimqo+B#iC7IgYyG{?3dzwD>P(xk45@`ivMpi>d*Qb9HWqxM&3W5hpjfG}n}Fml z9G^>&7LF0!NvGYesa~yH3{t>)Ty{xKgJA}Oe!~0aC^$hP83kV>M3~Ev4+lJ``QYt!rj(3NZ%xvD^xVliiOC*Nan4j+??g8eK z<8M27cduO>F=X9++iI4zeJOQ(#2xw|ut+;8i>y5HctNVNCEoSJB!OXjZ#~&7Z}I;2 z3i|x!J>Kb>e8mS;Z6N#pup0k2pB@*<$u-zpC;XTD_x3nw8&O?)QE|g*#G$kIGt`;K z&5hz=FXBeKXiz^wiN?V4InL*wwS$IM)vJkG5&hqy+O(QJycJ(Q2~1oRTBxch+$GTY zndf{v*lieby!dS3z_JN!Rx!^F-0zoPZN3hRL#9WYWLzYX0Hc3Mtvd|kH_)!ctU z!F2df1_k^RoTRA`6<6>f4}E=!+AeE&B?-%9Jitbn_F7zM^Hzs1#L9*K`Sqn$80}1E znzt3x()+c*$74_JX}9r}{phFo<@bbloC$-+JhZ}f(P$j6XIe%H4IS6fMXwhhH1c2P z6Z1T=f3fPq<>$u@KfYbQ?^9~#B0^VH0#ZxSgDB2Tw(0e`U5eQ*v){Bt+ZQ~GuLqO7 zR@@kdBSuhWf=~|%VgsD z6>-9Eo8_eja~v|a0t6h98?U>oE?ep>vdX78ix4!uilcc%==v>XduPKN0X!oHy<9he zfh-lDE}t!J)~TU^4y5b>0m6$WohYDb>+d#TWG@eF$_!?QK=F57_G-DqbFb8|5msT8|}~deh;x2 zS~o)`fRkf*Kg&@)esJ3};m^u^J>_h~K08iYxyUqPRyv8}}^yBZskvk+6u z5ItTwOvQUvzXaIECkn0WzC0xNCL#BkxIui)`>#o@KL`0fr6er6(3wmedv12>9(Y#C zJEI{peqXLJrx?@HNI=i-&aRmIgvo)8S(fyb_SCzKMim_A{r40yyLC#7FTm^}yOi^$*qBx9ckriWl9(vY|~gm*MMI4gM`$T4g+0Ol!9*WEa`zjc0_1pNiNA64S^E3gbfj8H}UOXRuR^0Cq zu&u-8k9b!RJqoIfpk;L|-B&yAB=a$sJxEg2s;a)Yn(pfjtMamyop)y|Wa}*r-+oI-@q8oF*p{bdn7N23J7`5C&Eph>T?F>ccgw#P zo^Wyk`s1S;pUfXt3L(lTzl0t-&%yeS^#d_eWZ zz++-|{GECA^CpkE_}c5UDi?M&{%WdfU39u)Gg7YgGx>36P}hgpk=cH2{PXV)4gVe? z2@pIjBxFH1zrT+W0=mW$`z3yxZIx}rcD~kTP7agcd4HI~|1=dJB9`f|*@J*Xl!YSU zfBdE54&2Di{OoQ9)m!@Kb@YXcAqJ%MP;Yk+W?>P=GJ-YI{$ z(6=6i(Nx^+%d`70fje@H}mFQahC&Rqk7~_z@-?M;njiLw+I1Z?(v$$QN zmM^kJZ#U_>l-tP#vedcuraiV3)u!~O3ij*f^KuMj;Jfm;3tj(wu@7qY>sk!D!BU-h zo_J)TnvWd6_HL`={>NwKcAoPv>9wj$Q+jaqpB-f^V-iP)3SR=o_j=#1Nz$%s1nd25T8Q z?l!LWr>~N496?UEyaD!1DB#k+uah5RUd08l;hs-@mCe+1V2{UeJN%~yf1sH^FbQYr zD}>{B&~CVH0~K8MO{N7K-8T9{;g6;g;q?a2hg%PKqE%~p@@w9ucw@cO*)oEJeoTNMPz2mmprt=F3CoqMC$?N%J$ zSk9(*TQlfMa;Nm0Oiw4H@!d2j-1ME)plv18GXZ`WWlBpov|TnSU@hoo)>$;#Bb<^M z1BzB=ubyhmBFTPa0tANV z@iGL4T*W+97hj!ap%f-_`h+v8JUm)Qwux!J@BHaz5b^=?!BirePSTdB{oG^i zan+^s5oB4}uDYUtn)hkbLj7JVMA?klZz>OZYHu$Y)tl#f*_t@orbaZ2(;omcAMKzjegr_$zb2_AsAIG z^caa-m+L0mZ7p%sNK;+Q82 zDR~7Z@0^cmCNLBg?Uot#M^&OVnO}cNDjJXK_BfxKaev$EGcaDFcDaPCNSc@!Pa&Lba2I*qjRr zJdq9t)5Zt|2%y!OF+Wls^IvI68R9F0B^~>IKzJ>O?U*s)!XySy+*UR@yL%GyN%PoaJK3HVV+n=QFinVjsJ!&9>Et)oovedBu^-^vXzlkv#tdIGhx$P7x0*9N{v& zpZiErkVs!<$)0S2fXD2xQ;X-Jsb_Vfm=B11zzDJS6hm^i@JZ6{8wzPJt5_-XlamO0c!iA_Sf`wb$4YMEz=BY#gld}9z>I8 zBm%Fbl0Jn22)!bK7`8P{>UQhBrwzzYVf&9+NDl;lXJ8sdE~U~u(}^V@t9WVPh`(c@ z+`2achvQC9c@~SH_(w;p%=!#niv!L{8(kk-sX|{oeR4d)K9QniFhkk^TJhhKj1ZQ8 zEV3kWhL?}V({HFzW0bylTmp1>5+0EhT%{=VgzR)zkU7}JWs86iGzun5#V8H(QJm9 zJHg8*rG@gg#^LvBmUMzcoy&4)zkCz z>;;?M5GY7N7-Q=}E_6dlqDpiXRPrwcEKV%_#!hxU8^JqDhG8>fA}%LnjpF!fGFZ%H zWErb2GbfH^rt%r0q-muTU;erX8_@!eb$fSOn55uo3?cG6)!{s-!08aLLn)7lr5^~2_8hV1(krDO{?g{Ex((vQlLV?46#vqpA z)(nVNWiSvukJ9r~t0!q@U;hubAPT6VogNtDGUr;}5!0hs-SNBjwX zeqoq%O})Qtgjsk>x2mW;rq&>=-vYd!##FYd=C#=}zK(6KAf=dguO-!9HYheXWqE#1zd}=vp>K!3fIT4om%GiA6PpKV<`uB*#T%D<(8C%9 z@8{X1G47yxwTPV@7E1Zx8osdY%`8tq*=M0QU!G3PToxlJqat#^PeKr<-+bd?2cK6u z;Y!U6@54dpx$ql&(76KxAvoiK4P_L*gAWRYKik7+y+!o@Uv{HNC`8>JrC%Q-A!9mR zjOb?Ri1_3hFt3Wn=rhQ*9R1%N0_>0aGJzdeH1I5o$OwedCxKhweW(?N{U86)Pw)$b z<`rS59|{5VTYq0Hod_#na&*KjeXE~3#?Buy%)g)Zn}inAee6v^4Kz`P1L4yBewclk zmOjuKwe*0YXzafa_wTXc`H57Bt|@+DFa!3U>=&4UxIS3Fw}BZc5z{+CNbj5NX(I#a`rT#8*ar=Y z!V9KtAE!%16!S?&9LcDGm-bjCo23Dc4$(oClQ;_6_E~nXIvgljN$h?72Pd_(4yJ`(g-z(&H~F zM=>XO$IYtH9HRe;rNF2pI35b-fFbr1ID*?s_$+;KcNTwna1f?ku~I6j6i(|`SvbDx zp7VUpnJ+rhU^WkQo|{@%j$C&zC?oE)KJ6w6lN7h1BiU?p6k{6C2Q01bP=YP~-*^6Z zKm)!4!7ZesuYk`73EAu7aeKefsa&uUIJd@hSsJqrS=ETRqOT1>^vO-0H?VQrcyTnJ z`Vl`9v?#@40{|>_fWG~$WdF0AZ^Xfmb654?DBq(;o72Qw!VAhyfGeB(gCz^83-bxSJXq z3gsF{Oy~)XGlU!_E(_KPlDUceXl_3L@5}N#HxH+I6?;7xT^@{q*T@0y?7;y0TACIW{xvOt5nx#P&C#){(y#sxtKyY!##{4dLN7?cFLPt$!MO87iqzP#L?CNM~AZYCR zJN!*{+!c>?rGBR!=lTVf^@Yu zQ!ytzg74MoaR-QA6>hq6lP^L_Pmq3)WWO3tS2SC5Hos_X=59fvL&yKs2tZ_qsuD$R z;C|~S2=<{}U74g5w@aCDEUiX061C(vhpC24{r4AClq649$T(Tq=L0Mqwqp!aYn$0$ zw~AoWQni!Eb+&jl^h=JQ#>aqLC3tO}tds_h(xhjruh3k2i&c4a*wfe}t!wV$u8Y)# z;)Vhde@g{WcEMQ2;Qf1lS3(prsFmYXn%0tdDuTpp8kQvpSuWdaWs|ZTW#t-jToL3Q z-L>+LuDeRh#ft#g6Ks$3?chQO)P~`+(b+S`Ycp3z@%=fwIik1S)BvWx!(Qy&||g zPb7_?@|}9J-(vdwa?5Up*RqRwbTt(BSM?DkLHoy)6Y@aSMTTI9`Sc$8sS+~c!k;}A zz{B_2ZNI5DF$2C4#B=Z$;79$o7^TQ%+O40qt?n)Y;1EBpNr`N(r~4Eb>AXmyl|VbV zQEoGjxqC1h<~ZKggUCK%)4U(<^7{NjmiDsOv_5b9a4^{ep^a=?X~i!kX4_d1XURFC zPKOI=;K(A!yI9H=e2asM-s|t%6t7|5`-P)Q_0_mlj+LNv`DPkuDcot8(`=1!bL@Q?cP&#DI_4?w@rNUUdVgyGl7D$LS5^0%r7pKl^*uqV&K8^t&s3%nt>D&r@NIQ7d*uc z0a3goYK6s1cgW$`D0^g(3T_>Qc$f^jZEeXUIXG~LBv4n#uwwh_Yqm$>QEd7V$WV6D zgo&&=txLvQWR_9U2*>TN`-;PjJ`Fi{ucP^%YO6~zWXu7*4Y+Y1vCUAjaliN@Waw7+ z@4#=@Pc&G1xA;t#Ok>KYkz{_;JH=hB@oh9;u9}VaqQ11-pQ^ZjLeS7FSokXRoiUIo zwIT>RG8$zEs7zHnIgNR=L=i;1S@WsbVL2%Az({9Z2AdD1q~NqIs#Khg!qb>yghwr> zh@Th%Oz}c>{zLKNslJ4Nq3z|?fHfaq^Sm5W)APE%-_7!Q z*e_DRB@q5?PX#LBN&h%zrsa8<=FW!@YycC02J$`fW8++g&Bb&sZEM@}T*{(o;_-_gBykWx)dX2G7l1xsLIML^SESJmz^kdc#m;>FUrGt#3o4?_B(4pNo#r16#0Md7fgzbGT{C{i|AyQp zw=D4<%@)CzPVU2?f>GEcp3tM+t)7gL25Y*XGcZYura6#ujnCHvNgO$tFk3963~+^e z_zK||*1)S|m?a6;QGNY;!fZltve8ee5lZqZd;t^Bs}So0bPx|@*yY762PP^iY6l-d zsQ7Knu0PB`kgwV6j9dcUXm=qeu&vNhOFV6^V*e`$96?W+Wn^eaa^&B=KHMrPmFWDG z?1d$6uv%u^9C#0@T3m4x2#$F?2FtCO_3bv348gE%JTY%TtFzJ_lkpk5u~I@XnIAAO zw@oD`0TPJob+Xc2YdT?1hYkDNm+6B9dxv)4Pzj;nz$OM$RLjvo$mlT_cA1&zG@FDF zIyEYF28c%|Q~lOxiqRM~Etcd#3;FEpCM?w+M<9V#M;^7;5JqSA+K0%{0pS3Ml1XIK zYmlHIxL>~3KpBQHK3y(04FX&V4=`-j*i^$b&havLx1jN&Tu z4k{p}t3djw%psYwYeB;d7;w*rgt17m;*2~$?aWYag<+uK!e*YDEE*YG-?4B6++C^b zZt|^>vBb%Rd1N$ih}JyPe9-%z7x>lOQ$U{0g@KmRHf}?w^>~{GhU7j@h$`9X&#jFl zr1YY?pSvZFkTUQFRS$T2Z z%r=1MA;B~*S$c89E+bcT+`8&e(l`AXn5ipuS4UkWQ%Y)kW+OMe_f6cglvGsbWa1sS zDyT}h{H#ThB8`T?&F=ATz43Yyq4?$*)-t;x=~`fF`0GZ?IMyf(pg2L@iP^JN2G=TRchUF5^%v zHjx($0RlG!BV=TRM$(FO0I+w57WqJb!?P*nTscvk*mE!J?BC1kH%QX|LbyJSTIqSy zT_yqVDiPJ=lymvxD40M#f9U! zAaV6SNJP*;lM+f>PwwfB{9WP5ZT%5B0-SS-d5dJA~=J@dceJGjgMK0?yLYHq@~278~&3cy7O*lCYH zR(5o3kU+n-!k(Yp+N@HswMd!}LRTz3M#h9uq)Noj{ya%KXG6`xEJ!hfy9BcFPSFq< zTLh-4(o(i-JG@lY4I~JoHuGEQ2elAV*WFav&T}i4TDq4BNgM90#r8q-1*(RVNtdFp z>P%S8REB?!%76V_D!4DgK^|EkQ{Wbu=cS|3^YS#1Y83?q3&$Ab^&c$ee*)kv_`NBx z$Y`w{g6PA-(9*5?dcwKqUhDgU_<=Kj`cU@N{0FT#Cqou>lYJ^BBwMs6GmdCWxEX)t zTd3(FR?EOCv-DKsn&m<4NI!IQMWlIab~$1^HuF+Hr!!!yW!S+Y2t9YzjC*plupDV# znI3-EeG%Bhr4_MbP$K4v`~W8~8~5!4275~1bV~Z6+<`1k%C&w0aYoD0dZzIKA9eg^ zQ#Q7rsIX3I$1FMx&)}+v(p^5@0jh2|B&j!TGLyuIEQXfuE&=G43j zQm(2{vc5*gYrBNf1a5a|!YKa0$=F$WbDUJ(f!$aigshvaTvtgPO(yz)OLUjr#N`Sv zA{b7-ox)!6(}&?T-{x$?cZ-lQA1;CpS8=4<;Gf$je1VSV;88Tu+sT}Kqi zsddZsiG<*SDrkyxAKzr=>aUWKB(CJxBnfIJetT8r&&C1w>%#p} zqhj?m^$QU%aUB_!i4#A(wRyJ}EqT};%@n-CsXDK^#5S2a%2G6&G8e}OLw`&R<=K#@ z_BpU?xZf#vxVKsZG^pOxqxAifz0Y9{O7mnle;3v+Y_)rUrqMvmUt7Y z#f=wn4mDobBwku>cXFo8^HT2-0O(3(&5X^PGsK@R`tg2g*Mj=A98_+2<;bkn#<#h1 zYlj5er_=|$`co3w88I~pR9nkoRSd{JJ$d$|EcJ~o8;8Vp6{9WWrGbU?cHfl-M2sd8 zt3f_}&sowjx4l5lw0Nwe+1h`QXJ@VOn|(4^GY0>tj}5h|iT;r{EVHaSgJIz<^F0nIB+Jx@W- zFXNfK-1{xvHO@cbp@h>Rc(?rN+(Xt3Y5ybjN{9~!$5g%jYnhs*B&dNFi*r|G`fz!$ zNK;I){5XbI=T>D#QV*nUIb7*o*^!ws%4S1bM_<3!Qa0zLm`kv%ti71cAl)>|+!fzD28Aa2ES=nS#G#uC}UNZJcaJ^PuJ&UJYG* zdLyrk#<>8|Pv0ii8Tc#7a10qB@79>CCadw~?P20U&nwFaaIOI3(l8DB+DGvivMYP0EoM=C zE+@F3a+@(C-~10a4#uwq2nC4ho1n;#m$s?$WXg=EZR&_(=QLGm%O-gSFC=Xg$2V+d z;R_>EIkUcRsgKcZ^|v#m@zz!^bW`$_#Ng6^2Aamw6&>o2e3t7YWa@d;U6ye9=`<1f zRYxt>SY{UAt1u{0Oz|MAR9M>RQw1K<(DJmXJ5S*8gf)5NjxcWTPNXCylUe4+Q8W`E zF&`!GDJ{H2YtxRcLC|xk@r$E0h$ zIM~M$^ah_(mdE^iPVELT!2KRtXSfkR9pdtMVS2Q##>SmmTG?`>rS3BcB60V@&Kf7n z)R4Oh#Tv6y-?FmRmctw#gs&O8Y2-p?J)AX}WpNHcBmOYrhg`OlhC~~jC9aoPQDX%fLy8CE6gKErqznN-(GJ(#`77f?U>yxdM zMrQFP)uB&h6Flb=<9$7zb-=3tvZ-#HgYZN)&r2i~;f|&Uk#v z0{ZQ;_mK_*tmPneE91b!%k(;a4Mb$I?&@a2g&9LzW^Bo8eSbeOXGS~ zbFM(ky20-iX-T%ll|Ief(VUuyL0t7u4fSX-w6(G2W5K-k@Ys)5NwYd^HVzdEc#CNq zIe`8oxbpKBCi8OLv+|pWN>@~TT(g}Ld+3lua_1Wm>d7Z&HkwG*?v zt@&`KfxLt7mFiGo0dZYFEqy*aoQhj7G9ri`Ky45%TA=t_3a+ z(cbCha9QL2IC83H;Y+(VXSK|iutWs6%w}edbPcN|-7eRBrbO8VP{%FrFgID?uedGQ zZE7xSlbexoGDMnsgiBuQ%+PIovqq!)<+EoYPs<|+%sZKLNmKx5{YT7{?LhXHqn2wV zF9=iDF$H-}T)1y^Ezu8`-@3lD{cnoPSKto%XV>xu=(Sva0;};YPJ#8*IxUzH%<$PK)%|TS_#7Y3~5iXG0_#3Wq4n~7AA*=oAoG*>&D8lN!6 zjyHeJ)VtmvsVhrEG#Rjmg(~INHO)i7_7dN=MF8@ZwWj|UifNfWkiuBsTF1*HcRy!8oklw=SVIYH? zY|aO>n!Gf@V$4GKUSazs-n<{(IeK=+@Ga(3d%%K1^C)jx)3}~WJ7+SQNq}Fol=uK1|trDZB<_CT`afZ})W=I(k=OT6>EK_=~qxly}`wC^xjwDb5& zgD}9TNX`j`bp@q?6J(||75BSWZT{g+P6%aKUgN701s&DQrq=!D?3$iNnVz$z4`tUq z9M~l_Dpn1038HKCnU{Zqp_x@$Pp7E5jjyXzB}cs*7Dv4mY7PGa=6*yn0(U9hv+Ve=X_gww zql?PjEQsMP#rDT5A2jbbrP{>w8$CI&Xt>>mJ&A+G+SNs=xyZ_MdqLJ7?jB?lv*kz$ zcvTcSd(QaZX1JP?KF@;R`KOo>l8TYKmf%PG_?L+$;$v`=vb6eu_X`?UteB7NKd-ZpSEX?#|Jk3=Kv{xp{gnGVh$#Z7|)aJm&se z}WqyTU0DiOXumfbffJz~RgRTKTvS?%EvBfZ?!ms@mtJp>{x7~x0BGgbZ4SqfuLSP=O zCilXSb9+xuw@+fATf`8@C-TR7Cp;S$d`plt>LjxA`aIPK=69SsGn;cP5^uUBJo%ay~;vW2uok9qB|d`91AsGn##^3jIp?#kb}=@pvcYM}7o#>m^$+=IauV8%Pq`_3eEisr(9G_nx!EnOHa+ z$!U3Ca=m2MDqSd+&$%W^=);#!6CSJ9lo_l+nBP6!yjK@OLJ4*{baGHx^3^>3>l!m&w$X97iq|vL{&KQgw z4kOa!16q~*Ny98o8Brsjzd+#k58Nf0`S00yI)L%L1v>9ung1$*ZLh z=uENz2cjfrcgtwy()z@?75?HaV;MV}NqfXB%s6G(sSHY9dnO#3DOdpQmY4*WTUTcn zU0n$R^URA>^dAS7LqE#}#0y`jNmSz2-90GyF`x}Z(f z(_KfzFvnr~O1K87cE^YD&K@)zE~k87QI|Xfiwb z4*!t(F8JFqA@qc@c^n7S<8Yne!hx>)##fqgjGj3?Q-EMeGs3acLH)Sm=IR$_9Sba? zo=!SX@oFE-hk~upyJHX8-7+DiH~f%wZFBn{0wsc4{*H>2b|BBhSs00+6P z3LfD4<_8zLV8le~dd{N_fJ^(_Ss)J`$CB%#$l zHCxO#_LSTn>_Z_cQie@aKnbj{h*303p+|Y=s|Gk*invBzDX=jWPWP>Ddj#0)N9rW` zgIQHCS~j~hIL>uUy&2COc|1x=8QDT|K1OQW8%OZ=YzY54@+jbu>ZZ7!YeTs?4yMsP z)FSH^U@{4A?EUazyyHW8<((qJ%ekLslVrEK2GmEH^5uMJ(i>xyLlG^Za`C%K88<3? zkG~-}&iOmi@5mv1f1f+gI&T#yUZkDqHK8QAoak^LwPz(Yr)EyO*|9Myh4X{T$Ct}1 z>uUy5exDTc-Tn9Yk(eoPu_V#9PCk?FmEH?GiNR(m86zE*=p# z{7>Ys_C+te_%^JFbia5CM)5%N&FR=#{TQC*se*_B;MA^Fr!sukvG+wE&~X#Wlih>graKA1sZ#}5}}FWI~LC3 z)s^G4dQs*%&*~gX)~EXo&1~u62Gnof{<&9I(zA~Ew3X|UDxtwmke|%&bzppgBaLdx zHQj=#5vAx$q2uase7Jk~Do=Z8w*$Rz%vT;0Rky8#ac${DWEkjBN6bt4AgWB7W#Q&5 z=eL~X)&sHTE-6tukIANBL)S0(aImAPfH0;*S$&gQg2}3#fLX+9D~k)XjQqt5Iw_X|5YMu|6q;&`iAD z4ig6IVISX{aDK07)Y*&seCL&fT3UFrdwPjUK9fIYN$Q4ivYy9TYR?mnen{h$k>U@f zFoarwYFE82i)ORX;SlM_VE@jUy(F2SV4o53PgvGfMbTvq%&U;X72PpoUhM}$(s7-Q z{bEWNRY_spN+{vVN;2wFp24*N7^hs;ps{(;+h^Fp?-W$ZyuEok`z#hB?aW zPKKEz;Tv06hRKL62WWISphnXtsXkRrb7fKt=3c24(H2;>o_~tQKXi09*?UgZAASeY zFdN=dKmep|57hZBHaL?MIG8bQaa5S*&*IECNJ}~W82R!CXArgo(j`Pld5Q_hp-80M z@-Prix7?Zw{YqCiEbI{1VV|QsMZ>IeWAGstlEh;tm+yHqZ9yTLkEryG6mF&<_EtzT3-h3Pshmd}T(@C}zSc~KAJy>w-R?t! zLWB6He?ks-N#ERXvZyv831@Wm@P?E~v06GI>-=OWNl><^G%2xXQH*ADKChG}rQ9cD zspu4PG0n1U#CO=tauQ}M~*x>TftnmX>V zjVVe|eo4=^@3J`X%ek%NvM}hr6QpqjFJT`X-TP-`u*%VJtZHhkKOjskI8c%$dO0n5 zq)B$NRak>l9DqBBc#7tR#dO3+cP|8wQxZ`_`pk(O@2Ic))h3Ez{7XQ9y9FVi*(74h z7k#W-jdcIa2|Kr_goA6`R684ug6T=K#s!W7^L)y9&CTECDM)ZJ=4y-+^b7T;87GZ* zhd+~Gu8e?Sxs!P~t5IE{+y=bQ`3>PMX~<`lo%f52VSzFk9oYuq$03XfpRqd^8uZJ0 zbOC1g83x6YECk70i)>jF688mt{q8SGQ)=0}aPLB9J};fNj+AOHI_~Qpd5A1r)-Gd6 zI|6l~2EeNfX0(L7gQ~#xxmGJKi*+;@c4RZyQb9A6k@y54W38V67@J>S`KtlS2CORJ zEA-9Ye{{pE+Y3nvHPON!&Z--HHimad(>^JbvJ7MX|G2sesHnQGEifqEDbn52-69Q= zBQ4V1-AZ?NNXHNoLpRdhIUpci(hdI@@B4k1@7a4lJI)Sk-EW_t?|0N= zc&>XYM+vaQ0&Ct17y}g@l)Y@Tcx=7cZp`IwTLe`~ZLHElb3dgzr0ldaALxHW3dOR=Ppl zG}$9d!Kob_DH8et*+u5Iy+tIarHWsYB4OP+Bsuekqjgy<2%Ag53@$b|WR!`X21 z{liLkVFr^spb>-jTRo+y4o?-x_9yq4Ux}vqGP@Gh+%`UcKSQFLTD(4MFTq@jE*=~_ zRHZhMku@19Wpj9bl6|GS^-4k=)a|}J!rpUSd9Z~4Rj9d96}cPDK?LdD;uLQ^NVLJTcvC6skg?R z{~%BZ26qb5(g&nY6LAV;$Ve*03A~QqtTQ>fjA}m1C~A0QvXWa-6rP!JK}JtjxLxzF za=r`;Dyqj%9}J^bP?1&d=_{da1`*&!6t#aW$U-oXk?cu@j;Mi;ponBn6b5~=I=XB< zo`|xnHUvefRpiE7*6O>k7`#`X8mBcslV`dTZKjs9!v%t`kWUoYIwCI2OHtHev*|&e zoZgM!KtNMcIM7+VWU`=GQ+&~^c*ZYE@ z@R(&o5fVP~Y4A^_gu+8gOcnNsrEI-(E%-F{4q6JFzXrEkUvS}(Ywi0GygA}~-Xy@^ zi|7wcRB(Ta_Wk*zMaG^2{Tmci2mk;bu0$~5jADg#gAS9riex`dW~@GKX6$pwgU@m-8h(;DX+<%#BtF|(dB$+B?_%kOzSy_I^2l~cQ^Lbh%P|F52b~C zSMyHp?- zC*snAd}vMc!YX=kI5T$ugLz==J>Cs>GOp?%qY|79O{a`Df9QG;$fG#1O+wnsHOE zUcR`g(buR}PZ?JmTU(_e)=ZwUDJdn5(Kswq2{9Wn1-&wqNlIMUT!IY0VvcyiBukq( zZaIN~o`aDE7lDniT1V&1@AkxAWj`xC)XY}qAvT_kUIUohWcI5ojnX4w z2F}17VY@g!e#gE{D&SA|GJnS5JAkW`GOUeOk&5FcM~$Nwj(L@s{F6xme)R}>5ly|z zsqA)V(_8^g8p@f5{8aNQ3z@A8=tCGf9XY(XfLdeJ{TeV__~#?4OstKQ%`lW#(sSMB z?zhFD2`-}92vkbz{@;EVzMM{>31;+xG@bVFgtBMsdv=g*Ba6kyR$cO^H;@!Jlv^qY zaX1+@q<%LxD#Ht~7wgFGSXIjYjCU9f&qX%=1khU&WQeW9r2U#IDdcbB3#K6-r*0IM zc5PH(Med5)iSX(y;^~{5GCYYen_|w{`teZm8a1i5P5;>yMa^n@rbg2j4IZMKgzZHhiw13)whB2f zP^pK!jxwv6eLG9kbiqGRw(&l(t!yjke#_(qF}W-1H{42)QgGZrtdnPN~LLeRbgCkWuKo zWmWn`+D=d+ZY;Nvvi4Z-<6E>DhvGdlc=DT?F}D>}m5Hm9hUH;}+q1WWTGTAy7d*-2I9Fan%Y*F*s^c?)x=*e zSqr?Sv1eOyOl@0K9z!_;So1k+>XFw6#0$$)fbqhs&CzERQ1`A}>j>CE<+U;KjX3WV zq$>+#mk)Q--=1`rmKzA+V|0~oDfWGH4&;EAj${2mnO>0uh8sEA9Fe_RY%aQdY5y)Jos6w;CD zl^_)mk95)$AgC91!O$Cy5|T?91|_n+l<2)~AIImx8`TZP-W)oNz+GIQZ#F1*bI0%U zU%$4bcwtjR!_sbe$7Y~k+tfH7>G}E2DK02} zpdO_%Pk#sbJj>#XXOW@HB@#pZc;1blKNTrj(_44rSY#1?uA7NF@mX{?XbnfYf zFq>HMdwc@WaZv+Hc?((%~RJ(^Q9EAuG&lH@uD zQ-x8Vm*fA(9AUMP(Qb`ELF>p|C4diEw872t5VwKDJxBhH&^*^;-iAo9^5Eu+YVS%% zT&F&gNoB{l(v~@tsVQP>2Fa$9I5kUKIEKUb&fREL;%jo(+n6=vSEpE^TH^2;i%_}!cO$7-Pu`tTau(VK`pSTI^fcH3K73l}%Ct~O1N7rR&3 zQaVYzd!?~m7C0lj6vyO08rM4;bXkB5gxP^jwY^!=DlhECD1?4jz+-^b78@*I?w_2_ z*dOm?C@hmV?ukmY7SEM<6dH}M2yqMh2Ih-T5k?P{Tj2bBBW013o40%f9VpNLma<5J zEn${v#L8PY4l1tx-Sm=kUJ6YzwZrw;^|Pw2x0d(a?E7DxG~Mp3=byV5iWYq|}S7d4C~aoq>6= zJ9aGuUjLFUt{46S{@ zzy~ZVB2|EJ|$!0X<(R1UDUra9~)z-c36?r8@BE#9_?!C@~vC1$!Qnb_5hmXJwu!A zH3@+{ob-`q3wz(Tl~q!ZAP_^(7(a@B@46yIxS^_^o2qc@Y#brkqRiD(`V=LRE$UXc zlQRax2=JfB+^P4Ts4TlrPAwtVm7{pSGyarGPj#vm7@GJqroGBXmzLuP&nyC;W=h{Z z$N)aJ>D#3)1~~8-S}Gk*f||YFqQX7f>4Q-GqN{~e&W`N?o`AX2dp!i6ojm(tnEEK3 z#MxK{cFdKU1c|YBNJCX!Q{FvK<};0I?T6T{m)!I-GtNmV1ra3lp`U(tKnd~dn7Z2A z#T9$ONjwSI={tgsVrjcSmGj785LPSL{3mbmys3)!w}XucQxtz3%E*MLZpNPKGp)EC zbdN98M+pl^4&?J$vdx0G$G7qreZEaXpyTjigm#!TS$; zbnFif+OxsScKo2&HGX7$9= zHEX`vW(WR%iUY|o1Yn^S5Jj0F47`Qz5ZJs&Ow0~?WGl$4IsRT#oI{DI#=c^PmEE%V zc@tZhb+Al)Q49jTDi_bxNkhw4K%;`sV-r%zXWTnW$xfpVF5F*bpLEQ4t9cS`V zTCeV!)rt50^Ks0q5A8C^EE&ej1?ivxBO^u2e9mEI*ZS6U?g)@&(*@&88 zE);4qs-@|@>9r#Ru}UmRDw2tfRPu+JJkd?J-y9u7q8TS~NE7Yf_~EQAU+&dk5;P!X z+C`}y0Jb0+3ZL+lt&G+%yqhdFo%2+oXN8_|flaaOy11GLlGGpPtLRu!>vrhidhCA1 zQ~&+J2zy5DTS6PCD)1J+$E)DHIPAUzXQOUAA&{0%omeoMUUPE?%Xa}7zZ<1VWbs3g zJz`-NiJ{j2V~TqkUrA~ghky~>OhAsJ%JJ)Wuh<=gv+)~|m?I#^j99QmGa5WM3)>RR zxN~_59qNYMOLWAXA^=?e@(I2CX1AmDz?x*1IOf|pyTHD+%U3Mv<-Jy%G&_#_g!M9V zH0CNfpxtbE!eVbvX`#HIQQi~Eej((4PyWe95sZ@CrgUGLvWvZf(n7HVyCfW+V^(z( zM{ftxIn_t!xi`6$`o3IUbEYj>5w%oav80J}h`mvv|GC4^Pb8hED>*g-J)i~GGsel^ z{^uNgf5ywYA7%vbu<@D#?)s)Uc!l7(xH<0K&IoUhlYKe-F}A_Q0!*wsGh=U0t&*vu ziR6J2?Rho`-BDcBr#E6j&;m;U8q5EdT@EAgHWQw)u}X0!C#dqjE9x$XR-HhmR6HSM z@A}@)p!bLMntoM03W+ilMaIC_$OvY3=<93=*KsPSY=J^dnsp~1EEPd zt5PH(xQ*#Zvt0Elrhk;&VNT-W6#JM$O?=QP$u1<2 z=K9Yr5B2r+wd2?^C?rMN)iPJeZEyaZ9Iz@UA0^AMA^uM$5Q-`6uP8z73uc;8y!P)Y=Vtk2v+L+eogJy|kvF6%DKYdcyZ^Z2HwvWv0h&*Bt(suk-FbP`r>) z$LxwJ?AZ4l&B(e@Fm5aG~nRGpNK?(Y`SoWw4JAvpC(4ykMTa)v;{7 z3Nr^M)G-*r*%sN!dJj0^W-}t0&X{2fz~W?T_d6v@Ki{4onGBCL2tR`7bb|R!(|?Nl zV5mhq3(9W@9Td$aWOKk>=9_x!rJw5zmVm%?6*pLhwf@(eJQdRMqx~fUYH+qw;|0RT zO<_a0fqEQT-{Snb@D3sP#6yVP#6)#Fb}~7{|3kfR=>F4XR-Q>pTJ)-8^dd0M=91|< zn?P1zfE8Iiisp`XA?Q&*c7I-`SEVpa?}Qu}{QN=j`(z-;`A2zihARF#~Moya2i;f+3h#mv2sPLuk0@mhsN6PE>eA;?^S5pYHKH1qQ>MNe&sUA;4@h6eaYio zFn^`SO_aTBOeHb0aHnFf%{WAZioFUer}Fkx_*?_W;o_vh$Y5@2xzG=WTY?1%^)FIE zJGw{uu{0+HPuq-g_0B0RyKUegyZ68A1fy)2kdHQ!qbsq&s7ChrSNeohWRGaPuRi<>q^SDg(bx!V&@n_M&d9vyG@ zZ+<)qCM;h%!%JL&q~QM2=Tq$c5L>V#>7-82u~44Tgi(0mYe+=TLYvuSu5Zf^h@<|4 zZV>(tHaK?=MzD>;cAMMD`a+Fq`}KOblE}mP*_eokDMY7HPQ!9z|BD{h@P^IjFs~y%WrEue8C<8kz3s_n=1pkyJ(OMVst>O zpTv=vE_V!tZ@+3B(rO{*Z4gLh$ELoY!XdYEK5;S~V-v%j@RTn#~pRXllKb8^PN{g#_dPH&UY$TgB)R@%|e3+2AIxxaLO z5&K-VMea(v)WK|7YF8`f$}V|1_^w~5mTEU;`af?^Xtx>x*PjGOfP_ z)>@w@cz-oc5%lOghrcqSk+Ro8`PZPta0NWq2tTS5(p9psCWQ+E^YW*5VrQ3at4VS_ z5l-opaC31;IaVc}T*KriWmkvL?`mJRk1*#R6~czu9}|lqY`yvXo~Y%mVil@fNu9<| zGo^dU2{I3NfxCogy)=$8b-75x(=16uMEDdPa-%>E%<${EguFEN0=ox`B3%oz-a(ah zT%s4PQiF6(>%>2Hr%|AxqfXiWI|Ba>hF!SK^G)F}Z=r0_F#rVhZkJGDtA(i?F)+mx zJfmPCf?S?d{Og#L47T^Qw|-@};t=n$_t!Jdc9vhIu386P10qpWkJAgr~JLd{RHmO^pm zCDvMDodjwrgkXbc>hrDs(wduNTnpZ1a5iF+@6q9wOlo(%K zh_j>k0_{eGf7!-=RoNd?Mc5x^GB*+VMBO!Uu8YBRCtf8Hwa2^s*!$bz@QQ?dGIcgB z?%~;Wm_Vk{vjZkP_6r#zS9TH%4BPXMr z+Ojnb<@k<3hb+^h;B(eWT9wbg`f6i-!a5~j7E!-eL62;e~;CdH2vrv_ferpO(mZy zMjDg+6L|SYT1z|litp~gQG}N7>TE$N%kfgrQ`cv|X4L--(hl|+I{!9^<+_+YO%3cSI#b`WyifS zT&pb@we=k{rUz$@XboJYF{3tRWzAoe*0@(5xrY5%OUt%I5Pb%V;Rp3}qy54rhyx|~ zPQthpKFXF?l&)&A71gyL?a-b3)4)-YVb+`zAHoeEv8A6bA2F0C;~p!vCvSKnR;6K7 zwab#AGQ}ePwAMD#6kAZaGJfJFv1q$}Osc*)OxNT(JfRf-n2T*@yZrMI=;QIu`K&d& zY(-epvcBBH=3y5)^8+j*sA>jB7Pl1BSGRwK)^_@4U~MxJ75&P)yt?=b{_RxhZZ zPFzTus%eA@C|U|^vEMcMa;BllhR|5=5zxK>XEV&`e7yT)IyA5)*qVV!a^Ib&i1xV- zvW^^?naA216>#zP8lt3 zT=&iCoC>J=E*FNq+avMbw;7kGe9~r0cG7asVP8m!Z?oDStekmYRCXx&e!Xoa+`qav z*yd4RMfd%1#_ih1-~hyG13Qk9HW`PW9zXsZJ!4o0?kCYE-f0onWKB$MCMj#om@6sM zf~I|CXC*RbWW{|??%rk|Gi8&zYS*_=wb?HzUnjH%^liE3DP9R!+^I@k30 z`WZb9hyPp?zm@-jkzvSK=mVzmy@o;d%?coV*x5ndG!3b-ch}? zJz|4NCftC_Y1i+dPbG#9ki$Ll*N5&~~AFLc1Evryv#6?v-~i zkdax=S^mI#GV2>!qcwe@+T%JuS&fwKA-22n8B{NzU{r;Vc<_)e;$?a#o;O+++;|XXcqZKGYuGQc+9ii{?_b7{~izA3sNdM#LYizP0mWqQL1WxrYO#_FJXc z68#L0C1{qxoo3C}4_m(Z#g&f^E3ZGTi1Ke{uR(kP8S(;bd6PJ>~O-k?< z-K7^Ja$W&ur{nntZK`{RI&cN_~v*x_!LW_pb+l{G&jbfb*oYPN+U1#6FJ z2ed0ufkg}lFD*xu0y`l|>PI7ILuWm@$Gb7#Coj&3t-8i4=G>aNuS#*-*>{>(PO$!vE!7sjg%Lc?Mey?xVVu4Q5`R6F})_L zfk{dWWnK2bTSIyQS}uG-@ew(Vg+~h|>+*}OxWrbuvN@RSrB5LEj8)aQzOCGY1Qnwn zfT5liW3E~n_J>&?X?c_G`6PQrB=KnDny5V$v>nWRdW-D?-{fH1mNNy^36kzwfr=9l zVJ%vlMu^MJANwZEW{WyChP*O^KfWSPce}BWi8qI5=lhs;7VVx*-GKO;!GEDW8O9Z$ zLbrM1QmSZm96{@(<_iVoS?&yb#C+wkjs5USjPGkSZnA_XN~C_ywz%Dm z?D6`GI$)=8YQkg1BhD}3_pZPd$?&mz?zVnDq~&DhrFC(Lrf&`Yh2j3l_?6c|uf3)N zvXu3SxE%6}&ARaq1w>pW#K{&J7TsUub=-rOoYSljLrfitPx_d$Dy?5e>Mpjz~bXF5=b36%r%r2_Goawu$-@LZ8 zYFVeh1f4j)Vu8&hXL2_(-VP!&RRJ+A>77rvi7f0tO@(?_hJi2TR{o+F3&86-*BZ-X zx_GTSI6S~axfZ$UtKQos{Br(e{#Wa&&IJGD!$j z=e3Qy0N4-3lG?I;+=RW8_$L=&+p!=*( zHv3v;-%~{MwNlSMceGTN@1Wo|;J28OeJ3Le&)poex8`$<9sm3s^`m4hywbc<-f=jZ zC%*X~b0sdo4=`@C+<32jQ%#uo(EqevKP$3&9T6;)YARH-9lYPp*WGsasa5oP1v%To z&io{tgGBSOyH98yPnfcp@b`_@gZ+4cc82?gP zWQ2&&32>B5Iq*#(+eUAWE60X+E~P^5ezWi&o%ZV^gQ}qjHi*P`-?S{VC z8}%~#Dl#DlFcd#nxuL`4vF zj$2!4TR%(}D|ZK@ld>Bjh#cMyaQx^|o*<3ggZ*XG>wsm2Z0FIBI8 zGX)B4@5YB4C56J8unde%CaqiO)TG>8F@XO!N-`FVo7y3CKIku2{%%%^rZiILD!$p` zo%vVzWuN>QVI0RqlZlS77VX|N8*O0y;W4OhAHi3Vn=2LdirzsyKTv)fkB?bu`2zkk zv{#^?;OjW;Q}g6(yVGLnfLgDuO+b4H&&R5msIaI&<&NhF*W^%yvGwpcj|#Yii(2~4 zwHWmP6m#+>ah$-Duz;wg&Tn>4fH#sv;#%KoyuNS7$ZcqUZ-324q4*4!*!~A6g8z3! zHyxbF05lRR>fcO6mW2=!fI)5$f^;Ie5g1j}**!~!*k1(UMLDs`c%k`Xfl>;(ly9~c z;4h4Arw?6v#37ILo7$qUFfo|PbH!1(^)~t)Uy! zeK`i`;7L;E+(pI3{dreGg2_vNsJV5GE+jwlpP1EUGuh^Vb+mQ(47cG`?IX6BQbh6PGtwhZGVe&fGei45-NVE$>T7lRvND-Q9MG3|F z2#lS|Fz)*?ikwZUr|}#KPY;hbvO}PYdbJ2ATn;jfJHQWQ>R_+XZ--0_S~%y>e*aJ` z6|sA7_L$B}+dHwc@)*b}|8&j@^oN5%c$3_*qb%_}|9tRlVJA&&{}(N=LPguvYEQ!T zsm^1_C4S(y-#cJ2dqX=`DNlrc3uzN2ivxVaXJ%NuRveg^W2frg=Y(bIX7F_W{VMpv#$E`fX{tiJ2L+ye8aoBzU>i5;Fh?vTs_6ZZ#F- zu;E*yC}FpX)V%F3D5E0!)c)lp3>^Q*QZsXTAJzd|Z$37?&lXw37=C^{u+o^eM$*xm zu%|hZnoF-yUkbG1AF=#pCgTGV&{N_5`{nkPV|2vf*e}fhOrB}b?-k|)~<=>C6)P7-T2{<=@srFyen6t2p6!l-% z7$KOjt*xzW9}!JNzl;Iq`KrfuDBfY<62tU!{^9!HU&cU$&CNzdw9W`YBYgcqR7pvR zYOxR%At1!h%VslX>}yL=mL{YK|sc#P*;d708|pLv%w`f#Luw3`nlKrAg;oCJ+S zIKEcP1k?gdp6g!{4TnBj7ZgTm?=hytev=&m#(#8i9lr9_{7JTL(}s;p~G%DG^Wkxp~)@Z*u}LSln?uBVFlpbM8%tipg+vffHhz5v{eeX2uVlh z6wKQ6uyt_n-bdzAKr~m%vY0#2f!bATR38+*5=$Vxn4`1BM6ejrDzL>nRK#vgsJ~Io zAB*>~YuVRBX3C_(cSTp6$a&ajD|6(AA|IGIM~q&paSF9l45&Yp#U{;ttrOpbzKH`C z^i8%Krv3x~5n?*3nrJm@Ef%{-j~T}<^{DPnwt0r9YvDlIZk^VH^;as({(pIy3=Q#1 z?uOlXu2FT>c)zevm@CuWW9Z_5}EMRqK&y(fFdO^^t_Y7!t#)#!=qsQm*WTdoM+qb^3wRt_z z+sDBH`o6!>%fEr9LlI~%*SxiI(=grrs;!Mb_6yWEZJ_aE<*x{474Id&`enXU+kP|6_NHlO zeOqjh2Hl){Olg{;bN?BAWh{PZW7HSIZBAPoR|y+HlXvaoMa+eIn&epgPndb7axbQh ztQ0&qBWZx&ZMj+~WTKO99p4abR<}LIlto>tMO(K1qWQ0vUjNhqbp}={WMbPHx0~0y zzV{P^hDD)l^V&%Izz~5a5kokov`iY@<;v3=BuZ6VPt$mW-i57X}Jy4kZ1(5_W?)|v$d*AdYwy(9C zhS3Um#Br(f`P{kmXQ?a3*V8H~@68?V2vXSh)HXZ7sZT8a$Y@?>Acy&fs^6f1saSS` zB4O(UbgInBJ&_ECd95z~MIR7hYt~*GlHKLH`JSpaCG+80FLm+I0B$?Rxuj;}>vW<> zPor-3jH!ZheD~@x?-mC>&p#$fh#^!ycu879QdvZ%P+iZ#mUHj%iP;V~0srICd?SOR z9SWLkM4A!8?*1)i9?eHQKwPw_%E(UC?9Vz9)UK*!_jAAb-97H;q}^KNzl%i)py1#A zj0+RQX&U*`XZOtkg*Q~aVSVaomMZJ)<+mJSUebhzbe3h9m&#+n3`aQ^+Zk~b0a{}C z&C==qQ4P>A?3dg;Az1qvVZF+Rmn;#1qH&o0qcd=xt>;#U+D$RSv*(cYR8WFx!?D!K z1A|L_4f~bmQdOP>`kp!8g0Q3gKT6r(?U(hoJ<;v*8#dev3pt*_%yB#Ukog7uHz&o61H!UeowgQ4J$M_7Kk6$7s*W-xS6ld&^<_E~hYxI`CJ#bX z)Wz**ZkzSB9*)9jEw&HJe>QH;v{`_k-^C_wE|4%%VE)540CK1}ian3Bp~CSmyzO$o zh=g%#6Q9|SD)i=3uqGa5cCiYtCCu4FyAUPI2hb#ov}@ikm4gQNqdY6GuaO;}5T$Be zh5tb{Dhwzmz@pOV?vncVTBkGAft?#PCUrR~bL|2>n=IR6bX|`anU*RQGGjq~*SPfB zRpkO-2bUPOtQnSPx6(8jGU`im2Y}Rki&E+{rkq8^T@cUJ`Bir3x1p-JmXNlJ^gY$y zaG_FNeD5QW+S9R&Zy%80&?55=ws7t#sD#`e(MaU6Ma$m%!v!#j+x=+sKXBZQd6x-40RT($ScmruahD;U>e4d0FN} zZObzO>Lyp^yNJW|Go4bT;s&~oF5Sf@Xmpm{>W0?R+8Bh)B^M?qnQ4vNK5mh`vjrv@ z15GOBXAMSc%p}mGF5f7JC$+cs>*dJV+a_lQyMRrAbAUxl+nM2Z9tq3RG3F(GJ$-HM z9|<7CrNU*fu#Tr<>Im|c$FO!?d>Na&)+MJoP-JnoLebsOM?%4VmGngecZV}OR)6Vy z{h8UxC!n6zJAU<9Eddc7ehuKQWa?1nZy1TkMDzZuAgPF;w5g8deYg-84eiwgT{cLM z)G=E|C(l*-ejk|$*N8A*d6kl%5EyS+zeSR|+E&C%M-G`#YcD?NZn?bxX{nyn6y=4~>lsBdj&p)7Z8S9Rn$R7GxXmvZ%fnWFQLUTXgGl$B^eA!v>767k9Tie;J`VHtm_2x;n zD_=w$t53-wMk@YAeyljCxkw(Is0gnpn!h;9?|C_Sau+!TyNNn$v1MJ(sJ+OTR%f<* z%#@f(6CF1i_I6T#fboZ?9SzJJ>^>yZENKZN)#F4SH}!_4r8~CY5Po{A7^BCp-!>* zqWjyxNC}OeS<^TiM94qHq=XWaJ&NDE4x=9=q^`|4y5tUWh1lW_KxD%$&U*YyOh-YG zn%a+g-yJ_EYQG0Tk0JXQ-Y}0eWrW$vQ91e4&HC86HWw1#e6_ZCEal^{YOH2=051#) zpffXgHg4&mSA5T)4qr+Rk*zKFVsW?f_Ds?x;%QZ%;RkhYp!+Js3n$GA$PW;=%T1YZ zWu$r6IsoMs?$vE=7zFF)Q?x6_tg>gm(!NK%Uz<=Iu?tPMsyI|vXxzK(#l_Viq6)SN zf-rzMbh6BNg%ZaDy+J}nL(SkX8%gICW#8-v2>)X?KLdP%qB7b`LU^oL(W|xDgBvcF zP5PDZxUBn=Hp%h1Q*U7Fku+Ch*%D$SNQ5x4tiy0Ug znaYIo+g1HSjgxvo2YRi$&yMe{1#Ysuowp`Dv#uFQtcH|LCM1BB#F94$3oIm^s)|QY z13@!7_T_qbuc0F1G@C6fSyoOxVy)P)b+E2+<22Tewt5#~0}j#3V&x@XH>xOil4WC= zzK0zdh&>|5mieRip-!lON51-EN?<9((blZ59UmUf3=1{~Y7L;lOYViZ!YaJjKRqMj z-)GdDE*wdr{sp8eK>+7BY_c7qa!UDm$lE{XN_r& zSF~v=xJMh`RFM!27nn2$Yl`b_6UcFd4|*si?lvB%PMw)@5s38|S=Z1h&0scP&B)|w zfoztTNzCoDi28oqpE^%Dr-!HSi!d;Oq$dI^oMoV=r0F5xxjfHnbRhgzBoA`nPX6Kh zdJ|1O7U7@0qbRTufgvn2e|@XlHy@T=YXyMj(5A{=-EAQ6*&OflZejH|j6} zCwf*TD!OiqyWB2(_1+Q^dw4UF)DXq-p4Vf2OLM6_UwaoBjgBv`aP`57$zWmKRQ3Ynk43lC2Bs zkJ;+W@?Pr8z84-p&hWT2AzABr;(sEM&j7jNqBvWJs}8mIdQ}@bqlMGv=JCYzEUv^n zw@x&P=-Cwo#+L0QQ$?pR_;F}2u@>pblZ$u!)?(QTp*D@71@CK`mjM1N>*8$io?1|y z!ueTBMp?_?i2Hhvg1uE&KCJ54;c`8Yh@)OwV7W;j#J=3VQieNMfuk%JB1e+mo-%8w@QYj#6-xT9M}YY!PBsNpz}$~~M*a7-J6E2>L>y-xX!W34 z7EJ=PwQs~TL$v;kKe5|kOoFsL$VWsYior{>#_s;5ZOf9pSIOjpcY`mq7V|M?w?U=* zr=SjpbDe4(J0yQMsJR+dG|@Ht8e_-COPRu)88^5ba$NdyAXQL|F&)%bjb(gSEz3Wv zH6vWF@^G9aG-91cl-TJ*H)rGH(;Q1{d3%4aOH?;<$@Q?LF-|fz95Zeay@{SE*C?$3 zQlww{VLwLe?O67}%pUHkR8?mkZ)MH=w7QpWIAS)<(=a+yo5X|W%!COnN1Dy#=~Iv- zTI(z2Jn6Qz9bG*|u6moWtxYt0WIItkG*@1!9(BIH;;D{vvV4+OhL|m4db>=a8oOv3 zx72LgHgvna*|@MsWXjK}uWc$IBK2CsgNWFbW2S7;gF;E@a(XneaX%0@m^eMa7+9J9 zPQ9*#`KcjIbG#+4aaqG8~Wx^uh?inbJ_1$Q|;c-r(s?CcG$!KjCm1F%HdT9yK zT51WYm8E>3u=~DQ2K8a`!~O)Q@6>^?xNjwC$3ILaLSv2AC14wc5+0;|T6Xzr*(?F(p#KAMiX&OBM; z8s8PDg%kyU=Evu@Qkk zdI1^^Lm@aan*Lf6HKZ*^ynW(xIhvUUAdn+gFspJS9iXZw#m)Rd`mE^~A+|uu9~w=R z#TXP;cQMb@C4Nl{!4ztTu#?N>@0M2U7O>V0ys=lBadPEs1xv46C6CyqaF`s2DX3p) z-83~;cPG&)8OH(oG|<+nSGK$Z1FL6d%l1ka5-%Gx*QF~#b9;NP3l??qk#+EGvzcqn zJ`+MfOY4EL)uXapB4EGXLe0x>OUp(eQQ$!hIQNKoqdp0*^m=79n2EW93tvlM_0%bm zi$-awV64og9VEj0vq33wBPYlcT7+Vfesbi%T12NLcorz`0J%TaD4BEf$>}4gM+=r? zWBKc&Rmq6XqNm3Io64hiY-%t`Z4`KAe)VY0>u!HO>*}zE#n4kB380RC5Tt7A*7De2Q z)j{e?(=F8-tR{kOqn1`$e2&9efr;f6)r>NCPG;)ShV7Q2J-UVE)q$Kt#bs&g>X)Y! z3X(6ejgNTwmt06=y^D0?(G$!Ivg+>~1lzyP{4juT z7&R>&;+`NOs$0jZZ^@zsx!pC*>afPI3C)UA{H6RYz~;V(+gF>yXMlXF@x?wU!w9L? z{v%aW-)56#5qDF6tW>q}lW6l#?GYiI1)!(;$WgH8ZFZKk!^?2G){W*mhu(75qp7#J z{2zl9JXG$?(oJRtW+_;ho79dR=2{Yp5cAqf3n05@k~WA1mZr6?BDPx&{5PkSU<21i z#n0KNxil!<#9|`@Xe?MsP!(qooD_8tDJrG9w8AqcpE?yhX^k7$T9>~W!6f{Jgspu)_U=WsSS*w-XbJ#0MSfvCA; z)sQfgn1)lXG|9~{Rhzpoa{~ey4Kb!y)D`W=*B;`pd9bE($T14cH}H*Rzi4hQRGHxD z{v0kwyrid*?la?dzNU@B4ekHv;O&!R;$c~BxW$LTP`|4}j-@q(~hMX*F0#q=Gfq(jMbh zN#hh0h?OroyBiIsb3dj|2(?idx9p6G-mYkndn#-~B_>yGBv;ZH)wmCY#53cRCes_L zAPx#2t?RYE2s~qO7}5flUbI#(X(X9s7qql#W^a^NH%gl!KEK=w*4lrZJb}8KGN`$N zrVGRq49&CjjJccEPeIl`_|E$`5_c+`d4hh0##+t8vcS%!0({0xE|_{^NVd$n|v!x)uO#*H%`!>f|yjx~V-Xvzst(%V&I78NVW$zl^$EnGgMMwD*0@ zHKH{5-cW~8dK5z-9u=J+cVF*u@*K|!3!q}L2p=0-Y!rp7ginO_x@N+l&iTnIqJrf| zm#-(?Ff`K4&|T-ocpl#GJ?Ae z(mC%p*4IdC)nweq&Q_*{Vk>`p0WNf(6c>@nqf&P!s(i2?d1F0BqZnesXRKjRz~Rxe zdh}@8da8e_+2FAMJ(pmAvG%;}LCUl_-PS>?@2L{|E6(!mSw=7KdC+F*XUdNU2OJ}t zomiH8W1FP=bKV#|Rhyy%Gu^$`+TBAENMzF&=39$A3^oz_pz^*!4i5*=q?L9?bF=;r z92R%RT&apx^YM>V<;1kX%YMl(HN%a8louFN-p~QSD9hWWY&C{6` zS)U&dPxK#o{3*gOQoSG_P-@=G1?VJH;5=ulsmU)kT>6DNs1h&XOi^k`rE4g{~=) z9ZWoOvf7jlolaw)s(Xgp3aQBNpZ{bQ+q_jAT>=UouroW*5Xc(>*QqC|%`g(F_l(e8 zIJCB8>mOUYKTfb4*!G&6y*3B(MCGut=@SsH-?(E@$@iF(eQsgT?ugV>4(d-(q?uza z{xux?>|TpTPMda+<`S$cL3XAN>f-1AW&BdDDQ-xMt=3)`U)?!(RAAS7&c#}7j`g3pG{qBQ!4f*`jo*YdpJ!ei6BCp zOT#RJPy4XQ&CRs}CgZe)d;}-sBSknZ=p|gcoTkGdxG;$e*_3{z4^#zzl6ZT;GXvx* z;ruR|V|j%oYh)QTQ$qbW@ zC=ia89yK%=CE7o1wmOt8h&BI-ZzyWBNmFwJoPsLwAv>pw06}7qhRPpoo)6Oql*BdULnhB-*D>pX5?rT6F{@jO4x#Dt$r^B;SaXF)6&lTN9Z`s0rzEl_Cq4jhw4b zK4-2BaSL93_To~j;WiogmUiH?g2X=HZtP|t>smHUN;xg@StpvIi)7AZs;@|>tJH<$ z^N(U%uMv{M>p@eyMsY!x=#tfP;u8u?G{8-hCv?ZZM2K%^&>2Wa|-;xg_gFF zlwUIbY;^9iN{n|OrfpMDw|f}2pI0+YL{lVyHJz;C^Uo=rw|MoF4)5(Ls=nknh;a$y zWv-rKxvoFnDAz6JsCGgG+77aE-dRY7Aya43TrpQ34G8+{6bqQW60A>87VY{n?K)gr zg!2O)Ow2x^w`11$%Ccv{Tv#7QTD^SyPZgVPO}&0a$*AA_Nwe3C9n z%3trWyYS`P`$res)k8l=RrE)VKLp5rw|e527Y;HTor|KtCiZ9vE_MHjc90xV09Ulx z7w_ON>#e@-c+4uS9+%oz9Vz-&>6*%EsomPby+JSgHnE5;?QwnXPCNXm0OEktGjQqIpK z}M&hiJ0>eaH8)vwz5 zFnXw}g1kHluWJECcDX@2^Em-0fR_D4$A?=k$AVhI!&hb0e@amxGllDLxTI^FZwsbF zwJ&hcqpUOIX>@q0``HY`bjTe^-b`hMaD!1~{w%-*CF0x-!s62fvJ$cI`& zqHme0F(DV@=rt8Dvb~SB%?SjgHNc$rkl0+d!D-H^YsP6h_G!I^fCh11^d(W3=YqvP z{?{}k<_oLT)WEPL9o+a6mnruJ^B8s!K#j0n4L5EZe*)2fgU%%q0eRK60(P}nVCR@Jc(7P+a_DKhL zpzLCJ*ukRUg$Xex8VO(w)-{!}ZT=AZ4pWa}C)t-;>P*ol^FBxl)R3&8{##x)dp^ z?Sdl=d~u;%zwUqL62?G-6V~KDvguox&W1TWc^QC#0Ic~h;I$MUz#U#1((f{tnWuNz z3U--D+2+PWEs3`3XJgUs?OdQZXyViGu9HuL!8r|fj;GOkSXle@%mcg!1|9(|Qvoc~ z807YU%G1BrC2v9gFpzv%x9I(hA$gi9q=cZ3lrjS;xno;j-CVp`itxd(LNUVC;Z^vf z_XJgSS|A^ykc+Pl&-7*>`?L|j(%s(M5fjpb6qrVKi;9EHA(@1j;gMPY2E*4D`XGapH7dBL*Rm0Z6wQpy~}6K^lCI- z&islk?mPeitpR|#n(r;L$ssopY1q;e8^p4;-$8X}G7p1Dx- zdcnF!K>4EId+Dg#VaKGhuS;jMesvHU5_8PwW%{2`Bh(#D>+Ac9QD8ox{Fan4Xl=>N z(Br)23~oS_w;7?4Q>tgnTd~nD266FxV>0Nc`kwy9uLor~H!AO3V&dYU$l502QXLPQ z;1Zu$IO!QUGL4$SJIPYpLPuf|+yn-<#!yg_$VBhn4>Yz6W6vf7l$v~eUo#tFckr3SO9u+>qWez%L zmMiTD`a*=l;ghJVYqQ867yM*avm#wn#p)t7UvwLZc<@L~xp79gV&3)Y1Xq#U`09w> zv-(4h&uobhG!vSwWQMe7$vlt>0Cy8hV;6{i0+ZoTEn3atBlCO2S>*v&G4F>$)^&M@ zivzq!EY1A(&(>q@t{f%c)62>-ddcbSguz~GWk6Z5nB^sEw^&KZy{>NUu&K1MQCEXy z{H!m6P1}gYJIj`g`yN%g#L6XGMLTm5TB#GBIpL&G28y$_$l&f{kHz23F&ZSJEzYxbEP zb%);Dc=H*BK|fa{)5J7d)WSwWa@cUv&-ydd0%+dp;n#%#v_lKx$f`QUtkPO|t3&s< zEUgS79IyWBrcK|$14hKEc)7n229WqZc?Q&L@i%|kbbtJZPye`K^dE5Pj)SemNT+u{ z=KO70QO$J6J&A5=YGR;vAobuOi-E$3t)4Bzev=eyb5m!D2q-&JcnitfRa_ zarsQ(=dL^wX2qZ4z|qq%PE@w2;}#09ogr>>oULQoNxNp&TXc2Pw}zIx)dI&MDIce_ z`PBn6`sNE&$NU!EKPmj}{SlA2$KI4HA=opm=3Gw}uKHcg{{7fPO862ai=>6QA<%pPH&4<#-!~Q0Pw}NaH9|O#|ogAKw@iqY;0yD8T98C z>@*yu=*mq#5i~27`WEl$L%%)NS}JQ9165eS0BXsIGGkg|fAZ&v9WGioXa?fU)^hU; zk&Nv(b6Ntp=kFff+47Y4;j1c`oQS<)8YNa(D%NglSKx3Sr|JpE$xtfCA6Tq5#f&E~ zNH%|c^Q$eh9~XW$0(-4Nn)0EO2Gr5_LEXrwbxc(yS+<1`Sk{Ek(G`=~0hFuSdiwmy z!oS@(1y4RUPNu2gaS$0-82xJEc$=J{q{UHOWa|&Dv>#wqX`1)7m^Kr2q03_5d5RvRI9%!S$n@6L3BIqNwCK@bg*zP@C@Nh)3Ab|iCdxFt3M$B_ zA)nv4drmCDIF8rty?YsL#?{mUKjhZR%-`2>@e?c*5(%zxM6u3?%?*8kwQ4H&P=C~1jJ`JGICeH9KR8Gi*Unbn3gF?d`u zdpB+au;pN*-UXA;&ebGEkCF>{E1df{TU(N$vfLm|$!e;$Pf7OqplZWL?Gh!ZLxUzw zYgNIduG!w?scIijJgxMMTkX|Hm&&(QXT_w|%?-D#wi<_b={mb^IN?sJfncCc-IH>; z?{(SVX6b_Qr~3nF-9tg}8F6Bj!`~3k61LMcgQ84tydQcn6{ZbK`&8Gx@BVXw5Nfb- zBGJO}vRpS~i>W%()g&p`5AJGa_w5(cthdm(G#D21e?v1H|771aqRcYd+BTAb*}=W3 zoLa6 z);_ILiWPYFP(ux1g{8G~x>Z#kVMn|RE;^320SDf_>*Wwhf^QQ9+w@L~urq}|`m$M- zZRD;vq5Oh|M(MXbS{PF{(_R}E9|$$1pYra3d-E(of$-lvFoms&tLWU362*6=pfg^K zAKbj0z{IjRnz4UOoxi`#^SSu+UH(^=a~pMFx*?wCp$knePEeNfO`9--c7!gLw8Qic z4R4?YoyV{v;Q6(b{~wU-0Xv>Y-e-&BRu*4+f?2Td#}6#isKkFU-hV?-pi>?|ZPOhc z@7&`*0spSZJscR)lnz$XxDu$XraQB>K@&8 zG-!kkCwn1W+AAWRs>ux`jB()|i9`ZIwt^fcUqii}CB3&!6B#sWQX#cAcr9k+B#3(o z3!M{+L!?lFH>Z!Pi}c)Y?zuS24t5Dc(kAL+1bYoE<g$r zgbjOL@O(My=AayMIE8zLaby1-#vYd}Np%6RTBz)%Z~N;PB06eqxTde)^lChoC;Cn5 zlojXgRUBqks*Lo)QWt9U z^?v068Oe)_IhTWQ&q0|6KnK3be>}gv<^Y|dWw@>SF&Rg;#W+MVl9-NudRO9Na7gb~ zozUeBmuE^SkW@H1t~Pp)NH_BO;n#cq9xNBlm9pj!xu{->@&-x?c{b6InIsMNK7}tc zDQ0->gqSzUtqq`p3B8 zjgwFL`2f+wwmXIb3H9+$RO1;89uhzc&!rX$d6IYhZ{VnT^X94t8x~yPOGuB_O>yFy$Bw$@XUgLh$pn9{t6s6s9HrGgsQv&dRc?RY=)4eDJCrb4dG`vd&j zOcoj}r^K5v(KrGX72B9p8d&_Dzn5t04ksIE@oj?&*_xqpaux9()AORj6)F@7X8}!H zN(^8O2MI-ko2A!9-rcXHoPPEkh|vR|W1T`~yK*L*yvN#ka*yQy%qiKl&`-B|x09o+zs=Zlu{bN0E!u`k6^0FjYI*Q+R zB|@A73V&VB%-@_RIJ@F4Ln4-H?zSa$d{Zf??}L5vpq;$T_(7!ylc)sk=E5k;tqxsL zF2Dim;K_sM!N3lLV&d~TS8PuZ$L6(gQl3wl33@uM#@z6&Rf}Zmk@6cRK#XT@Q?Z!H zEDN85B=BtZ4|f2f^I8(XO84HasN5S%M6OIAm_=ucaRq1`p*D!{TXK?h z%gyS8DnAX5*5}?qDuT@9F~Z3AAIkNmoh8`Xxj}!Tey}fu<)&s{?%Z6}qisk>vD5R$ z)mYu`dw>NZ_?DA3^g;R-1YM9@@F%-G1@6}~7+)~Z(<5;9EA=?@n_k&f|&S!JQ%Qnh!jAxSrC?|EwZw*vLWE)g3^GCA}Jb@ zMvwD{Ckd?o;)$W8$HxvD?b=&@3sX7L{oP&ak0dTs2I{ib! zm(p!!om|XK??-t5tBwWm0aZ9YY-N7zn9QTX!ovLv{U|byN9~>md22qOI*|gEGF$CG zo+V{tbiQj1Gtb?#BjkUx3(T4w9UB7}m;&?yB6}7iU?`uftBZvc){O9amx*r?|$J!)+SGH3leHAIVLDHPT7(R9~mVHskyxB>%;rWjlkb{(g@MkHeeXH#vD~JY>;*ZHd35y(eGsF zbcCh-bfvxx;Ag_Q#@96ri!6Ocln5R^>37B_wen09V~*U>R^854`W4woqao6|mwT2< zqzVevq+3mYY?h=ttUAsFkasI{mNl7=NsP-;;@wF5q7W&#v)h?kt(Y61Ld_U?sS9e`uXcvB`Z)@yJ?7tM}m>wmtsR9@y~ zyrzqBy}5v?dwDPtH7{^tb>bj3!$y6xz|dO_%2z$i9ZUbvP!4 zg|ABALtO2mK{&^1*L%^Xva0EL>mTkRvdRm$RGI83G&)#>#3y25Y{L^Oqrt9IDv`E* z65hS^E0IN^*BT8iKN_=Q=jjxCeQk=gVMp9^itaCAB@H`Dxul*Of zY67|m|Iw1S9tr0*O`)t!9IW9-sd#E@o6tC;ea7bC(#WJ$8|JbmpS0B^S#7+fC$t7B zOT)f7a3H+SzIa#PUovIj#qYK&eGrF@_WdbPkd}Ld+Ig!CfaSWi1K))LNruAk>J1mG zj*)hbGyLkWCPGw`+%H)|`n0iW>)^bD2H>nUrK!i`2@z6|qwYj6!pAMl1c&VqBHSWk zCL8ycggg-$HS(G_q=)*Q|nt*!?Ek8u=>f$es+k? zziNuk$+!oUvQLuJ0gZOIqE1#gxzu%q&3_&mwx3syG3leP70|nCvsGq_uf)?ltjlxr z9j5WzXLq?$wQf)6vGKchS(=)gJv5au{Cc6y8jsTls)=LT>SK`nw0ny%)h7Ca?-gA-Klf$;@@fXT)C7&?U#+FMN%Fc?HW9M zQ=Iuj0GzBERZOh3f4zB)dK3svR)n7ZX!&@pxuU&71zkrC!{lmqjr z3ES5+@+o<^6sT(AX7D$_`!eUJOO-eXnv0HJ>vYMKN>7>dMR^qVOnJ-_R0Lz^&)^iO zStz#oUbf;fXYN#SnR_P%4|nhSU`)rh!VmXkQdgNeCnw&zVS^Fb7MZQR3K#dULWxHRdpGZA zt*&UQ7JI1N+Gww`)*n?br@i$JJU&S3fcZaGb!u=fXUMOuDePxXWIW^2kbmrY z+ARKr&*>+Q2fCg40y1KyBEUtzFbM(%5eNkKCo2ZGDO>+Spa63~K8{S7G_O|#rllRE z7+h>`I{7@7E)*YX%&gQPWpP>SErg2EIw3G39XmBm8lid6<*}#JqEL`DHJwX4X96p{ zI9pgWnZ=Gw0KSa6-TtTCoOp|p4Z8oFt*|Q^ewkiU-Rw*q?c?VDG5cvt>GgdLgK{^y zdJwr$Rov8Pwwt8)<&#Z@PzM5+f12jg;wYY@UQCu;2y($4qHnG}W5>)_ss=rK4g3w< z!#P!aG9M~5yGf2<_^$0aaXk)??z#2cUCa+jIo&(C{5SU0$-}M~Nu{yF(k+CIJ)-A+ zwf_FTrNZQ<$lA^0Z_O_#ps!K#%oX9Yxp|IBNd??s=k>ShV;n`X!7O0GQ-+HIkSh&9Gb%_og-j8zHD9DJmrbl=nfhd(3* zWmuWUyf&&bs9{XT<$+bAh21iC3)b-o(E9(Fid(A@YJ`~Vu}Zh44}A^EdC(UR_~QY+ z+?vY#+~U@)PfV@FKUM*QQP+cWGGBEn>!UH#@o%f|Yj9EqOXEeAmKT3^f#Zb<@&t>2$&;$~mP1B`Lh8&ZSCFr^q2^^e=Y zrnR^acyMN?k1BqSI5e`n0X2e)r^y&=!Fjc+q2}(zMdUZHWs7Zs={3eW6G9AJdi1Y1 zo1>UYT2fA4f7_syO_L?Wz0Zp@W9elizk{3FKHJE4qmjhoO5%Hhn(2N6?oRQ8x@rl+ z+BJ>WhAmt%@L!NqQ&U&+5qh+K!z1w!d;jI{tpfI5i9AN-+V$Jm7>v-#3wV!X71ZUP z?3;W%#u6(byQC@iZ?bhQ@B$|PrV<}}_c>zOAyyBPFHwxQ4Ots!I_WxRyjcbpId9o`OuHF4!)Y*2iQ6 z9(VPu7b7Juu=;uBEZ#gG737Za)vN@VH}QMC3#CZbR)_Q2g2O(ID?K%LuRJzkx9tU0~XN z5+QDQj9=+tTe|&&Pb4aBBnbVY)4Yiyvd_nlH>;Dr4D&8}J4*OlQXIfA z6d30Jj$!Xw2U*~Q24c<+K*n~*Qv>Nw0j6~1tK!>dh$jiJYJepn_q2aa)B5vJ;WU8i z-L=tS#EE<^LGH4uaz(FbbXOiK1_X0g+O6&}hr55h6&Ni82(#XpC)|+(`1LLAaKq(D zLZt^D^Ze8kA1yNKPt)N|@o)mbBjGYTI4RPiAYM@ggu{UW+IKfG%K=$|6Kuywd|@fC zF&`dMtBP;CHQi{`MZwx+)-`nUzO0YLEgA&pR;{;aPh}JOl?k}3Qhxe&E+RA7t2ENX z13$d8$K;#H3fuYz+9ge|z8UJQzEyjW|KRX#ut8=j0p+PYkqd-)bU&%~##SWCCJ~## zUtOh4+}kXhRHcpdgo{k-wF4ClDrh#p)Ei3_Kpij{|o9*)Qv=7 zfo3R_pCBb+G9mt%Uh>)a02wMOP^1I-?i#}6+`oc$1^%;@@;24vjaJiNR|e_w^#nGB z{s&mA44g_3c=}Q_tH58Ud%M|>0?R<*7pgvt^+HLt1GLn0YS{uhi@roe&5STEyBfUm z5|y5iB6D6f%@Na{?VFmo`0FfnN$0_=&=?XgQyD?k3tsWALcHD@WEs9b8*Ei|m%o zXLdc)x}ks09S^Py)MjYw3gejgGN8)!E+gMM!?(oTFm$MGxNWhI>^B^?Y}BqcN**`% zChJdp-lWF_6hPxNKis@n{;B_{z+R^FvYYr1N|x=~=rrnZ^@%=)-W zy?(V37rnu(xc$l!XcZ-{3wx$lc=9&?5o7t5++(%Yg-;r?b(4Gr<;w1rvrdKOE}%G+ z2T2^0$u$qwf|H~8>#9Msd_T9dQ4viN*)O)Lc(tK>3?3YCb#`>jT_7NXWj^7#bG4Te zIu^1CfVdZT*+IA2&~igj@Z8Pde6-d-Y5!1gyy9~Q?gQTHtbI?M#$K6Jf+_JYDN{9J z)H;-Gxo;NRyZ0B;(sWxrBs)rF#vZ0SxN_UpCAOb?xikiv8Lw1@sM_rKFVyWM7zLF> zI`PJNEj$${7Q0@t0Yleju7~#-<$}=O^kHSdyDXo8eh=n z`(357-@Mp6z)(y%m)=~6ML3e=Vb4@yNK5jcShWC=YkGaKv|H4WSG4kO*5;V4TK$s1 zfiOYXB-g0IcBP6-1KoE9=3>cp>kj*P1(K=o&($?1Ax*z(%FWf=eBPb@vyvt{tg+Er zzttlzV{fj;o|l0shEqQ+jeD!#p_ zUfo(s2>&>-!@*l@Aj&5Iyji?a!p1Y?7OWV2lhxMBvFoYga6wI%6`!Q!v1Rlh0>(V( zywSSbfN0xPX>(_j+>4IQ3nt6&#i@5Iox0fM#_aWnn-mK9Ose5{nbntJ=A@Yi!y_L4 zyFfL9`U=S79@k$&dvb*a&-UB%=8Kk`J%zOC#YJt#`tYpX_&&Ut9eaR#-FMst>-%&R z_x_s-4G&impSNWzgs76Q(l+&NjHg9L!=6s__denUei)Fav8nM12fSyPn{XjDG^!Hm zNWQm~lkVw=68Mw+#nLfB&xn%OpBa~q)Gyw2mu}Sapqpzk+J#y}O2Aur1`|C`m+~{q zX7}R+z8=dCV2Lu0rS#y=$ZB{fY*acLA(KYPLtO8}RY^BNu~E@S^EMNf@&vPc`Fr)dA{aSH61e2qj8n8|u-7%uzKK9@y=yG)T-odSGSPKv?m zQY8Btk#2uT{;hlDUAU3z0m7L)bZpE8HeLGwk2y`i`?x=ueQhwEh|6Ag(ZE-K439cp z$fID{DzjOHYf(&RPUR-HjH?iwv83Vn?7f#pyIheIS-jNH!LTI zqtwbIJmopuHKG07{)LVtFfJ>RqzINsgu`62p;laL_HG@ucwBJw7*--?t+Tw}EbC#3 zplPGIJ}!0e)$G`Nd0$G_4tsTg(b;$>oqE7>_H)BZ+d&~V?wU@e4+o6_==@LU4;qjY zg?=1K4|=DN^lt{j6FdttE>AsX-t_3B|hhrjnULQE!38H_>L$ z!)uVOXAg?B$hFu+IkW5WVsQyIwY7Z)hbVI-mDe-i%b+?HtRfqZeOf^ z`vs)^4hsXh$rlFjt{{!yuDwqn19UThGXSb0IWl9{o*e;SM2?Dp;^_{MW znKSrIU#)-OvlwdeS2Wl6tiaOFliUF^;kQXFE^)(gaqN6!OD<8yQt=fVVTnU5*r&hq zhn+S#W5~PW<k-XQ^$>=U!D14)c$BE3yIbl;g2eo?IRU zte6s`#lwBG>XNMqw|AynB_X|6Whqg4uo{x`+pdJIZlgQ$1jvwYw^>$GE0ANd0$mo* z-t%_9-<3c`|FHdk#{0l@>(?VP{HLyWj^lR)@huQRKgPnJ`d3%~mMg-5Y$N{7?#3PX z;`c|$w@C%ni>kXZ_w8E3ShrKF_7=w6?t02!@%`KwdlK*S) z+?|+&{Qs>}_g4fhBX|AAU^v; zn*F^Uw?a_e4kyFH5Y)Sy-rRn7pl=#rMIcN5o_}*ETah*f9!^E-({#79_?NsQKvML7 zc_38Gfxw*7?=WHk0OQM%A?HYm9Qz;R3Pq)|>gyo|a#V)_2uSArAG05kls#%?a%Fmr zjZFC}AVlQJ4-!WcfM%~0`*(Z|AOkrmTBzSG*Wjh)Fw~(v9RzYeKiqgzK}Zyz9)MSx`dYl0^{Jiy#xa#7yPo5cPyn4Fba2MY|KRAAj;{2m8A#$BblM+J*rG2F=<)jIFqmecy5k~i*Cg~|6O-- zG(<{Hai)k11z>aUt&vuZV)tc&+})QMoJ_5^kgQis1zivtb5aJ z^wSK4_IGly?@xd+ZtufE*n7*1!BM%D-&ErR_E)l@jD=y`!zTAjxe{shVmdqxN L5HAxoeEa_ZfajY( literal 0 HcmV?d00001 From b38fc1136ef4f846a54117d8d9f1deb3aebe302e Mon Sep 17 00:00:00 2001 From: yh_cc Date: Sun, 1 May 2022 00:32:11 +0800 Subject: [PATCH 9/9] =?UTF-8?q?1.=E4=BF=AE=E6=94=B9ProgressBart=E5=9C=A8Tr?= =?UTF-8?q?ainer=E4=B8=AD=E7=9A=84=E4=B8=80=E4=B8=AAbug;2=E4=BF=AE?= =?UTF-8?q?=E5=A4=8Dpytest=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/callbacks/callback_manager.py | 54 ++++++++++++++----- fastNLP/core/callbacks/progress_callback.py | 2 - fastNLP/core/controllers/trainer.py | 16 ++---- .../test_checkpoint_callback_torch.py | 11 ++-- .../test_load_best_model_callback_torch.py | 4 +- .../callbacks/test_more_evaluate_callback.py | 3 +- tests/helpers/utils.py | 9 ++-- 7 files changed, 61 insertions(+), 38 deletions(-) diff --git a/fastNLP/core/callbacks/callback_manager.py b/fastNLP/core/callbacks/callback_manager.py index 90d2e1b1..f63c6088 100644 --- a/fastNLP/core/callbacks/callback_manager.py +++ b/fastNLP/core/callbacks/callback_manager.py @@ -9,6 +9,8 @@ __all__ = [ from .callback_events import Events from .callback import Callback from fastNLP.core.log import logger +from .progress_callback import ProgressCallback, choose_progress_callback +from fastNLP.envs import rank_zero_call def _transfer(func): @@ -26,6 +28,43 @@ def _transfer(func): return wrapper +def prepare_callbacks(callbacks, progress_bar): + """ + + :param callbacks: + :param progress_bar: + :return: + """ + _callbacks = [] + if callbacks is not None: + if isinstance(callbacks, Callback): + callbacks = [callbacks] + if not isinstance(callbacks, Sequence): + raise ValueError("Parameter `callbacks` should be type 'List' or 'Tuple'.") + callbacks = list(callbacks) + for _callback in callbacks: + if not isinstance(_callback, Callback): + raise TypeError(f"callbacks must be of Callback type, instead of `{type(_callback)}`") + _callbacks += callbacks + + has_no_progress = False + for _callback in _callbacks: + if isinstance(_callback, ProgressCallback): + has_no_progress = True + if not has_no_progress: + callback = choose_progress_callback(progress_bar) + if callback is not None: + _callbacks.append(callback) + elif progress_bar is not None and progress_bar != 'auto': + logger.warning(f"Since you have passed in ProgressBar callback, progress_bar will be ignored.") + + if has_no_progress and progress_bar is None: + rank_zero_call(logger.warning)("No progress bar is provided, there will have no information output " + "during training.") + + return _callbacks + + class CallbackManager: r""" 用来管理训练过程中的所有的 callback 实例; @@ -45,24 +84,13 @@ class CallbackManager: """ self._need_reproducible_sampler = False - _callbacks = [] - if callbacks is not None: - if isinstance(callbacks, Callback): - callbacks = [callbacks] - if not isinstance(callbacks, Sequence): - raise ValueError("Parameter `callbacks` should be type 'List' or 'Tuple'.") - callbacks = list(callbacks) - for _callback in callbacks: - if not isinstance(_callback, Callback): - raise TypeError(f"callbacks must be of Callback type, instead of `{type(_callback)}`") - _callbacks += callbacks self.callback_fns = defaultdict(list) # 因为理论上用户最多只能通过 'trainer.on_train_begin' 或者 'trainer.callback_manager.on_train_begin' 来调用,即其是没办法 # 直接调用具体的某一个 callback 函数,而不调用其余的同名的 callback 函数的,因此我们只需要记录具体 Event 的时机即可; self.callback_counter = defaultdict(lambda: 0) - if len(_callbacks): + if len(callbacks): # 这一对象是为了保存原始的类 callback 对象来帮助用户进行 debug,理论上在正常的使用中你并不会需要它; - self.class_callbacks = _callbacks + self.class_callbacks = callbacks else: self.class_callbacks: Optional[List[Callback]] = [] diff --git a/fastNLP/core/callbacks/progress_callback.py b/fastNLP/core/callbacks/progress_callback.py index bacdea48..335345e0 100644 --- a/fastNLP/core/callbacks/progress_callback.py +++ b/fastNLP/core/callbacks/progress_callback.py @@ -11,8 +11,6 @@ __all__ = [ from .has_monitor_callback import HasMonitorCallback from fastNLP.core.utils import f_rich_progress from fastNLP.core.log import logger -from fastNLP.core.utils.utils import is_notebook - class ProgressCallback(HasMonitorCallback): diff --git a/fastNLP/core/controllers/trainer.py b/fastNLP/core/controllers/trainer.py index 307901b1..5223c9d8 100644 --- a/fastNLP/core/controllers/trainer.py +++ b/fastNLP/core/controllers/trainer.py @@ -19,8 +19,8 @@ from .evaluator import Evaluator from fastNLP.core.controllers.utils.utils import TrainerEventTrigger, _TruncatedDataLoader from fastNLP.core.callbacks import Callback, CallbackManager, Events, EventsList from fastNLP.core.callbacks.callback import _CallbackWrapper +from fastNLP.core.callbacks.callback_manager import prepare_callbacks from fastNLP.core.callbacks.callback_events import _SingleEventState -from fastNLP.core.callbacks.progress_callback import choose_progress_callback from fastNLP.core.drivers import Driver from fastNLP.core.drivers.utils import choose_driver from fastNLP.core.utils import get_fn_arg_names, match_and_substitute_params, nullcontext @@ -133,7 +133,7 @@ class Trainer(TrainerEventTrigger): ["all", "ignore", "only_error"];当该参数的值不是以上值时,该值应当表示一个文件夹的名字,我们会将其他 rank 的输出流重定向到 log 文件中,然后将 log 文件保存在通过该参数值设定的文件夹中;默认为 "only_error"; progress_bar: 以哪种方式显示 progress ,目前支持[None, 'raw', 'rich', 'auto'] 或者 RichCallback, RawTextCallback对象, - 默认为 auto , auto 表示如果检测到当前 terminal 为交互型 则使用 RichCallback,否则使用 RawTextCallback对象。如果 + 默认为 auto , auto 表示如果检测到当前 terminal 为交互型则使用 RichCallback,否则使用 RawTextCallback对象。如果 需要定制 progress bar 的参数,例如打印频率等,可以传入 RichCallback, RawTextCallback 对象。 train_input_mapping: 与 input_mapping 一致,但是只用于 train 中。与 input_mapping 互斥。 train_output_mapping: 与 output_mapping 一致,但是只用于 train 中。与 output_mapping 互斥。 @@ -212,17 +212,7 @@ class Trainer(TrainerEventTrigger): self.driver.set_optimizers(optimizers=optimizers) # 根据 progress_bar 参数选择 ProgressBarCallback - progress_bar_callback = choose_progress_callback(kwargs.get('progress_bar', 'auto')) - if progress_bar_callback is not None: - if callbacks is None: - callbacks = [] - elif not isinstance(callbacks, Sequence): - callbacks = [callbacks] - - callbacks = list(callbacks) + [progress_bar_callback] - else: - rank_zero_call(logger.warning)("No progress bar is provided, there will have no information output " - "during training.") + callbacks = prepare_callbacks(callbacks, kwargs.get('progress_bar', 'auto')) # 初始化 callback manager; self.callback_manager = CallbackManager(callbacks) # 添加所有的函数式 callbacks; diff --git a/tests/core/callbacks/test_checkpoint_callback_torch.py b/tests/core/callbacks/test_checkpoint_callback_torch.py index ca2a3292..0ae9e801 100644 --- a/tests/core/callbacks/test_checkpoint_callback_torch.py +++ b/tests/core/callbacks/test_checkpoint_callback_torch.py @@ -73,7 +73,7 @@ def model_and_optimizers(request): @pytest.mark.parametrize("driver,device", [("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1)]) # ("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1) @pytest.mark.parametrize("version", [0, 1]) @pytest.mark.parametrize("only_state_dict", [True, False]) -@magic_argv_env_context +@magic_argv_env_context(timeout=100) def test_model_checkpoint_callback_1( model_and_optimizers: TrainerParameters, driver, @@ -193,7 +193,7 @@ def test_model_checkpoint_callback_1( trainer.load_model(folder, only_state_dict=only_state_dict) trainer.run() - + trainer.driver.barrier() finally: rank_zero_rm(path) @@ -203,7 +203,7 @@ def test_model_checkpoint_callback_1( @pytest.mark.parametrize("driver,device", [("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1)]) # ("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1) @pytest.mark.parametrize("only_state_dict", [True]) -@magic_argv_env_context +@magic_argv_env_context(timeout=100) def test_model_checkpoint_callback_2( model_and_optimizers: TrainerParameters, driver, @@ -283,6 +283,7 @@ def test_model_checkpoint_callback_2( trainer.load_model(folder, only_state_dict=only_state_dict) trainer.run() + trainer.driver.barrier() finally: rank_zero_rm(path) @@ -295,7 +296,7 @@ def test_model_checkpoint_callback_2( @pytest.mark.parametrize("driver,device", [("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 0)]) # ("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1) @pytest.mark.parametrize("version", [0, 1]) @pytest.mark.parametrize("only_state_dict", [True, False]) -@magic_argv_env_context +@magic_argv_env_context(timeout=100) def test_trainer_checkpoint_callback_1( model_and_optimizers: TrainerParameters, driver, @@ -413,6 +414,7 @@ def test_trainer_checkpoint_callback_1( trainer.load(folder, only_state_dict=only_state_dict) trainer.run() + trainer.driver.barrier() finally: rank_zero_rm(path) @@ -661,6 +663,7 @@ def test_trainer_checkpoint_callback_2( trainer.load(folder, model_load_fn=model_load_fn) trainer.run() + trainer.driver.barrier() finally: rank_zero_rm(path) diff --git a/tests/core/callbacks/test_load_best_model_callback_torch.py b/tests/core/callbacks/test_load_best_model_callback_torch.py index 0bc63bd5..f5b67f95 100644 --- a/tests/core/callbacks/test_load_best_model_callback_torch.py +++ b/tests/core/callbacks/test_load_best_model_callback_torch.py @@ -16,7 +16,6 @@ from fastNLP.core.controllers.trainer import Trainer from fastNLP.core.metrics.accuracy import Accuracy from fastNLP.core.callbacks.load_best_model_callback import LoadBestModelCallback from fastNLP.core import Evaluator -from fastNLP.core.utils.utils import safe_rm from fastNLP.core.drivers.torch_driver import TorchSingleDriver from tests.helpers.models.torch_model import TorchNormalModel_Classification_1 from tests.helpers.datasets.torch_data import TorchArgMaxDataset @@ -112,7 +111,8 @@ def test_load_best_model_callback( results = evaluator.run() assert np.allclose(callbacks[0].monitor_value, results['acc#acc#dl1']) if save_folder: - safe_rm(save_folder) + import shutil + shutil.rmtree(save_folder, ignore_errors=True) if dist.is_initialized(): dist.destroy_process_group() diff --git a/tests/core/callbacks/test_more_evaluate_callback.py b/tests/core/callbacks/test_more_evaluate_callback.py index 16ee3e17..115f519a 100644 --- a/tests/core/callbacks/test_more_evaluate_callback.py +++ b/tests/core/callbacks/test_more_evaluate_callback.py @@ -171,7 +171,7 @@ def test_model_more_evaluate_callback_1( trainer.load_model(folder, only_state_dict=only_state_dict) trainer.run() - + trainer.driver.barrier() finally: rank_zero_rm(path) @@ -255,6 +255,7 @@ def test_trainer_checkpoint_callback_1( trainer.load(folder, only_state_dict=only_state_dict) trainer.run() + trainer.driver.barrier() finally: rank_zero_rm(path) diff --git a/tests/helpers/utils.py b/tests/helpers/utils.py index c0b51a8b..7e02ca0d 100644 --- a/tests/helpers/utils.py +++ b/tests/helpers/utils.py @@ -33,6 +33,8 @@ def recover_logger(fn): def magic_argv_env_context(fn=None, timeout=600): """ 用来在测试时包裹每一个单独的测试函数,使得 ddp 测试正确; + 会丢掉 pytest 中的 arg 参数。 + :param timeout: 表示一个测试如果经过多久还没有通过的话就主动将其 kill 掉,默认为 10 分钟,单位为秒; :return: """ @@ -46,9 +48,10 @@ def magic_argv_env_context(fn=None, timeout=600): env = deepcopy(os.environ.copy()) used_args = [] - for each_arg in sys.argv[1:]: - if "test" not in each_arg: - used_args.append(each_arg) + # for each_arg in sys.argv[1:]: + # # warning,否则 可能导致 pytest -s . 中的点混入其中,导致多卡启动的 collect tests items 不为 1 + # if each_arg.startswith('-'): + # used_args.append(each_arg) pytest_current_test = os.environ.get('PYTEST_CURRENT_TEST')