From df1db2535aeb654b408834f45e6e45b1223953ae Mon Sep 17 00:00:00 2001
From: x54-729 <17307130121@fudan.edu.cn>
Date: Fri, 29 Apr 2022 16:53:02 +0000
Subject: [PATCH 1/9] =?UTF-8?q?=E8=BF=81=E7=A7=BBtransformers=20ver.4.11.3?=
 =?UTF-8?q?=E7=9A=84bert=20bart=20roberta=20gpt2=E5=92=8Ccpt=E6=A8=A1?=
 =?UTF-8?q?=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/core/utils/dummy_class.py             |    5 +-
 fastNLP/transformers/__init__.py              |    1 +
 fastNLP/transformers/torch/__init__.py        |    9 +
 fastNLP/transformers/torch/activations.py     |  125 +
 .../transformers/torch/configuration_utils.py |  777 ++++
 fastNLP/transformers/torch/deepspeed.py       |  388 ++
 .../torch/dependency_versions_check.py        |   20 +
 .../torch/dependency_versions_table.py        |   76 +
 fastNLP/transformers/torch/file_utils.py      |  934 +++++
 .../torch/generation_beam_search.py           |  393 ++
 .../torch/generation_logits_process.py        |  618 +++
 .../torch/generation_stopping_criteria.py     |  128 +
 .../transformers/torch/generation_utils.py    | 2579 +++++++++++++
 .../transformers/torch/modeling_outputs.py    |  816 ++++
 fastNLP/transformers/torch/modeling_utils.py  | 1888 ++++++++++
 fastNLP/transformers/torch/models/__init__.py |    5 +
 .../torch/models/auto/configuration_auto.py   |  541 +++
 .../torch/models/auto/tokenization_auto.py    |  199 +
 .../torch/models/bart/__init__.py             |   20 +
 .../torch/models/bart/configuration_bart.py   |  177 +
 .../torch/models/bart/modeling_bart.py        | 1834 +++++++++
 .../torch/models/bart/tokenization_bart.py    |   65 +
 .../torch/models/bert/__init__.py             |   27 +
 .../torch/models/bert/configuration_bert.py   |  158 +
 .../torch/models/bert/modeling_bert.py        | 1806 +++++++++
 .../torch/models/bert/tokenization_bert.py    |  558 +++
 .../transformers/torch/models/cpt/__init__.py |   12 +
 .../torch/models/cpt/modeling_cpt.py          | 1489 ++++++++
 .../torch/models/gpt2/__init__.py             |   19 +
 .../torch/models/gpt2/configuration_gpt2.py   |  184 +
 .../torch/models/gpt2/modeling_gpt2.py        | 1393 +++++++
 .../torch/models/gpt2/tokenization_gpt2.py    |  308 ++
 .../torch/models/roberta/__init__.py          |   21 +
 .../models/roberta/configuration_roberta.py   |   65 +
 .../torch/models/roberta/modeling_roberta.py  | 1584 ++++++++
 .../models/roberta/tokenization_roberta.py    |  254 ++
 .../transformers/torch/tokenization_utils.py  |  915 +++++
 .../torch/tokenization_utils_base.py          | 3351 +++++++++++++++++
 fastNLP/transformers/torch/utils/__init__.py  |    0
 .../torch/utils/model_parallel_utils.py       |   54 +
 fastNLP/transformers/torch/utils/versions.py  |  120 +
 41 files changed, 23914 insertions(+), 2 deletions(-)
 create mode 100644 fastNLP/transformers/__init__.py
 create mode 100644 fastNLP/transformers/torch/__init__.py
 create mode 100644 fastNLP/transformers/torch/activations.py
 create mode 100644 fastNLP/transformers/torch/configuration_utils.py
 create mode 100644 fastNLP/transformers/torch/deepspeed.py
 create mode 100644 fastNLP/transformers/torch/dependency_versions_check.py
 create mode 100644 fastNLP/transformers/torch/dependency_versions_table.py
 create mode 100644 fastNLP/transformers/torch/file_utils.py
 create mode 100644 fastNLP/transformers/torch/generation_beam_search.py
 create mode 100644 fastNLP/transformers/torch/generation_logits_process.py
 create mode 100644 fastNLP/transformers/torch/generation_stopping_criteria.py
 create mode 100644 fastNLP/transformers/torch/generation_utils.py
 create mode 100644 fastNLP/transformers/torch/modeling_outputs.py
 create mode 100644 fastNLP/transformers/torch/modeling_utils.py
 create mode 100644 fastNLP/transformers/torch/models/__init__.py
 create mode 100644 fastNLP/transformers/torch/models/auto/configuration_auto.py
 create mode 100644 fastNLP/transformers/torch/models/auto/tokenization_auto.py
 create mode 100644 fastNLP/transformers/torch/models/bart/__init__.py
 create mode 100644 fastNLP/transformers/torch/models/bart/configuration_bart.py
 create mode 100644 fastNLP/transformers/torch/models/bart/modeling_bart.py
 create mode 100644 fastNLP/transformers/torch/models/bart/tokenization_bart.py
 create mode 100644 fastNLP/transformers/torch/models/bert/__init__.py
 create mode 100644 fastNLP/transformers/torch/models/bert/configuration_bert.py
 create mode 100644 fastNLP/transformers/torch/models/bert/modeling_bert.py
 create mode 100644 fastNLP/transformers/torch/models/bert/tokenization_bert.py
 create mode 100644 fastNLP/transformers/torch/models/cpt/__init__.py
 create mode 100644 fastNLP/transformers/torch/models/cpt/modeling_cpt.py
 create mode 100644 fastNLP/transformers/torch/models/gpt2/__init__.py
 create mode 100644 fastNLP/transformers/torch/models/gpt2/configuration_gpt2.py
 create mode 100644 fastNLP/transformers/torch/models/gpt2/modeling_gpt2.py
 create mode 100644 fastNLP/transformers/torch/models/gpt2/tokenization_gpt2.py
 create mode 100644 fastNLP/transformers/torch/models/roberta/__init__.py
 create mode 100644 fastNLP/transformers/torch/models/roberta/configuration_roberta.py
 create mode 100644 fastNLP/transformers/torch/models/roberta/modeling_roberta.py
 create mode 100644 fastNLP/transformers/torch/models/roberta/tokenization_roberta.py
 create mode 100644 fastNLP/transformers/torch/tokenization_utils.py
 create mode 100644 fastNLP/transformers/torch/tokenization_utils_base.py
 create mode 100644 fastNLP/transformers/torch/utils/__init__.py
 create mode 100644 fastNLP/transformers/torch/utils/model_parallel_utils.py
 create mode 100644 fastNLP/transformers/torch/utils/versions.py

diff --git a/fastNLP/core/utils/dummy_class.py b/fastNLP/core/utils/dummy_class.py
index 2e97c3e4..2856b656 100644
--- a/fastNLP/core/utils/dummy_class.py
+++ b/fastNLP/core/utils/dummy_class.py
@@ -1,4 +1,5 @@
-
+import functools
 
 class DummyClass:
-    pass
\ No newline at end of file
+    def __call__(self, *args, **kwargs):
+        return
diff --git a/fastNLP/transformers/__init__.py b/fastNLP/transformers/__init__.py
new file mode 100644
index 00000000..6403f6b9
--- /dev/null
+++ b/fastNLP/transformers/__init__.py
@@ -0,0 +1 @@
+"""基于 transformers-4.11.3 版本迁移"""
\ No newline at end of file
diff --git a/fastNLP/transformers/torch/__init__.py b/fastNLP/transformers/torch/__init__.py
new file mode 100644
index 00000000..9ce4fb10
--- /dev/null
+++ b/fastNLP/transformers/torch/__init__.py
@@ -0,0 +1,9 @@
+"""
+为了防止因 https://github.com/huggingface/transformers 版本变化导致代码不兼容，当前 folder 以及子 folder 
+都复制自 https://github.com/huggingface/transformers 的4.11.3版本。
+In order to avoid the code change of https://github.com/huggingface/transformers to cause version
+mismatch, we copy code from https://github.com/huggingface/transformers(version:4.11.3) in this
+folder and its subfolder.
+"""
+__version__ = "4.11.3"
+from .models import *
\ No newline at end of file
diff --git a/fastNLP/transformers/torch/activations.py b/fastNLP/transformers/torch/activations.py
new file mode 100644
index 00000000..cf01f2bf
--- /dev/null
+++ b/fastNLP/transformers/torch/activations.py
@@ -0,0 +1,125 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+from packaging import version
+
+from fastNLP.envs.imports import _NEED_IMPORT_TORCH
+from fastNLP.core.log import logger
+
+if _NEED_IMPORT_TORCH:
+    import torch
+    from torch import nn, tanh, sigmoid
+    from torch.nn.functional import relu
+else:
+    from fastNLP.core.utils.dummy_class import (
+        DummyClass as relu, 
+        DummyClass as tanh,
+        DummyClass as sigmoid,
+)
+
+
+def _gelu_python(x):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def gelu_new(x):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+
+if _NEED_IMPORT_TORCH:
+    if version.parse(torch.__version__) < version.parse("1.4"):
+        gelu = _gelu_python
+    else:
+        gelu = nn.functional.gelu
+else:
+    from fastNLP.core.utils.dummy_class import DummyClass as gelu
+
+def gelu_fast(x):
+    return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
+
+
+def quick_gelu(x):
+    return x * torch.sigmoid(1.702 * x)
+
+
+def _silu_python(x):
+    """
+    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
+    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
+    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
+    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
+    later.
+    """
+    return x * torch.sigmoid(x)
+
+if _NEED_IMPORT_TORCH:
+    if version.parse(torch.__version__) < version.parse("1.7"):
+        silu = _silu_python
+    else:
+        silu = nn.functional.silu
+else:
+    from fastNLP.core.utils.dummy_class import DummyClass as silu
+
+
+def _mish_python(x):
+    """
+    See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also
+    visit the official repository for the paper: https://github.com/digantamisra98/Mish
+    """
+    return x * torch.tanh(nn.functional.softplus(x))
+
+if _NEED_IMPORT_TORCH:
+    if version.parse(torch.__version__) < version.parse("1.9"):
+        mish = _mish_python
+    else:
+        mish = nn.functional.mish
+else:
+    from fastNLP.core.utils.dummy_class import DummyClass as mish
+
+
+def linear_act(x):
+    return x
+
+
+ACT2FN = {
+    "relu": relu,
+    "silu": silu,
+    "swish": silu,
+    "gelu": gelu,
+    "tanh": tanh,
+    "gelu_new": gelu_new,
+    "gelu_fast": gelu_fast,
+    "quick_gelu": quick_gelu,
+    "mish": mish,
+    "linear": linear_act,
+    "sigmoid": sigmoid,
+}
+
+
+def get_activation(activation_string):
+    if activation_string in ACT2FN:
+        return ACT2FN[activation_string]
+    else:
+        raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}")
diff --git a/fastNLP/transformers/torch/configuration_utils.py b/fastNLP/transformers/torch/configuration_utils.py
new file mode 100644
index 00000000..9c17f336
--- /dev/null
+++ b/fastNLP/transformers/torch/configuration_utils.py
@@ -0,0 +1,777 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Configuration base class and utilities."""
+
+
+import copy
+import json
+import os
+from typing import Any, Dict, Tuple, Union
+
+from . import __version__
+from .file_utils import (
+    CONFIG_NAME,
+    cached_path,
+    hf_bucket_url,
+    is_offline_mode,
+    is_remote_url,
+)
+from fastNLP.envs.imports import _NEED_IMPORT_TORCH
+from fastNLP.core.log import logger
+
+
+class PretrainedConfig:
+    r"""
+    Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as
+    methods for loading/downloading/saving configurations.
+
+    Note:
+        A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to
+        initialize a model does **not** load the model weights. It only affects the model's configuration.
+
+    Class attributes (overridden by derived classes)
+
+        - **model_type** (:obj:`str`) -- An identifier for the model type, serialized into the JSON file, and used to
+          recreate the correct object in :class:`~transformers.AutoConfig`.
+        - **is_composition** (:obj:`bool`) -- Whether the config class is composed of multiple sub-configs. In this
+          case the config has to be initialized from two or more configs of type
+          :class:`~transformers.PretrainedConfig` like: :class:`~transformers.EncoderDecoderConfig` or
+          :class:`~RagConfig`.
+        - **keys_to_ignore_at_inference** (:obj:`List[str]`) -- A list of keys to ignore by default when looking at
+          dictionary outputs of the model during inference.
+        - **attribute_map** (:obj:`Dict[str, str]`) -- A dict that maps model specific attribute names to the
+          standardized naming of attributes.
+
+    Common attributes (present in all subclasses)
+
+        - **vocab_size** (:obj:`int`) -- The number of tokens in the vocabulary, which is also the first dimension of
+          the embeddings matrix (this attribute may be missing for models that don't have a text modality like ViT).
+        - **hidden_size** (:obj:`int`) -- The hidden size of the model.
+        - **num_attention_heads** (:obj:`int`) -- The number of attention heads used in the multi-head attention layers
+          of the model.
+        - **num_hidden_layers** (:obj:`int`) -- The number of blocks in the model.
+
+    Args:
+        name_or_path (:obj:`str`, `optional`, defaults to :obj:`""`):
+            Store the string that was passed to :func:`~transformers.PreTrainedModel.from_pretrained` or
+            :func:`~transformers.TFPreTrainedModel.from_pretrained` as ``pretrained_model_name_or_path`` if the
+            configuration was created with such a method.
+        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the model should return all hidden-states.
+        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the model should returns all attentions.
+        return_dict (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a plain
+            tuple.
+        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether the model is used as an encoder/decoder or not.
+        is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether the model is used as decoder or not (in which case it's used as an encoder).
+        add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether cross-attention layers should be added to the model. Note, this option is only relevant for models
+            that can be used as decoder models within the `:class:~transformers.EncoderDecoderModel` class, which
+            consists of all models in ``AUTO_MODELS_FOR_CAUSAL_LM``.
+        tie_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder
+            and decoder model to have the exact same parameter names.
+        prune_heads (:obj:`Dict[int, List[int]]`, `optional`, defaults to :obj:`{}`):
+            Pruned heads of the model. The keys are the selected layer indices and the associated values, the list of
+            heads to prune in said layer.
+
+            For instance ``{1: [0, 2], 2: [2, 3]}`` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
+        chunk_size_feed_forward (:obj:`int`, `optional`, defaults to :obj:`0`):
+            The chunk size of all feed forward layers in the residual attention blocks. A chunk size of :obj:`0` means
+            that the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes
+            :obj:`n` < sequence_length embeddings at a time. For more information on feed forward chunking, see `How
+            does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ .
+
+    Parameters for sequence generation
+
+        - **max_length** (:obj:`int`, `optional`, defaults to 20) -- Maximum length that will be used by default in the
+          :obj:`generate` method of the model.
+        - **min_length** (:obj:`int`, `optional`, defaults to 10) -- Minimum length that will be used by default in the
+          :obj:`generate` method of the model.
+        - **do_sample** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default in the
+          :obj:`generate` method of the model. Whether or not to use sampling ; use greedy decoding otherwise.
+        - **early_stopping** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default
+          in the :obj:`generate` method of the model. Whether to stop the beam search when at least ``num_beams``
+          sentences are finished per batch or not.
+        - **num_beams** (:obj:`int`, `optional`, defaults to 1) -- Number of beams for beam search that will be used by
+          default in the :obj:`generate` method of the model. 1 means no beam search.
+        - **num_beam_groups** (:obj:`int`, `optional`, defaults to 1) -- Number of groups to divide :obj:`num_beams`
+          into in order to ensure diversity among different groups of beams that will be used by default in the
+          :obj:`generate` method of the model. 1 means no group beam search.
+        - **diversity_penalty** (:obj:`float`, `optional`, defaults to 0.0) -- Value to control diversity for group
+          beam search. that will be used by default in the :obj:`generate` method of the model. 0 means no diversity
+          penalty. The higher the penalty, the more diverse are the outputs.
+        - **temperature** (:obj:`float`, `optional`, defaults to 1) -- The value used to module the next token
+          probabilities that will be used by default in the :obj:`generate` method of the model. Must be strictly
+          positive.
+        - **top_k** (:obj:`int`, `optional`, defaults to 50) -- Number of highest probability vocabulary tokens to keep
+          for top-k-filtering that will be used by default in the :obj:`generate` method of the model.
+        - **top_p** (:obj:`float`, `optional`, defaults to 1) -- Value that will be used by default in the
+          :obj:`generate` method of the model for ``top_p``. If set to float < 1, only the most probable tokens with
+          probabilities that add up to ``top_p`` or higher are kept for generation.
+        - **repetition_penalty** (:obj:`float`, `optional`, defaults to 1) -- Parameter for repetition penalty that
+          will be used by default in the :obj:`generate` method of the model. 1.0 means no penalty.
+        - **length_penalty** (:obj:`float`, `optional`, defaults to 1) -- Exponential penalty to the length that will
+          be used by default in the :obj:`generate` method of the model.
+        - **no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by default in the
+          :obj:`generate` method of the model for ``no_repeat_ngram_size``. If set to int > 0, all ngrams of that size
+          can only occur once.
+        - **encoder_no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by
+          default in the :obj:`generate` method of the model for ``encoder_no_repeat_ngram_size``. If set to int > 0,
+          all ngrams of that size that occur in the ``encoder_input_ids`` cannot occur in the ``decoder_input_ids``.
+        - **bad_words_ids** (:obj:`List[int]`, `optional`) -- List of token ids that are not allowed to be generated
+          that will be used by default in the :obj:`generate` method of the model. In order to get the tokens of the
+          words that should not appear in the generated text, use :obj:`tokenizer.encode(bad_word,
+          add_prefix_space=True)`.
+        - **num_return_sequences** (:obj:`int`, `optional`, defaults to 1) -- Number of independently computed returned
+          sequences for each element in the batch that will be used by default in the :obj:`generate` method of the
+          model.
+        - **output_scores** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether the model should return the
+          logits when used for generation
+        - **return_dict_in_generate** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether the model should
+          return a :class:`~transformers.file_utils.ModelOutput` instead of a :obj:`torch.LongTensor`
+        - **forced_bos_token_id** (:obj:`int`, `optional`) -- The id of the token to force as the first generated token
+          after the :obj:`decoder_start_token_id`. Useful for multilingual models like :doc:`mBART
+          <../model_doc/mbart>` where the first generated token needs to be the target language token.
+        - **forced_eos_token_id** (:obj:`int`, `optional`) -- The id of the token to force as the last generated token
+          when :obj:`max_length` is reached.
+        - **remove_invalid_values** (:obj:`bool`, `optional`) -- Whether to remove possible `nan` and `inf` outputs of
+          the model to prevent the generation method to crash. Note that using ``remove_invalid_values`` can slow down
+          generation.
+
+
+    Parameters for fine-tuning tasks
+
+        - **architectures** (:obj:`List[str]`, `optional`) -- Model architectures that can be used with the model
+          pretrained weights.
+        - **finetuning_task** (:obj:`str`, `optional`) -- Name of the task used to fine-tune the model. This can be
+          used when converting from an original (TensorFlow or PyTorch) checkpoint.
+        - **id2label** (:obj:`Dict[int, str]`, `optional`) -- A map from index (for instance prediction index, or
+          target index) to label.
+        - **label2id** (:obj:`Dict[str, int]`, `optional`) -- A map from label to index for the model.
+        - **num_labels** (:obj:`int`, `optional`) -- Number of labels to use in the last layer added to the model,
+          typically for a classification task.
+        - **task_specific_params** (:obj:`Dict[str, Any]`, `optional`) -- Additional keyword arguments to store for the
+          current task.
+        - **problem_type** (:obj:`str`, `optional`) -- Problem type for :obj:`XxxForSequenceClassification` models. Can
+          be one of (:obj:`"regression"`, :obj:`"single_label_classification"`, :obj:`"multi_label_classification"`).
+          Please note that this parameter is only available in the following models: `AlbertForSequenceClassification`,
+          `BertForSequenceClassification`, `BigBirdForSequenceClassification`, `ConvBertForSequenceClassification`,
+          `DistilBertForSequenceClassification`, `ElectraForSequenceClassification`, `FunnelForSequenceClassification`,
+          `LongformerForSequenceClassification`, `MobileBertForSequenceClassification`,
+          `ReformerForSequenceClassification`, `RobertaForSequenceClassification`,
+          `SqueezeBertForSequenceClassification`, `XLMForSequenceClassification` and `XLNetForSequenceClassification`.
+
+    Parameters linked to the tokenizer
+
+        - **tokenizer_class** (:obj:`str`, `optional`) -- The name of the associated tokenizer class to use (if none is
+          set, will use the tokenizer associated to the model by default).
+        - **prefix** (:obj:`str`, `optional`) -- A specific prompt that should be added at the beginning of each text
+          before calling the model.
+        - **bos_token_id** (:obj:`int`, `optional`)) -- The id of the `beginning-of-stream` token.
+        - **pad_token_id** (:obj:`int`, `optional`)) -- The id of the `padding` token.
+        - **eos_token_id** (:obj:`int`, `optional`)) -- The id of the `end-of-stream` token.
+        - **decoder_start_token_id** (:obj:`int`, `optional`)) -- If an encoder-decoder model starts decoding with a
+          different token than `bos`, the id of that token.
+        - **sep_token_id** (:obj:`int`, `optional`)) -- The id of the `separation` token.
+
+    PyTorch specific parameters
+
+        - **torchscript** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should be
+          used with Torchscript.
+        - **tie_word_embeddings** (:obj:`bool`, `optional`, defaults to :obj:`True`) -- Whether the model's input and
+          output word embeddings should be tied. Note that this is only relevant if the model has a output word
+          embedding layer.
+        - **torch_dtype** (:obj:`str`, `optional`) -- The :obj:`dtype` of the weights. This attribute can be used to
+          initialize the model to a non-default ``dtype`` (which is normally ``float32``) and thus allow for optimal
+          storage allocation. For example, if the saved model is ``float16``, ideally we want to load it back using the
+          minimal amount of memory needed to load ``float16`` weights. Since the config object is stored in plain text,
+          this attribute contains just the floating type string without the ``torch.`` prefix. For example, for
+          ``torch.float16`` ``torch_dtype`` is the ``"float16"`` string.
+
+          This attribute is currently not being used during model loading time, but this may change in the future
+          versions. But we can already start preparing for the future by saving the dtype with save_pretrained.
+
+    TensorFlow specific parameters
+
+        - **use_bfloat16** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should use
+          BFloat16 scalars (only used by some TensorFlow models).
+    """
+    model_type: str = ""
+    is_composition: bool = False
+    attribute_map: Dict[str, str] = {}
+
+    def __setattr__(self, key, value):
+        if key in super().__getattribute__("attribute_map"):
+            key = super().__getattribute__("attribute_map")[key]
+        super().__setattr__(key, value)
+
+    def __getattribute__(self, key):
+        if key != "attribute_map" and key in super().__getattribute__("attribute_map"):
+            key = super().__getattribute__("attribute_map")[key]
+        return super().__getattribute__(key)
+
+    def __init__(self, **kwargs):
+        # Attributes with defaults
+        self.return_dict = kwargs.pop("return_dict", True)
+        self.output_hidden_states = kwargs.pop("output_hidden_states", False)
+        self.output_attentions = kwargs.pop("output_attentions", False)
+        self.torchscript = kwargs.pop("torchscript", False)  # Only used by PyTorch models
+        self.torch_dtype = kwargs.pop("torch_dtype", None)  # Only used by PyTorch models
+        self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
+        self.pruned_heads = kwargs.pop("pruned_heads", {})
+        self.tie_word_embeddings = kwargs.pop(
+            "tie_word_embeddings", True
+        )  # Whether input and output word embeddings should be tied for all MLM, LM and Seq2Seq models.
+
+        # Is decoder is used in encoder-decoder models to differentiate encoder from decoder
+        self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False)
+        self.is_decoder = kwargs.pop("is_decoder", False)
+        self.add_cross_attention = kwargs.pop("add_cross_attention", False)
+        self.tie_encoder_decoder = kwargs.pop("tie_encoder_decoder", False)
+
+        # Parameters for sequence generation
+        self.max_length = kwargs.pop("max_length", 20)
+        self.min_length = kwargs.pop("min_length", 0)
+        self.do_sample = kwargs.pop("do_sample", False)
+        self.early_stopping = kwargs.pop("early_stopping", False)
+        self.num_beams = kwargs.pop("num_beams", 1)
+        self.num_beam_groups = kwargs.pop("num_beam_groups", 1)
+        self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0)
+        self.temperature = kwargs.pop("temperature", 1.0)
+        self.top_k = kwargs.pop("top_k", 50)
+        self.top_p = kwargs.pop("top_p", 1.0)
+        self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
+        self.length_penalty = kwargs.pop("length_penalty", 1.0)
+        self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
+        self.encoder_no_repeat_ngram_size = kwargs.pop("encoder_no_repeat_ngram_size", 0)
+        self.bad_words_ids = kwargs.pop("bad_words_ids", None)
+        self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
+        self.chunk_size_feed_forward = kwargs.pop("chunk_size_feed_forward", 0)
+        self.output_scores = kwargs.pop("output_scores", False)
+        self.return_dict_in_generate = kwargs.pop("return_dict_in_generate", False)
+        self.forced_bos_token_id = kwargs.pop("forced_bos_token_id", None)
+        self.forced_eos_token_id = kwargs.pop("forced_eos_token_id", None)
+        self.remove_invalid_values = kwargs.pop("remove_invalid_values", False)
+
+        # Fine-tuning task arguments
+        self.architectures = kwargs.pop("architectures", None)
+        self.finetuning_task = kwargs.pop("finetuning_task", None)
+        self.id2label = kwargs.pop("id2label", None)
+        self.label2id = kwargs.pop("label2id", None)
+        if self.id2label is not None:
+            kwargs.pop("num_labels", None)
+            self.id2label = dict((int(key), value) for key, value in self.id2label.items())
+            # Keys are always strings in JSON so convert ids to int here.
+        else:
+            self.num_labels = kwargs.pop("num_labels", 2)
+
+        if self.torch_dtype is not None and isinstance(self.torch_dtype, str):
+            # we will start using self.torch_dtype in v5, but to be consistent with
+            # from_pretrained's torch_dtype arg convert it to an actual torch.dtype object
+            if _NEED_IMPORT_TORCH:
+                import torch
+
+                self.torch_dtype = getattr(torch, self.torch_dtype)
+
+        # Tokenizer arguments TODO: eventually tokenizer and models should share the same config
+        self.tokenizer_class = kwargs.pop("tokenizer_class", None)
+        self.prefix = kwargs.pop("prefix", None)
+        self.bos_token_id = kwargs.pop("bos_token_id", None)
+        self.pad_token_id = kwargs.pop("pad_token_id", None)
+        self.eos_token_id = kwargs.pop("eos_token_id", None)
+        self.sep_token_id = kwargs.pop("sep_token_id", None)
+
+        self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
+
+        # task specific arguments
+        self.task_specific_params = kwargs.pop("task_specific_params", None)
+
+        # regression / multi-label classification
+        self.problem_type = kwargs.pop("problem_type", None)
+        allowed_problem_types = ("regression", "single_label_classification", "multi_label_classification")
+        if self.problem_type is not None and self.problem_type not in allowed_problem_types:
+            raise ValueError(
+                f"The config parameter `problem_type` was not understood: received {self.problem_type}"
+                "but only 'regression', 'single_label_classification' and 'multi_label_classification' are valid."
+            )
+
+        # TPU arguments
+        if kwargs.pop("xla_device", None) is not None:
+            logger.warning(
+                "The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can "
+                "safely remove it from your `config.json` file."
+            )
+
+        # Name or path to the pretrained checkpoint
+        self._name_or_path = str(kwargs.pop("name_or_path", ""))
+
+        # Drop the transformers version info
+        self.transformers_version = kwargs.pop("transformers_version", None)
+
+        # Deal with gradient checkpointing
+        if kwargs.get("gradient_checkpointing", False):
+            logger.warn(
+                "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "
+                "Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the "
+                "`Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`."
+            )
+
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error(f"Can't set {key} with value {value} for {self}")
+                raise err
+
+    @property
+    def name_or_path(self) -> str:
+        return self._name_or_path
+
+    @name_or_path.setter
+    def name_or_path(self, value):
+        self._name_or_path = str(value)  # Make sure that name_or_path is a string (for JSON encoding)
+
+    @property
+    def use_return_dict(self) -> bool:
+        """
+        :obj:`bool`: Whether or not return :class:`~transformers.file_utils.ModelOutput` instead of tuples.
+        """
+        # If torchscript is set, force `return_dict=False` to avoid jit errors
+        return self.return_dict and not self.torchscript
+
+    @property
+    def num_labels(self) -> int:
+        """
+        :obj:`int`: The number of labels for classification models.
+        """
+        return len(self.id2label)
+
+    @num_labels.setter
+    def num_labels(self, num_labels: int):
+        if not hasattr(self, "id2label") or self.id2label is None or len(self.id2label) != num_labels:
+            self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)}
+            self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs):
+        """
+        Save a configuration object to the directory ``save_directory``, so that it can be re-loaded using the
+        :func:`~transformers.PretrainedConfig.from_pretrained` class method.
+
+        Args:
+            save_directory (:obj:`str` or :obj:`os.PathLike`):
+                Directory where the configuration JSON file will be saved (will be created if it does not exist).
+            push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it.
+
+                .. warning::
+
+                    Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with
+                    :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are
+                    pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory
+                    instead.
+
+            kwargs:
+                Additional key word arguments passed along to the
+                :meth:`~transformers.file_utils.PushToHubMixin.push_to_hub` method.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
+
+        os.makedirs(save_directory, exist_ok=True)
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_config_file = os.path.join(save_directory, CONFIG_NAME)
+
+        self.to_json_file(output_config_file, use_diff=True)
+        logger.info(f"Configuration saved in {output_config_file}")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        r"""
+        Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pretrained model
+        configuration.
+
+        Args:
+            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+                This can be either:
+
+                - a string, the `model id` of a pretrained model configuration hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
+                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
+                - a path to a `directory` containing a configuration file saved using the
+                  :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g., ``./my_model_directory/``.
+                - a path or url to a saved configuration JSON `file`, e.g.,
+                  ``./my_model_directory/configuration.json``.
+            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to force to (re-)download the configuration files and override the cached versions if
+                they exist.
+            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to delete incompletely received file. Attempts to resume the download if such a file
+                exists.
+            proxies (:obj:`Dict[str, str]`, `optional`):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            use_auth_token (:obj:`str` or `bool`, `optional`):
+                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
+                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
+            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                identifier allowed by git.
+            return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If :obj:`False`, then this function returns just the final configuration object.
+
+                If :obj:`True`, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs`
+                is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e.,
+                the part of ``kwargs`` which has not been used to update ``config`` and is otherwise ignored.
+            kwargs (:obj:`Dict[str, Any]`, `optional`):
+                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
+                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
+                by the ``return_unused_kwargs`` keyword parameter.
+
+        .. note::
+
+            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+
+
+        Returns:
+            :class:`PretrainedConfig`: The configuration object instantiated from this pretrained model.
+
+        Examples::
+
+            # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
+            # derived class: BertConfig
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from huggingface.co and cache.
+            config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+            config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
+            config = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
+            assert config.output_attentions == True
+            config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True,
+                                                               foo=False, return_unused_kwargs=True)
+            assert config.output_attentions == True
+            assert unused_kwargs == {'foo': False}
+
+        """
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warn(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+    @classmethod
+    def get_config_dict(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """
+        From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a
+        :class:`~transformers.PretrainedConfig` using ``from_dict``.
+
+
+
+        Parameters:
+            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+
+        Returns:
+            :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
+
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
+        from_pipeline = kwargs.pop("_from_pipeline", None)
+        from_auto_class = kwargs.pop("_from_auto", False)
+
+        user_agent = {"file_type": "config", "from_auto_class": from_auto_class}
+        if from_pipeline is not None:
+            user_agent["using_pipeline"] = from_pipeline
+
+        if is_offline_mode() and not local_files_only:
+            logger.info("Offline mode: forcing local_files_only=True")
+            local_files_only = True
+
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        if os.path.isdir(pretrained_model_name_or_path):
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+            config_file = pretrained_model_name_or_path
+        else:
+            config_file = hf_bucket_url(
+                pretrained_model_name_or_path, filename=CONFIG_NAME, revision=revision, mirror=None
+            )
+
+        try:
+            # Load from URL or cache if already cached
+            resolved_config_file = cached_path(
+                config_file,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                user_agent=user_agent,
+            )
+            # Load config dict
+            config_dict = cls._dict_from_json_file(resolved_config_file)
+
+        except EnvironmentError as err:
+            logger.error(err)
+            msg = (
+                f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
+                f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
+                f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {CONFIG_NAME} file\n\n"
+            )
+
+            if revision is not None:
+                msg += f"- or '{revision}' is a valid git identifier (branch name, a tag name, or a commit id) that exists for this model name as listed on its model page on 'https://huggingface.co/models'\n\n"
+
+            raise EnvironmentError(msg)
+
+        except (json.JSONDecodeError, UnicodeDecodeError):
+            msg = (
+                f"Couldn't reach server at '{config_file}' to download configuration file or "
+                "configuration file is not a valid JSON file. "
+                f"Please check network or file content here: {resolved_config_file}."
+            )
+            raise EnvironmentError(msg)
+
+        if resolved_config_file == config_file:
+            logger.info(f"loading configuration file {config_file}")
+        else:
+            logger.info(f"loading configuration file {config_file} from cache at {resolved_config_file}")
+
+        return config_dict, kwargs
+
+    @classmethod
+    def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig":
+        """
+        Instantiates a :class:`~transformers.PretrainedConfig` from a Python dictionary of parameters.
+
+        Args:
+            config_dict (:obj:`Dict[str, Any]`):
+                Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
+                retrieved from a pretrained checkpoint by leveraging the
+                :func:`~transformers.PretrainedConfig.get_config_dict` method.
+            kwargs (:obj:`Dict[str, Any]`):
+                Additional parameters from which to initialize the configuration object.
+
+        Returns:
+            :class:`PretrainedConfig`: The configuration object instantiated from those parameters.
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+
+        config = cls(**config_dict)
+
+        if hasattr(config, "pruned_heads"):
+            config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
+
+        # Update config with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                if key != "torch_dtype":
+                    to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info(f"Model config {config}")
+        if return_unused_kwargs:
+            return config, kwargs
+        else:
+            return config
+
+    @classmethod
+    def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "PretrainedConfig":
+        """
+        Instantiates a :class:`~transformers.PretrainedConfig` from the path to a JSON file of parameters.
+
+        Args:
+            json_file (:obj:`str` or :obj:`os.PathLike`):
+                Path to the JSON file containing the parameters.
+
+        Returns:
+            :class:`PretrainedConfig`: The configuration object instantiated from that JSON file.
+
+        """
+        config_dict = cls._dict_from_json_file(json_file)
+        return cls(**config_dict)
+
+    @classmethod
+    def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        return json.loads(text)
+
+    def __eq__(self, other):
+        return self.__dict__ == other.__dict__
+
+    def __repr__(self):
+        return f"{self.__class__.__name__} {self.to_json_string()}"
+
+    def to_diff_dict(self) -> Dict[str, Any]:
+        """
+        Removes all attributes from config which correspond to the default config attributes for better readability and
+        serializes to a Python dictionary.
+
+        Returns:
+            :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        config_dict = self.to_dict()
+
+        # get the default config dict
+        default_config_dict = PretrainedConfig().to_dict()
+
+        # get class specific config dict
+        class_config_dict = self.__class__().to_dict() if not self.is_composition else {}
+
+        serializable_config_dict = {}
+
+        # only serialize values that differ from the default config
+        for key, value in config_dict.items():
+            if (
+                key not in default_config_dict
+                or key == "transformers_version"
+                or value != default_config_dict[key]
+                or (key in class_config_dict and value != class_config_dict[key])
+            ):
+                serializable_config_dict[key] = value
+
+        self.dict_torch_dtype_to_str(serializable_config_dict)
+
+        return serializable_config_dict
+
+    def to_dict(self) -> Dict[str, Any]:
+        """
+        Serializes this instance to a Python dictionary.
+
+        Returns:
+            :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+        """
+        output = copy.deepcopy(self.__dict__)
+        if hasattr(self.__class__, "model_type"):
+            output["model_type"] = self.__class__.model_type
+
+        # Transformers version when serializing the model
+        output["transformers_version"] = __version__
+
+        self.dict_torch_dtype_to_str(output)
+
+        return output
+
+    def to_json_string(self, use_diff: bool = True) -> str:
+        """
+        Serializes this instance to a JSON string.
+
+        Args:
+            use_diff (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                If set to ``True``, only the difference between the config instance and the default
+                ``PretrainedConfig()`` is serialized to JSON string.
+
+        Returns:
+            :obj:`str`: String containing all the attributes that make up this configuration instance in JSON format.
+        """
+        if use_diff is True:
+            config_dict = self.to_diff_dict()
+        else:
+            config_dict = self.to_dict()
+        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True):
+        """
+        Save this instance to a JSON file.
+
+        Args:
+            json_file_path (:obj:`str` or :obj:`os.PathLike`):
+                Path to the JSON file in which this configuration instance's parameters will be saved.
+            use_diff (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                If set to ``True``, only the difference between the config instance and the default
+                ``PretrainedConfig()`` is serialized to JSON file.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string(use_diff=use_diff))
+
+    def update(self, config_dict: Dict[str, Any]):
+        """
+        Updates attributes of this class with attributes from ``config_dict``.
+
+        Args:
+            config_dict (:obj:`Dict[str, Any]`): Dictionary of attributes that should be updated for this class.
+        """
+        for key, value in config_dict.items():
+            setattr(self, key, value)
+
+    def update_from_string(self, update_str: str):
+        """
+        Updates attributes of this class with attributes from ``update_str``.
+
+        The expected format is ints, floats and strings as is, and for booleans use ``true`` or ``false``. For example:
+        "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
+
+        The keys to change have to already exist in the config object.
+
+        Args:
+            update_str (:obj:`str`): String with attributes that should be updated for this class.
+
+        """
+
+        d = dict(x.split("=") for x in update_str.split(","))
+        for k, v in d.items():
+            if not hasattr(self, k):
+                raise ValueError(f"key {k} isn't in the original config dict")
+
+            old_v = getattr(self, k)
+            if isinstance(old_v, bool):
+                if v.lower() in ["true", "1", "y", "yes"]:
+                    v = True
+                elif v.lower() in ["false", "0", "n", "no"]:
+                    v = False
+                else:
+                    raise ValueError(f"can't derive true or false from {v} (key {k})")
+            elif isinstance(old_v, int):
+                v = int(v)
+            elif isinstance(old_v, float):
+                v = float(v)
+            elif not isinstance(old_v, str):
+                raise ValueError(
+                    f"You can only update int, float, bool or string values in the config, got {v} for key {k}"
+                )
+
+            setattr(self, k, v)
+
+    def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None:
+        """
+        Checks whether the passed dictionary has a `torch_dtype` key and if it's not None, converts torch.dtype to a
+        string of just the type. For example, :obj:`torch.float32` get converted into `"float32"` string, which can
+        then be stored in the json format.
+        """
+        if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str):
+            d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1]
\ No newline at end of file
diff --git a/fastNLP/transformers/torch/deepspeed.py b/fastNLP/transformers/torch/deepspeed.py
new file mode 100644
index 00000000..fc3fcc7c
--- /dev/null
+++ b/fastNLP/transformers/torch/deepspeed.py
@@ -0,0 +1,388 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Integration with Deepspeed
+"""
+
+import importlib.util
+import io
+import json
+import weakref
+from copy import deepcopy
+from functools import partialmethod
+
+from .dependency_versions_check import dep_version_check
+from fastNLP.envs.imports import _NEED_IMPORT_TORCH
+from fastNLP.core.log import logger
+
+if _NEED_IMPORT_TORCH:
+    import torch
+
+
+def is_deepspeed_available():
+    return importlib.util.find_spec("deepspeed") is not None
+
+
+class HfDeepSpeedConfig:
+    """
+    This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.
+
+    A ``weakref`` of this object is stored in the module's globals to be able to access the config from areas where
+    things like the Trainer object is not available (e.g. ``from_pretrained`` and ``_get_resized_embeddings``).
+    Therefore it's important that this object remains alive while the program is still running.
+
+    :class:`~transformers.Trainer` uses the ``HfTrainerDeepSpeedConfig`` subclass instead. That subclass has logic to
+    sync the configuration with values of :class:`~transformers.TrainingArguments` by replacing special placeholder
+    values: ``"auto"``. Without this special logic the DeepSpeed configuration is not modified in any way.
+
+    Args:
+        config_file_or_dict (:obj:`Union[str, Dict]`): path to DeepSpeed config file or dict.
+
+    """
+
+    def __init__(self, config_file_or_dict):
+        # set global weakref object
+        set_hf_deepspeed_config(self)
+
+        dep_version_check("deepspeed")
+
+        if isinstance(config_file_or_dict, dict):
+            # Don't modify user's data should they want to reuse it (e.g. in tests), because once we
+            # modified it, it will not be accepted here again, since `auto` values would have been overridden
+            config = deepcopy(config_file_or_dict)
+        elif isinstance(config_file_or_dict, str):
+            with io.open(config_file_or_dict, "r", encoding="utf-8") as f:
+                config = json.load(f)
+        else:
+            raise ValueError("expecting either a path to a DeepSpeed config file or a pre-populated dict")
+        self.config = config
+
+        # zero stage - this is done as early as possible, before model is created, to allow
+        # ``is_deepspeed_zero3_enabled`` query and getting to the early deepspeed config object
+        # during ``zero.Init()`` which needs whether fp16 is enabled, dtype, etc.
+        self._stage = self.get_value("zero_optimization.stage", -1)
+
+        # offload
+        self._offload = False
+        if self.is_zero2() or self.is_zero3():
+            offload_devices_valid = set(["cpu", "nvme"])
+            offload_devices = set(
+                [
+                    self.get_value("zero_optimization.offload_optimizer.device"),
+                    self.get_value("zero_optimization.offload_param.device"),
+                ]
+            )
+            if len(offload_devices & offload_devices_valid) > 0:
+                self._offload = True
+
+    def find_config_node(self, ds_key_long):
+        config = self.config
+
+        # find the config node of interest if it exists
+        nodes = ds_key_long.split(".")
+        ds_key = nodes.pop()
+        for node in nodes:
+            config = config.get(node)
+            if config is None:
+                return None, ds_key
+
+        return config, ds_key
+
+    def get_value(self, ds_key_long, default=None):
+        """
+        Returns the set value or ``default`` if no value is set
+        """
+        config, ds_key = self.find_config_node(ds_key_long)
+        if config is None:
+            return default
+        return config.get(ds_key, default)
+
+    def is_true(self, ds_key_long):
+        """
+        Returns :obj:`True`/:obj:`False` only if the value is set, always :obj:`False` otherwise. So use this method to
+        ask the very specific question of whether the value is set to :obj:`True` (and it's not set to :obj:`False` or
+        isn't set).
+
+        """
+        value = self.get_value(ds_key_long)
+        return False if value is None else bool(value)
+
+    def is_false(self, ds_key_long):
+        """
+        Returns :obj:`True`/:obj:`False` only if the value is set, always :obj:`False` otherwise. So use this method to
+        ask the very specific question of whether the value is set to :obj:`False` (and it's not set to :obj:`True` or
+        isn't set).
+        """
+        value = self.get_value(ds_key_long)
+        return False if value is None else not bool(value)
+
+    def is_zero2(self):
+        return self._stage == 2
+
+    def is_zero3(self):
+        return self._stage == 3
+
+    def is_offload(self):
+        return self._offload
+
+
+class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
+    """
+    The ``HfTrainerDeepSpeedConfig`` object is meant to be created during ``TrainingArguments`` object creation and has
+    the same lifespan as the latter.
+    """
+
+    def __init__(self, config_file_or_dict):
+        super().__init__(config_file_or_dict)
+        self._dtype = torch.float16
+        self.mismatches = []
+
+    def dtype(self):
+        return self._dtype
+
+    def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True):
+        """
+        A utility method that massages the config file and can optionally verify that the values match.
+
+        1. Replace "auto" values with ``TrainingArguments`` value.
+
+        2. If it wasn't "auto" and ``must_match`` is true, then check that DS config matches Trainer
+        config values and if mismatched add the entry to ``self.mismatched`` - will assert during
+        ``trainer_config_finalize`` for one or more mismatches.
+
+        """
+        config, ds_key = self.find_config_node(ds_key_long)
+        if config is None:
+            return
+
+        if config.get(ds_key) == "auto":
+            config[ds_key] = hf_val
+            return
+
+        if not must_match:
+            return
+
+        ds_val = config.get(ds_key)
+        if ds_val is not None and ds_val != hf_val:
+            self.mismatches.append(f"- ds {ds_key_long}={ds_val} vs hf {hf_key}={hf_val}")
+
+    fill_only = partialmethod(fill_match, must_match=False)
+
+    def trainer_config_process(self, args):
+        """
+        Adjust the config with ``TrainingArguments`` values. This stage is run during ``TrainingArguments`` object
+        creation.
+        """
+        # DeepSpeed does:
+        # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps
+        train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps
+        self.fill_match(
+            "train_micro_batch_size_per_gpu", args.per_device_train_batch_size, "per_device_train_batch_size"
+        )
+        self.fill_match("gradient_accumulation_steps", args.gradient_accumulation_steps, "gradient_accumulation_steps")
+        self.fill_match("train_batch_size", train_batch_size, "train_batch_size (calculated)")
+        self.fill_match("gradient_clipping", args.max_grad_norm, "max_grad_norm")
+
+        self.fill_match("optimizer.params.lr", args.learning_rate, "learning_rate")
+        self.fill_match("optimizer.params.betas", [args.adam_beta1, args.adam_beta2], "adam_beta1+adam_beta2")
+        self.fill_match("optimizer.params.eps", args.adam_epsilon, "adam_epsilon")
+        self.fill_match("optimizer.params.weight_decay", args.weight_decay, "weight_decay")
+
+        self.fill_only("scheduler.params.warmup_min_lr", 0)  # not a trainer arg
+        self.fill_match("scheduler.params.warmup_max_lr", args.learning_rate, "learning_rate")
+        # total_num_steps - will get set in trainer_config_finalize
+
+        # fp16
+        if args.fp16:
+            fp16_backend = "apex" if args.fp16_backend == "apex" else "amp"
+        else:
+            fp16_backend = None
+
+        # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set
+        # any here unless the user did the work
+        self.fill_match("fp16.enabled", fp16_backend == "amp", "fp16+fp16_backend(amp)")
+
+        # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
+        # ZeRO features
+        self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)")
+        self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level")
+
+        # only if we have an explicit fp16.enabled = False then it's fp32, if it's True or this
+        # whole config section is missing then the fallback is fp16
+        if self.is_false("fp16.enabled"):
+            self._dtype = torch.float32
+        # later there will be other dtypes besides just fp16 and fp32
+        # also not quite sure what dtype should be under apex, defaulting to fp16 for now
+
+    def trainer_config_finalize(self, args, model, num_training_steps):
+        """
+        This stage is run after we have the model and know num_training_steps.
+
+        Now we we can complete the configuration process.
+        """
+        # zero
+        if self.is_zero3():
+            # automatically assign the optimal config values based on model config
+            hidden_size = model.config.hidden_size
+            self.fill_only("zero_optimization.reduce_bucket_size", hidden_size * hidden_size)
+            self.fill_only("zero_optimization.stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size)
+            self.fill_only("zero_optimization.stage3_param_persistence_threshold", 10 * hidden_size)
+
+        # scheduler
+        self.fill_match("scheduler.params.total_num_steps", num_training_steps, "num_training_steps (calculated)")
+        self.fill_match("scheduler.params.warmup_num_steps", args.get_warmup_steps(num_training_steps), "warmup_steps")
+
+        if len(self.mismatches) > 0:
+            mismatches = "\n".join(self.mismatches)
+            raise ValueError(
+                f"Please correct the following DeepSpeed config values that mismatch TrainingArguments values:\n{mismatches}\n"
+                "The easiest method is to set these DeepSpeed config values to 'auto'."
+            )
+
+
+# keep the config object global to be able to access it anywhere during TrainingArguments life-cycle
+_hf_deepspeed_config_weak_ref = None
+
+
+def set_hf_deepspeed_config(hf_deepspeed_config_obj):
+    # this is a special weakref global object to allow us to get to Deepspeed config from APIs
+    # that don't have an easy way to get to the Deepspeed config outside of the Trainer domain.
+    global _hf_deepspeed_config_weak_ref
+    # will go away automatically when HfDeepSpeedConfig is destroyed (when TrainingArguments is destroyed)
+    _hf_deepspeed_config_weak_ref = weakref.ref(hf_deepspeed_config_obj)
+
+
+def is_deepspeed_zero3_enabled():
+    if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
+        return _hf_deepspeed_config_weak_ref().is_zero3()
+    else:
+        return False
+
+
+def deepspeed_config():
+    if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None:
+        return _hf_deepspeed_config_weak_ref().config
+    else:
+        return None
+
+
+def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
+    """
+    Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
+
+    If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made.
+
+    Args:
+        trainer: Trainer object
+        num_training_steps: per single gpu
+        resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
+
+    Returns: model, optimizer, lr_scheduler
+
+    """
+    import deepspeed
+    from deepspeed.utils import logger as ds_logger
+
+    model = trainer.model
+    args = trainer.args
+
+    hf_deepspeed_config = args.hf_deepspeed_config
+    hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps)
+
+    # resume config update - some bits like `model` and `num_training_steps` only become available during train
+    config = hf_deepspeed_config.config
+
+    # Optimizer + Scheduler
+    # Currently supported combos:
+    # 1. DS scheduler + DS optimizer: Yes
+    # 2. HF scheduler + HF optimizer: Yes
+    # 3. DS scheduler + HF optimizer: Yes
+    # 4. HF scheduler + DS optimizer: Yes
+    #
+    # Unless Offload is enabled in which case it's:
+    # 1. DS scheduler + DS optimizer: Yes
+    # 2. HF scheduler + HF optimizer: Mostly*
+    # 3. DS scheduler + HF optimizer: Mostly*
+    # 4. HF scheduler + DS optimizer: Yes
+    #
+    # Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB)
+
+    optimizer = None
+    if "optimizer" in config:
+        if args.adafactor:
+            raise ValueError(
+                "--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. "
+                "Only one optimizer can be configured."
+            )
+    else:
+        if hf_deepspeed_config.is_offload():
+            logger.info(
+                "Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the custom optimizer has both CPU and GPU implementation (except LAMB)"
+            )
+
+        # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
+        # But trainer uses AdamW by default.
+        optimizer = trainer.create_optimizer()
+        # To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer`
+        config["zero_allow_untested_optimizer"] = True
+
+    def _lr_scheduler_callable(optimizer):
+        return trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
+
+    lr_scheduler = None
+    if "scheduler" not in config:
+        if optimizer is None:
+            # Optimizer is not available, so use callable to defer lr_scheduler creation to DS init
+            lr_scheduler = _lr_scheduler_callable
+        else:
+            lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer)
+
+    # keep for quick debug:
+    # from pprint import pprint; pprint(config)
+
+    # set the Deepspeed log level consistent with the trainer
+    ds_logger.setLevel(args.get_process_log_level())
+
+    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
+
+    model, optimizer, _, lr_scheduler = deepspeed.initialize(
+        model=model,
+        model_parameters=model_parameters,
+        config_params=config,
+        optimizer=optimizer,
+        lr_scheduler=lr_scheduler,
+    )
+
+    if resume_from_checkpoint is not None:
+
+        # it's possible that the user is trying to resume from model_path, which doesn't necessarily
+        # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's
+        # a resume from a checkpoint and not just a local pretrained weight. So we check here if the
+        # path contains what looks like a deepspeed checkpoint
+        import glob
+
+        deepspeed_checkpoint_dirs = sorted(glob.glob(f"{resume_from_checkpoint}/global_step*"))
+
+        if len(deepspeed_checkpoint_dirs) > 0:
+            logger.info(f"Attempting to resume from {resume_from_checkpoint}")
+            # this magically updates self.optimizer and self.lr_scheduler
+            load_path, _ = model.load_checkpoint(
+                resume_from_checkpoint, load_optimizer_states=True, load_lr_scheduler_states=True
+            )
+            if load_path is None:
+                raise ValueError(f"[deepspeed] failed to resume from checkpoint {resume_from_checkpoint}")
+        else:
+            logger.info(f"{resume_from_checkpoint} doesn't have deepspeed checkpoints, doing nothing")
+
+    return model, optimizer, lr_scheduler
diff --git a/fastNLP/transformers/torch/dependency_versions_check.py b/fastNLP/transformers/torch/dependency_versions_check.py
new file mode 100644
index 00000000..30e8f448
--- /dev/null
+++ b/fastNLP/transformers/torch/dependency_versions_check.py
@@ -0,0 +1,20 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+from .dependency_versions_table import deps
+from .utils.versions import require_version
+
+def dep_version_check(pkg, hint=None):
+    require_version(deps[pkg], hint)
diff --git a/fastNLP/transformers/torch/dependency_versions_table.py b/fastNLP/transformers/torch/dependency_versions_table.py
new file mode 100644
index 00000000..ef396637
--- /dev/null
+++ b/fastNLP/transformers/torch/dependency_versions_table.py
@@ -0,0 +1,76 @@
+# THIS FILE HAS BEEN AUTOGENERATED. To update:
+# 1. modify the `_deps` dict in setup.py
+# 2. run `make deps_table_update``
+deps = {
+    "Pillow": "Pillow",
+    "black": "black==21.4b0",
+    "codecarbon": "codecarbon==1.2.0",
+    "cookiecutter": "cookiecutter==1.7.2",
+    "dataclasses": "dataclasses",
+    "datasets": "datasets",
+    "deepspeed": "deepspeed>=0.5.3",
+    "docutils": "docutils==0.16.0",
+    "fairscale": "fairscale>0.3",
+    "faiss-cpu": "faiss-cpu",
+    "fastapi": "fastapi",
+    "filelock": "filelock",
+    "flake8": "flake8>=3.8.3",
+    "flax": "flax>=0.3.4",
+    "fugashi": "fugashi>=1.0",
+    "GitPython": "GitPython<3.1.19",
+    "huggingface-hub": "huggingface-hub>=0.0.17",
+    "importlib_metadata": "importlib_metadata",
+    "ipadic": "ipadic>=1.0.0,<2.0",
+    "isort": "isort>=5.5.4",
+    "jax": "jax>=0.2.8",
+    "jaxlib": "jaxlib>=0.1.65",
+    "jieba": "jieba",
+    "keras2onnx": "keras2onnx",
+    "nltk": "nltk",
+    "numpy": "numpy>=1.17",
+    "onnxconverter-common": "onnxconverter-common",
+    "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
+    "onnxruntime": "onnxruntime>=1.4.0",
+    "optuna": "optuna",
+    "optax": "optax>=0.0.8",
+    "packaging": "packaging>=20.0",
+    "parameterized": "parameterized",
+    "protobuf": "protobuf",
+    "psutil": "psutil",
+    "pyyaml": "pyyaml>=5.1",
+    "pydantic": "pydantic",
+    "pytest": "pytest",
+    "pytest-timeout": "pytest-timeout",
+    "pytest-xdist": "pytest-xdist",
+    "python": "python>=3.6.0",
+    "ray[tune]": "ray[tune]",
+    "recommonmark": "recommonmark",
+    "regex": "regex!=2019.12.17",
+    "requests": "requests",
+    "rouge-score": "rouge-score",
+    "sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
+    "sacremoses": "sacremoses",
+    "sagemaker": "sagemaker>=2.31.0",
+    "scikit-learn": "scikit-learn",
+    "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
+    "sigopt": "sigopt",
+    "soundfile": "soundfile",
+    "sphinx-copybutton": "sphinx-copybutton",
+    "sphinx-markdown-tables": "sphinx-markdown-tables",
+    "sphinx-rtd-theme": "sphinx-rtd-theme==0.4.3",
+    "sphinx": "sphinx==3.2.1",
+    "sphinxext-opengraph": "sphinxext-opengraph==0.4.1",
+    "sphinx-intl": "sphinx-intl",
+    "starlette": "starlette",
+    "tensorflow-cpu": "tensorflow-cpu>=2.3",
+    "tensorflow": "tensorflow>=2.3",
+    "timeout-decorator": "timeout-decorator",
+    "timm": "timm",
+    "tokenizers": "tokenizers>=0.10.1,<0.11",
+    "torch": "torch>=1.0",
+    "torchaudio": "torchaudio",
+    "tqdm": "tqdm>=4.27",
+    "unidic": "unidic>=1.0.2",
+    "unidic_lite": "unidic_lite>=1.0.7",
+    "uvicorn": "uvicorn",
+}
diff --git a/fastNLP/transformers/torch/file_utils.py b/fastNLP/transformers/torch/file_utils.py
new file mode 100644
index 00000000..2b606b33
--- /dev/null
+++ b/fastNLP/transformers/torch/file_utils.py
@@ -0,0 +1,934 @@
+import copy
+import fnmatch
+import importlib.util
+import io
+import json
+import os
+import re
+import shutil
+import sys
+import tarfile
+import tempfile
+import operator
+from collections import OrderedDict, UserDict
+from contextlib import contextmanager
+from dataclasses import fields
+from enum import Enum
+from functools import partial
+from hashlib import sha256
+from pathlib import Path
+from typing import Any, BinaryIO, Dict, Optional, Tuple, Union
+from urllib.parse import urlparse
+from uuid import uuid4
+from zipfile import ZipFile, is_zipfile
+
+import numpy as np
+# from tqdm.auto import tqdm
+
+import requests
+
+from . import __version__
+from .utils.versions import importlib_metadata
+from fastNLP.envs.imports import _NEED_IMPORT_TORCH, _TORCH_GREATER_EQUAL_1_8
+from fastNLP.envs.utils import _compare_version
+from fastNLP.core.log import logger
+
+if _NEED_IMPORT_TORCH:
+    import torch
+    _torch_version = importlib_metadata.version("torch")
+
+hf_cache_home = os.path.expanduser(
+    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
+)
+default_cache_path = os.path.join(hf_cache_home, "transformers")
+
+PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
+PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
+TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE)
+SESSION_ID = uuid4().hex
+
+ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
+
+DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", False) in ENV_VARS_TRUE_VALUES
+
+WEIGHTS_NAME = "pytorch_model.bin"
+DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+
+_staging_mode = os.environ.get("HUGGINGFACE_CO_STAGING", "NO").upper() in ENV_VARS_TRUE_VALUES
+_default_endpoint = "https://moon-staging.huggingface.co" if _staging_mode else "https://huggingface.co"
+
+HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HUGGINGFACE_CO_RESOLVE_ENDPOINT", _default_endpoint)
+HUGGINGFACE_CO_PREFIX = HUGGINGFACE_CO_RESOLVE_ENDPOINT + "/{model_id}/resolve/{revision}/{filename}"
+
+CONFIG_NAME = "config.json"
+
+_is_offline_mode = True if os.environ.get("TRANSFORMERS_OFFLINE", "0").upper() in ENV_VARS_TRUE_VALUES else False
+
+@contextmanager
+def filelock(path):
+    try:
+        import fcntl
+        open_mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC
+        fd = os.open(path, open_mode)
+        fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
+    except:
+        pass
+
+    yield
+
+    try:
+        fcntl.flock(fd, fcntl.LOCK_UN)
+        os.close(fd)
+    except:
+        pass
+
+def is_offline_mode():
+    return _is_offline_mode
+
+def is_training_run_on_sagemaker():
+    return "SAGEMAKER_JOB_NAME" in os.environ
+
+def add_start_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
+        return fn
+
+    return docstring_decorator
+
+
+def add_start_docstrings_to_model_forward(*docstr):
+    def docstring_decorator(fn):
+        class_name = f":class:`~transformers.{fn.__qualname__.split('.')[0]}`"
+        intro = f"   The {class_name} forward method, overrides the :func:`__call__` special method."
+        note = r"""
+
+    .. note::
+        Although the recipe for forward pass needs to be defined within this function, one should call the
+        :class:`Module` instance afterwards instead of this since the former takes care of running the pre and post
+        processing steps while the latter silently ignores them.
+        """
+        fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
+        return fn
+
+    return docstring_decorator
+
+
+def add_end_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + "".join(docstr)
+        return fn
+
+    return docstring_decorator
+
+PT_RETURN_INTRODUCTION = r"""
+    Returns:
+        :class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`: A :class:`~{full_output_type}` or a tuple of
+        :obj:`torch.FloatTensor` (if ``return_dict=False`` is passed or when ``config.return_dict=False``) comprising
+        various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs.
+
+"""
+
+def _get_indent(t):
+    """Returns the indentation in the first line of t"""
+    search = re.search(r"^(\s*)\S", t)
+    return "" if search is None else search.groups()[0]
+
+
+def _convert_output_args_doc(output_args_doc):
+    """Convert output_args_doc to display properly."""
+    # Split output_arg_doc in blocks argument/description
+    indent = _get_indent(output_args_doc)
+    blocks = []
+    current_block = ""
+    for line in output_args_doc.split("\n"):
+        # If the indent is the same as the beginning, the line is the name of new arg.
+        if _get_indent(line) == indent:
+            if len(current_block) > 0:
+                blocks.append(current_block[:-1])
+            current_block = f"{line}\n"
+        else:
+            # Otherwise it's part of the description of the current arg.
+            # We need to remove 2 spaces to the indentation.
+            current_block += f"{line[2:]}\n"
+    blocks.append(current_block[:-1])
+
+    # Format each block for proper rendering
+    for i in range(len(blocks)):
+        blocks[i] = re.sub(r"^(\s+)(\S+)(\s+)", r"\1- **\2**\3", blocks[i])
+        blocks[i] = re.sub(r":\s*\n\s*(\S)", r" -- \1", blocks[i])
+
+    return "\n".join(blocks)
+
+def _prepare_output_docstrings(output_type, config_class):
+    """
+    Prepares the return part of the docstring using `output_type`.
+    """
+    docstrings = output_type.__doc__
+
+    # Remove the head of the docstring to keep the list of args only
+    lines = docstrings.split("\n")
+    i = 0
+    while i < len(lines) and re.search(r"^\s*(Args|Parameters):\s*$", lines[i]) is None:
+        i += 1
+    if i < len(lines):
+        docstrings = "\n".join(lines[(i + 1) :])
+        docstrings = _convert_output_args_doc(docstrings)
+
+    # Add the return introduction
+    full_output_type = f"{output_type.__module__}.{output_type.__name__}"
+    intro = PT_RETURN_INTRODUCTION
+    intro = intro.format(full_output_type=full_output_type, config_class=config_class)
+    return intro + docstrings
+
+PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)  # Batch size 1
+
+        >>> outputs = model(**inputs, labels=labels)
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+"""
+
+PT_QUESTION_ANSWERING_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        >>> inputs = tokenizer(question, text, return_tensors='pt')
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+
+        >>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
+        >>> loss = outputs.loss
+        >>> start_scores = outputs.start_logits
+        >>> end_scores = outputs.end_logits
+"""
+
+PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(**inputs, labels=labels)
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+"""
+
+PT_MASKED_LM_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
+        >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
+
+        >>> outputs = model(**inputs, labels=labels)
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+"""
+
+PT_BASE_MODEL_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+"""
+
+PT_MULTIPLE_CHOICE_SAMPLE = r"""
+    Example::
+
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> choice0 = "It is eaten with a fork and a knife."
+        >>> choice1 = "It is eaten while held in the hand."
+        >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
+
+        >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='pt', padding=True)
+        >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels)  # batch size is 1
+
+        >>> # the linear classifier still needs to be trained
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+"""
+
+PT_CAUSAL_LM_SAMPLE = r"""
+    Example::
+
+        >>> import torch
+        >>> from transformers import {tokenizer_class}, {model_class}
+
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs, labels=inputs["input_ids"])
+        >>> loss = outputs.loss
+        >>> logits = outputs.logits
+"""
+
+PT_SAMPLE_DOCSTRINGS = {
+    "SequenceClassification": PT_SEQUENCE_CLASSIFICATION_SAMPLE,
+    "QuestionAnswering": PT_QUESTION_ANSWERING_SAMPLE,
+    "TokenClassification": PT_TOKEN_CLASSIFICATION_SAMPLE,
+    "MultipleChoice": PT_MULTIPLE_CHOICE_SAMPLE,
+    "MaskedLM": PT_MASKED_LM_SAMPLE,
+    "LMHead": PT_CAUSAL_LM_SAMPLE,
+    "BaseModel": PT_BASE_MODEL_SAMPLE,
+}
+
+def add_code_sample_docstrings(
+    *docstr, tokenizer_class=None, checkpoint=None, output_type=None, config_class=None, mask=None, model_cls=None
+):
+    def docstring_decorator(fn):
+        # model_class defaults to function's class if not specified otherwise
+        model_class = fn.__qualname__.split(".")[0] if model_cls is None else model_cls
+
+        sample_docstrings = PT_SAMPLE_DOCSTRINGS
+
+        doc_kwargs = dict(model_class=model_class, tokenizer_class=tokenizer_class, checkpoint=checkpoint)
+
+        if "SequenceClassification" in model_class:
+            code_sample = sample_docstrings["SequenceClassification"]
+        elif "QuestionAnswering" in model_class:
+            code_sample = sample_docstrings["QuestionAnswering"]
+        elif "TokenClassification" in model_class:
+            code_sample = sample_docstrings["TokenClassification"]
+        elif "MultipleChoice" in model_class:
+            code_sample = sample_docstrings["MultipleChoice"]
+        elif "MaskedLM" in model_class or model_class in ["FlaubertWithLMHeadModel", "XLMWithLMHeadModel"]:
+            doc_kwargs["mask"] = "[MASK]" if mask is None else mask
+            code_sample = sample_docstrings["MaskedLM"]
+        elif "LMHead" in model_class or "CausalLM" in model_class:
+            code_sample = sample_docstrings["LMHead"]
+        elif "Model" in model_class or "Encoder" in model_class:
+            code_sample = sample_docstrings["BaseModel"]
+        else:
+            raise ValueError(f"Docstring can't be built for model {model_class}")
+
+        output_doc = _prepare_output_docstrings(output_type, config_class) if output_type is not None else ""
+        built_doc = code_sample.format(**doc_kwargs)
+        fn.__doc__ = (fn.__doc__ or "") + "".join(docstr) + output_doc + built_doc
+        return fn
+
+    return docstring_decorator
+
+def replace_return_docstrings(output_type=None, config_class=None):
+    def docstring_decorator(fn):
+        docstrings = fn.__doc__
+        lines = docstrings.split("\n")
+        i = 0
+        while i < len(lines) and re.search(r"^\s*Returns?:\s*$", lines[i]) is None:
+            i += 1
+        if i < len(lines):
+            lines[i] = _prepare_output_docstrings(output_type, config_class)
+            docstrings = "\n".join(lines)
+        else:
+            raise ValueError(
+                f"The function {fn} should have an empty 'Return:' or 'Returns:' in its docstring as placeholder, current docstring is:\n{docstrings}"
+            )
+        fn.__doc__ = docstrings
+        return fn
+
+    return docstring_decorator
+
+def is_remote_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ("http", "https")
+
+def hf_bucket_url(
+    model_id: str, filename: str, subfolder: Optional[str] = None, revision: Optional[str] = None, mirror=None
+) -> str:
+    """
+    Resolve a model identifier, a file name, and an optional revision id, to a huggingface.co-hosted url, redirecting
+    to Cloudfront (a Content Delivery Network, or CDN) for large files.
+
+    Cloudfront is replicated over the globe so downloads are way faster for the end user (and it also lowers our
+    bandwidth costs).
+
+    Cloudfront aggressively caches files by default (default TTL is 24 hours), however this is not an issue here
+    because we migrated to a git-based versioning system on huggingface.co, so we now store the files on S3/Cloudfront
+    in a content-addressable way (i.e., the file name is its hash). Using content-addressable filenames means cache
+    can't ever be stale.
+
+    In terms of client-side caching from this library, we base our caching on the objects' ETag. An object' ETag is:
+    its sha1 if stored in git, or its sha256 if stored in git-lfs. Files cached locally from transformers before v3.5.0
+    are not shared with those new files, because the cached file's name contains a hash of the url (which changed).
+    """
+    if subfolder is not None:
+        filename = f"{subfolder}/{filename}"
+
+    if mirror:
+        if mirror in ["tuna", "bfsu"]:
+            raise ValueError("The Tuna and BFSU mirrors are no longer available. Try removing the mirror argument.")
+        legacy_format = "/" not in model_id
+        if legacy_format:
+            return f"{mirror}/{model_id}-{filename}"
+        else:
+            return f"{mirror}/{model_id}/{filename}"
+
+    if revision is None:
+        revision = "main"
+    return HUGGINGFACE_CO_PREFIX.format(model_id=model_id, revision=revision, filename=filename)
+
+def url_to_filename(url: str, etag: Optional[str] = None) -> str:
+    """
+    Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's,
+    delimited by a period. If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name so that TF 2.0 can
+    identify it as a HDF5 file (see
+    https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
+    """
+    url_bytes = url.encode("utf-8")
+    filename = sha256(url_bytes).hexdigest()
+
+    if etag:
+        etag_bytes = etag.encode("utf-8")
+        filename += "." + sha256(etag_bytes).hexdigest()
+
+    if url.endswith(".h5"):
+        filename += ".h5"
+
+    return filename
+
+def cached_path(
+    url_or_filename,
+    cache_dir=None,
+    force_download=False,
+    proxies=None,
+    resume_download=False,
+    user_agent: Union[Dict, str, None] = None,
+    extract_compressed_file=False,
+    force_extract=False,
+    use_auth_token: Union[bool, str, None] = None,
+    local_files_only=False,
+) -> Optional[str]:
+    """
+    Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file
+    and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and
+    then return the path
+
+    Args:
+        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
+        force_download: if True, re-download the file even if it's already cached in the cache dir.
+        resume_download: if True, resume the download if incompletely received file is found.
+        user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
+        use_auth_token: Optional string or boolean to use as Bearer token for remote files. If True,
+            will get token from ~/.huggingface.
+        extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
+            file in a folder along the archive.
+        force_extract: if True when extract_compressed_file is True and the archive was already extracted,
+            re-extract the archive and override the folder where it was extracted.
+
+    Return:
+        Local path (string) of file or if networking is off, last version of file cached on disk.
+
+    Raises:
+        In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+    """
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    if isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    if is_offline_mode() and not local_files_only:
+        logger.info("Offline mode: forcing local_files_only=True")
+        local_files_only = True
+
+    if is_remote_url(url_or_filename):
+        # URL, so get it from the cache (downloading if necessary)
+        output_path = get_from_cache(
+            url_or_filename,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            user_agent=user_agent,
+            use_auth_token=use_auth_token,
+            local_files_only=local_files_only,
+        )
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        output_path = url_or_filename
+    elif urlparse(url_or_filename).scheme == "":
+        # File, but it doesn't exist.
+        raise EnvironmentError(f"file {url_or_filename} not found")
+    else:
+        # Something unknown
+        raise ValueError(f"unable to parse {url_or_filename} as a URL or as a local path")
+
+    if extract_compressed_file:
+        if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path):
+            return output_path
+
+        # Path where we extract compressed archives
+        # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/"
+        output_dir, output_file = os.path.split(output_path)
+        output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
+        output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
+
+        if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract:
+            return output_path_extracted
+
+        # Prevent parallel extractions
+        lock_path = output_path + ".lock"
+        with filelock(lock_path):
+            shutil.rmtree(output_path_extracted, ignore_errors=True)
+            os.makedirs(output_path_extracted)
+            if is_zipfile(output_path):
+                with ZipFile(output_path, "r") as zip_file:
+                    zip_file.extractall(output_path_extracted)
+                    zip_file.close()
+            elif tarfile.is_tarfile(output_path):
+                tar_file = tarfile.open(output_path)
+                tar_file.extractall(output_path_extracted)
+                tar_file.close()
+            else:
+                raise EnvironmentError(f"Archive format of {output_path} could not be identified")
+
+        return output_path_extracted
+
+    return output_path
+
+def define_sagemaker_information():
+    try:
+        instance_data = requests.get(os.environ["ECS_CONTAINER_METADATA_URI"]).json()
+        dlc_container_used = instance_data["Image"]
+        dlc_tag = instance_data["Image"].split(":")[1]
+    except Exception:
+        dlc_container_used = None
+        dlc_tag = None
+
+    sagemaker_params = json.loads(os.getenv("SM_FRAMEWORK_PARAMS", "{}"))
+    runs_distributed_training = True if "sagemaker_distributed_dataparallel_enabled" in sagemaker_params else False
+    account_id = os.getenv("TRAINING_JOB_ARN").split(":")[4] if "TRAINING_JOB_ARN" in os.environ else None
+
+    sagemaker_object = {
+        "sm_framework": os.getenv("SM_FRAMEWORK_MODULE", None),
+        "sm_region": os.getenv("AWS_REGION", None),
+        "sm_number_gpu": os.getenv("SM_NUM_GPUS", 0),
+        "sm_number_cpu": os.getenv("SM_NUM_CPUS", 0),
+        "sm_distributed_training": runs_distributed_training,
+        "sm_deep_learning_container": dlc_container_used,
+        "sm_deep_learning_container_tag": dlc_tag,
+        "sm_account_id": account_id,
+    }
+    return sagemaker_object
+
+def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
+    """
+    Formats a user-agent string with basic info about a request.
+    """
+    ua = f"transformers/{__version__}; python/{sys.version.split()[0]}; session_id/{SESSION_ID}"
+    if _NEED_IMPORT_TORCH:
+        ua += f"; torch/{_torch_version}"
+    if DISABLE_TELEMETRY:
+        return ua + "; telemetry/off"
+    if is_training_run_on_sagemaker():
+        ua += "; " + "; ".join(f"{k}/{v}" for k, v in define_sagemaker_information().items())
+    # CI will set this value to True
+    if os.environ.get("TRANSFORMERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES:
+        ua += "; is_ci/true"
+    if isinstance(user_agent, dict):
+        ua += "; " + "; ".join(f"{k}/{v}" for k, v in user_agent.items())
+    elif isinstance(user_agent, str):
+        ua += "; " + user_agent
+    return ua
+
+def http_get(url: str, temp_file: BinaryIO, proxies=None, resume_size=0, headers: Optional[Dict[str, str]] = None):
+    """
+    Download remote file. Do not gobble up errors.
+    """
+    headers = copy.deepcopy(headers)
+    if resume_size > 0:
+        headers["Range"] = f"bytes={resume_size}-"
+    r = requests.get(url, stream=True, proxies=proxies, headers=headers)
+    r.raise_for_status()
+    content_length = r.headers.get("Content-Length")
+    total = resume_size + int(content_length) if content_length is not None else None
+    # progress = tqdm(
+    #     unit="B",
+    #     unit_scale=True,
+    #     unit_divisor=1024,
+    #     total=total,
+    #     initial=resume_size,
+    #     desc="Downloading",
+    #     disable=bool(logging.get_verbosity() == logging.NOTSET),
+    # )
+    for chunk in r.iter_content(chunk_size=1024):
+        if chunk:  # filter out keep-alive new chunks
+            # progress.update(len(chunk))
+            temp_file.write(chunk)
+    # progress.close()
+
+def get_from_cache(
+    url: str,
+    cache_dir=None,
+    force_download=False,
+    proxies=None,
+    etag_timeout=10,
+    resume_download=False,
+    user_agent: Union[Dict, str, None] = None,
+    use_auth_token: Union[bool, str, None] = None,
+    local_files_only=False,
+) -> Optional[str]:
+    """
+    Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the
+    path to the cached file.
+
+    Return:
+        Local path (string) of file or if networking is off, last version of file cached on disk.
+
+    Raises:
+        In case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+    """
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    os.makedirs(cache_dir, exist_ok=True)
+
+    headers = {"user-agent": http_user_agent(user_agent)}
+    if isinstance(use_auth_token, str):
+        headers["authorization"] = f"Bearer {use_auth_token}"
+    elif use_auth_token:
+        raise RuntimeError("`use_auth_token=True` is not supported in FastNLP now")
+        # token = HfFolder.get_token()
+        # if token is None:
+        #     raise EnvironmentError("You specified use_auth_token=True, but a huggingface token was not found.")
+        # headers["authorization"] = f"Bearer {token}"
+
+    url_to_download = url
+    etag = None
+    if not local_files_only:
+        try:
+            r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout)
+            r.raise_for_status()
+            etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag")
+            # We favor a custom header indicating the etag of the linked resource, and
+            # we fallback to the regular etag header.
+            # If we don't have any of those, raise an error.
+            if etag is None:
+                raise OSError(
+                    "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
+                )
+            # In case of a redirect,
+            # save an extra redirect on the request.get call,
+            # and ensure we download the exact atomic version even if it changed
+            # between the HEAD and the GET (unlikely, but hey).
+            if 300 <= r.status_code <= 399:
+                url_to_download = r.headers["Location"]
+        except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
+            # Actually raise for those subclasses of ConnectionError
+            raise
+        except (requests.exceptions.ConnectionError, requests.exceptions.Timeout):
+            # Otherwise, our Internet connection is down.
+            # etag is None
+            pass
+
+    filename = url_to_filename(url, etag)
+
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+
+    # etag is None == we don't have a connection or we passed local_files_only.
+    # try to get the last downloaded one
+    if etag is None:
+        if os.path.exists(cache_path):
+            return cache_path
+        else:
+            matching_files = [
+                file
+                for file in fnmatch.filter(os.listdir(cache_dir), filename.split(".")[0] + ".*")
+                if not file.endswith(".json") and not file.endswith(".lock")
+            ]
+            if len(matching_files) > 0:
+                return os.path.join(cache_dir, matching_files[-1])
+            else:
+                # If files cannot be found and local_files_only=True,
+                # the models might've been found if local_files_only=False
+                # Notify the user about that
+                if local_files_only:
+                    raise FileNotFoundError(
+                        "Cannot find the requested files in the cached path and outgoing traffic has been"
+                        " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
+                        " to False."
+                    )
+                else:
+                    raise ValueError(
+                        "Connection error, and we cannot find the requested files in the cached path."
+                        " Please try again or make sure your Internet connection is on."
+                    )
+
+    # From now on, etag is not None.
+    if os.path.exists(cache_path) and not force_download:
+        return cache_path
+
+    # Prevent parallel downloads of the same file with a lock.
+    lock_path = cache_path + ".lock"
+    with filelock(lock_path):
+
+        # If the download just completed while the lock was activated.
+        if os.path.exists(cache_path) and not force_download:
+            # Even if returning early like here, the lock will be released.
+            return cache_path
+
+        if resume_download:
+            incomplete_path = cache_path + ".incomplete"
+
+            @contextmanager
+            def _resumable_file_manager() -> "io.BufferedWriter":
+                with open(incomplete_path, "ab") as f:
+                    yield f
+
+            temp_file_manager = _resumable_file_manager
+            if os.path.exists(incomplete_path):
+                resume_size = os.stat(incomplete_path).st_size
+            else:
+                resume_size = 0
+        else:
+            temp_file_manager = partial(tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False)
+            resume_size = 0
+
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with temp_file_manager() as temp_file:
+            logger.info(f"{url} not found in cache or force_download set to True, downloading to {temp_file.name}")
+
+            http_get(url_to_download, temp_file, proxies=proxies, resume_size=resume_size, headers=headers)
+
+        logger.info(f"storing {url} in cache at {cache_path}")
+        os.replace(temp_file.name, cache_path)
+
+        # NamedTemporaryFile creates a file with hardwired 0600 perms (ignoring umask), so fixing it.
+        umask = os.umask(0o666)
+        os.umask(umask)
+        os.chmod(cache_path, 0o666 & ~umask)
+
+        logger.info(f"creating metadata file for {cache_path}")
+        meta = {"url": url, "etag": etag}
+        meta_path = cache_path + ".json"
+        with open(meta_path, "w") as meta_file:
+            json.dump(meta, meta_file)
+
+    return cache_path
+
+def is_torch_fx_available():
+    return _TORCH_GREATER_EQUAL_1_8 and _compare_version("torch", operator.lt, "1.9.0")
+
+def is_torch_fx_proxy(x):
+    if is_torch_fx_available():
+        import torch.fx
+
+        return isinstance(x, torch.fx.Proxy)
+    return False
+
+def is_sentencepiece_available():
+    return importlib.util.find_spec("sentencepiece") is not None
+
+def is_tokenizers_available():
+    return importlib.util.find_spec("tokenizers") is not None
+
+def is_tensor(x):
+    """
+    Tests if ``x`` is a :obj:`torch.Tensor`, :obj:`tf.Tensor`, obj:`jaxlib.xla_extension.DeviceArray` or
+    :obj:`np.ndarray`.
+    """
+    if is_torch_fx_proxy(x):
+        return True
+
+    if isinstance(x, torch.Tensor):
+        return True
+
+    return isinstance(x, np.ndarray)
+
+def to_py_obj(obj):
+    """
+    Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list.
+    """
+    if isinstance(obj, (dict, UserDict)):
+        return {k: to_py_obj(v) for k, v in obj.items()}
+    elif isinstance(obj, (list, tuple)):
+        return [to_py_obj(o) for o in obj]
+    elif _NEED_IMPORT_TORCH and _is_torch(obj):
+        return obj.detach().cpu().tolist()
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    else:
+        return obj
+
+def _is_numpy(x):
+    return isinstance(x, np.ndarray)
+
+def _is_torch(x):
+    import torch
+
+    return isinstance(x, torch.Tensor)
+
+
+def _is_torch_device(x):
+    import torch
+
+    return isinstance(x, torch.device)
+
+class ModelOutput(OrderedDict):
+    """
+    Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like
+    a tuple) or strings (like a dictionary) that will ignore the ``None`` attributes. Otherwise behaves like a regular
+    python dictionary.
+
+    .. warning::
+        You can't unpack a :obj:`ModelOutput` directly. Use the :meth:`~transformers.file_utils.ModelOutput.to_tuple`
+        method to convert it to a tuple before.
+    """
+
+    def __post_init__(self):
+        class_fields = fields(self)
+
+        # Safety and consistency checks
+        assert len(class_fields), f"{self.__class__.__name__} has no fields."
+        assert all(
+            field.default is None for field in class_fields[1:]
+        ), f"{self.__class__.__name__} should not have more than one required field."
+
+        first_field = getattr(self, class_fields[0].name)
+        other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:])
+
+        if other_fields_are_none and not is_tensor(first_field):
+            if isinstance(first_field, dict):
+                iterator = first_field.items()
+                first_field_iterator = True
+            else:
+                try:
+                    iterator = iter(first_field)
+                    first_field_iterator = True
+                except TypeError:
+                    first_field_iterator = False
+
+            # if we provided an iterator as first field and the iterator is a (key, value) iterator
+            # set the associated fields
+            if first_field_iterator:
+                for element in iterator:
+                    if (
+                        not isinstance(element, (list, tuple))
+                        or not len(element) == 2
+                        or not isinstance(element[0], str)
+                    ):
+                        break
+                    setattr(self, element[0], element[1])
+                    if element[1] is not None:
+                        self[element[0]] = element[1]
+            elif first_field is not None:
+                self[class_fields[0].name] = first_field
+        else:
+            for field in class_fields:
+                v = getattr(self, field.name)
+                if v is not None:
+                    self[field.name] = v
+
+    def __delitem__(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.")
+
+    def setdefault(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.")
+
+    def pop(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.")
+
+    def update(self, *args, **kwargs):
+        raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.")
+
+    def __getitem__(self, k):
+        if isinstance(k, str):
+            inner_dict = {k: v for (k, v) in self.items()}
+            return inner_dict[k]
+        else:
+            return self.to_tuple()[k]
+
+    def __setattr__(self, name, value):
+        if name in self.keys() and value is not None:
+            # Don't call self.__setitem__ to avoid recursion errors
+            super().__setitem__(name, value)
+        super().__setattr__(name, value)
+
+    def __setitem__(self, key, value):
+        # Will raise a KeyException if needed
+        super().__setitem__(key, value)
+        # Don't call self.__setattr__ to avoid recursion errors
+        super().__setattr__(key, value)
+
+    def to_tuple(self) -> Tuple[Any]:
+        """
+        Convert self to a tuple containing all the attributes/keys that are not ``None``.
+        """
+        return tuple(self[k] for k in self.keys())
+
+
+class ExplicitEnum(Enum):
+    """
+    Enum with more explicit error message for missing values.
+    """
+
+    @classmethod
+    def _missing_(cls, value):
+        raise ValueError(
+            f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}"
+        )
+
+
+class PaddingStrategy(ExplicitEnum):
+    """
+    Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
+    in an IDE.
+    """
+
+    LONGEST = "longest"
+    MAX_LENGTH = "max_length"
+    DO_NOT_PAD = "do_not_pad"
+
+
+class TensorType(ExplicitEnum):
+    """
+    Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
+    tab-completion in an IDE.
+    """
+
+    PYTORCH = "pt"
+    NUMPY = "np"
diff --git a/fastNLP/transformers/torch/generation_beam_search.py b/fastNLP/transformers/torch/generation_beam_search.py
new file mode 100644
index 00000000..117d9a38
--- /dev/null
+++ b/fastNLP/transformers/torch/generation_beam_search.py
@@ -0,0 +1,393 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from collections import UserDict
+from typing import Optional, Tuple
+
+from .file_utils import add_start_docstrings
+from fastNLP.envs.imports import _NEED_IMPORT_TORCH
+from fastNLP.core.log import logger
+
+if _NEED_IMPORT_TORCH:
+    import torch
+
+
+PROCESS_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using any class inheriting from :class:`~transformers.PreTrainedTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        next_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
+            Current scores of the top :obj:`2 * num_beams` non-finished beam hypotheses.
+        next_tokens (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
+            :obj:`input_ids` of the tokens corresponding to the top :obj:`2 * num_beams` non-finished beam hypotheses.
+        next_indices (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
+            Beam indices indicating to which beam hypothesis the :obj:`next_tokens` correspond.
+        pad_token_id (:obj:`int`, `optional`):
+            The id of the `padding` token.
+        eos_token_id (:obj:`int`, `optional`):
+            The id of the `end-of-sequence` token.
+
+    Return:
+        :obj:`UserDict`: A dictionary composed of the fields as defined above:
+
+            - **next_beam_scores** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Updated
+              scores of all non-finished beams.
+            - **next_beam_tokens** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Next tokens
+              to be added to the non-finished beam_hypotheses.
+            - **next_beam_indices** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Beam indices
+              indicating to which beam the next tokens shall be added.
+
+"""
+
+FINALIZE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using any class inheriting from :class:`~transformers.PreTrainedTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        final_beam_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+            The final scores of all non-finished beams.
+        final_beam_tokens (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+            The last tokens to be added to the non-finished beam_hypotheses.
+        final_beam_indices (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+            The beam indices indicating to which beam the :obj:`final_beam_tokens` shall be added.
+        pad_token_id (:obj:`int`, `optional`):
+            The id of the `padding` token.
+        eos_token_id (:obj:`int`, `optional`):
+            The id of the `end-of-sequence` token.
+
+    Return:
+        :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated
+        sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all
+        batches finished early due to the :obj:`eos_token_id`.
+
+"""
+
+
+class BeamScorer(ABC):
+    """
+    Abstract base class for all beam scorers that are used for :meth:`~transformers.PreTrainedModel.beam_search` and
+    :meth:`~transformers.PreTrainedModel.beam_sample`.
+    """
+
+    @abstractmethod
+    @add_start_docstrings(PROCESS_INPUTS_DOCSTRING)
+    def process(
+        self,
+        input_ids: "torch.LongTensor",
+        next_scores: "torch.FloatTensor",
+        next_tokens: "torch.LongTensor",
+        next_indices: "torch.LongTensor",
+        **kwargs
+    ) -> Tuple["torch.Tensor"]:
+        raise NotImplementedError("This is an abstract method.")
+
+    @abstractmethod
+    @add_start_docstrings(FINALIZE_INPUTS_DOCSTRING)
+    def finalize(
+        self,
+        input_ids: "torch.LongTensor",
+        next_scores: "torch.FloatTensor",
+        next_tokens: "torch.LongTensor",
+        next_indices: "torch.LongTensor",
+        max_length: int,
+        **kwargs
+    ) -> "torch.LongTensor":
+        raise NotImplementedError("This is an abstract method.")
+
+
+class BeamSearchScorer(BeamScorer):
+    r"""
+    :class:`transformers.BeamScorer` implementing standard beam search decoding.
+
+    Adapted in part from `Facebook's XLM beam search code
+    <https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529>`__.
+
+    Reference for the diverse beam search algorithm and implementation `Ashwin Kalyan's DBS implementation
+    <https://github.com/ashwinkalyan/dbs/blob/master/dbs/beam_utils.lua>`__
+
+    Args:
+        batch_size (:obj:`int`):
+            Batch Size of :obj:`input_ids` for which standard beam search decoding is run in parallel.
+        max_length (:obj:`int`):
+            The maximum length of the sequence to be generated.
+        num_beams (:obj:`int`):
+            Number of beams for beam search.
+        device (:obj:`torch.device`):
+            Defines the device type (*e.g.*, :obj:`"cpu"` or :obj:`"cuda"`) on which this instance of
+            :obj:`BeamSearchScorer` will be allocated.
+        length_penalty (:obj:`float`, `optional`, defaults to 1.0):
+            Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the
+            model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer
+            sequences.
+        do_early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
+        num_beam_hyps_to_keep (:obj:`int`, `optional`, defaults to 1):
+            The number of beam hypotheses that shall be returned upon calling
+            :meth:`~transformer.BeamSearchScorer.finalize`.
+        num_beam_groups (:obj:`int`):
+            Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of
+            beams. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        num_beams: int,
+        device: "torch.device",
+        length_penalty: Optional[float] = 1.0,
+        do_early_stopping: Optional[bool] = False,
+        num_beam_hyps_to_keep: Optional[int] = 1,
+        num_beam_groups: Optional[int] = 1,
+        **kwargs,
+    ):
+        self.num_beams = num_beams
+        self.device = device
+        self.length_penalty = length_penalty
+        self.do_early_stopping = do_early_stopping
+        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
+        self.num_beam_groups = num_beam_groups
+        self.group_size = self.num_beams // self.num_beam_groups
+
+        self._is_init = False
+        self._beam_hyps = [
+            BeamHypotheses(
+                num_beams=self.num_beams,
+                length_penalty=self.length_penalty,
+                early_stopping=self.do_early_stopping,
+            )
+            for _ in range(batch_size)
+        ]
+        self._done = torch.tensor([False for _ in range(batch_size)], dtype=torch.bool, device=self.device)
+
+        if not isinstance(num_beams, int) or num_beams <= 1:
+            raise ValueError(
+                f"`num_beams` has to be an integer strictly greater than 1, but is {num_beams}. For `num_beams` == 1, one should make use of `greedy_search` instead."
+            )
+
+        if not isinstance(num_beam_groups, int) or (num_beam_groups > num_beams) or (num_beams % num_beam_groups != 0):
+            raise ValueError(
+                f"`num_beam_groups` has to be an integer smaller or equal than `num_beams` and `num_beams` "
+                f"has to be divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}."
+            )
+
+        if "max_length" in kwargs:
+            logger.warn(
+                "Passing `max_length` to BeamSearchScorer is deprecated and has no effect."
+                "`max_length` should be passed directly to `beam_search(...)`, `beam_sample(...)`"
+                ",or `group_beam_search(...)`."
+            )
+
+    @property
+    def is_done(self) -> bool:
+        return self._done.all()
+
+    def process(
+        self,
+        input_ids: "torch.LongTensor",
+        next_scores: "torch.FloatTensor",
+        next_tokens: "torch.LongTensor",
+        next_indices: "torch.LongTensor",
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+    ) -> Tuple["torch.Tensor"]:
+        cur_len = input_ids.shape[-1]
+        batch_size = len(self._beam_hyps)
+        assert batch_size == (input_ids.shape[0] // self.group_size)
+
+        device = input_ids.device
+        next_beam_scores = torch.zeros((batch_size, self.group_size), dtype=next_scores.dtype, device=device)
+        next_beam_tokens = torch.zeros((batch_size, self.group_size), dtype=next_tokens.dtype, device=device)
+        next_beam_indices = torch.zeros((batch_size, self.group_size), dtype=next_indices.dtype, device=device)
+
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx]:
+                assert (
+                    len(beam_hyp) >= self.num_beams
+                ), f"Batch can only be done if at least {self.num_beams} beams have been generated"
+                assert (
+                    eos_token_id is not None and pad_token_id is not None
+                ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined"
+                # pad the batch
+                next_beam_scores[batch_idx, :] = 0
+                next_beam_tokens[batch_idx, :] = pad_token_id
+                next_beam_indices[batch_idx, :] = 0
+                continue
+
+            # next tokens for this sentence
+            beam_idx = 0
+            for beam_token_rank, (next_token, next_score, next_index) in enumerate(
+                zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx])
+            ):
+                batch_beam_idx = batch_idx * self.group_size + next_index
+                # add to generated hypotheses if end of sentence
+                if (eos_token_id is not None) and (next_token.item() == eos_token_id):
+                    # if beam_token does not belong to top num_beams tokens, it should not be added
+                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size
+                    if is_beam_token_worse_than_top_num_beams:
+                        continue
+                    beam_hyp.add(
+                        input_ids[batch_beam_idx].clone(),
+                        next_score.item(),
+                    )
+                else:
+                    # add next predicted token since it is not eos_token
+                    next_beam_scores[batch_idx, beam_idx] = next_score
+                    next_beam_tokens[batch_idx, beam_idx] = next_token
+                    next_beam_indices[batch_idx, beam_idx] = batch_beam_idx
+                    beam_idx += 1
+
+                # once the beam for next step is full, don't add more tokens to it.
+                if beam_idx == self.group_size:
+                    break
+
+            if beam_idx < self.group_size:
+                raise ValueError(
+                    f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id: {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected."
+                )
+
+            # Check if we are done so that we can save a pad step if all(done)
+            self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done(
+                next_scores[batch_idx].max().item(), cur_len
+            )
+
+        return UserDict(
+            {
+                "next_beam_scores": next_beam_scores.view(-1),
+                "next_beam_tokens": next_beam_tokens.view(-1),
+                "next_beam_indices": next_beam_indices.view(-1),
+            }
+        )
+
+    def finalize(
+        self,
+        input_ids: "torch.LongTensor",
+        final_beam_scores: "torch.FloatTensor",
+        final_beam_tokens: "torch.LongTensor",
+        final_beam_indices: "torch.LongTensor",
+        max_length: int,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+    ) -> Tuple["torch.LongTensor"]:
+        batch_size = len(self._beam_hyps)
+
+        # finalize all open beam hypotheses and add to generated hypotheses
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx]:
+                continue
+
+            # all open beam hypotheses are added to the beam hypothesis
+            # beam hypothesis class automatically keeps the best beams
+            for beam_id in range(self.num_beams):
+                batch_beam_idx = batch_idx * self.num_beams + beam_id
+                final_score = final_beam_scores[batch_beam_idx].item()
+                final_tokens = input_ids[batch_beam_idx]
+                beam_hyp.add(final_tokens, final_score)
+
+        # select the best hypotheses
+        sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep)
+        best = []
+        best_scores = torch.zeros(batch_size * self.num_beam_hyps_to_keep, device=self.device, dtype=torch.float32)
+
+        # retrieve best hypotheses
+        for i, beam_hyp in enumerate(self._beam_hyps):
+            sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0])
+            for j in range(self.num_beam_hyps_to_keep):
+                best_hyp_tuple = sorted_hyps.pop()
+                best_score = best_hyp_tuple[0]
+                best_hyp = best_hyp_tuple[1]
+                sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp)
+
+                # append to lists
+                best.append(best_hyp)
+                best_scores[i * self.num_beam_hyps_to_keep + j] = best_score
+
+        # prepare for adding eos
+        sent_max_len = min(sent_lengths.max().item() + 1, max_length)
+        decoded: "torch.LongTensor" = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len)
+        # shorter batches are padded if needed
+        if sent_lengths.min().item() != sent_lengths.max().item():
+            assert pad_token_id is not None, "`pad_token_id` has to be defined"
+            decoded.fill_(pad_token_id)
+
+        # fill with hypotheses and eos_token_id if the latter fits in
+        for i, hypo in enumerate(best):
+            decoded[i, : sent_lengths[i]] = hypo
+            if sent_lengths[i] < max_length:
+                decoded[i, sent_lengths[i]] = eos_token_id
+        return UserDict(
+            {
+                "sequences": decoded,
+                "sequence_scores": best_scores,
+            }
+        )
+
+
+class BeamHypotheses:
+    def __init__(self, num_beams: int, length_penalty: float, early_stopping: bool):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.num_beams = num_beams
+        self.beams = []
+        self.worst_score = 1e9
+
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.beams)
+
+    def add(self, hyp: "torch.LongTensor", sum_logprobs: float):
+        """
+        Add a new hypothesis to the list.
+        """
+        score = sum_logprobs / (hyp.shape[-1] ** self.length_penalty)
+        if len(self) < self.num_beams or score > self.worst_score:
+            self.beams.append((score, hyp))
+            if len(self) > self.num_beams:
+                sorted_next_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
+                del self.beams[sorted_next_scores[0][1]]
+                self.worst_score = sorted_next_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+
+    def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool:
+        """
+        If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
+        one in the heap, then we are done with this sentence.
+        """
+
+        if len(self) < self.num_beams:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            cur_score = best_sum_logprobs / cur_len ** self.length_penalty
+            ret = self.worst_score >= cur_score
+            return ret
diff --git a/fastNLP/transformers/torch/generation_logits_process.py b/fastNLP/transformers/torch/generation_logits_process.py
new file mode 100644
index 00000000..e97b62be
--- /dev/null
+++ b/fastNLP/transformers/torch/generation_logits_process.py
@@ -0,0 +1,618 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+from abc import ABC
+from typing import Callable, Iterable, List, Optional
+
+import numpy as np
+
+from .file_utils import add_start_docstrings
+from fastNLP.envs.imports import _NEED_IMPORT_TORCH
+from fastNLP.core.log import logger
+
+if _NEED_IMPORT_TORCH:
+    import torch
+
+LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`):
+            Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
+            search or log softmax for each vocabulary token when using beam search
+        kwargs:
+            Additional logits processor specific kwargs.
+
+    Return:
+        :obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`: The processed prediction scores.
+
+"""
+
+
+class LogitsProcessor(ABC):
+    """Abstract base class for all logit processors that can be applied during generation."""
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor":
+        """Torch method for processing logits."""
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+
+
+class LogitsWarper(ABC):
+    """Abstract base class for all logit warpers that can be applied during generation with multinomial sampling."""
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor":
+        """Torch method for warping logits."""
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can be called."
+        )
+
+
+class LogitsProcessorList(list):
+    """
+    This class can be used to create a list of :class:`~transformers.LogitsProcessor` or
+    :class:`~transformers.LogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits from
+    list and adds a specific `__call__` method to apply each :class:`~transformers.LogitsProcessor` or
+    :class:`~transformers.LogitsWarper` to the inputs.
+    """
+
+    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> "torch.FloatTensor":
+        for processor in self:
+            function_args = inspect.signature(processor.__call__).parameters
+            if len(function_args) > 2:
+                assert all(
+                    arg in kwargs for arg in list(function_args.keys())[2:]
+                ), f"Make sure that all the required parameters: {list(function_args.keys())} for {processor.__class__} are passed to the logits processor."
+                scores = processor(input_ids, scores, **kwargs)
+            else:
+                scores = processor(input_ids, scores)
+        return scores
+
+
+class MinLengthLogitsProcessor(LogitsProcessor):
+    r"""
+    :class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0.
+
+    Args:
+        min_length (:obj:`int`):
+            The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`.
+        eos_token_id (:obj:`int`):
+            The id of the `end-of-sequence` token.
+    """
+
+    def __init__(self, min_length: int, eos_token_id: int):
+        if not isinstance(min_length, int) or min_length < 0:
+            raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")
+
+        if not isinstance(eos_token_id, int) or eos_token_id < 0:
+            raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")
+
+        self.min_length = min_length
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor":
+        cur_len = input_ids.shape[-1]
+        if cur_len < self.min_length:
+            scores[:, self.eos_token_id] = -float("inf")
+        return scores
+
+
+class TemperatureLogitsWarper(LogitsWarper):
+    r"""
+    :class:`transformers.LogitsWarper` for temperature (exponential scaling output probability distribution).
+
+    Args:
+        temperature (:obj:`float`):
+            The value used to module the logits distribution.
+    """
+
+    def __init__(self, temperature: float):
+        if not isinstance(temperature, float) or not (temperature > 0):
+            raise ValueError(f"`temperature` has to be a strictly positive float, but is {temperature}")
+
+        self.temperature = temperature
+
+    def __call__(self, input_ids: "torch.Tensor", scores: "torch.Tensor") -> "torch.FloatTensor":
+        scores = scores / self.temperature
+        return scores
+
+
+class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
+    r"""
+    :class:`transformers.LogitsProcessor` enforcing an exponential penalty on repeated sequences.
+
+    Args:
+        repetition_penalty (:obj:`float`):
+            The parameter for repetition penalty. 1.0 means no penalty. See `this paper
+            <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
+    """
+
+    def __init__(self, penalty: float):
+        if not isinstance(penalty, float) or not (penalty > 0):
+            raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")
+
+        self.penalty = penalty
+
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor":
+        score = torch.gather(scores, 1, input_ids)
+
+        # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
+        score = torch.where(score < 0, score * self.penalty, score / self.penalty)
+
+        scores.scatter_(1, input_ids, score)
+        return scores
+
+
+class TopPLogitsWarper(LogitsWarper):
+    """
+    :class:`transformers.LogitsWarper` that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
+    prob_cut_off.
+
+    Args:
+        top_p (:obj:`float`):
+            If set to < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or higher are
+            kept for generation.
+        filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`):
+            All filtered values will be set to this float value.
+        min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+            Minimum number of tokens that cannot be filtered.
+    """
+
+    def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        top_p = float(top_p)
+        if top_p < 0 or top_p > 1.0:
+            raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
+
+        self.top_p = top_p
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor":
+        sorted_logits, sorted_indices = torch.sort(scores, descending=True)
+        cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+
+        # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > self.top_p
+        if self.min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove[..., : self.min_tokens_to_keep - 1] = 0
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
+
+
+class TopKLogitsWarper(LogitsWarper):
+    r"""
+    :class:`transformers.LogitsWarper` that performs top-k, i.e. restricting to the k highest probability elements.
+
+    Args:
+        top_k (:obj:`int`):
+            The number of highest probability vocabulary tokens to keep for top-k-filtering.
+        filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`):
+            All filtered values will be set to this float value.
+        min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+            Minimum number of tokens that cannot be filtered.
+    """
+
+    def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
+        if not isinstance(top_k, int) or top_k <= 0:
+            raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")
+
+        self.top_k = top_k
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor":
+        top_k = min(max(self.top_k, self.min_tokens_to_keep), scores.size(-1))  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = scores < torch.topk(scores, top_k)[0][..., -1, None]
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
+
+
+def _get_ngrams(ngram_size: int, prev_input_ids: "torch.Tensor", num_hypos: int):
+    generated_ngrams = [{} for _ in range(num_hypos)]
+    for idx in range(num_hypos):
+        gen_tokens = prev_input_ids[idx].tolist()
+        generated_ngram = generated_ngrams[idx]
+        for ngram in zip(*[gen_tokens[i:] for i in range(ngram_size)]):
+            prev_ngram_tuple = tuple(ngram[:-1])
+            generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
+    return generated_ngrams
+
+
+def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur_len):
+    # Before decoding the next token, prevent decoding of ngrams that have already appeared
+    start_idx = cur_len + 1 - ngram_size
+    ngram_idx = tuple(prev_input_ids[start_idx:cur_len].tolist())
+    return banned_ngrams.get(ngram_idx, [])
+
+
+def _calc_banned_ngram_tokens(
+    ngram_size: int, prev_input_ids: "torch.Tensor", num_hypos: int, cur_len: int
+) -> List[Iterable[int]]:
+    """Copied from fairseq for no_repeat_ngram in beam_search"""
+    if cur_len + 1 < ngram_size:
+        # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
+        return [[] for _ in range(num_hypos)]
+
+    generated_ngrams = _get_ngrams(ngram_size, prev_input_ids, num_hypos)
+
+    banned_tokens = [
+        _get_generated_ngrams(generated_ngrams[hypo_idx], prev_input_ids[hypo_idx], ngram_size, cur_len)
+        for hypo_idx in range(num_hypos)
+    ]
+    return banned_tokens
+
+
+class NoRepeatNGramLogitsProcessor(LogitsProcessor):
+    r"""
+    :class:`transformers.LogitsProcessor` that enforces no repetition of n-grams. See `Fairseq
+    <https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345>`__.
+
+    Args:
+        ngram_size (:obj:`int`):
+            All ngrams of size :obj:`ngram_size` can only occur once.
+    """
+
+    def __init__(self, ngram_size: int):
+        if not isinstance(ngram_size, int) or ngram_size <= 0:
+            raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}")
+        self.ngram_size = ngram_size
+
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor":
+        num_batch_hypotheses = scores.shape[0]
+        cur_len = input_ids.shape[-1]
+        banned_batch_tokens = _calc_banned_ngram_tokens(self.ngram_size, input_ids, num_batch_hypotheses, cur_len)
+
+        for i, banned_tokens in enumerate(banned_batch_tokens):
+            scores[i, banned_tokens] = -float("inf")
+
+        return scores
+
+
+class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor):
+    r"""
+    :class:`transformers.LogitsProcessor` that enforces no repetition of encoder input ids n-grams for the decoder ids.
+    See `ParlAI <https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/torch_generator_agent.py#L1350>`__.
+
+    Args:
+        encoder_ngram_size (:obj:`int`):
+            All ngrams of size :obj:`ngram_size` can only occur within the encoder input ids.
+        encoder_input_ids (:obj:`int`):
+            The encoder_input_ids that should not be repeated within the decoder ids.
+    """
+
+    def __init__(self, encoder_ngram_size: int, encoder_input_ids: "torch.LongTensor"):
+        if not isinstance(encoder_ngram_size, int) or encoder_ngram_size <= 0:
+            raise ValueError(
+                f"`encoder_ngram_size` has to be a strictly positive integer, but is {encoder_ngram_size}"
+            )
+        self.ngram_size = encoder_ngram_size
+        if len(encoder_input_ids.shape) == 1:
+            encoder_input_ids = encoder_input_ids.unsqueeze(0)
+        self.batch_size = encoder_input_ids.shape[0]
+        self.generated_ngrams = _get_ngrams(encoder_ngram_size, encoder_input_ids, self.batch_size)
+
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor":
+        # B x num_beams
+        num_hypos = scores.shape[0]
+        num_beams = num_hypos // self.batch_size
+        cur_len = input_ids.shape[-1]
+        banned_batch_tokens = [
+            _get_generated_ngrams(
+                self.generated_ngrams[hypo_idx // num_beams], input_ids[hypo_idx], self.ngram_size, cur_len
+            )
+            for hypo_idx in range(num_hypos)
+        ]
+
+        for i, banned_tokens in enumerate(banned_batch_tokens):
+            scores[i, banned_tokens] = -float("inf")
+
+        return scores
+
+
+class NoBadWordsLogitsProcessor(LogitsProcessor):
+    """
+    :class:`transformers.LogitsProcessor` that enforces that specified sequences will never be sampled.
+
+    Args:
+        bad_words_ids (:obj:`List[List[int]]`):
+            List of list of token ids that are not allowed to be generated. In order to get the tokens of the words
+            that should not appear in the generated text, use :obj:`tokenizer(bad_word,
+            add_prefix_space=True).input_ids`.
+        eos_token_id (:obj:`int`):
+            The id of the `end-of-sequence` token.
+    """
+
+    def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int):
+
+        if not isinstance(bad_words_ids, List) or len(bad_words_ids) == 0:
+            raise ValueError(f"`bad_words_ids` has to be a non-emtpy list, but is {bad_words_ids}.")
+        if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids):
+            raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.")
+        if any(
+            any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in bad_word_ids)
+            for bad_word_ids in bad_words_ids
+        ):
+            raise ValueError(
+                f"Each list in `bad_words_ids` has to be a list of positive integers, but is {bad_words_ids}."
+            )
+
+        bad_words_ids = list(filter(lambda bad_token_seq: bad_token_seq != [eos_token_id], bad_words_ids))
+        self.bad_words_id_length_1 = []
+        self.bad_words_id_length_greater_than_1 = []
+        for word in bad_words_ids:
+            if len(word) == 1:
+                self.bad_words_id_length_1.append(word[0])
+            else:
+                self.bad_words_id_length_greater_than_1.append(word)
+
+        self.static_bad_words_mask: Optional[torch.LongTensor] = None
+
+        for banned_token_seq in self.bad_words_id_length_greater_than_1:
+            assert len(banned_token_seq) > 0, f"Banned words token sequences {bad_words_ids} cannot have an empty list"
+
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor":
+        if self.static_bad_words_mask is None and len(self.bad_words_id_length_1) > 0:
+            self.static_bad_words_mask = self._calc_static_bad_word_mask(scores)
+
+        dynamic_banned_tokens = self._calc_banned_bad_words_ids(input_ids.tolist())
+        scores = self._set_scores_to_inf_for_banned_tokens(scores, dynamic_banned_tokens)
+
+        return scores
+
+    def _calc_static_bad_word_mask(self, scores: "torch.FloatTensor") -> "torch.BoolTensor":
+        static_bad_words_mask = torch.zeros(scores.shape[1])
+        static_bad_words_mask[self.bad_words_id_length_1] = 1
+        return static_bad_words_mask.unsqueeze(0).to(scores.device).bool()
+
+    def _tokens_match(self, prev_tokens: List[int], tokens: List[int]) -> bool:
+        if len(tokens) == 0:
+            # if bad word tokens is just one token always ban it
+            return True
+        elif len(tokens) > len(prev_tokens):
+            # if bad word tokens are longer then prev input_ids they can't be equal
+            return False
+        else:
+            return prev_tokens[-len(tokens) :] == tokens
+
+    def _calc_banned_bad_words_ids(self, prev_input_ids: List[List[int]]) -> Iterable[int]:
+        banned_tokens = []
+        for prev_input_ids_slice in prev_input_ids:
+            banned_tokens_slice = []
+            for banned_token_seq in self.bad_words_id_length_greater_than_1:
+                if self._tokens_match(prev_input_ids_slice, banned_token_seq[:-1]):
+                    banned_tokens_slice.append(banned_token_seq[-1])
+
+            banned_tokens.append(banned_tokens_slice)
+
+        return banned_tokens
+
+    def _set_scores_to_inf_for_banned_tokens(
+        self, scores: "torch.Tensor", banned_tokens: List[List[int]]
+    ) -> "torch.Tensor":
+        """
+        Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be a
+        list of list of banned tokens to ban in the format [[batch index, vocabulary position],...
+
+        Args:
+            scores: logits distribution of shape (batch size, vocabulary size)
+            banned_tokens: list of list of tokens to ban of length (batch_size)
+        """
+        banned_mask_list = []
+        for idx, batch_banned_tokens in enumerate(banned_tokens):
+            for token in batch_banned_tokens:
+                # Eliminates invalid bad word IDs that are over the vocabulary size.
+                if token <= scores.shape[1]:
+                    banned_mask_list.append([idx, token])
+                else:
+                    logger.error(
+                        f"An invalid bad word ID is defined: {token}. This ID is not contained in the"
+                        f"vocabulary, and is therefore ignored."
+                    )
+        if not banned_mask_list and self.static_bad_words_mask is None:
+            return scores
+
+        else:
+            if banned_mask_list:
+                banned_mask = torch.LongTensor(banned_mask_list)
+                indices = torch.ones(len(banned_mask))
+                # A sparse tensor is generated from a list of coordinates: [[0, 1], [0, 2], [2, 0]]. A conversion to dense tensor generates:
+                # [ 0  1  1 ]
+                # [ 0  0  0 ]
+                # [ 1  0  0 ]
+
+                banned_mask = (
+                    torch.sparse.LongTensor(banned_mask.t(), indices, scores.size())
+                    .to(scores.device)
+                    .to_dense()
+                    .bool()
+                )
+
+                if self.static_bad_words_mask is not None:
+                    banned_mask = torch.bitwise_or(banned_mask, self.static_bad_words_mask)
+            else:
+                banned_mask = self.static_bad_words_mask
+
+            scores = scores.masked_fill(banned_mask, -float("inf"))
+            return scores
+
+
+class PrefixConstrainedLogitsProcessor(LogitsProcessor):
+    r"""
+    :class:`transformers.LogitsProcessor` that enforces constrained generation and is useful for prefix-conditioned
+    constrained generation. See `Autoregressive Entity Retrieval <https://arxiv.org/abs/2010.00904>`__ for more
+    information.
+
+    Args:
+        prefix_allowed_tokens_fn: (:obj:`Callable[[int, torch.Tensor], List[int]]`):
+            This function constraints the beam search to allowed tokens only at each step. This function takes 2
+            arguments :obj:`inputs_ids` and the batch ID :obj:`batch_id`. It has to return a list with the allowed
+            tokens for the next generation step conditioned on the previously generated tokens :obj:`inputs_ids` and
+            the batch ID :obj:`batch_id`.
+    """
+
+    def __init__(self, prefix_allowed_tokens_fn: Callable[[int, "torch.Tensor"], List[int]], num_beams: int):
+        self._prefix_allowed_tokens_fn = prefix_allowed_tokens_fn
+        self._num_beams = num_beams
+
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor":
+        mask = torch.full_like(scores, -math.inf)
+        for batch_id, beam_sent in enumerate(input_ids.view(-1, self._num_beams, input_ids.shape[-1])):
+            for beam_id, sent in enumerate(beam_sent):
+                mask[batch_id * self._num_beams + beam_id, self._prefix_allowed_tokens_fn(batch_id, sent)] = 0
+
+        return scores + mask
+
+
+class HammingDiversityLogitsProcessor(LogitsProcessor):
+    r"""
+    :class:`transformers.LogitsProcessor` that enforces diverse beam search. Note that this logits processor is only
+    effective for :meth:`transformers.PreTrainedModel.group_beam_search`. See `Diverse Beam Search: Decoding Diverse
+    Solutions from Neural Sequence Models <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
+
+    Args:
+        diversity_penalty (:obj:`float`):
+            This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
+            particular time. Note that :obj:`diversity_penalty` is only effective if ``group beam search`` is enabled.
+        num_beams (:obj:`int`):
+            Number of beams used for group beam search. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for
+            more details.
+        num_beam_groups (:obj:`int`):
+            Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of
+            beams. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
+    """
+
+    def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int):
+        if not isinstance(diversity_penalty, float) or (not diversity_penalty > 0.0):
+            raise ValueError("`diversity_penalty` should be a float strictly larger than 0.")
+        self._diversity_penalty = diversity_penalty
+        if not isinstance(num_beams, int) or num_beams < 2:
+            raise ValueError("`num_beams` should be an integer strictly larger than 1.")
+        self._num_beams = num_beams
+        if not isinstance(num_beam_groups, int) or num_beam_groups < 2:
+            raise ValueError("`num_beam_groups` should be an integer strictly larger than 1.")
+        if num_beam_groups > num_beams:
+            raise ValueError("`beam_groups` has to be smaller or equal to `num_beams`.")
+        self._num_sub_beams = num_beams // num_beam_groups
+
+    def __call__(
+        self,
+        input_ids: "torch.LongTensor",
+        scores: "torch.FloatTensor",
+        current_tokens: "torch.LongTensor",
+        beam_group_idx: int,
+    ) -> "torch.FloatTensor":
+        # hamming diversity: penalise using same token in current group which was used in previous groups at
+        # the same time step
+        batch_size = current_tokens.shape[0] // self._num_beams
+        group_start_idx = beam_group_idx * self._num_sub_beams
+        group_end_idx = min(group_start_idx + self._num_sub_beams, self._num_beams)
+        group_size = group_end_idx - group_start_idx
+        vocab_size = scores.shape[-1]
+
+        if group_start_idx == 0:
+            return scores
+
+        for batch_idx in range(batch_size):
+            # predicted tokens of last time step of previous groups
+            previous_group_tokens = current_tokens[
+                batch_idx * self._num_beams : batch_idx * self._num_beams + group_start_idx
+            ]
+            token_frequency = torch.bincount(previous_group_tokens, minlength=vocab_size).to(scores.device)
+            scores[batch_idx * group_size : (batch_idx + 1) * group_size] -= self._diversity_penalty * token_frequency
+
+        return scores
+
+
+class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
+    r"""
+    :class:`~transformers.LogitsProcessor` that enforces the specified token as the first generated token.
+
+    Args:
+        bos_token_id (:obj:`int`):
+            The id of the token to force as the first generated token.
+    """
+
+    def __init__(self, bos_token_id: int):
+        self.bos_token_id = bos_token_id
+
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor":
+        cur_len = input_ids.shape[-1]
+        if cur_len == 1:
+            num_tokens = scores.shape[1]
+            scores[:, [i for i in range(num_tokens) if i != self.bos_token_id]] = -float("inf")
+            scores[:, self.bos_token_id] = 0
+        return scores
+
+
+class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
+    r"""
+    :class:`~transformers.LogitsProcessor` that enforces the specified token as the last generated token when
+    :obj:`max_length` is reached.
+
+    Args:
+        max_length (:obj:`int`):
+            The maximum length of the sequence to be generated.
+        eos_token_id (:obj:`int`):
+            The id of the token to force as the last generated token when :obj:`max_length` is reached.
+    """
+
+    def __init__(self, max_length: int, eos_token_id: int):
+        self.max_length = max_length
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor":
+        cur_len = input_ids.shape[-1]
+        if cur_len == self.max_length - 1:
+            num_tokens = scores.shape[1]
+            scores[:, [i for i in range(num_tokens) if i != self.eos_token_id]] = -float("inf")
+            scores[:, self.eos_token_id] = 0
+        return scores
+
+
+class InfNanRemoveLogitsProcessor(LogitsProcessor):
+    r"""
+    :class:`~transformers.LogitsProcessor` that removes all :obj:`nan` and :obj:`inf` values to avoid the generation
+    method to fail. Note that using the logits processor should only be used if necessary since it can slow down the
+    generation method. :obj:`max_length` is reached.
+    """
+
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor":
+        # set all nan values to 0.0
+        scores[scores != scores] = 0.0
+
+        # set all inf values to max possible value
+        scores[scores == float("inf")] = torch.finfo(scores.dtype).max
+
+        return scores
diff --git a/fastNLP/transformers/torch/generation_stopping_criteria.py b/fastNLP/transformers/torch/generation_stopping_criteria.py
new file mode 100644
index 00000000..179bf7c1
--- /dev/null
+++ b/fastNLP/transformers/torch/generation_stopping_criteria.py
@@ -0,0 +1,128 @@
+import time
+from abc import ABC
+from copy import deepcopy
+from typing import Optional
+
+from .file_utils import add_start_docstrings
+from fastNLP.envs.imports import _NEED_IMPORT_TORCH
+from fastNLP.core.log import logger
+
+if _NEED_IMPORT_TORCH:
+    import torch
+
+STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`):
+            Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
+            or scores for each vocabulary token after SoftMax.
+        kwargs:
+            Additional stopping criteria specific kwargs.
+
+    Return:
+        :obj:`bool`. :obj:`False` indicates we should continue, :obj:`True` indicates we should stop.
+
+"""
+
+
+class StoppingCriteria(ABC):
+    """Abstract base class for all stopping criteria that can be applied during generation."""
+
+    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> bool:
+        raise NotImplementedError("StoppingCriteria needs to be subclassed")
+
+
+class MaxLengthCriteria(StoppingCriteria):
+    """
+    This class can be used to stop generation whenever the full generated number of tokens exceeds :obj:`max_length`.
+    Keep in mind for decoder-only type of transformers, this will include the initial prompted tokens.
+
+    Args:
+        max_length (:obj:`int`):
+            The maximum length that the output sequence can have in number of tokens.
+    """
+
+    def __init__(self, max_length: int):
+        self.max_length = max_length
+
+    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> bool:
+        return input_ids.shape[-1] >= self.max_length
+
+
+class MaxNewTokensCriteria(StoppingCriteria):
+    """
+    This class can be used to stop generation whenever the generated number of tokens exceeds :obj:`max_new_tokens`.
+    Keep in mind for decoder-only type of transformers, this will **not** include the initial prompted tokens. This is
+    very close to :obj:`MaxLengthCriteria` but ignores the number of initial tokens.
+
+    Args:
+        start_length (:obj:`int`):
+            The number of initial tokens.
+        max_new_tokens (:obj:`int`):
+            The maximum number of tokens to generate.
+    """
+
+    def __init__(self, start_length: int, max_new_tokens: int):
+        self.start_length = start_length
+        self.max_new_tokens = max_new_tokens
+        self.max_length = start_length + max_new_tokens
+
+    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> bool:
+        return input_ids.shape[-1] >= self.max_length
+
+
+class MaxTimeCriteria(StoppingCriteria):
+    """
+    This class can be used to stop generation whenever the full generation exceeds some amount of time. By default, the
+    time will start being counted when you initialize this function. You can override this by passing an
+    :obj:`initial_time`.
+
+    Args:
+        max_time (:obj:`float`):
+            The maximum allowed time in seconds for the generation.
+        initial_time (:obj:`float`, `optional`, defaults to :obj:`time.time()`):
+            The start of the generation allowed time.
+    """
+
+    def __init__(self, max_time: float, initial_timestamp: Optional[float] = None):
+        self.max_time = max_time
+        self.initial_timestamp = time.time() if initial_timestamp is None else initial_timestamp
+
+    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> bool:
+        return time.time() - self.initial_timestamp > self.max_time
+
+
+class StoppingCriteriaList(list):
+    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
+    def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> bool:
+        return any(criteria(input_ids, scores) for criteria in self)
+
+    @property
+    def max_length(self) -> Optional[int]:
+        for stopping_criterium in self:
+            if isinstance(stopping_criterium, MaxLengthCriteria):
+                return stopping_criterium.max_length
+            elif isinstance(stopping_criterium, MaxNewTokensCriteria):
+                return stopping_criterium.max_length
+        return None
+
+
+def validate_stopping_criteria(stopping_criteria: StoppingCriteriaList, max_length: int) -> StoppingCriteriaList:
+    stopping_max_length = stopping_criteria.max_length
+    new_stopping_criteria = deepcopy(stopping_criteria)
+    if stopping_max_length is not None and stopping_max_length != max_length:
+        logger.warn("You set different `max_length` for stopping criteria and `max_length` parameter", UserWarning)
+    elif stopping_max_length is None:
+        new_stopping_criteria.append(MaxLengthCriteria(max_length=max_length))
+    return new_stopping_criteria
diff --git a/fastNLP/transformers/torch/generation_utils.py b/fastNLP/transformers/torch/generation_utils.py
new file mode 100644
index 00000000..cfc2108c
--- /dev/null
+++ b/fastNLP/transformers/torch/generation_utils.py
@@ -0,0 +1,2579 @@
+# coding=utf-8
+# Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+
+from .file_utils import ModelOutput
+from .generation_beam_search import BeamScorer, BeamSearchScorer
+from .generation_logits_process import (
+    EncoderNoRepeatNGramLogitsProcessor,
+    ForcedBOSTokenLogitsProcessor,
+    ForcedEOSTokenLogitsProcessor,
+    HammingDiversityLogitsProcessor,
+    InfNanRemoveLogitsProcessor,
+    LogitsProcessorList,
+    MinLengthLogitsProcessor,
+    NoBadWordsLogitsProcessor,
+    NoRepeatNGramLogitsProcessor,
+    PrefixConstrainedLogitsProcessor,
+    RepetitionPenaltyLogitsProcessor,
+    TemperatureLogitsWarper,
+    TopKLogitsWarper,
+    TopPLogitsWarper,
+)
+from .generation_stopping_criteria import (
+    MaxLengthCriteria,
+    MaxNewTokensCriteria,
+    MaxTimeCriteria,
+    StoppingCriteriaList,
+    validate_stopping_criteria,
+)
+
+from fastNLP.envs.imports import _NEED_IMPORT_TORCH
+from fastNLP.core.log import logger
+
+if _NEED_IMPORT_TORCH:
+    import torch
+    import torch.distributed as dist
+    from torch import nn, no_grad
+else:
+    from fastNLP.core.utils.dummy_class import DummyClass as no_grad
+
+
+@dataclass
+class GreedySearchDecoderOnlyOutput(ModelOutput):
+    """
+    Base class for outputs of decoder-only generation models using greedy search.
+
+
+    Args:
+        sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
+            shorter if all batches finished early due to the :obj:`eos_token_id`.
+        scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
+            at each generation step. :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`torch.FloatTensor`
+            with each tensor of shape :obj:`(batch_size, config.vocab_size)`).
+        attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
+        hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, generated_length, hidden_size)`.
+    """
+
+    sequences: "torch.LongTensor" = None
+    scores: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    hidden_states: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+
+
+@dataclass
+class GreedySearchEncoderDecoderOutput(ModelOutput):
+    """
+    Base class for outputs of encoder-decoder generation models using greedy search. Hidden states and attention
+    weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the
+    encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
+
+
+    Args:
+        sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
+            shorter if all batches finished early due to the :obj:`eos_token_id`.
+        scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
+            at each generation step. :obj:`(max_length-1,)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor
+            of shape :obj:`(batch_size, config.vocab_size)`).
+        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer of the decoder) of shape :obj:`(batch_size,
+            num_heads, sequence_length, sequence_length)`.
+        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+        decoder_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
+        cross_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, generated_length, hidden_size)`.
+    """
+
+    sequences: "torch.LongTensor" = None
+    scores: Optional[Tuple["torch.FloatTensor"]] = None
+    encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+    encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    decoder_attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    cross_attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    decoder_hidden_states: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+
+
+@dataclass
+class SampleDecoderOnlyOutput(ModelOutput):
+    """
+    Base class for outputs of decoder-only generation models using sampling.
+
+
+    Args:
+        sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
+            shorter if all batches finished early due to the :obj:`eos_token_id`.
+        scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
+            at each generation step. :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`torch.FloatTensor`
+            with each tensor of shape :obj:`(batch_size*num_return_sequences, config.vocab_size)`).
+        attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(num_return_sequences*batch_size, num_heads, generated_length,
+            sequence_length)`.
+        hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(num_return_sequences*batch_size, generated_length, hidden_size)`.
+    """
+
+    sequences: "torch.LongTensor" = None
+    scores: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    hidden_states: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+
+
+@dataclass
+class SampleEncoderDecoderOutput(ModelOutput):
+    """
+    Base class for outputs of encoder-decoder generation models using sampling. Hidden states and attention weights of
+    the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states
+    attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
+
+
+    Args:
+        sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
+            shorter if all batches finished early due to the :obj:`eos_token_id`.
+        scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
+            at each generation step. :obj:`(max_length-1,)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor
+            of shape :obj:`(batch_size*num_return_sequences, config.vocab_size)`).
+        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer of the decoder) of shape
+            :obj:`(batch_size*num_return_sequences, num_heads, sequence_length, sequence_length)`.
+        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size*num_return_sequences, sequence_length, hidden_size)`.
+        decoder_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_return_sequences, num_heads, generated_length,
+            sequence_length)`.
+        cross_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_return_sequences, generated_length, hidden_size)`.
+    """
+
+    sequences: "torch.LongTensor" = None
+    scores: Optional[Tuple["torch.FloatTensor"]] = None
+    encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+    encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    decoder_attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    cross_attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    decoder_hidden_states: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+
+
+@dataclass
+class BeamSearchDecoderOnlyOutput(ModelOutput):
+    """
+    Base class for outputs of decoder-only generation models using beam search.
+
+    Args:
+        sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
+            shorter if all batches finished early due to the :obj:`eos_token_id`.
+        sequences_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_return_sequences)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+            Final beam scores of the generated ``sequences``.
+        scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+            Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
+            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
+            . :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor of
+            shape :obj:`(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
+        attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length,
+            sequence_length)`.
+        hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, generated_length,
+            hidden_size)`.
+    """
+
+    sequences: "torch.LongTensor" = None
+    sequences_scores: Optional["torch.FloatTensor"] = None
+    scores: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    hidden_states: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+
+
+@dataclass
+class BeamSearchEncoderDecoderOutput(ModelOutput):
+    """
+    Base class for outputs of encoder-decoder generation models using beam search. Hidden states and attention weights
+    of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the encoder_hidden_states
+    attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
+
+    Args:
+        sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
+            shorter if all batches finished early due to the :obj:`eos_token_id`.
+        sequences_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_return_sequences)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+            Final beam scores of the generated ``sequences``.
+        scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+            Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
+            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
+            . :obj:`(max_length-1,)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor of shape
+            :obj:`(batch_size*num_beams, config.vocab_size)`).
+        attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer of the decoder) of shape :obj:`(batch_size,
+            num_heads, sequence_length, sequence_length)`.
+        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
+        decoder_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, num_heads,
+            generated_length, sequence_length)`.
+        cross_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, generated_length,
+            hidden_size)`.
+    """
+
+    sequences: "torch.LongTensor" = None
+    sequences_scores: Optional["torch.FloatTensor"] = None
+    scores: Optional[Tuple["torch.FloatTensor"]] = None
+    encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+    encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    decoder_attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    cross_attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    decoder_hidden_states: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+
+
+@dataclass
+class BeamSampleDecoderOnlyOutput(ModelOutput):
+    """
+    Base class for outputs of decoder-only generation models using beam sample.
+
+    Args:
+        sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
+            shorter if all batches finished early due to the :obj:`eos_token_id`.
+        sequences_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_return_sequence)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+            Final beam scores of the generated ``sequences``.
+        scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+            Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
+            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
+            . :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor of
+            shape :obj:`(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
+        attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length,
+            sequence_length)`.
+        hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, generated_length, hidden_size)`.
+    """
+
+    sequences: "torch.LongTensor" = None
+    sequences_scores: Optional["torch.FloatTensor"] = None
+    scores: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    hidden_states: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+
+
+@dataclass
+class BeamSampleEncoderDecoderOutput(ModelOutput):
+    """
+    Base class for outputs of encoder-decoder generation models using beam sampling. Hidden states and attention
+    weights of the decoder (respectively the encoder) can be accessed via the encoder_attentions and the
+    encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
+
+    Args:
+        sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_beams, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
+            shorter if all batches finished early due to the :obj:`eos_token_id`.
+        sequences_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_return_sequence)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+            Final beam scores of the generated ``sequences``.
+        scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+            Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
+            softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
+            . :obj:`(max_length-1,)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor of shape
+            :obj:`(batch_size*num_beams, config.vocab_size)`).
+        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer of the decoder) of shape :obj:`(batch_size,
+            num_heads, sequence_length, sequence_length)`.
+        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size*num_beams, sequence_length, hidden_size)`.
+        decoder_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length,
+            sequence_length)`.
+        cross_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
+            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, generated_length, hidden_size)`.
+    """
+
+    sequences: "torch.LongTensor" = None
+    sequences_scores: Optional["torch.FloatTensor"] = None
+    scores: Optional[Tuple["torch.FloatTensor"]] = None
+    encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+    encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    decoder_attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    cross_attentions: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    decoder_hidden_states: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+
+
+GreedySearchOutput = Union[GreedySearchEncoderDecoderOutput, GreedySearchDecoderOnlyOutput]
+SampleOutput = Union[SampleEncoderDecoderOutput, SampleDecoderOnlyOutput]
+BeamSearchOutput = Union[BeamSearchEncoderDecoderOutput, BeamSearchDecoderOnlyOutput]
+BeamSampleOutput = Union[BeamSampleEncoderDecoderOutput, BeamSampleDecoderOnlyOutput]
+
+
+class GenerationMixin:
+    """
+    A class containing all of the functions supporting generation, to be used as a mixin in
+    :class:`~transformers.PreTrainedModel`.
+    """
+
+    def prepare_inputs_for_generation(self, input_ids: "torch.LongTensor", **kwargs) -> Dict[str, Any]:
+        """
+        Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to prepare inputs in the
+        generate method.
+        """
+        return {"input_ids": input_ids}
+
+    def adjust_logits_during_generation(self, logits: "torch.FloatTensor", **kwargs) -> "torch.FloatTensor":
+        """
+        Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to adjust the logits in
+        the generate method.
+        """
+        return logits
+
+    def _prepare_input_ids_for_generation(
+        self, bos_token_id: Optional[int], encoder_outputs: Optional[ModelOutput]
+    ) -> "torch.LongTensor":
+        if self.config.is_encoder_decoder and encoder_outputs is not None:
+            # make dummy input_ids with value -100, as a sanity check ensuring that they won't be used for encoding
+            shape = encoder_outputs.last_hidden_state.size()[:-1]
+            return torch.ones(shape, dtype=torch.long, device=self.device) * -100
+
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")
+        return torch.ones((1, 1), dtype=torch.long, device=self.device) * bos_token_id
+
+    def _prepare_attention_mask_for_generation(
+        self, input_ids: "torch.Tensor", pad_token_id: int, eos_token_id: int
+    ) -> "torch.LongTensor":
+        is_pad_token_in_inputs_ids = (pad_token_id is not None) and (pad_token_id in input_ids)
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
+            (eos_token_id is not None) and (pad_token_id != eos_token_id)
+        )
+        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
+            return input_ids.ne(pad_token_id).long()
+        return input_ids.new_ones(input_ids.shape, dtype=torch.long)
+
+    def _prepare_encoder_decoder_kwargs_for_generation(
+        self, input_ids: "torch.LongTensor", model_kwargs
+    ) -> Dict[str, Any]:
+        if "encoder_outputs" not in model_kwargs:
+            # retrieve encoder hidden states
+            encoder = self.get_encoder()
+            encoder_kwargs = {
+                argument: value
+                for argument, value in model_kwargs.items()
+                if not (argument.startswith("decoder_") or argument.startswith("cross_attn"))
+            }
+            model_kwargs["encoder_outputs"]: ModelOutput = encoder(input_ids, return_dict=True, **encoder_kwargs)
+        return model_kwargs
+
+    def _prepare_decoder_input_ids_for_generation(
+        self, input_ids: "torch.LongTensor", decoder_start_token_id: int = None, bos_token_id: int = None
+    ) -> "torch.LongTensor":
+        decoder_start_token_id = self._get_decoder_start_token_id(decoder_start_token_id, bos_token_id)
+        decoder_input_ids = (
+            torch.ones((input_ids.shape[0], 1), dtype=torch.long, device=input_ids.device) * decoder_start_token_id
+        )
+        return decoder_input_ids
+
+    def _get_pad_token_id(self, pad_token_id: int = None, eos_token_id: int = None) -> int:
+        if pad_token_id is None and eos_token_id is not None:
+            logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
+            pad_token_id = eos_token_id
+        return pad_token_id
+
+    def _get_decoder_start_token_id(self, decoder_start_token_id: int = None, bos_token_id: int = None) -> int:
+        decoder_start_token_id = (
+            decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id
+        )
+        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
+
+        if decoder_start_token_id is not None:
+            return decoder_start_token_id
+        elif (
+            hasattr(self.config, "decoder")
+            and hasattr(self.config.decoder, "decoder_start_token_id")
+            and self.config.decoder.decoder_start_token_id is not None
+        ):
+            return self.config.decoder.decoder_start_token_id
+        elif bos_token_id is not None:
+            return bos_token_id
+        elif (
+            hasattr(self.config, "decoder")
+            and hasattr(self.config.decoder, "bos_token_id")
+            and self.config.decoder.bos_token_id is not None
+        ):
+            return self.config.decoder.bos_token_id
+        raise ValueError(
+            "`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation."
+        )
+
+    @staticmethod
+    def _expand_inputs_for_generation(
+        input_ids: "torch.LongTensor",
+        expand_size: int = 1,
+        is_encoder_decoder: bool = False,
+        attention_mask: "torch.LongTensor" = None,
+        encoder_outputs: ModelOutput = None,
+        **model_kwargs,
+    ) -> Tuple["torch.LongTensor", Dict[str, Any]]:
+        expanded_return_idx = (
+            torch.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1).to(input_ids.device)
+        )
+        input_ids = input_ids.index_select(0, expanded_return_idx)
+
+        if "token_type_ids" in model_kwargs:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = token_type_ids.index_select(0, expanded_return_idx)
+
+        if attention_mask is not None:
+            model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
+
+        if is_encoder_decoder:
+            assert encoder_outputs is not None
+            encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.index_select(
+                0, expanded_return_idx.to(encoder_outputs.last_hidden_state.device)
+            )
+            model_kwargs["encoder_outputs"] = encoder_outputs
+        return input_ids, model_kwargs
+
+    @staticmethod
+    def _update_model_kwargs_for_generation(
+        outputs: ModelOutput, model_kwargs: Dict[str, Any], is_encoder_decoder: bool = False
+    ) -> Dict[str, Any]:
+        # update past
+        if "past_key_values" in outputs:
+            model_kwargs["past"] = outputs.past_key_values
+        elif "mems" in outputs:
+            model_kwargs["past"] = outputs.mems
+        elif "past_buckets_states" in outputs:
+            model_kwargs["past"] = outputs.past_buckets_states
+        else:
+            model_kwargs["past"] = None
+
+        # update token_type_ids with last value
+        if "token_type_ids" in model_kwargs:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
+
+        # update attention mask
+        if not is_encoder_decoder:
+            if "attention_mask" in model_kwargs:
+                attention_mask = model_kwargs["attention_mask"]
+                model_kwargs["attention_mask"] = torch.cat(
+                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+                )
+
+        return model_kwargs
+
+    def _reorder_cache(self, past, beam_idx):
+        raise NotImplementedError(
+            f"Make sure that a `_reorder_cache` function is correctly implemented in {self.__class__.__module__} to enable beam search for {self.__class__}"
+        )
+
+    def _get_logits_warper(
+        self, top_k: int = None, top_p: float = None, temperature: float = None, num_beams: int = None
+    ) -> LogitsProcessorList:
+        """
+        This class returns a :obj:`~transformers.LogitsProcessorList` list object that contains all relevant
+        :obj:`~transformers.LogitsWarper` instances used for multinomial sampling.
+        """
+
+        # init warp parameters
+        top_k = top_k if top_k is not None else self.config.top_k
+        top_p = top_p if top_p is not None else self.config.top_p
+        temperature = temperature if temperature is not None else self.config.temperature
+        # instantiate warpers list
+        warpers = LogitsProcessorList()
+
+        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
+        # all samplers can be found in `generation_utils_samplers.py`
+        if temperature is not None and temperature != 1.0:
+            warpers.append(TemperatureLogitsWarper(temperature))
+        if top_k is not None and top_k != 0:
+            warpers.append(TopKLogitsWarper(top_k=top_k, min_tokens_to_keep=(2 if num_beams > 1 else 1)))
+        if top_p is not None and top_p < 1.0:
+            warpers.append(TopPLogitsWarper(top_p=top_p, min_tokens_to_keep=(2 if num_beams > 1 else 1)))
+        return warpers
+
+    def _get_logits_processor(
+        self,
+        repetition_penalty: float,
+        no_repeat_ngram_size: int,
+        encoder_no_repeat_ngram_size: int,
+        encoder_input_ids: "torch.LongTensor",
+        bad_words_ids: List[List[int]],
+        min_length: int,
+        max_length: int,
+        eos_token_id: int,
+        forced_bos_token_id: int,
+        forced_eos_token_id: int,
+        prefix_allowed_tokens_fn: Callable[[int, "torch.Tensor"], List[int]],
+        num_beams: int,
+        num_beam_groups: int,
+        diversity_penalty: float,
+        remove_invalid_values: bool,
+    ) -> LogitsProcessorList:
+        """
+        This class returns a :obj:`~transformers.LogitsProcessorList` list object that contains all relevant
+        :obj:`~transformers.LogitsProcessor` instances used to modify the scores of the language model head.
+        """
+        processors = LogitsProcessorList()
+
+        # init warp parameters
+        repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty
+        no_repeat_ngram_size = (
+            no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size
+        )
+        encoder_no_repeat_ngram_size = (
+            encoder_no_repeat_ngram_size
+            if encoder_no_repeat_ngram_size is not None
+            else self.config.encoder_no_repeat_ngram_size
+        )
+        bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids
+        min_length = min_length if min_length is not None else self.config.min_length
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        diversity_penalty = diversity_penalty if diversity_penalty is not None else self.config.diversity_penalty
+        forced_bos_token_id = (
+            forced_bos_token_id if forced_bos_token_id is not None else self.config.forced_bos_token_id
+        )
+        forced_eos_token_id = (
+            forced_eos_token_id if forced_eos_token_id is not None else self.config.forced_eos_token_id
+        )
+        remove_invalid_values = (
+            remove_invalid_values if remove_invalid_values is not None else self.config.remove_invalid_values
+        )
+        # instantiate processors list
+
+        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
+        # all samplers can be found in `generation_utils_samplers.py`
+        if diversity_penalty is not None and diversity_penalty > 0.0:
+            processors.append(
+                HammingDiversityLogitsProcessor(
+                    diversity_penalty=diversity_penalty, num_beams=num_beams, num_beam_groups=num_beam_groups
+                )
+            )
+        if repetition_penalty is not None and repetition_penalty != 1.0:
+            processors.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
+        if no_repeat_ngram_size is not None and no_repeat_ngram_size > 0:
+            processors.append(NoRepeatNGramLogitsProcessor(no_repeat_ngram_size))
+        if encoder_no_repeat_ngram_size is not None and encoder_no_repeat_ngram_size > 0:
+            if self.config.is_encoder_decoder:
+                processors.append(EncoderNoRepeatNGramLogitsProcessor(encoder_no_repeat_ngram_size, encoder_input_ids))
+            else:
+                raise ValueError(
+                    "It's impossible to use `encoder_no_repeat_ngram_size` with decoder-only architecture"
+                )
+        if bad_words_ids is not None:
+            processors.append(NoBadWordsLogitsProcessor(bad_words_ids, eos_token_id))
+        if min_length is not None and eos_token_id is not None and min_length > -1:
+            processors.append(MinLengthLogitsProcessor(min_length, eos_token_id))
+        if prefix_allowed_tokens_fn is not None:
+            processors.append(PrefixConstrainedLogitsProcessor(prefix_allowed_tokens_fn, num_beams // num_beam_groups))
+        if forced_bos_token_id is not None:
+            processors.append(ForcedBOSTokenLogitsProcessor(forced_bos_token_id))
+        if forced_eos_token_id is not None:
+            processors.append(ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id))
+        if remove_invalid_values is True:
+            processors.append(InfNanRemoveLogitsProcessor())
+        return processors
+
+    def _get_stopping_criteria(
+        self, max_length: Optional[int], max_time: Optional[float], max_new_tokens: Optional[int], start_length: int
+    ) -> StoppingCriteriaList:
+        stopping_criteria = StoppingCriteriaList()
+        if max_length is not None:
+            stopping_criteria.append(MaxLengthCriteria(max_length=max_length))
+        if max_time is not None:
+            stopping_criteria.append(MaxTimeCriteria(max_time=max_time))
+        if max_new_tokens is not None:
+            stopping_criteria.append(MaxNewTokensCriteria(start_length=start_length, max_new_tokens=max_new_tokens))
+        return stopping_criteria
+
+    @no_grad()
+    def generate(
+        self,
+        input_ids: Optional["torch.LongTensor"] = None,
+        max_length: Optional[int] = None,
+        min_length: Optional[int] = None,
+        do_sample: Optional[bool] = None,
+        early_stopping: Optional[bool] = None,
+        num_beams: Optional[int] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        repetition_penalty: Optional[float] = None,
+        bad_words_ids: Optional[Iterable[int]] = None,
+        bos_token_id: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        length_penalty: Optional[float] = None,
+        no_repeat_ngram_size: Optional[int] = None,
+        encoder_no_repeat_ngram_size: Optional[int] = None,
+        num_return_sequences: Optional[int] = None,
+        max_time: Optional[float] = None,
+        max_new_tokens: Optional[int] = None,
+        decoder_start_token_id: Optional[int] = None,
+        use_cache: Optional[bool] = None,
+        num_beam_groups: Optional[int] = None,
+        diversity_penalty: Optional[float] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, "torch.Tensor"], List[int]]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        forced_bos_token_id: Optional[int] = None,
+        forced_eos_token_id: Optional[int] = None,
+        remove_invalid_values: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
+        **model_kwargs,
+    ) -> Union[GreedySearchOutput, SampleOutput, BeamSearchOutput, BeamSampleOutput, "torch.LongTensor"]:
+        r"""
+        Generates sequences for models with a language modeling head. The method currently supports greedy decoding,
+        multinomial sampling, beam-search decoding, and beam-search multinomial sampling.
+
+        Apart from :obj:`input_ids` and :obj:`attention_mask`, all the arguments below will default to the value of the
+        attribute of the same name inside the :class:`~transformers.PretrainedConfig` of the model. The default values
+        indicated are the default values of those config.
+
+        Most of these parameters are explained in more detail in `this blog post
+        <https://huggingface.co/blog/how-to-generate>`__.
+
+        Parameters:
+
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                The sequence used as a prompt for the generation. If :obj:`None` the method initializes it with
+                :obj:`bos_token_id` and a batch size of 1.
+            max_length (:obj:`int`, `optional`, defaults to :obj:`model.config.max_length`):
+                The maximum length of the sequence to be generated.
+            max_new_tokens (:obj:`int`, `optional`, defaults to None):
+                The maximum numbers of tokens to generate, ignore the current number of tokens. Use either
+                :obj:`max_new_tokens` or :obj:`max_length` but not both, they serve the same purpose.
+            min_length (:obj:`int`, `optional`, defaults to 10):
+                The minimum length of the sequence to be generated.
+            do_sample (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to use sampling ; use greedy decoding otherwise.
+            early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
+            num_beams (:obj:`int`, `optional`, defaults to 1):
+                Number of beams for beam search. 1 means no beam search.
+            temperature (:obj:`float`, `optional`, defaults to 1.0):
+                The value used to module the next token probabilities.
+            top_k (:obj:`int`, `optional`, defaults to 50):
+                The number of highest probability vocabulary tokens to keep for top-k-filtering.
+            top_p (:obj:`float`, `optional`, defaults to 1.0):
+                If set to float < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or
+                higher are kept for generation.
+            repetition_penalty (:obj:`float`, `optional`, defaults to 1.0):
+                The parameter for repetition penalty. 1.0 means no penalty. See `this paper
+                <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
+            pad_token_id (:obj:`int`, `optional`):
+                The id of the `padding` token.
+            bos_token_id (:obj:`int`, `optional`):
+                The id of the `beginning-of-sequence` token.
+            eos_token_id (:obj:`int`, `optional`):
+                The id of the `end-of-sequence` token.
+            length_penalty (:obj:`float`, `optional`, defaults to 1.0):
+                Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the
+                model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer
+                sequences.
+            no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0):
+                If set to int > 0, all ngrams of that size can only occur once.
+            encoder_no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0):
+                If set to int > 0, all ngrams of that size that occur in the ``encoder_input_ids`` cannot occur in the
+                ``decoder_input_ids``.
+            bad_words_ids(:obj:`List[List[int]]`, `optional`):
+                List of token ids that are not allowed to be generated. In order to get the tokens of the words that
+                should not appear in the generated text, use :obj:`tokenizer(bad_word,
+                add_prefix_space=True).input_ids`.
+            num_return_sequences(:obj:`int`, `optional`, defaults to 1):
+                The number of independently computed returned sequences for each element in the batch.
+            max_time(:obj:`float`, `optional`, defaults to None):
+                The maximum amount of time you allow the computation to run for in seconds. generation will still
+                finish the current pass after allocated time has been passed.
+            attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values are in ``[0, 1]``, 1 for
+                tokens that are not masked, and 0 for masked tokens. If not provided, will default to a tensor the same
+                shape as :obj:`input_ids` that masks the pad token. `What are attention masks?
+                <../glossary.html#attention-mask>`__
+            decoder_start_token_id (:obj:`int`, `optional`):
+                If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token.
+            use_cache: (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether or not the model should use the past last key/values attentions (if applicable to the model) to
+                speed up decoding.
+            num_beam_groups (:obj:`int`, `optional`, defaults to 1):
+                Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of
+                beams. `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
+            diversity_penalty (:obj:`float`, `optional`, defaults to 0.0):
+                This value is subtracted from a beam's score if it generates a token same as any beam from other group
+                at a particular time. Note that :obj:`diversity_penalty` is only effective if ``group beam search`` is
+                enabled.
+            prefix_allowed_tokens_fn: (:obj:`Callable[[int, torch.Tensor], List[int]]`, `optional`):
+                If provided, this function constraints the beam search to allowed tokens only at each step. If not
+                provided no constraint is applied. This function takes 2 arguments: the batch ID :obj:`batch_id` and
+                :obj:`input_ids`. It has to return a list with the allowed tokens for the next generation step
+                conditioned on the batch ID :obj:`batch_id` and the previously generated tokens :obj:`inputs_ids`. This
+                argument is useful for constrained generation conditioned on the prefix, as described in
+                `Autoregressive Entity Retrieval <https://arxiv.org/abs/2010.00904>`__.
+            output_attentions (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more details.
+            output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more details.
+            output_scores (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
+            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            forced_bos_token_id (:obj:`int`, `optional`):
+                The id of the token to force as the first generated token after the :obj:`decoder_start_token_id`.
+                Useful for multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token
+                needs to be the target language token.
+            forced_eos_token_id (:obj:`int`, `optional`):
+                The id of the token to force as the last generated token when :obj:`max_length` is reached.
+            remove_invalid_values (:obj:`bool`, `optional`):
+                Whether to remove possible `nan` and `inf` outputs of the model to prevent the generation method to
+                crash. Note that using ``remove_invalid_values`` can slow down generation.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If the
+                model is an encoder-decoder model, encoder specific kwargs should not be prefixed and decoder specific
+                kwargs should be prefixed with `decoder_`.
+
+        Return:
+            :class:`~transformers.file_utils.ModelOutput` or :obj:`torch.LongTensor`: A
+            :class:`~transformers.file_utils.ModelOutput` (if ``return_dict_in_generate=True`` or when
+            ``config.return_dict_in_generate=True``) or a :obj:`torch.FloatTensor`.
+
+                If the model is `not` an encoder-decoder model (``model.config.is_encoder_decoder=False``), the
+                possible :class:`~transformers.file_utils.ModelOutput` types are:
+
+                    - :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput`,
+                    - :class:`~transformers.generation_utils.SampleDecoderOnlyOutput`,
+                    - :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput`,
+                    - :class:`~transformers.generation_utils.BeamSampleDecoderOnlyOutput`
+
+                If the model is an encoder-decoder model (``model.config.is_encoder_decoder=True``), the possible
+                :class:`~transformers.file_utils.ModelOutput` types are:
+
+                    - :class:`~transformers.generation_utils.GreedySearchEncoderDecoderOutput`,
+                    - :class:`~transformers.generation_utils.SampleEncoderDecoderOutput`,
+                    - :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput`,
+                    - :class:`~transformers.generation_utils.BeamSampleEncoderDecoderOutput`
+
+        Examples::
+            >>> from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
+
+            >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+            >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+            >>> # do greedy decoding without providing a prompt
+            >>> outputs = model.generate(max_length=40)
+            >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
+
+            >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+            >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+            >>> document = (
+            ... "at least two people were killed in a suspected bomb attack on a passenger bus "
+            ... "in the strife-torn southern philippines on monday , the military said."
+            ... )
+            >>> # encode input context
+            >>> input_ids = tokenizer(document, return_tensors="pt").input_ids
+            >>> # generate 3 independent sequences using beam search decoding (5 beams)
+            >>> # with T5 encoder-decoder model conditioned on short news article.
+            >>> outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3)
+            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+
+            >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+            >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+            >>> input_context = "The dog"
+            >>> # encode input context
+            >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids
+            >>> # generate 3 candidates using sampling
+            >>> outputs = model.generate(input_ids=input_ids, max_length=20, num_return_sequences=3, do_sample=True)
+            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+
+            >>> tokenizer = AutoTokenizer.from_pretrained("ctrl")
+            >>> model = AutoModelForCausalLM.from_pretrained("ctrl")
+            >>> # "Legal" is one of the control codes for ctrl
+            >>> input_context = "Legal My neighbor is"
+            >>> # encode input context
+            >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids
+            >>> outputs = model.generate(input_ids=input_ids, max_length=20, repetition_penalty=1.2)
+            >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
+
+            >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+            >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+            >>> input_context = "My cute dog"
+            >>> # get tokens of words that should not be generated
+            >>> bad_words_ids = [tokenizer(bad_word, add_prefix_space=True).input_ids for bad_word in ["idiot", "stupid", "shut up"]]
+            >>> # encode input context
+            >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids
+            >>> # generate sequences without allowing bad_words to be generated
+            >>> outputs = model.generate(input_ids=input_ids, max_length=20, do_sample=True, bad_words_ids=bad_words_ids)
+            >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
+        """
+
+        # set init values
+        if max_length is None and max_new_tokens is None:
+            # Both are None, default
+            max_length = self.config.max_length
+        elif max_length is not None and max_new_tokens is not None:
+            # Both are set, this is odd, raise a warning
+            logger.warn(
+                "Both `max_length` and `max_new_tokens` have been set but they serve the same purpose.", UserWarning
+            )
+
+        max_length = max_length if max_length is not None else self.config.max_length
+        num_beams = num_beams if num_beams is not None else self.config.num_beams
+        num_beam_groups = num_beam_groups if num_beam_groups is not None else self.config.num_beam_groups
+        do_sample = do_sample if do_sample is not None else self.config.do_sample
+        num_return_sequences = (
+            num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
+        )
+
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+
+        output_scores = output_scores if output_scores is not None else self.config.output_scores
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
+        )
+
+        model_kwargs["output_attentions"] = output_attentions
+        model_kwargs["output_hidden_states"] = output_hidden_states
+
+        if input_ids is None and "inputs_embeds" not in model_kwargs:
+            # init `input_ids` with bos_token_id
+            input_ids = self._prepare_input_ids_for_generation(bos_token_id, model_kwargs.get("encoder_outputs"))
+
+        if model_kwargs.get("attention_mask", None) is None:
+            # init `attention_mask` depending on `pad_token_id`
+            model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
+                input_ids, pad_token_id, eos_token_id
+            )
+
+        # special case if pad_token_id is not defined
+        if pad_token_id is None and eos_token_id is not None:
+            logger.warning(f"Setting `pad_token_id` to `eos_token_id`:{eos_token_id} for open-end generation.")
+            pad_token_id = eos_token_id
+
+        # Storing encoder_input_ids for logits_processor that could use them
+        encoder_input_ids = input_ids if self.config.is_encoder_decoder else None
+
+        if self.config.is_encoder_decoder:
+            # add encoder_outputs to model_kwargs
+            model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(input_ids, model_kwargs)
+
+            # set input_ids as decoder_input_ids
+            if "decoder_input_ids" in model_kwargs:
+                input_ids = model_kwargs.pop("decoder_input_ids")
+            else:
+                input_ids = self._prepare_decoder_input_ids_for_generation(
+                    input_ids, decoder_start_token_id=decoder_start_token_id, bos_token_id=bos_token_id
+                )
+
+            if "encoder_outputs" not in model_kwargs or not isinstance(model_kwargs["encoder_outputs"], ModelOutput):
+                raise ValueError("Make sure that `model_kwargs` include `encoder_outputs` of type `ModelOutput`.")
+
+        if input_ids.shape[-1] >= max_length:
+            input_ids_string = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids.shape[-1]}, but ``max_length`` is set to {max_length}."
+                "This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``."
+            )
+
+        # determine generation mode
+        is_greedy_gen_mode = (num_beams == 1) and (num_beam_groups == 1) and do_sample is False
+        is_sample_gen_mode = (num_beams == 1) and (num_beam_groups == 1) and do_sample is True
+        is_beam_gen_mode = (num_beams > 1) and (num_beam_groups == 1) and do_sample is False
+        is_beam_sample_gen_mode = (num_beams > 1) and (num_beam_groups == 1) and do_sample is True
+        is_group_beam_gen_mode = (num_beams > 1) and (num_beam_groups > 1)
+        if num_beam_groups > num_beams:
+            raise ValueError("`num_beam_groups` has to be smaller or equal to `num_beams`")
+        if is_group_beam_gen_mode and do_sample is True:
+            raise ValueError(
+                "Diverse beam search cannot be used in sampling mode. Make sure that `do_sample` is set to `False`."
+            )
+
+        # set model_kwargs
+        model_kwargs["use_cache"] = use_cache
+
+        # get distribution pre_processing samplers
+        logits_processor = self._get_logits_processor(
+            repetition_penalty=repetition_penalty,
+            no_repeat_ngram_size=no_repeat_ngram_size,
+            encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size,
+            encoder_input_ids=encoder_input_ids,
+            bad_words_ids=bad_words_ids,
+            min_length=min_length,
+            max_length=max_length,
+            eos_token_id=eos_token_id,
+            forced_bos_token_id=forced_bos_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            num_beams=num_beams,
+            num_beam_groups=num_beam_groups,
+            diversity_penalty=diversity_penalty,
+            remove_invalid_values=remove_invalid_values,
+        )
+
+        cur_len = input_ids.shape[-1]
+        stopping_criteria = self._get_stopping_criteria(
+            max_length=max_length, max_time=max_time, max_new_tokens=max_new_tokens, start_length=cur_len
+        )
+
+        if is_greedy_gen_mode:
+            if num_return_sequences > 1:
+                raise ValueError(
+                    f"num_return_sequences has to be 1, but is {num_return_sequences} when doing greedy search."
+                )
+
+            # greedy search
+            return self.greedy_search(
+                input_ids,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                output_scores=output_scores,
+                return_dict_in_generate=return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                **model_kwargs,
+            )
+
+        elif is_sample_gen_mode:
+            # get probability distribution warper
+            logits_warper = self._get_logits_warper(
+                top_k=top_k, top_p=top_p, temperature=temperature, num_beams=num_beams
+            )
+
+            # expand input_ids with `num_return_sequences` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids,
+                expand_size=num_return_sequences,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                **model_kwargs,
+            )
+
+            # sample
+            return self.sample(
+                input_ids,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                output_scores=output_scores,
+                return_dict_in_generate=return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                **model_kwargs,
+            )
+
+        elif is_beam_gen_mode:
+            batch_size = input_ids.shape[0]
+
+            length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
+            early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
+
+            if num_return_sequences > num_beams:
+                raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
+
+            if stopping_criteria.max_length is None:
+                raise ValueError("`max_length` needs to be a stopping_criteria for now.")
+
+            beam_scorer = BeamSearchScorer(
+                batch_size=batch_size,
+                num_beams=num_beams,
+                device=self.device,
+                length_penalty=length_penalty,
+                do_early_stopping=early_stopping,
+                num_beam_hyps_to_keep=num_return_sequences,
+            )
+            # interleave with `num_beams`
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids, expand_size=num_beams, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs
+            )
+            return self.beam_search(
+                input_ids,
+                beam_scorer,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                output_scores=output_scores,
+                return_dict_in_generate=return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                **model_kwargs,
+            )
+
+        elif is_beam_sample_gen_mode:
+            logits_warper = self._get_logits_warper(
+                top_k=top_k, top_p=top_p, temperature=temperature, num_beams=num_beams
+            )
+
+            batch_size = input_ids.shape[0] * num_return_sequences
+
+            length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
+            if stopping_criteria.max_length is None:
+                raise ValueError("`max_length` needs to be a stopping_criteria for now.")
+            beam_scorer = BeamSearchScorer(
+                batch_size=batch_size,
+                num_beams=num_beams,
+                device=self.device,
+                length_penalty=length_penalty,
+                do_early_stopping=early_stopping,
+            )
+
+            # interleave with `num_beams * num_return_sequences`
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids,
+                expand_size=num_beams * num_return_sequences,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+                **model_kwargs,
+            )
+
+            return self.beam_sample(
+                input_ids,
+                beam_scorer,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                output_scores=output_scores,
+                return_dict_in_generate=return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                **model_kwargs,
+            )
+
+        elif is_group_beam_gen_mode:
+            batch_size = input_ids.shape[0]
+
+            length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
+            early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
+
+            if num_return_sequences > num_beams:
+                raise ValueError("`num_return_sequences` has to be smaller or equal to `num_beams`.")
+
+            if num_beams % num_beam_groups != 0:
+                raise ValueError("`num_beams` should be divisible by `num_beam_groups` for group beam search.")
+
+            if stopping_criteria.max_length is None:
+                raise ValueError("`max_length` needs to be a stopping_criteria for now.")
+
+            diverse_beam_scorer = BeamSearchScorer(
+                batch_size=batch_size,
+                num_beams=num_beams,
+                max_length=stopping_criteria.max_length,
+                device=self.device,
+                length_penalty=length_penalty,
+                do_early_stopping=early_stopping,
+                num_beam_hyps_to_keep=num_return_sequences,
+                num_beam_groups=num_beam_groups,
+            )
+            # interleave with `num_beams`
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids, expand_size=num_beams, is_encoder_decoder=self.config.is_encoder_decoder, **model_kwargs
+            )
+            return self.group_beam_search(
+                input_ids,
+                diverse_beam_scorer,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                output_scores=output_scores,
+                return_dict_in_generate=return_dict_in_generate,
+                synced_gpus=synced_gpus,
+                **model_kwargs,
+            )
+
+    def greedy_search(
+        self,
+        input_ids: "torch.LongTensor",
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
+        **model_kwargs,
+    ) -> Union[GreedySearchOutput, "torch.LongTensor"]:
+        r"""
+        Generates sequences for models with a language modeling head using greedy decoding.
+
+        Parameters:
+
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            logits_processor (:obj:`LogitsProcessorList`, `optional`):
+                An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
+                :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+                head applied at each generation step.
+            stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
+                An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
+                :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
+
+            max_length (:obj:`int`, `optional`, defaults to 20):
+                **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of
+                generated tokens. The maximum length of the sequence to be generated.
+            pad_token_id (:obj:`int`, `optional`):
+                The id of the `padding` token.
+            eos_token_id (:obj:`int`, `optional`):
+                The id of the `end-of-sequence` token.
+            output_attentions (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more details.
+            output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more details.
+            output_scores (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
+            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            model_kwargs:
+                Additional model specific keyword arguments will be forwarded to the :obj:`forward` function of the
+                model. If model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+
+        Return:
+            :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput`,
+            :class:`~transformers.generation_utils.GreedySearchEncoderDecoderOutput` or obj:`torch.LongTensor`: A
+            :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
+            :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput` if
+            ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
+            :class:`~transformers.generation_utils.GreedySearchEncoderDecoderOutput` if
+            ``model.config.is_encoder_decoder=True``.
+
+        Examples::
+
+            >>> from transformers import (
+            ... AutoTokenizer,
+            ... AutoModelForCausalLM,
+            ... LogitsProcessorList,
+            ... MinLengthLogitsProcessor,
+            ... )
+
+            >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+            >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+
+            >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
+            >>> model.config.pad_token_id = model.config.eos_token_id
+
+            >>> input_prompt = "Today is a beautiful day, and"
+            >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+
+            >>> # instantiate logits processors
+            >>> logits_processor = LogitsProcessorList([
+            ...     MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id),
+            ... ])
+
+            >>> outputs = model.greedy_search(input_ids, logits_processor=logits_processor)
+
+            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        """
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            logger.warn(
+                "`max_length` is deprecated in this function, use `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        output_scores = output_scores if output_scores is not None else self.config.output_scores
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
+        )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        # keep track of which sequences are already finished
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        cur_len = input_ids.shape[-1]
+
+        this_peer_finished = False  # used by synced_gpus only
+        while True:
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
+            next_token_logits = outputs.logits[:, -1, :]
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            # pre-process distribution
+            next_tokens_scores = logits_processor(input_ids, next_token_logits)
+
+            # argmax
+            next_tokens = torch.argmax(next_tokens_scores, dim=-1)
+
+            # finished sentences should have their next token be a padding token
+            if eos_token_id is not None:
+                assert pad_token_id is not None, "If eos_token_id is defined, make sure that pad_token_id is defined."
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            cur_len = cur_len + 1
+
+            # if eos_token was found in one sentence, set sentence to finished
+            if eos_token_id is not None:
+                unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long())
+
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True
+
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                return GreedySearchEncoderDecoderOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                return GreedySearchDecoderOnlyOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return input_ids
+
+    def sample(
+        self,
+        input_ids: "torch.LongTensor",
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_warper: Optional[LogitsProcessorList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
+        **model_kwargs,
+    ) -> Union[SampleOutput, "torch.LongTensor"]:
+        r"""
+        Generates sequences for models with a language modeling head using multinomial sampling.
+
+        Parameters:
+
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            logits_processor (:obj:`LogitsProcessorList`, `optional`):
+                An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
+                :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+                head applied at each generation step.
+            stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
+                An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
+                :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
+            logits_warper (:obj:`LogitsProcessorList`, `optional`):
+                An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
+                :class:`~transformers.LogitsWarper` used to warp the prediction score distribution of the language
+                modeling head applied before multinomial sampling at each generation step.
+            max_length (:obj:`int`, `optional`, defaults to 20):
+                **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of
+                generated tokens. The maximum length of the sequence to be generated.
+            pad_token_id (:obj:`int`, `optional`):
+                The id of the `padding` token.
+            eos_token_id (:obj:`int`, `optional`):
+                The id of the `end-of-sequence` token.
+            output_attentions (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more details.
+            output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more details.
+            output_scores (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
+            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
+                model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+
+        Return:
+            :class:`~transformers.generation_utils.SampleDecoderOnlyOutput`,
+            :class:`~transformers.generation_utils.SampleEncoderDecoderOutput` or obj:`torch.LongTensor`: A
+            :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
+            :class:`~transformers.generation_utils.SampleDecoderOnlyOutput` if
+            ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
+            :class:`~transformers.generation_utils.SampleEncoderDecoderOutput` if
+            ``model.config.is_encoder_decoder=True``.
+
+        Examples::
+
+            >>> from transformers import (
+            ...    AutoTokenizer,
+            ...    AutoModelForCausalLM,
+            ...    LogitsProcessorList,
+            ...    MinLengthLogitsProcessor,
+            ...    TopKLogitsWarper,
+            ...    TemperatureLogitsWarper,
+            ... )
+
+            >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+            >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+
+            >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
+            >>> model.config.pad_token_id = model.config.eos_token_id
+
+            >>> input_prompt = "Today is a beautiful day, and"
+            >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+
+            >>> # instantiate logits processors
+            >>> logits_processor = LogitsProcessorList([
+            ...     MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id),
+            ... ])
+            >>> # instantiate logits processors
+            >>> logits_warper = LogitsProcessorList([
+            ...     TopKLogitsWarper(50),
+            ...     TemperatureLogitsWarper(0.7),
+            ... ])
+
+            >>> outputs = model.sample(input_ids, logits_processor=logits_processor, logits_warper=logits_warper)
+
+            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        """
+
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            logger.warn(
+                "`max_length` is deprecated in this function, use `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        output_scores = output_scores if output_scores is not None else self.config.output_scores
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
+        )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        # keep track of which sequences are already finished
+        unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+        cur_len = input_ids.shape[-1]
+
+        this_peer_finished = False  # used by synced_gpus only
+        # auto-regressive generation
+        while True:
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
+            next_token_logits = outputs.logits[:, -1, :]
+
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+
+            # finished sentences should have their next token be a padding token
+            if eos_token_id is not None:
+                assert pad_token_id is not None, "If eos_token_id is defined, make sure that pad_token_id is defined."
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            cur_len = cur_len + 1
+
+            # if eos_token was found in one sentence, set sentence to finished
+            if eos_token_id is not None:
+                unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long())
+
+            # stop when each sentence is finished, or if we exceed the maximum length
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True
+
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                return SampleEncoderDecoderOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                return SampleDecoderOnlyOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return input_ids
+
+    def beam_search(
+        self,
+        input_ids: "torch.LongTensor",
+        beam_scorer: BeamScorer,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
+        **model_kwargs,
+    ) -> Union[BeamSearchOutput, "torch.LongTensor"]:
+        r"""
+        Generates sequences for models with a language modeling head using beam search decoding.
+
+        Parameters:
+
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            beam_scorer (:obj:`BeamScorer`):
+                An derived instance of :class:`~transformers.BeamScorer` that defines how beam hypotheses are
+                constructed, stored and sorted during generation. For more information, the documentation of
+                :class:`~transformers.BeamScorer` should be read.
+            logits_processor (:obj:`LogitsProcessorList`, `optional`):
+                An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
+                :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+                head applied at each generation step.
+            stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
+                An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
+                :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
+            max_length (:obj:`int`, `optional`, defaults to 20):
+                **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of
+                generated tokens. The maximum length of the sequence to be generated.
+            pad_token_id (:obj:`int`, `optional`):
+                The id of the `padding` token.
+            eos_token_id (:obj:`int`, `optional`):
+                The id of the `end-of-sequence` token.
+            output_attentions (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more details.
+            output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more details.
+            output_scores (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
+            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
+                model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+
+        Return:
+            :class:`~transformers.generation_utilsBeamSearchDecoderOnlyOutput`,
+            :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` or obj:`torch.LongTensor`: A
+            :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
+            :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput` if
+            ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
+            :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` if
+            ``model.config.is_encoder_decoder=True``.
+
+
+        Examples::
+
+            >>> from transformers import (
+            ...    AutoTokenizer,
+            ...    AutoModelForSeq2SeqLM,
+            ...    LogitsProcessorList,
+            ...    MinLengthLogitsProcessor,
+            ...    BeamSearchScorer,
+            ... )
+            >>> import torch
+
+            >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+            >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+
+            >>> encoder_input_str = "translate English to German: How old are you?"
+            >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+
+
+            >>> # lets run beam search using 3 beams
+            >>> num_beams = 3
+            >>> # define decoder start token ids
+            >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+            >>> input_ids = input_ids * model.config.decoder_start_token_id
+
+            >>> # add encoder_outputs to model keyword arguments
+            >>> model_kwargs = {
+            ...     "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
+            ... }
+
+            >>> # instantiate beam scorer
+            >>> beam_scorer = BeamSearchScorer(
+            ...     batch_size=1,
+            ...     num_beams=num_beams,
+            ...     device=model.device,
+            ... )
+
+            >>> # instantiate logits processors
+            >>> logits_processor = LogitsProcessorList([
+            ...     MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+            ... ])
+
+            >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
+
+            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        """
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            logger.warn(
+                "`max_length` is deprecated in this function, use `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        if len(stopping_criteria) == 0:
+            logger.warn("You don't have defined any stopping_criteria, this will likely loop forever", UserWarning)
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        output_scores = output_scores if output_scores is not None else self.config.output_scores
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
+        )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        batch_size = len(beam_scorer._beam_hyps)
+        num_beams = beam_scorer.num_beams
+
+        batch_beam_size, cur_len = input_ids.shape
+
+        assert (
+            num_beams * batch_size == batch_beam_size
+        ), f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+
+        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+
+        this_peer_finished = False  # used by synced_gpus only
+        while True:
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
+            next_token_logits = outputs.logits[:, -1, :]
+            # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
+            # cannot be generated both before and after the `nn.functional.log_softmax` operation.
+            next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len)
+            next_token_scores = nn.functional.log_softmax(
+                next_token_logits, dim=-1
+            )  # (batch_size * num_beams, vocab_size)
+
+            next_token_scores = logits_processor(input_ids, next_token_scores)
+            next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores)
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            # reshape for beam search
+            vocab_size = next_token_scores.shape[-1]
+            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+
+            next_token_scores, next_tokens = torch.topk(
+                next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True
+            )
+
+            next_indices = (next_tokens / vocab_size).long()
+            next_tokens = next_tokens % vocab_size
+
+            # stateless
+            beam_outputs = beam_scorer.process(
+                input_ids,
+                next_token_scores,
+                next_tokens,
+                next_indices,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+            )
+            beam_scores = beam_outputs["next_beam_scores"]
+            beam_next_tokens = beam_outputs["next_beam_tokens"]
+            beam_idx = beam_outputs["next_beam_indices"]
+
+            input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            if model_kwargs["past"] is not None:
+                model_kwargs["past"] = self._reorder_cache(model_kwargs["past"], beam_idx)
+
+            # increase cur_len
+            cur_len = cur_len + 1
+
+            if beam_scorer.is_done or stopping_criteria(input_ids, scores):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True
+
+        sequence_outputs = beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+        )
+
+        if return_dict_in_generate:
+            if not output_scores:
+                sequence_outputs["sequence_scores"] = None
+            if self.config.is_encoder_decoder:
+                return BeamSearchEncoderDecoderOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                return BeamSearchDecoderOnlyOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return sequence_outputs["sequences"]
+
+    def beam_sample(
+        self,
+        input_ids: "torch.LongTensor",
+        beam_scorer: BeamScorer,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_warper: Optional[LogitsProcessorList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
+        **model_kwargs,
+    ) -> Union[BeamSampleOutput, "torch.LongTensor"]:
+        r"""
+        Generates sequences for models with a language modeling head using beam search with multinomial sampling.
+
+        Parameters:
+
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            beam_scorer (:obj:`BeamScorer`):
+                A derived instance of :class:`~transformers.BeamScorer` that defines how beam hypotheses are
+                constructed, stored and sorted during generation. For more information, the documentation of
+                :class:`~transformers.BeamScorer` should be read.
+            logits_processor (:obj:`LogitsProcessorList`, `optional`):
+                An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
+                :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+                head applied at each generation step.
+            stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
+                An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
+                :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
+            logits_warper (:obj:`LogitsProcessorList`, `optional`):
+                An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
+                :class:`~transformers.LogitsWarper` used to warp the prediction score distribution of the language
+                modeling head applied before multinomial sampling at each generation step.
+            max_length (:obj:`int`, `optional`, defaults to 20):
+                **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of
+                generated tokens. The maximum length of the sequence to be generated.
+            pad_token_id (:obj:`int`, `optional`):
+                The id of the `padding` token.
+            eos_token_id (:obj:`int`, `optional`):
+                The id of the `end-of-sequence` token.
+            output_attentions (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more details.
+            output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more details.
+            output_scores (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
+            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
+                model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+
+        Return:
+            :class:`~transformers.generation_utils.BeamSampleDecoderOnlyOutput`,
+            :class:`~transformers.generation_utils.BeamSampleEncoderDecoderOutput` or obj:`torch.LongTensor`: A
+            :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
+            :class:`~transformers.generation_utils.BeamSampleDecoderOnlyOutput` if
+            ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
+            :class:`~transformers.generation_utils.BeamSampleEncoderDecoderOutput` if
+            ``model.config.is_encoder_decoder=True``.
+
+        Examples::
+
+            >>> from transformers import (
+            ...     AutoTokenizer,
+            ...     AutoModelForSeq2SeqLM,
+            ...     LogitsProcessorList,
+            ...     MinLengthLogitsProcessor,
+            ...     TopKLogitsWarper,
+            ...     TemperatureLogitsWarper,
+            ...     BeamSearchScorer,
+            ... )
+            >>> import torch
+
+            >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+            >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+
+            >>> encoder_input_str = "translate English to German: How old are you?"
+            >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+
+            >>> # lets run beam search using 3 beams
+            >>> num_beams = 3
+            >>> # define decoder start token ids
+            >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+            >>> input_ids = input_ids * model.config.decoder_start_token_id
+
+            >>> # add encoder_outputs to model keyword arguments
+            >>> model_kwargs = {
+            ...     "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
+            ... }
+
+            >>> # instantiate beam scorer
+            >>> beam_scorer = BeamSearchScorer(
+            ...     batch_size=1,
+            ...     max_length=model.config.max_length,
+            ...     num_beams=num_beams,
+            ...     device=model.device,
+            ... )
+
+            >>> # instantiate logits processors
+            >>> logits_processor = LogitsProcessorList([
+            ...     MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id)
+            ... ])
+            >>> # instantiate logits processors
+            >>> logits_warper = LogitsProcessorList([
+            ...     TopKLogitsWarper(50),
+            ...     TemperatureLogitsWarper(0.7),
+            ... ])
+
+            >>> outputs = model.beam_sample(
+            ...     input_ids, beam_scorer, logits_processor=logits_processor, logits_warper=logits_warper, **model_kwargs
+            ... )
+
+            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        """
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            logger.warn(
+                "`max_length` is deprecated in this function, use `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        output_scores = output_scores if output_scores is not None else self.config.output_scores
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
+        )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        batch_size = len(beam_scorer._beam_hyps)
+        num_beams = beam_scorer.num_beams
+
+        batch_beam_size, cur_len = input_ids.shape
+
+        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+
+        this_peer_finished = False  # used by synced_gpus only
+        while True:
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
+            next_token_logits = outputs.logits[:, -1, :]
+
+            # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
+            # cannot be generated both before and after the `nn.functional.log_softmax` operation.
+            next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len)
+            next_token_scores = nn.functional.log_softmax(
+                next_token_logits, dim=-1
+            )  # (batch_size * num_beams, vocab_size)
+
+            next_token_scores = logits_processor(input_ids, next_token_scores)
+            next_token_scores = next_token_scores + beam_scores[:, None].expand_as(next_token_scores)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            # reshape for beam search
+            vocab_size = next_token_scores.shape[-1]
+            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+
+            next_tokens = torch.multinomial(probs, num_samples=2 * num_beams)
+            next_token_scores = torch.gather(next_token_scores, -1, next_tokens)
+
+            next_token_scores, _indices = torch.sort(next_token_scores, descending=True, dim=1)
+            next_tokens = torch.gather(next_tokens, -1, _indices)
+
+            next_indices = next_tokens // vocab_size
+            next_tokens = next_tokens % vocab_size
+
+            # stateless
+            beam_outputs = beam_scorer.process(
+                input_ids,
+                next_token_scores,
+                next_tokens,
+                next_indices,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+            )
+            beam_scores = beam_outputs["next_beam_scores"]
+            beam_next_tokens = beam_outputs["next_beam_tokens"]
+            beam_idx = beam_outputs["next_beam_indices"]
+
+            input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            if model_kwargs["past"] is not None:
+                model_kwargs["past"] = self._reorder_cache(model_kwargs["past"], beam_idx)
+
+            # increase cur_len
+            cur_len = cur_len + 1
+
+            if beam_scorer.is_done or stopping_criteria(input_ids, scores):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True
+
+        sequence_outputs = beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+        )
+
+        if return_dict_in_generate:
+            if not output_scores:
+                sequence_outputs["sequence_scores"] = None
+            if self.config.is_encoder_decoder:
+                return BeamSampleEncoderDecoderOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                return BeamSampleDecoderOnlyOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return sequence_outputs["sequences"]
+
+    def group_beam_search(
+        self,
+        input_ids: "torch.LongTensor",
+        beam_scorer: BeamScorer,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_scores: Optional[bool] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        synced_gpus: Optional[bool] = None,
+        **model_kwargs,
+    ):
+        r"""
+        Generates sequences for models with a language modeling head using beam search decoding.
+
+        Parameters:
+
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            beam_scorer (:obj:`BeamScorer`):
+                An derived instance of :class:`~transformers.BeamScorer` that defines how beam hypotheses are
+                constructed, stored and sorted during generation. For more information, the documentation of
+                :class:`~transformers.BeamScorer` should be read.
+            logits_processor (:obj:`LogitsProcessorList`, `optional`):
+                An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
+                :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+                head applied at each generation step.
+            stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
+                An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
+                :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
+            max_length (:obj:`int`, `optional`, defaults to 20):
+                **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of
+                generated tokens. The maximum length of the sequence to be generated.
+            pad_token_id (:obj:`int`, `optional`):
+                The id of the `padding` token.
+            eos_token_id (:obj:`int`, `optional`):
+                The id of the `end-of-sequence` token.
+            output_attentions (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more details.
+            output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more details.
+            output_scores (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
+            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+
+            model_kwargs:
+                Additional model specific kwargs that will be forwarded to the :obj:`forward` function of the model. If
+                model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+
+        Return:
+            :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput`,
+            :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` or obj:`torch.LongTensor`: A
+            :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
+            :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput` if
+            :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput` if
+            ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
+            :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` if
+            ``model.config.is_encoder_decoder=True``.
+
+        Examples::
+
+            >>> from transformers import (
+            ...    AutoTokenizer,
+            ...    AutoModelForSeq2SeqLM,
+            ...    LogitsProcessorList,
+            ...    MinLengthLogitsProcessor,
+            ...    HammingDiversityLogitsProcessor,
+            ...    BeamSearchScorer,
+            ... )
+            >>> import torch
+
+            >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+            >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+
+            >>> encoder_input_str = "translate English to German: How old are you?"
+            >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+
+
+            >>> # lets run diverse beam search using 6 beams
+            >>> num_beams = 6
+            >>> # define decoder start token ids
+            >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+            >>> input_ids = input_ids * model.config.decoder_start_token_id
+
+            >>> # add encoder_outputs to model keyword arguments
+            >>> model_kwargs = {
+            ...     "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
+            ... }
+
+            >>> # instantiate beam scorer
+            >>> beam_scorer = BeamSearchScorer(
+            ...     batch_size=1,
+            ...     max_length=model.config.max_length,
+            ...     num_beams=num_beams,
+            ...     device=model.device,
+            ...     num_beam_groups=3
+            ... )
+
+            >>> # instantiate logits processors
+            >>> logits_processor = LogitsProcessorList([
+            ...     HammingDiversityLogitsProcessor(5.5, num_beams=6, num_beam_groups=3),
+            ...     MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+            ... ])
+
+            >>> outputs = model.group_beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
+
+            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        """
+        # init values
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        if max_length is not None:
+            logger.warn(
+                "`max_length` is deprecated in this function, use `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        output_scores = output_scores if output_scores is not None else self.config.output_scores
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict_in_generate = (
+            return_dict_in_generate if return_dict_in_generate is not None else self.config.return_dict_in_generate
+        )
+
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+
+        batch_size = len(beam_scorer._beam_hyps)
+        num_beams = beam_scorer.num_beams
+        num_beam_groups = beam_scorer.num_beam_groups
+        num_sub_beams = num_beams // num_beam_groups
+        device = input_ids.device
+
+        batch_beam_size, cur_len = input_ids.shape
+
+        assert (
+            num_beams * batch_size == batch_beam_size
+        ), f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+
+        beam_scores = torch.full((batch_size, num_beams), -1e9, dtype=torch.float, device=device)
+        # initialise score of first beam of each group with 0 and the rest with 1e-9. This ensures that the beams in
+        # the same group don't produce same tokens everytime.
+        beam_scores[:, ::num_sub_beams] = 0
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+
+        this_peer_finished = False  # used by synced_gpus only
+        while True:
+
+            if synced_gpus:
+                # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
+                # The following logic allows an early break if all peers finished generating their sequence
+                this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
+                # send 0.0 if we finished, 1.0 otherwise
+                dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
+                # did all peers finish? the reduced sum will be 0.0 then
+                if this_peer_finished_flag.item() == 0.0:
+                    break
+
+            # predicted tokens in cur_len step
+            current_tokens = torch.zeros(batch_size * num_beams, dtype=input_ids.dtype, device=device)
+
+            # indices which will form the beams in the next time step
+            reordering_indices = torch.zeros(batch_size * num_beams, dtype=torch.long, device=device)
+
+            # do one decoder step on all beams of all sentences in batch
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+
+            if output_scores:
+                processed_score = torch.zeros_like(outputs.logits[:, -1, :])
+
+            for beam_group_idx in range(num_beam_groups):
+                group_start_idx = beam_group_idx * num_sub_beams
+                group_end_idx = min(group_start_idx + num_sub_beams, num_beams)
+                group_size = group_end_idx - group_start_idx
+
+                # indices of beams of current group among all sentences in batch
+                batch_group_indices = []
+
+                for batch_idx in range(batch_size):
+                    batch_group_indices.extend(
+                        [batch_idx * num_beams + idx for idx in range(group_start_idx, group_end_idx)]
+                    )
+                group_input_ids = input_ids[batch_group_indices]
+
+                # select outputs of beams of current group only
+                next_token_logits = outputs.logits[batch_group_indices, -1, :]
+
+                # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id`
+                # cannot be generated both before and after the `nn.functional.log_softmax` operation.
+                next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len)
+                next_token_scores = nn.functional.log_softmax(
+                    next_token_logits, dim=-1
+                )  # (batch_size * group_size, vocab_size)
+                vocab_size = next_token_scores.shape[-1]
+
+                next_token_scores = logits_processor(
+                    group_input_ids, next_token_scores, current_tokens=current_tokens, beam_group_idx=beam_group_idx
+                )
+                next_token_scores = next_token_scores + beam_scores[batch_group_indices].unsqueeze(-1).expand_as(
+                    next_token_scores
+                )
+
+                if output_scores:
+                    processed_score[batch_group_indices] = next_token_scores
+
+                # reshape for beam search
+                next_token_scores = next_token_scores.view(batch_size, group_size * vocab_size)
+
+                next_token_scores, next_tokens = torch.topk(
+                    next_token_scores, 2 * group_size, dim=1, largest=True, sorted=True
+                )
+
+                next_indices = next_tokens // vocab_size
+                next_tokens = next_tokens % vocab_size
+
+                # stateless
+                beam_outputs = beam_scorer.process(
+                    group_input_ids,
+                    next_token_scores,
+                    next_tokens,
+                    next_indices,
+                    pad_token_id=pad_token_id,
+                    eos_token_id=eos_token_id,
+                )
+                beam_scores[batch_group_indices] = beam_outputs["next_beam_scores"]
+                beam_next_tokens = beam_outputs["next_beam_tokens"]
+                beam_idx = beam_outputs["next_beam_indices"]
+
+                input_ids[batch_group_indices] = group_input_ids[beam_idx]
+                group_input_ids = torch.cat([group_input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+                current_tokens[batch_group_indices] = group_input_ids[:, -1]
+
+                # (beam_idx // group_size) -> batch_idx
+                # (beam_idx % group_size) -> offset of idx inside the group
+                reordering_indices[batch_group_indices] = (
+                    num_beams * (beam_idx // group_size) + group_start_idx + (beam_idx % group_size)
+                )
+
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (processed_score,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+
+            input_ids = torch.cat([input_ids, current_tokens.unsqueeze(-1)], dim=-1)
+
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
+            )
+            if model_kwargs["past"] is not None:
+                model_kwargs["past"] = self._reorder_cache(model_kwargs["past"], reordering_indices)
+
+            # increase cur_len
+            cur_len = cur_len + 1
+
+            if beam_scorer.is_done or stopping_criteria(input_ids, scores):
+                if not synced_gpus:
+                    break
+                else:
+                    this_peer_finished = True
+
+        sequence_outputs = beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+        )
+
+        if return_dict_in_generate:
+            if not output_scores:
+                sequence_outputs["sequence_scores"] = None
+            if self.config.is_encoder_decoder:
+                return BeamSearchEncoderDecoderOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                )
+            else:
+                return BeamSearchDecoderOnlyOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                )
+        else:
+            return sequence_outputs["sequences"]
+
+
+def top_k_top_p_filtering(
+    logits: "torch.FloatTensor",
+    top_k: int = 0,
+    top_p: float = 1.0,
+    filter_value: float = -float("Inf"),
+    min_tokens_to_keep: int = 1,
+) -> "torch.FloatTensor":
+    """
+    Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+
+    Args:
+        logits: logits distribution shape (batch size, vocabulary size)
+        top_k (:obj:`int`, `optional`, defaults to 0):
+            If > 0, only keep the top k tokens with highest probability (top-k filtering)
+        top_p (:obj:`float`, `optional`, defaults to 1.0):
+            If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus
+            filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+            Minimumber of tokens we keep per batch example in the output.
+
+    From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    if top_k > 0:
+        logits = TopKLogitsWarper(top_k=top_k, filter_value=filter_value, min_tokens_to_keep=min_tokens_to_keep)(
+            None, logits
+        )
+
+    if 0 <= top_p <= 1.0:
+        logits = TopPLogitsWarper(top_p=top_p, min_tokens_to_keep=min_tokens_to_keep)(None, logits)
+
+    return logits
diff --git a/fastNLP/transformers/torch/modeling_outputs.py b/fastNLP/transformers/torch/modeling_outputs.py
new file mode 100644
index 00000000..ae972a94
--- /dev/null
+++ b/fastNLP/transformers/torch/modeling_outputs.py
@@ -0,0 +1,816 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+from .file_utils import ModelOutput
+from fastNLP.envs.imports import _NEED_IMPORT_TORCH
+
+if _NEED_IMPORT_TORCH:
+    import torch
+
+
+@dataclass
+class BaseModelOutput(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: "torch.FloatTensor" = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class BaseModelOutputWithPooling(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token) after further processing
+            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
+            the classification token after processing through a linear layer and a tanh activation function. The linear
+            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: "torch.FloatTensor" = None
+    pooler_output: "torch.FloatTensor" = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class BaseModelOutputWithPast(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+
+            If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
+            1, hidden_size)` is output.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
+            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            ``config.is_encoder_decoder=True`` 2 additional tensors of shape :obj:`(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            ``config.is_encoder_decoder=True`` in the cross-attention blocks) that can be used (see
+            :obj:`past_key_values` input) to speed up sequential decoding.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: "torch.FloatTensor" = None
+    past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class BaseModelOutputWithCrossAttentions(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+    """
+
+    last_hidden_state: "torch.FloatTensor" = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+    cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token) after further processing
+            through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns
+            the classification token after processing through a linear layer and a tanh activation function. The linear
+            layer weights are trained from the next sentence prediction (classification) objective during pretraining.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
+            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            ``config.is_encoder_decoder=True`` 2 additional tensors of shape :obj:`(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            ``config.is_encoder_decoder=True`` in the cross-attention blocks) that can be used (see
+            :obj:`past_key_values` input) to speed up sequential decoding.
+    """
+
+    last_hidden_state: "torch.FloatTensor" = None
+    pooler_output: "torch.FloatTensor" = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+    cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class BaseModelOutputWithPastAndCrossAttentions(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+
+            If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
+            1, hidden_size)` is output.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
+            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            ``config.is_encoder_decoder=True`` 2 additional tensors of shape :obj:`(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            ``config.is_encoder_decoder=True`` in the cross-attention blocks) that can be used (see
+            :obj:`past_key_values` input) to speed up sequential decoding.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+    """
+
+    last_hidden_state: "torch.FloatTensor" = None
+    past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+    cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class Seq2SeqModelOutput(ModelOutput):
+    """
+    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
+    decoding.
+
+    Args:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+            If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size,
+            1, hidden_size)` is output.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
+            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+            shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    last_hidden_state: "torch.FloatTensor" = None
+    past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    decoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    decoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+    cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+    encoder_last_hidden_state: Optional["torch.FloatTensor"] = None
+    encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class CausalLMOutput(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional["torch.FloatTensor"] = None
+    logits: "torch.FloatTensor" = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class CausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
+            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            :obj:`past_key_values` input) to speed up sequential decoding.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional["torch.FloatTensor"] = None
+    logits: "torch.FloatTensor" = None
+    past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class CausalLMOutputWithCrossAttentions(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Cross attentions weights after the attention softmax, used to compute the weighted average in the
+            cross-attention heads.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            Tuple of :obj:`torch.FloatTensor` tuples of length :obj:`config.n_layers`, with each tuple containing the
+            cached key, value states of the self-attention and the cross-attention layers if model is used in
+            encoder-decoder setting. Only relevant if ``config.is_decoder = True``.
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
+            :obj:`past_key_values` input) to speed up sequential decoding.
+    """
+
+    loss: Optional["torch.FloatTensor"] = None
+    logits: "torch.FloatTensor" = None
+    past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+    cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class SequenceClassifierOutputWithPast(ModelOutput):
+    """
+    Base class for outputs of sentence classification models.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
+            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            :obj:`past_key_values` input) to speed up sequential decoding.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional["torch.FloatTensor"] = None
+    logits: "torch.FloatTensor" = None
+    past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class MaskedLMOutput(ModelOutput):
+    """
+    Base class for masked language models outputs.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Masked language modeling (MLM) loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional["torch.FloatTensor"] = None
+    logits: "torch.FloatTensor" = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class Seq2SeqLMOutput(ModelOutput):
+    """
+    Base class for sequence-to-sequence language models outputs.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Language modeling loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
+            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+            shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    loss: Optional["torch.FloatTensor"] = None
+    logits: "torch.FloatTensor" = None
+    past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    decoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    decoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+    cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+    encoder_last_hidden_state: Optional["torch.FloatTensor"] = None
+    encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class NextSentencePredictorOutput(ModelOutput):
+    """
+    Base class for outputs of models predicting if two sentences are consecutive or not.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided):
+            Next sequence prediction (classification) loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional["torch.FloatTensor"] = None
+    logits: "torch.FloatTensor" = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class SequenceClassifierOutput(ModelOutput):
+    """
+    Base class for outputs of sentence classification models.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional["torch.FloatTensor"] = None
+    logits: "torch.FloatTensor" = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class Seq2SeqSequenceClassifierOutput(ModelOutput):
+    """
+    Base class for outputs of sequence-to-sequence sentence classification models.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
+            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+            shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    loss: Optional["torch.FloatTensor"] = None
+    logits: "torch.FloatTensor" = None
+    past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    decoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    decoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+    cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+    encoder_last_hidden_state: Optional["torch.FloatTensor"] = None
+    encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class MultipleChoiceModelOutput(ModelOutput):
+    """
+    Base class for outputs of multiple choice models.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Classification loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
+            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
+
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional["torch.FloatTensor"] = None
+    logits: "torch.FloatTensor" = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class TokenClassifierOutput(ModelOutput):
+    """
+    Base class for outputs of token classification models.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
+            Classification loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`):
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional["torch.FloatTensor"] = None
+    logits: "torch.FloatTensor" = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class QuestionAnsweringModelOutput(ModelOutput):
+    """
+    Base class for outputs of question answering models.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        start_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Span-start scores (before SoftMax).
+        end_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Span-end scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional["torch.FloatTensor"] = None
+    start_logits: "torch.FloatTensor" = None
+    end_logits: "torch.FloatTensor" = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+@dataclass
+class Seq2SeqQuestionAnsweringModelOutput(ModelOutput):
+    """
+    Base class for outputs of sequence-to-sequence question answering models.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        start_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Span-start scores (before SoftMax).
+        end_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Span-end scores (before SoftMax).
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
+            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+            shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    loss: Optional["torch.FloatTensor"] = None
+    start_logits: "torch.FloatTensor" = None
+    end_logits: "torch.FloatTensor" = None
+    past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    decoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    decoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+    cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None
+    encoder_last_hidden_state: Optional["torch.FloatTensor"] = None
+    encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None
diff --git a/fastNLP/transformers/torch/modeling_utils.py b/fastNLP/transformers/torch/modeling_utils.py
new file mode 100644
index 00000000..d1d5c2f3
--- /dev/null
+++ b/fastNLP/transformers/torch/modeling_utils.py
@@ -0,0 +1,1888 @@
+import inspect
+import os
+import re
+from contextlib import contextmanager
+from functools import partial
+from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
+
+from .activations import get_activation
+from .configuration_utils import PretrainedConfig
+from .deepspeed import deepspeed_config, is_deepspeed_zero3_enabled
+from .utils.versions import require_version_core
+from .file_utils import (
+    DUMMY_INPUTS,
+    WEIGHTS_NAME,
+    cached_path,
+    hf_bucket_url,
+    is_offline_mode,
+    is_remote_url,
+)
+from .generation_utils import GenerationMixin
+from fastNLP.core.log import logger
+from fastNLP.envs.imports import _NEED_IMPORT_TORCH
+
+if _NEED_IMPORT_TORCH:
+    import torch
+    from torch import Tensor, device, nn, save as torch_save
+    from torch.nn import Module
+    try:
+        from torch.nn import Identity
+    except ImportError:
+        # Older PyTorch compatibility
+        class Identity(nn.Module):
+            r"""A placeholder identity operator that is argument-insensitive."""
+
+            def __init__(self, *args, **kwargs):
+                super().__init__()
+
+            def forward(self, input):
+                return input
+else:
+    from fastNLP.core.utils.dummy_class import(
+        DummyClass as Module,
+        DummyClass as torch_save,
+    )
+
+_init_weights = True
+
+@contextmanager
+def no_init_weights(_enable=True):
+    """
+    Context manager to globally disable weight initialization to speed up loading large models.
+
+    TODO(Patrick): Delete safety argument `_enable=True` at next major version. .
+    """
+    global _init_weights
+    if _enable:
+        _init_weights = False
+    try:
+        yield
+    finally:
+        _init_weights = True
+
+def find_pruneable_heads_and_indices(
+    heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int]
+) -> Tuple[Set[int], "torch.LongTensor"]:
+    """
+    Finds the heads and their indices taking :obj:`already_pruned_heads` into account.
+
+    Args:
+        heads (:obj:`List[int]`): List of the indices of heads to prune.
+        n_heads (:obj:`int`): The number of heads in the model.
+        head_size (:obj:`int`): The size of each head.
+        already_pruned_heads (:obj:`Set[int]`): A set of already pruned heads.
+
+    Returns:
+        :obj:`Tuple[Set[int], torch.LongTensor]`: A tuple with the remaining heads and their corresponding indices.
+    """
+    mask = torch.ones(n_heads, head_size)
+    heads = set(heads) - already_pruned_heads  # Convert to set and remove already pruned heads
+    for head in heads:
+        # Compute how many pruned heads are before the head and move the index accordingly
+        head = head - sum(1 if h < head else 0 for h in already_pruned_heads)
+        mask[head] = 0
+    mask = mask.view(-1).contiguous().eq(1)
+    index: torch.LongTensor = torch.arange(len(mask))[mask].long()
+    return heads, index
+
+def get_parameter_device(parameter: Union[Module, GenerationMixin, "ModuleUtilsMixin"]):
+    try:
+        return next(parameter.parameters()).device
+    except StopIteration:
+        # For nn.DataParallel compatibility in PyTorch 1.5
+
+        def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].device
+
+
+def get_parameter_dtype(parameter: Union["nn.Module", GenerationMixin, "ModuleUtilsMixin"]):
+    try:
+        return next(parameter.parameters()).dtype
+    except StopIteration:
+        # For nn.DataParallel compatibility in PyTorch 1.5
+
+        def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].dtype
+
+class ModuleUtilsMixin:
+    """
+    A few utilities for :obj:`torch.nn.Modules`, to be used as a mixin.
+    """
+
+    @staticmethod
+    def _hook_rss_memory_pre_forward(module, *args, **kwargs):
+        try:
+            import psutil
+        except (ImportError):
+            raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
+
+        process = psutil.Process(os.getpid())
+        mem = process.memory_info()
+        module.mem_rss_pre_forward = mem.rss
+        return None
+
+    @staticmethod
+    def _hook_rss_memory_post_forward(module, *args, **kwargs):
+        try:
+            import psutil
+        except (ImportError):
+            raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
+
+        process = psutil.Process(os.getpid())
+        mem = process.memory_info()
+        module.mem_rss_post_forward = mem.rss
+        mem_rss_diff = module.mem_rss_post_forward - module.mem_rss_pre_forward
+        module.mem_rss_diff = mem_rss_diff + (module.mem_rss_diff if hasattr(module, "mem_rss_diff") else 0)
+        return None
+
+    def add_memory_hooks(self):
+        """
+        Add a memory hook before and after each sub-module forward pass to record increase in memory consumption.
+
+        Increase in memory consumption is stored in a :obj:`mem_rss_diff` attribute for each module and can be reset to
+        zero with :obj:`model.reset_memory_hooks_state()`.
+        """
+        for module in self.modules():
+            module.register_forward_pre_hook(self._hook_rss_memory_pre_forward)
+            module.register_forward_hook(self._hook_rss_memory_post_forward)
+        self.reset_memory_hooks_state()
+
+    def reset_memory_hooks_state(self):
+        """
+        Reset the :obj:`mem_rss_diff` attribute of each module (see
+        :func:`~transformers.modeling_utils.ModuleUtilsMixin.add_memory_hooks`).
+        """
+        for module in self.modules():
+            module.mem_rss_diff = 0
+            module.mem_rss_post_forward = 0
+            module.mem_rss_pre_forward = 0
+
+    @property
+    def device(self) -> "device":
+        """
+        :obj:`torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+        return get_parameter_device(self)
+
+    @property
+    def dtype(self) -> "torch.dtype":
+        """
+        :obj:`torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        return get_parameter_dtype(self)
+
+    def invert_attention_mask(self, encoder_attention_mask: "Tensor") -> "Tensor":
+        """
+        Invert an attention mask (e.g., switches 0. and 1.).
+
+        Args:
+            encoder_attention_mask (:obj:`torch.Tensor`): An attention mask.
+
+        Returns:
+            :obj:`torch.Tensor`: The inverted attention mask.
+        """
+        if encoder_attention_mask.dim() == 3:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+        if encoder_attention_mask.dim() == 2:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
+        # /transformer/transformer_layers.py#L270
+        # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
+        # encoder_extended_attention_mask.transpose(-1, -2))
+        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+
+        if self.dtype == torch.float16:
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+        elif self.dtype == torch.float32:
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
+        else:
+            raise ValueError(
+                f"{self.dtype} not recognized. `dtype` should be set to either `torch.float32` or `torch.float16`"
+            )
+
+        return encoder_extended_attention_mask
+
+    def get_extended_attention_mask(self, attention_mask: "Tensor", input_shape: Tuple[int], device: "device") -> "Tensor":
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if self.config.is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                # in case past_key_values are used we need to add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones(
+                                (batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def get_head_mask(
+        self, head_mask: Optional["Tensor"], num_hidden_layers: int, is_attention_chunked: bool = False
+    ) -> "Tensor":
+        """
+        Prepare the head mask if needed.
+
+        Args:
+            head_mask (:obj:`torch.Tensor` with shape :obj:`[num_heads]` or :obj:`[num_hidden_layers x num_heads]`, `optional`):
+                The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
+            num_hidden_layers (:obj:`int`):
+                The number of hidden layers in the model.
+            is_attention_chunked: (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the attentions scores are computed by chunks or not.
+
+        Returns:
+            :obj:`torch.Tensor` with shape :obj:`[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or
+            list with :obj:`[None]` for each layer.
+        """
+        if head_mask is not None:
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
+            if is_attention_chunked is True:
+                head_mask = head_mask.unsqueeze(-1)
+        else:
+            head_mask = [None] * num_hidden_layers
+
+        return head_mask
+
+    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
+        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
+        if head_mask.dim() == 1:
+            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1)
+        elif head_mask.dim() == 2:
+            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+        assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
+        head_mask = head_mask.to(dtype=self.dtype)  # switch to float if need + fp16 compatibility
+        return head_mask
+
+    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
+        """
+        Get number of (optionally, trainable or non-embeddings) parameters in the module.
+
+        Args:
+            only_trainable (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to return only the number of trainable parameters
+
+            exclude_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to return only the number of non-embeddings parameters
+
+        Returns:
+            :obj:`int`: The number of parameters.
+        """
+
+        if exclude_embeddings:
+            embedding_param_names = [
+                f"{name}.weight" for name, module_type in self.named_modules() if isinstance(module_type, nn.Embedding)
+            ]
+            non_embedding_parameters = [
+                parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
+            ]
+            return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable)
+        else:
+            return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
+
+    def estimate_tokens(self, input_dict: Dict[str, Union["torch.Tensor", Any]]) -> int:
+        """
+        Helper function to estimate the total number of tokens from the model inputs.
+
+        Args:
+            inputs (:obj:`dict`): The model inputs.
+
+        Returns:
+            :obj:`int`: The total number of tokens.
+        """
+        token_inputs = [tensor for key, tensor in input_dict.items() if "input" in key]
+        if token_inputs:
+            return sum([token_input.numel() for token_input in token_inputs])
+        else:
+            logger.warn(
+                "Could not estimate the number of tokens of the input, floating-point operations will not be computed"
+            )
+            return 0
+
+    def floating_point_ops(
+        self, input_dict: Dict[str, Union["torch.Tensor", Any]], exclude_embeddings: bool = True
+    ) -> int:
+        """
+        Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a
+        batch with this transformer model. Default approximation neglects the quadratic dependency on the number of
+        tokens (valid if :obj:`12 * d_model << sequence_length`) as laid out in `this paper
+        <https://arxiv.org/pdf/2001.08361.pdf>`__ section 2.1. Should be overridden for transformers with parameter
+        re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths.
+
+        Args:
+            batch_size (:obj:`int`):
+                The batch size for the forward pass.
+
+            sequence_length (:obj:`int`):
+                The number of tokens in each line of the batch.
+
+            exclude_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether or not to count embedding and softmax operations.
+
+        Returns:
+            :obj:`int`: The number of floating-point operations.
+        """
+
+        return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings)
+
+
+class PreTrainedModel(Module, ModuleUtilsMixin, GenerationMixin):
+    r"""
+    Base class for all models.
+
+    :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods
+    for loading, downloading and saving models as well as a few methods common to all models to:
+
+        * resize the input embeddings,
+        * prune heads in the self-attention heads.
+
+    Class attributes (overridden by derived classes):
+
+        - **config_class** (:class:`~transformers.PretrainedConfig`) -- A subclass of
+          :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
+        - **load_tf_weights** (:obj:`Callable`) -- A python `method` for loading a TensorFlow checkpoint in a PyTorch
+          model, taking as arguments:
+
+            - **model** (:class:`~transformers.PreTrainedModel`) -- An instance of the model on which to load the
+              TensorFlow checkpoint.
+            - **config** (:class:`~transformers.PreTrainedConfig`) -- An instance of the configuration associated to
+              the model.
+            - **path** (:obj:`str`) -- A path to the TensorFlow checkpoint.
+
+        - **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
+          derived classes of the same architecture adding modules on top of the base model.
+        - **is_parallelizable** (:obj:`bool`) -- A flag indicating whether this model supports model parallelization.
+    """
+    config_class = None
+    base_model_prefix = ""
+    # a list of re pattern of tensor names to ignore from the model when loading the model weights
+    # (and avoid unnecessary warnings).
+    _keys_to_ignore_on_load_missing = None
+    # a list of re pattern of tensor names to ignore from the weights when loading the model weights
+    # (and avoid unnecessary warnings).
+    _keys_to_ignore_on_load_unexpected = None
+    # a list of of tensor names to ignore when saving the model (useful for keys that aren't
+    # trained, but which are deterministic, or tied variables)
+    _keys_to_ignore_on_save = None
+
+    is_parallelizable = False
+    supports_gradient_checkpointing = False
+
+    @property
+    def dummy_inputs(self) -> Dict[str, "torch.Tensor"]:
+        """
+        :obj:`Dict[str, torch.Tensor]`: Dummy inputs to do a forward pass in the network.
+        """
+        return {"input_ids": torch.tensor(DUMMY_INPUTS)}
+
+    def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
+        super().__init__()
+        if not isinstance(config, PretrainedConfig):
+            raise ValueError(
+                f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class "
+                "`PretrainedConfig`. To create a model from a pretrained model use "
+                f"`model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        # Save config and origin of the pretrained weights if given in model
+        self.config = config
+        self.name_or_path = config.name_or_path
+        if getattr(self.config, "gradient_checkpointing", False):
+            self.gradient_checkpointing_enable()
+            # Remove the attribute now that is has been consumed, so it's no saved in the config.
+            delattr(self.config, "gradient_checkpointing")
+
+    @classmethod
+    def _from_config(cls, config, **kwargs):
+        """
+        All context managers that the model should be initialized under go here.
+
+        Args:
+            torch_dtype (:obj:`torch.dtype`, `optional`):
+                Override the default ``torch.dtype`` and load the model under this dtype.
+        """
+        torch_dtype = kwargs.pop("torch_dtype", None)
+
+        # override default dtype if needed
+        dtype_orig = None
+        if torch_dtype is not None:
+            dtype_orig = cls._set_default_torch_dtype(torch_dtype)
+
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
+            # this immediately partitions the model across all gpus, to avoid the overhead in time
+            # and memory copying it on CPU or each GPU first
+            with deepspeed.zero.Init(config_dict_or_path=deepspeed_config()):
+                model = cls(config, **kwargs)
+        else:
+            model = cls(config, **kwargs)
+
+        # restore default dtype if it was modified
+        if dtype_orig is not None:
+            torch.set_default_dtype(dtype_orig)
+
+        return model
+
+    @classmethod
+    def _set_default_torch_dtype(cls, dtype: "torch.dtype") -> "torch.dtype":
+        """
+        Change the default dtype and return the previous one. This is needed when wanting to instantiate the model
+        under specific dtype.
+
+        Args:
+            dtype (:obj:`torch.dtype`):
+                a floating dtype to set to.
+
+        Returns:
+            :obj:`torch.dtype`: the original ``dtype`` that can be used to restore ``torch.set_default_dtype(dtype)``
+            if it was modified. If it wasn't, returns :obj:`None`.
+
+        Note ``set_default_dtype`` currently only works with floating-point types and asserts if for example,
+        ``torch.int64`` is passed. So if a non-float ``dtype`` is passed this functions will throw an exception.
+        """
+        if not dtype.is_floating_point:
+            raise ValueError(
+                f"Can't instantiate {cls.__name__} model under dtype={dtype} since it is not a floating point dtype"
+            )
+
+        logger.info(f"Instantiating {cls.__name__} model under default dtype {dtype}.")
+        dtype_orig = torch.get_default_dtype()
+        torch.set_default_dtype(dtype)
+        return dtype_orig
+
+    @property
+    def base_model(self) -> "nn.Module":
+        """
+        :obj:`torch.nn.Module`: The main body of the model.
+        """
+        return getattr(self, self.base_model_prefix, self)
+
+    def get_input_embeddings(self) -> "nn.Module":
+        """
+        Returns the model's input embeddings.
+
+        Returns:
+            :obj:`nn.Module`: A torch module mapping vocabulary to hidden states.
+        """
+        base_model = getattr(self, self.base_model_prefix, self)
+        if base_model is not self:
+            return base_model.get_input_embeddings()
+        else:
+            raise NotImplementedError
+
+    def set_input_embeddings(self, value: "nn.Module"):
+        """
+        Set model's input embeddings.
+
+        Args:
+            value (:obj:`nn.Module`): A module mapping vocabulary to hidden states.
+        """
+        base_model = getattr(self, self.base_model_prefix, self)
+        if base_model is not self:
+            base_model.set_input_embeddings(value)
+        else:
+            raise NotImplementedError
+
+    def get_output_embeddings(self) -> "nn.Module":
+        """
+        Returns the model's output embeddings.
+
+        Returns:
+            :obj:`nn.Module`: A torch module mapping hidden states to vocabulary.
+        """
+        return None  # Overwrite for models with output embeddings
+
+    def _init_weights(self, module):
+        """
+        Initialize the weights. This method should be overridden by derived class.
+        """
+        raise NotImplementedError(f"Make sure `_init_weigths` is implemented for {self.__class__}")
+
+    def tie_weights(self):
+        """
+        Tie the weights between the input embeddings and the output embeddings.
+
+        If the :obj:`torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning
+        the weights instead.
+        """
+        output_embeddings = self.get_output_embeddings()
+        if output_embeddings is not None and self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
+
+        if self.config.is_encoder_decoder and self.config.tie_encoder_decoder:
+            if hasattr(self, self.base_model_prefix):
+                self = getattr(self, self.base_model_prefix)
+            self._tie_encoder_decoder_weights(self.encoder, self.decoder, self.base_model_prefix)
+
+        for module in self.modules():
+            if hasattr(module, "_tie_weights"):
+                module._tie_weights()
+
+    @staticmethod
+    def _tie_encoder_decoder_weights(encoder: "nn.Module", decoder: "nn.Module", base_model_prefix: str):
+        uninitialized_encoder_weights: List[str] = []
+        if decoder.__class__ != encoder.__class__:
+            logger.info(
+                f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized."
+            )
+
+        def tie_encoder_to_decoder_recursively(
+            decoder_pointer: nn.Module,
+            encoder_pointer: nn.Module,
+            module_name: str,
+            uninitialized_encoder_weights: List[str],
+            depth=0,
+        ):
+            assert isinstance(decoder_pointer, nn.Module) and isinstance(
+                encoder_pointer, nn.Module
+            ), f"{decoder_pointer} and {encoder_pointer} have to be of type nn.Module"
+            if hasattr(decoder_pointer, "weight"):
+                assert hasattr(encoder_pointer, "weight")
+                encoder_pointer.weight = decoder_pointer.weight
+                if hasattr(decoder_pointer, "bias"):
+                    assert hasattr(encoder_pointer, "bias")
+                    encoder_pointer.bias = decoder_pointer.bias
+                return
+
+            encoder_modules = encoder_pointer._modules
+            decoder_modules = decoder_pointer._modules
+            if len(decoder_modules) > 0:
+                assert (
+                    len(encoder_modules) > 0
+                ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
+
+                all_encoder_weights = set([module_name + "/" + sub_name for sub_name in encoder_modules.keys()])
+                encoder_layer_pos = 0
+                for name, module in decoder_modules.items():
+                    if name.isdigit():
+                        encoder_name = str(int(name) + encoder_layer_pos)
+                        decoder_name = name
+                        if not isinstance(decoder_modules[decoder_name], type(encoder_modules[encoder_name])) and len(
+                            encoder_modules
+                        ) != len(decoder_modules):
+                            # this can happen if the name corresponds to the position in a list module list of layers
+                            # in this case the decoder has added a cross-attention that the encoder does not have
+                            # thus skip this step and subtract one layer pos from encoder
+                            encoder_layer_pos -= 1
+                            continue
+                    elif name not in encoder_modules:
+                        continue
+                    elif depth > 500:
+                        raise ValueError(
+                            "Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model."
+                        )
+                    else:
+                        decoder_name = encoder_name = name
+                    tie_encoder_to_decoder_recursively(
+                        decoder_modules[decoder_name],
+                        encoder_modules[encoder_name],
+                        module_name + "/" + name,
+                        uninitialized_encoder_weights,
+                        depth=depth + 1,
+                    )
+                    all_encoder_weights.remove(module_name + "/" + encoder_name)
+
+                uninitialized_encoder_weights += list(all_encoder_weights)
+
+        # tie weights recursively
+        tie_encoder_to_decoder_recursively(decoder, encoder, base_model_prefix, uninitialized_encoder_weights)
+        if len(uninitialized_encoder_weights) > 0:
+            logger.warning(
+                f"The following encoder weights were not tied to the decoder {uninitialized_encoder_weights}"
+            )
+
+    def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
+        """Tie or clone module weights depending of whether we are using TorchScript or not"""
+        if self.config.torchscript:
+            output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone())
+        else:
+            output_embeddings.weight = input_embeddings.weight
+
+        if getattr(output_embeddings, "bias", None) is not None:
+            output_embeddings.bias.data = nn.functional.pad(
+                output_embeddings.bias.data,
+                (
+                    0,
+                    output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],
+                ),
+                "constant",
+                0,
+            )
+        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
+            output_embeddings.out_features = input_embeddings.num_embeddings
+
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> "nn.Embedding":
+        """
+        Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`.
+
+        Takes care of tying weights embeddings afterwards if the model class has a :obj:`tie_weights()` method.
+
+        Arguments:
+            new_num_tokens (:obj:`int`, `optional`):
+                The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
+                vectors at the end. Reducing the size will remove vectors from the end. If not provided or :obj:`None`,
+                just returns a pointer to the input tokens :obj:`torch.nn.Embedding` module of the model without doing
+                anything.
+
+        Return:
+            :obj:`torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
+        """
+        model_embeds = self._resize_token_embeddings(new_num_tokens)
+        if new_num_tokens is None:
+            return model_embeds
+
+        # Update base model and current model config
+        self.config.vocab_size = new_num_tokens
+        self.vocab_size = new_num_tokens
+
+        # Tie weights again if needed
+        self.tie_weights()
+
+        return model_embeds
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        old_embeddings = self.get_input_embeddings()
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        self.set_input_embeddings(new_embeddings)
+
+        # if word embeddings are not tied, make sure that lm head is resized as well
+        if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings:
+            old_lm_head = self.get_output_embeddings()
+            new_lm_head = self._get_resized_lm_head(old_lm_head, new_num_tokens)
+            self.set_output_embeddings(new_lm_head)
+
+        return self.get_input_embeddings()
+
+    def _get_resized_embeddings(
+        self, old_embeddings: "nn.Embedding", new_num_tokens: Optional[int] = None
+    ) -> "nn.Embedding":
+        """
+        Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly
+        initialized vectors at the end. Reducing the size will remove vectors from the end
+
+        Args:
+            old_embeddings (:obj:`torch.nn.Embedding`):
+                Old embeddings to be resized.
+            new_num_tokens (:obj:`int`, `optional`):
+                New number of tokens in the embedding matrix.
+
+                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
+                vectors from the end. If not provided or :obj:`None`, just returns a pointer to the input tokens
+                :obj:`torch.nn.Embedding`` module of the model without doing anything.
+
+        Return:
+            :obj:`torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if
+            :obj:`new_num_tokens` is :obj:`None`
+        """
+        if new_num_tokens is None:
+            return old_embeddings
+
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(old_embeddings.weight, modifier_rank=None):
+                old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        else:
+            old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+
+        if old_num_tokens == new_num_tokens:
+            return old_embeddings
+
+        if not isinstance(old_embeddings, nn.Embedding):
+            raise TypeError(
+                f"Old embeddings are of type {type(old_embeddings)}, which is not an instance of {nn.Embedding}."
+                f"You should either use a different resize function or make sure that `old_embeddings` are an instance of {nn.Embedding}."
+            )
+
+        # Build new embeddings
+        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim).to(
+            self.device, dtype=old_embeddings.weight.dtype
+        )
+
+        # initialize all new embeddings (in particular added tokens)
+        self._init_weights(new_embeddings)
+
+        # Copy token embeddings from the previous weights
+
+        # numbers of tokens to copy
+        n = min(old_num_tokens, new_num_tokens)
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(old_embeddings.weight, modifier_rank=0):
+                if torch.distributed.get_rank() == 0:
+                    new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :]
+        else:
+            new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :]
+
+        return new_embeddings
+
+    def _get_resized_lm_head(
+        self, old_lm_head: "nn.Linear", new_num_tokens: Optional[int] = None, transposed: Optional[bool] = False
+    ) -> "nn.Linear":
+        """
+        Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized
+        vectors at the end. Reducing the size will remove vectors from the end
+
+        Args:
+            old_lm_head (:obj:`torch.nn.Linear`):
+                Old lm head liner layer to be resized.
+            new_num_tokens (:obj:`int`, `optional`):
+                New number of tokens in the linear matrix.
+
+                Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
+                vectors from the end. If not provided or :obj:`None`, just returns a pointer to the input tokens
+                :obj:`torch.nn.Linear`` module of the model without doing anything.
+            transposed (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether ``old_lm_head`` is transposed or not. If True ``old_lm_head.size()`` is ``lm_head_dim,
+                vocab_size`` else ``vocab_size, lm_head_dim``.
+
+        Return:
+            :obj:`torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if
+            :obj:`new_num_tokens` is :obj:`None`
+        """
+        if new_num_tokens is None:
+            return old_lm_head
+
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(old_lm_head.weight, modifier_rank=None):
+                old_num_tokens, old_lm_head_dim = (
+                    old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size()
+                )
+        else:
+            old_num_tokens, old_lm_head_dim = (
+                old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size()
+            )
+
+        if old_num_tokens == new_num_tokens:
+            return old_lm_head
+
+        if not isinstance(old_lm_head, nn.Linear):
+            raise TypeError(
+                f"Old language model head is of type {type(old_lm_head)}, which is not an instance of {nn.Linear}."
+                f"You should either use a different resize function or make sure that `old_lm_head` are an instance of {nn.Linear}."
+            )
+
+        # Build new lm head
+        new_lm_head_shape = (old_lm_head_dim, new_num_tokens) if not transposed else (new_num_tokens, old_lm_head_dim)
+        has_new_lm_head_bias = old_lm_head.bias is not None
+        new_lm_head = nn.Linear(*new_lm_head_shape, bias=has_new_lm_head_bias).to(self.device)
+
+        # initialize new lm head (in particular added tokens)
+        self._init_weights(new_lm_head)
+
+        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
+
+        # XXX: put the long block of code in a wrapper
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(old_lm_head.weight, modifier_rank=0):
+                if torch.distributed.get_rank() == 0:
+                    # Copy old lm head weights to new lm head
+                    if not transposed:
+                        new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[
+                            :num_tokens_to_copy, :
+                        ]
+                    else:
+                        new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[
+                            :, :num_tokens_to_copy
+                        ]
+
+                    # Copy bias weights to new lm head
+                    if has_new_lm_head_bias:
+                        new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy]
+        else:
+            # Copy old lm head weights to new lm head
+            if not transposed:
+                new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[:num_tokens_to_copy, :]
+            else:
+                new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[:, :num_tokens_to_copy]
+
+            # Copy bias weights to new lm head
+            if has_new_lm_head_bias:
+                new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy]
+
+        return new_lm_head
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        raise NotImplementedError(
+            f"`resize_position_embeddings` is not implemented for {self.__class__}`. To implement it, you should "
+            f"overwrite this method in the class {self.__class__} in `modeling_{self.__class__.__module__}.py`"
+        )
+
+    def get_position_embeddings(self) -> Union["nn.Embedding", Tuple["nn.Embedding"]]:
+        raise NotImplementedError(
+            f"`get_position_embeddings` is not implemented for {self.__class__}`. To implement it, you should "
+            f"overwrite this method in the class {self.__class__} in `modeling_{self.__class__.__module__}.py`"
+        )
+
+    def init_weights(self):
+        """
+        If needed prunes and maybe initializes weights.
+        """
+        # Prune heads if needed
+        if self.config.pruned_heads:
+            self.prune_heads(self.config.pruned_heads)
+
+        if _init_weights:
+            # Initialize weights
+            self.apply(self._init_weights)
+
+            # Tie weights should be skipped when not initializing all weights
+            # since from_pretrained(...) calls tie weights anyways
+            self.tie_weights()
+
+    def prune_heads(self, heads_to_prune: Dict[int, List[int]]):
+        """
+        Prunes heads of the base model.
+
+        Arguments:
+            heads_to_prune (:obj:`Dict[int, List[int]]`):
+                Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list of
+                heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads
+                0 and 2 on layer 1 and heads 2 and 3 on layer 2.
+        """
+        # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
+        for layer, heads in heads_to_prune.items():
+            union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
+            self.config.pruned_heads[layer] = list(union_heads)  # Unfortunately we have to store it as list for JSON
+
+        self.base_model._prune_heads(heads_to_prune)
+
+    def gradient_checkpointing_enable(self, flag: bool = True):
+        """
+        Activates gradient checkpointing for the current model.
+
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if not self.supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+
+    def gradient_checkpointing_disable(self, flag: bool = True):
+        """
+        Deactivates gradient checkpointing for the current model.
+
+        Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint
+        activations".
+        """
+        if self.supports_gradient_checkpointing:
+            self.apply(partial(self._set_gradient_checkpointing, value=False))
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        save_config: bool = True,
+        state_dict: Optional[dict] = None,
+        save_function: Callable = torch_save,
+        **kwargs,
+    ):
+        """
+        Save a model and its configuration file to a directory, so that it can be re-loaded using the
+        `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
+
+        Arguments:
+            save_directory (:obj:`str` or :obj:`os.PathLike`):
+                Directory to which to save. Will be created if it doesn't exist.
+            save_config (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether or not to save the config of the model. Useful when in distributed training like TPUs and need
+                to call this function on all processes. In this case, set :obj:`save_config=True` only on the main
+                process to avoid race conditions.
+            state_dict (nested dictionary of :obj:`torch.Tensor`):
+                The state dictionary of the model to save. Will default to :obj:`self.state_dict()`, but can be used to
+                only save parts of the model or if special precautions need to be taken when recovering the state
+                dictionary of a model (like when using model parallelism).
+            save_function (:obj:`Callable`):
+                The function to use to save the state dictionary. Useful on distributed training like TPUs when one
+                need to replace :obj:`torch.save` by another method.
+            push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it.
+
+                .. warning::
+
+                    Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with
+                    :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are
+                    pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory
+                    instead.
+
+            kwargs:
+                Additional key word arguments passed along to the
+                :meth:`~transformers.file_utils.PushToHubMixin.push_to_hub` method.
+        """
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        # Only save the model itself if we are using distributed training
+        model_to_save = unwrap_model(self)
+
+        # save the string version of dtype to the config, e.g. convert torch.float32 => "float32"
+        # we currently don't use this setting automatically, but may start to use with v5
+        dtype = get_parameter_dtype(model_to_save)
+        model_to_save.config.torch_dtype = str(dtype).split(".")[1]
+
+        # Attach architecture to the config
+        model_to_save.config.architectures = [model_to_save.__class__.__name__]
+
+        # Save the config
+        if save_config:
+            model_to_save.config.save_pretrained(save_directory)
+
+        # Save the model
+        if state_dict is None:
+            state_dict = model_to_save.state_dict()
+
+        # Handle the case where some state_dict keys shouldn't be saved
+        if self._keys_to_ignore_on_save is not None:
+            state_dict = {k: v for k, v in state_dict.items() if k not in self._keys_to_ignore_on_save}
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
+        save_function(state_dict, output_model_file)
+
+        logger.info(f"Model weights saved in {output_model_file}")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
+        r"""
+        Instantiate a pretrained pytorch model from a pre-trained model configuration.
+
+        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated). To
+        train the model, you should first set it back in training mode with ``model.train()``.
+
+        The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come
+        pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
+        task.
+
+        The warning `Weights from XXX not used in YYY` means that the layer XXX is not used by YYY, therefore those
+        weights are discarded.
+
+        Parameters:
+            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`, `optional`):
+                Can be either:
+
+                    - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
+                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
+                    - A path to a `directory` containing model weights saved using
+                      :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
+                    - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In
+                      this case, ``from_tf`` should be set to :obj:`True` and a configuration object should be provided
+                      as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in
+                      a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+                    - A path or url to a model folder containing a `flax checkpoint file` in `.msgpack` format (e.g,
+                      ``./flax_model/`` containing ``flax_model.msgpack``). In this case, ``from_flax`` should be set
+                      to :obj:`True`.
+                    - :obj:`None` if you are both providing the configuration and state dictionary (resp. with keyword
+                      arguments ``config`` and ``state_dict``).
+            model_args (sequence of positional arguments, `optional`):
+                All remaining positional arguments will be passed to the underlying model's ``__init__`` method.
+            config (:obj:`Union[PretrainedConfig, str, os.PathLike]`, `optional`):
+                Can be either:
+
+                    - an instance of a class derived from :class:`~transformers.PretrainedConfig`,
+                    - a string or path valid as input to :func:`~transformers.PretrainedConfig.from_pretrained`.
+
+                Configuration for the model to use instead of an automatically loaded configuration. Configuration can
+                be automatically loaded when:
+
+                    - The model is a model provided by the library (loaded with the `model id` string of a pretrained
+                      model).
+                    - The model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
+                      by supplying the save directory.
+                    - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
+                      configuration JSON file named `config.json` is found in the directory.
+            state_dict (:obj:`Dict[str, torch.Tensor]`, `optional`):
+                A state dictionary to use instead of a state dictionary loaded from saved weights file.
+
+                This option can be used if you want to create a model from a pretrained configuration but load your own
+                weights. In this case though, you should check if using
+                :func:`~transformers.PreTrainedModel.save_pretrained` and
+                :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+            cache_dir (:obj:`Union[str, os.PathLike]`, `optional`):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            from_tf (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Load the model weights from a TensorFlow checkpoint save file (see docstring of
+                ``pretrained_model_name_or_path`` argument).
+            from_flax (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Load the model weights from a Flax checkpoint save file (see docstring of
+                ``pretrained_model_name_or_path`` argument).
+            ignore_mismatched_sizes (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
+                as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
+                checkpoint with 3 labels).
+            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (:obj:`Dict[str, str], `optional`):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (:obj:`str` or `bool`, `optional`):
+                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
+                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
+            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                identifier allowed by git.
+            mirror(:obj:`str`, `optional`):
+                Mirror source to accelerate downloads in China. If you are from China and have an accessibility
+                problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
+                Please refer to the mirror site for more information.
+            _fast_init(:obj:`bool`, `optional`, defaults to `:obj:`True`):
+                Whether or not to disable fast initialization.
+            low_cpu_mem_usage(:obj:`bool`, `optional`, defaults to `:obj:`False`):
+                Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                This is an experimental feature and a subject to change at any moment.
+            torch_dtype (:obj:`str` or :obj:`torch.dtype`, `optional`):
+                Override the default ``torch.dtype`` and load the model under this dtype. If ``"auto"`` is passed the
+                dtype will be automatically derived from the model's weights.
+
+                .. warning::
+
+                    One should only disable `_fast_init` to ensure backwards compatibility with
+                    ``transformers.__version__ < 4.6.0`` for seeded model initialization. This argument will be removed
+                    at the next major version. See `pull request 11471
+                    <https://github.com/huggingface/transformers/pull/11471>`__ for more information.
+
+            kwargs (remaining dictionary of keyword arguments, `optional`):
+                Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
+                :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
+                automatically loaded:
+
+                    - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
+                      underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
+                      already been done)
+                    - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
+                      initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
+                      ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
+                      with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
+                      attribute will be passed to the underlying model's ``__init__`` function.
+
+        .. note::
+
+            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+
+        .. note::
+
+            Activate the special `"offline-mode"
+            <https://huggingface.co/transformers/installation.html#offline-mode>`__ to use this method in a firewalled
+            environment.
+
+        Examples::
+
+            >>> from transformers import BertConfig, BertModel
+            >>> # Download model and configuration from huggingface.co and cache.
+            >>> model = BertModel.from_pretrained('bert-base-uncased')
+            >>> # Model was saved using `save_pretrained('./test/saved_model/')` (for example purposes, not runnable).
+            >>> model = BertModel.from_pretrained('./test/saved_model/')
+            >>> # Update configuration during loading.
+            >>> model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)
+            >>> assert model.config.output_attentions == True
+            >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
+            >>> config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
+            >>> model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            >>> # Loading from a Flax checkpoint file instead of a PyTorch model (slower)
+            >>> model = BertModel.from_pretrained('bert-base-uncased', from_flax=True)
+
+        """
+        config = kwargs.pop("config", None)
+        state_dict = kwargs.pop("state_dict", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+        local_files_only = kwargs.pop("local_files_only", False)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        mirror = kwargs.pop("mirror", None)
+        from_pipeline = kwargs.pop("_from_pipeline", None)
+        from_auto_class = kwargs.pop("_from_auto", False)
+        _fast_init = kwargs.pop("_fast_init", True)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False)
+
+        user_agent = {"file_type": "model", "framework": "pytorch", "from_auto_class": from_auto_class}
+        if from_pipeline is not None:
+            user_agent["using_pipeline"] = from_pipeline
+
+        if is_offline_mode() and not local_files_only:
+            logger.info("Offline mode: forcing local_files_only=True")
+            local_files_only = True
+
+        # Load config if we don't provide a configuration
+        if not isinstance(config, PretrainedConfig):
+            config_path = config if config is not None else pretrained_model_name_or_path
+            config, model_kwargs = cls.config_class.from_pretrained(
+                config_path,
+                *model_args,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                _from_auto=from_auto_class,
+                _from_pipeline=from_pipeline,
+                **kwargs,
+            )
+        else:
+            model_kwargs = kwargs
+
+        # Load model
+        if pretrained_model_name_or_path is not None:
+            pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+            if os.path.isdir(pretrained_model_name_or_path):
+                if os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
+                    # Load from a PyTorch checkpoint
+                    archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                else:
+                    raise EnvironmentError(
+                        f"Error no file named {[WEIGHTS_NAME]} found in "
+                        f"directory {pretrained_model_name_or_path}"
+                    )
+            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+                archive_file = pretrained_model_name_or_path
+            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
+                raise ValueError(
+                    f"We found a TensorFlow checkpoint at {pretrained_model_name_or_path + '.index'}, which is not supported"
+                )
+            else:
+                # set correct filename
+                filename = WEIGHTS_NAME
+
+                archive_file = hf_bucket_url(
+                    pretrained_model_name_or_path,
+                    filename=filename,
+                    revision=revision,
+                    mirror=mirror,
+                )
+
+            try:
+                # Load from URL or cache if already cached
+                resolved_archive_file = cached_path(
+                    archive_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                    use_auth_token=use_auth_token,
+                    user_agent=user_agent,
+                )
+            except EnvironmentError as err:
+                logger.error(err)
+                msg = (
+                    f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
+                    f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
+                    f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {WEIGHTS_NAME}\n\n"
+                )
+
+                if revision is not None:
+                    msg += f"- or '{revision}' is a valid git identifier (branch name, a tag name, or a commit id) that exists for this model name as listed on its model page on 'https://huggingface.co/models'\n\n"
+
+                raise EnvironmentError(msg)
+
+            if resolved_archive_file == archive_file:
+                logger.info(f"loading weights file {archive_file}")
+            else:
+                logger.info(f"loading weights file {archive_file} from cache at {resolved_archive_file}")
+        else:
+            resolved_archive_file = None
+
+        # load pt weights early so that we know which dtype to init the model under
+        if state_dict is None:
+            try:
+                state_dict = torch.load(resolved_archive_file, map_location="cpu")
+            except Exception as e:
+                try:
+                    with open(resolved_archive_file) as f:
+                        if f.read().startswith("version"):
+                            raise OSError(
+                                "You seem to have cloned a repository without having git-lfs installed. Please install "
+                                "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
+                                "you cloned."
+                            )
+                        else:
+                            raise ValueError from e
+                except (UnicodeDecodeError, ValueError):
+                    raise OSError(
+                        f"Unable to load weights from pytorch checkpoint file for '{pretrained_model_name_or_path}' "
+                        f"at '{resolved_archive_file}'"
+                        "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. "
+                    )
+
+            # set dtype to instantiate the model under:
+            # 1. If torch_dtype is not None, we use that dtype
+            # 2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first
+            #    weights entry - we assume all weights are of the same dtype
+            # we also may have config.torch_dtype available, but we won't rely on it till v5
+            dtype_orig = None
+            if torch_dtype is not None:
+                if isinstance(torch_dtype, str):
+                    if torch_dtype == "auto":
+                        torch_dtype = next(iter(state_dict.values())).dtype
+                    else:
+                        raise ValueError(
+                            f"`torch_dtype` can be either a `torch.dtype` or `auto`, but received {torch_dtype}"
+                        )
+                dtype_orig = cls._set_default_torch_dtype(torch_dtype)
+
+            if low_cpu_mem_usage:
+                # save the keys
+                loaded_state_dict_keys = [k for k in state_dict.keys()]
+                del state_dict  # free CPU memory - will reload again later
+
+        config.name_or_path = pretrained_model_name_or_path
+
+        # Instantiate model.
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
+            # this immediately partitions the model across all gpus, to avoid the overhead in time
+            # and memory copying it on CPU or each GPU first
+            with deepspeed.zero.Init(config_dict_or_path=deepspeed_config()):
+                with no_init_weights(_enable=_fast_init):
+                    model = cls(config, *model_args, **model_kwargs)
+        else:
+            with no_init_weights(_enable=_fast_init):
+                model = cls(config, *model_args, **model_kwargs)
+
+        if dtype_orig is not None:
+            torch.set_default_dtype(dtype_orig)
+
+        if low_cpu_mem_usage:
+            cls._load_state_dict_into_model_low_mem(model, loaded_state_dict_keys, resolved_archive_file)
+        else:
+            model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_state_dict_into_model(
+                model,
+                state_dict,
+                pretrained_model_name_or_path,
+                ignore_mismatched_sizes=ignore_mismatched_sizes,
+                _fast_init=_fast_init,
+            )
+
+        # make sure token embedding weights are still tied if needed
+        model.tie_weights()
+
+        # Set model in evaluation mode to deactivate DropOut modules by default
+        model.eval()
+
+        if output_loading_info:
+            loading_info = {
+                "missing_keys": missing_keys,
+                "unexpected_keys": unexpected_keys,
+                "mismatched_keys": mismatched_keys,
+                "error_msgs": error_msgs,
+            }
+            return model, loading_info
+
+        return model
+
+    @classmethod
+    def _load_state_dict_into_model(
+        cls, model, state_dict, pretrained_model_name_or_path, ignore_mismatched_sizes=False, _fast_init=True
+    ):
+
+        # Convert old format to new format if needed from a PyTorch state_dict
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if "gamma" in key:
+                new_key = key.replace("gamma", "weight")
+            if "beta" in key:
+                new_key = key.replace("beta", "bias")
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        # Retrieve missing & unexpected_keys
+        model_state_dict = model.state_dict()
+        expected_keys = list(model_state_dict.keys())
+        loaded_keys = list(state_dict.keys())
+        prefix = model.base_model_prefix
+
+        has_prefix_module = any(s.startswith(prefix) for s in loaded_keys)
+        expects_prefix_module = any(s.startswith(prefix) for s in expected_keys)
+
+        # key re-naming operations are never done on the keys
+        # that are loaded, but always on the keys of the newly initialized model
+        remove_prefix = not has_prefix_module and expects_prefix_module
+        add_prefix = has_prefix_module and not expects_prefix_module
+
+        if remove_prefix:
+            expected_keys_not_prefixed = [s for s in expected_keys if not s.startswith(prefix)]
+            expected_keys = [".".join(s.split(".")[1:]) if s.startswith(prefix) else s for s in expected_keys]
+        elif add_prefix:
+            expected_keys = [".".join([prefix, s]) for s in expected_keys]
+
+        missing_keys = list(set(expected_keys) - set(loaded_keys))
+        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
+
+        # Mistmatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
+        # matching the weights in the model.
+        mismatched_keys = []
+        if ignore_mismatched_sizes:
+            for checkpoint_key in loaded_keys:
+                model_key = checkpoint_key
+                if remove_prefix and checkpoint_key.startswith(prefix):
+                    model_key = ".".join(checkpoint_key.split(".")[1:])
+                elif add_prefix:
+                    model_key = f"{prefix}.{checkpoint_key}"
+
+                if (
+                    model_key in model_state_dict
+                    and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
+                ):
+                    mismatched_keys.append(
+                        (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
+                    )
+                    del state_dict[checkpoint_key]
+
+        # Some models may have keys that are not in the state by design, removing them before needlessly warning
+        # the user.
+        if cls._keys_to_ignore_on_load_missing is not None:
+            for pat in cls._keys_to_ignore_on_load_missing:
+                missing_keys = [k for k in missing_keys if re.search(pat, k) is None]
+
+        if cls._keys_to_ignore_on_load_unexpected is not None:
+            for pat in cls._keys_to_ignore_on_load_unexpected:
+                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+
+        if _fast_init:
+            # retrieve unintialized modules and initialize
+            uninitialized_modules = model.retrieve_modules_from_names(
+                missing_keys, add_prefix=add_prefix, remove_prefix=remove_prefix
+            )
+            for module in uninitialized_modules:
+                model._init_weights(module)
+
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, "_metadata", None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        error_msgs = []
+
+        # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+        # so we need to apply the function recursively.
+        def load(module: nn.Module, prefix=""):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
+            if is_deepspeed_zero3_enabled():
+                import deepspeed
+
+                # because zero3 puts placeholders in model params, this context
+                # manager gathers (unpartitions) the params of the current layer, then loads from
+                # the state dict and then re-partitions them again
+                with deepspeed.zero.GatheredParameters(list(module.parameters(recurse=False)), modifier_rank=0):
+                    if torch.distributed.get_rank() == 0:
+                        module._load_from_state_dict(*args)
+            else:
+                module._load_from_state_dict(*args)
+
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + ".")
+
+        # Make sure we are able to load base models as well as derived models (with heads)
+        start_prefix = ""
+        model_to_load = model
+        if not hasattr(model, cls.base_model_prefix) and has_prefix_module:
+            start_prefix = cls.base_model_prefix + "."
+        if hasattr(model, cls.base_model_prefix) and not has_prefix_module:
+            model_to_load = getattr(model, cls.base_model_prefix)
+            if any(key in expected_keys_not_prefixed for key in loaded_keys):
+                raise ValueError(
+                    "The state dictionary of the model you are training to load is corrupted. Are you sure it was "
+                    "properly saved?"
+                )
+
+        load(model_to_load, prefix=start_prefix)
+
+        if len(error_msgs) > 0:
+            error_msg = "\n\t".join(error_msgs)
+            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
+
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when "
+                f"initializing {model.__class__.__name__}: {unexpected_keys}\n"
+                f"- This IS expected if you are initializing {model.__class__.__name__} from the checkpoint of a model trained on another task "
+                f"or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n"
+                f"- This IS NOT expected if you are initializing {model.__class__.__name__} from the checkpoint of a model that you expect "
+                f"to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
+            )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at {pretrained_model_name_or_path} "
+                f"and are newly initialized: {missing_keys}\n"
+                f"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        elif len(mismatched_keys) == 0:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at {pretrained_model_name_or_path}.\n"
+                f"If your task is similar to the task the model of the checkpoint was trained on, "
+                f"you can already use {model.__class__.__name__} for predictions without further training."
+            )
+        if len(mismatched_keys) > 0:
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at {pretrained_model_name_or_path} "
+                f"and are newly initialized because the shapes did not match:\n{mismatched_warning}\n"
+                f"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+
+        return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
+
+    def retrieve_modules_from_names(self, names, add_prefix=False, remove_prefix=False):
+        module_keys = set([".".join(key.split(".")[:-1]) for key in names])
+
+        # torch.nn.ParameterList is a special case where two parameter keywords
+        # are appended to the module name, *e.g.* bert.special_embeddings.0
+        module_keys = module_keys.union(set([".".join(key.split(".")[:-2]) for key in names if key[-1].isdigit()]))
+
+        retrieved_modules = []
+        # retrieve all modules that has at least one missing weight name
+        for name, module in self.named_modules():
+            if remove_prefix:
+                name = ".".join(name.split(".")[1:]) if name.startswith(self.base_model_prefix) else name
+            elif add_prefix:
+                name = ".".join([self.base_model_prefix, name]) if len(name) > 0 else self.base_model_prefix
+
+            if name in module_keys:
+                retrieved_modules.append(module)
+
+        return retrieved_modules
+
+    @classmethod
+    def _load_state_dict_into_model_low_mem(cls, model, loaded_state_dict_keys, resolved_archive_file):
+        """
+        This is an experimental function that loads the model using ~1.x model size CPU memory
+
+        Before it gets called we do:
+
+        1. save which state_dict keys we have
+        2. drop state_dict before model is created, since the latter takes 1x model size memory
+
+        Here then we continue:
+
+        3. switch to the meta device all params/buffers that are going to be replaced from the loaded state_dict
+        4. load state_dict 2nd time
+        5. replace the params/buffers from the state_dict
+
+        Currently, it doesn't handle missing_keys, unexpected_keys, mismatched_keys. It can't handle deepspeed.
+        """
+
+        require_version_core("torch>=1.9")
+        if is_deepspeed_zero3_enabled():
+            raise ValueError("low_cpu_mem_usage arg cannot be used with DeepSpeed ZeRO-3")
+
+        # a helper util to find the last sub-module and the param/buffer name
+        def find_submodule_and_param_name(model, long_key):
+            split_key = long_key.split(".")
+            submodule = model
+            while len(split_key) > 1:
+                if hasattr(submodule, split_key[0]):
+                    submodule = getattr(submodule, split_key[0])
+                    del split_key[0]
+                else:
+                    submodule = None
+                    break
+            return submodule, split_key[0]
+
+        # dematerialize param storage for keys that are going to be replaced by state_dict, by
+        # putting those on the meta device
+        for k in loaded_state_dict_keys:
+            submodule, param_name = find_submodule_and_param_name(model, k)
+            if submodule is not None:
+                # selectively switch to the meta device only those params/buffers that will
+                # be next replaced from state_dict. This a complex way to do p.to_("meta")
+                # since we have no in-place to_ for tensors.
+                new_val = getattr(submodule, param_name)
+                if isinstance(new_val, torch.nn.Parameter):
+                    # isinstance returns False for Params on meta device, so switch after the check
+                    new_val = torch.nn.Parameter(new_val.to("meta"))
+                else:
+                    new_val = new_val.to("meta")
+                setattr(submodule, param_name, new_val)
+
+        # only now can load state_dict
+        state_dict = torch.load(resolved_archive_file, map_location="cpu")
+
+        # materialize state_dict entries one by one on CPU
+        for k in loaded_state_dict_keys:
+            submodule, param_name = find_submodule_and_param_name(model, k)
+            if submodule is not None:
+                new_val = state_dict[k]
+                if isinstance(getattr(submodule, param_name), torch.nn.Parameter):
+                    new_val = torch.nn.Parameter(new_val)
+                setattr(submodule, param_name, new_val)
+
+        del state_dict
+
+class Conv1D(Module):
+    """
+    1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
+
+    Basically works like a linear layer but the weights are transposed.
+
+    Args:
+        nf (:obj:`int`): The number of output features.
+        nx (:obj:`int`): The number of input features.
+    """
+
+    def __init__(self, nf, nx):
+        super().__init__()
+        self.nf = nf
+        w = torch.empty(nx, nf)
+        nn.init.normal_(w, std=0.02)
+        self.weight = nn.Parameter(w)
+        self.bias = nn.Parameter(torch.zeros(nf))
+
+    def forward(self, x):
+        size_out = x.size()[:-1] + (self.nf,)
+        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
+        x = x.view(*size_out)
+        return x
+
+
+class SequenceSummary(Module):
+    r"""
+    Compute a single vector summary of a sequence hidden states.
+
+    Args:
+        config (:class:`~transformers.PretrainedConfig`):
+            The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
+            config class of your model for the default values it uses):
+
+            - **summary_type** (:obj:`str`) -- The method to use to make this summary. Accepted values are:
+
+                - :obj:`"last"` -- Take the last token hidden state (like XLNet)
+                - :obj:`"first"` -- Take the first token hidden state (like Bert)
+                - :obj:`"mean"` -- Take the mean of all tokens hidden states
+                - :obj:`"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+                - :obj:`"attn"` -- Not implemented now, use multi-head attention
+
+            - **summary_use_proj** (:obj:`bool`) -- Add a projection after the vector extraction.
+            - **summary_proj_to_labels** (:obj:`bool`) -- If :obj:`True`, the projection outputs to
+              :obj:`config.num_labels` classes (otherwise to :obj:`config.hidden_size`).
+            - **summary_activation** (:obj:`Optional[str]`) -- Set to :obj:`"tanh"` to add a tanh activation to the
+              output, another string or :obj:`None` will add no activation.
+            - **summary_first_dropout** (:obj:`float`) -- Optional dropout probability before the projection and
+              activation.
+            - **summary_last_dropout** (:obj:`float`)-- Optional dropout probability after the projection and
+              activation.
+    """
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+
+        self.summary_type = getattr(config, "summary_type", "last")
+        if self.summary_type == "attn":
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+
+        self.summary = Identity()
+        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = nn.Linear(config.hidden_size, num_classes)
+
+        activation_string = getattr(config, "summary_activation", None)
+        self.activation: Callable = get_activation(activation_string) if activation_string else Identity()
+
+        self.first_dropout = Identity()
+        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = Identity()
+        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
+
+    def forward(
+        self, hidden_states: "torch.FloatTensor", cls_index: Optional["torch.LongTensor"] = None
+    ) -> "torch.FloatTensor":
+        """
+        Compute a single vector summary of a sequence hidden states.
+
+        Args:
+            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`[batch_size, seq_len, hidden_size]`):
+                The hidden states of the last layer.
+            cls_index (:obj:`torch.LongTensor` of shape :obj:`[batch_size]` or :obj:`[batch_size, ...]` where ... are optional leading dimensions of :obj:`hidden_states`, `optional`):
+                Used if :obj:`summary_type == "cls_index"` and takes the last token of the sequence as classification
+                token.
+
+        Returns:
+            :obj:`torch.FloatTensor`: The summary of the sequence hidden states.
+        """
+        if self.summary_type == "last":
+            output = hidden_states[:, -1]
+        elif self.summary_type == "first":
+            output = hidden_states[:, 0]
+        elif self.summary_type == "mean":
+            output = hidden_states.mean(dim=1)
+        elif self.summary_type == "cls_index":
+            if cls_index is None:
+                cls_index = torch.full_like(
+                    hidden_states[..., :1, :],
+                    hidden_states.shape[-2] - 1,
+                    dtype=torch.long,
+                )
+            else:
+                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
+                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
+        elif self.summary_type == "attn":
+            raise NotImplementedError
+
+        output = self.first_dropout(output)
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.last_dropout(output)
+
+        return output
+
+
+def unwrap_model(model: "nn.Module") -> "nn.Module":
+    """
+    Recursively unwraps a model from potential containers (as used in distributed training).
+
+    Args:
+        model (:obj:`torch.nn.Module`): The model to unwrap.
+    """
+    # since there could be multiple levels of wrapping, unwrap recursively
+    if hasattr(model, "module"):
+        return unwrap_model(model.module)
+    else:
+        return model
+
+def prune_linear_layer(layer: "nn.Linear", index: "torch.LongTensor", dim: int = 0) -> "nn.Linear":
+    """
+    Prune a linear layer to keep only entries in index.
+
+    Used to remove heads.
+
+    Args:
+        layer (:obj:`torch.nn.Linear`): The layer to prune.
+        index (:obj:`torch.LongTensor`): The indices to keep in the layer.
+        dim (:obj:`int`, `optional`, defaults to 0): The dimension on which to keep the indices.
+
+    Returns:
+        :obj:`torch.nn.Linear`: The pruned layer as a new layer with :obj:`requires_grad=True`.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if layer.bias is not None:
+        if dim == 1:
+            b = layer.bias.clone().detach()
+        else:
+            b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    if layer.bias is not None:
+        new_layer.bias.requires_grad = False
+        new_layer.bias.copy_(b.contiguous())
+        new_layer.bias.requires_grad = True
+    return new_layer
+
+def prune_conv1d_layer(layer: Conv1D, index: "torch.LongTensor", dim: int = 1) -> Conv1D:
+    """
+    Prune a Conv1D layer to keep only entries in index. A Conv1D work as a Linear layer (see e.g. BERT) but the weights
+    are transposed.
+
+    Used to remove heads.
+
+    Args:
+        layer (:class:`~transformers.modeling_utils.Conv1D`): The layer to prune.
+        index (:obj:`torch.LongTensor`): The indices to keep in the layer.
+        dim (:obj:`int`, `optional`, defaults to 1): The dimension on which to keep the indices.
+
+    Returns:
+        :class:`~transformers.modeling_utils.Conv1D`: The pruned layer as a new layer with :obj:`requires_grad=True`.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if dim == 0:
+        b = layer.bias.clone().detach()
+    else:
+        b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    new_layer.bias.requires_grad = False
+    new_layer.bias.copy_(b.contiguous())
+    new_layer.bias.requires_grad = True
+    return new_layer
+
+def apply_chunking_to_forward(
+    forward_fn: Callable[..., "torch.Tensor"], chunk_size: int, chunk_dim: int, *input_tensors
+) -> "torch.Tensor":
+    """
+    This function chunks the :obj:`input_tensors` into smaller input tensor parts of size :obj:`chunk_size` over the
+    dimension :obj:`chunk_dim`. It then applies a layer :obj:`forward_fn` to each chunk independently to save memory.
+
+    If the :obj:`forward_fn` is independent across the :obj:`chunk_dim` this function will yield the same result as
+    directly applying :obj:`forward_fn` to :obj:`input_tensors`.
+
+    Args:
+        forward_fn (:obj:`Callable[..., torch.Tensor]`):
+            The forward function of the model.
+        chunk_size (:obj:`int`):
+            The chunk size of a chunked tensor: :obj:`num_chunks = len(input_tensors[0]) / chunk_size`.
+        chunk_dim (:obj:`int`):
+            The dimension over which the :obj:`input_tensors` should be chunked.
+        input_tensors (:obj:`Tuple[torch.Tensor]`):
+            The input tensors of ``forward_fn`` which will be chunked
+
+    Returns:
+        :obj:`torch.Tensor`: A tensor with the same shape as the :obj:`forward_fn` would have given if applied`.
+
+
+    Examples::
+
+        # rename the usual forward() fn to forward_chunk()
+        def forward_chunk(self, hidden_states):
+            hidden_states = self.decoder(hidden_states)
+            return hidden_states
+
+        # implement a chunked forward function
+        def forward(self, hidden_states):
+            return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
+    """
+
+    assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors"
+
+    # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility
+    num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
+    if num_args_in_forward_chunk_fn != len(input_tensors):
+        raise ValueError(
+            f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input "
+            "tensors are given"
+        )
+
+    if chunk_size > 0:
+        tensor_shape = input_tensors[0].shape[chunk_dim]
+        for input_tensor in input_tensors:
+            if input_tensor.shape[chunk_dim] != tensor_shape:
+                raise ValueError(
+                    f"All input tenors have to be of the same shape: {tensor_shape}, "
+                    f"found shape {input_tensor.shape[chunk_dim]}"
+                )
+
+        if input_tensors[0].shape[chunk_dim] % chunk_size != 0:
+            raise ValueError(
+                f"The dimension to be chunked {input_tensors[0].shape[chunk_dim]} has to be a multiple of the chunk "
+                f"size {chunk_size}"
+            )
+
+        num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size
+
+        # chunk input tensor into tuples
+        input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors)
+        # apply forward fn to every tuple
+        output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks))
+        # concatenate output at same dimension
+        return torch.cat(output_chunks, dim=chunk_dim)
+
+    return forward_fn(*input_tensors)
\ No newline at end of file
diff --git a/fastNLP/transformers/torch/models/__init__.py b/fastNLP/transformers/torch/models/__init__.py
new file mode 100644
index 00000000..ddf3005f
--- /dev/null
+++ b/fastNLP/transformers/torch/models/__init__.py
@@ -0,0 +1,5 @@
+from .bart import *
+from .bert import *
+from .cpt import *
+from .gpt2 import *
+from .roberta import *
\ No newline at end of file
diff --git a/fastNLP/transformers/torch/models/auto/configuration_auto.py b/fastNLP/transformers/torch/models/auto/configuration_auto.py
new file mode 100644
index 00000000..bcd7576c
--- /dev/null
+++ b/fastNLP/transformers/torch/models/auto/configuration_auto.py
@@ -0,0 +1,541 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Config class. """
+import importlib
+import re
+from collections import OrderedDict
+from typing import List, Union
+
+from fastNLP.transformers.torch.configuration_utils import PretrainedConfig
+from fastNLP.transformers.torch.file_utils import CONFIG_NAME
+from fastNLP.core.log import logger
+
+
+CONFIG_MAPPING_NAMES = OrderedDict(
+    [
+        # Add configs here
+        ("fnet", "FNetConfig"),
+        ("gptj", "GPTJConfig"),
+        ("layoutlmv2", "LayoutLMv2Config"),
+        ("beit", "BeitConfig"),
+        ("rembert", "RemBertConfig"),
+        ("visual_bert", "VisualBertConfig"),
+        ("canine", "CanineConfig"),
+        ("roformer", "RoFormerConfig"),
+        ("clip", "CLIPConfig"),
+        ("bigbird_pegasus", "BigBirdPegasusConfig"),
+        ("deit", "DeiTConfig"),
+        ("luke", "LukeConfig"),
+        ("detr", "DetrConfig"),
+        ("gpt_neo", "GPTNeoConfig"),
+        ("big_bird", "BigBirdConfig"),
+        ("speech_to_text_2", "Speech2Text2Config"),
+        ("speech_to_text", "Speech2TextConfig"),
+        ("vit", "ViTConfig"),
+        ("wav2vec2", "Wav2Vec2Config"),
+        ("m2m_100", "M2M100Config"),
+        ("convbert", "ConvBertConfig"),
+        ("led", "LEDConfig"),
+        ("blenderbot-small", "BlenderbotSmallConfig"),
+        ("retribert", "RetriBertConfig"),
+        ("ibert", "IBertConfig"),
+        ("mt5", "MT5Config"),
+        ("t5", "T5Config"),
+        ("mobilebert", "MobileBertConfig"),
+        ("distilbert", "DistilBertConfig"),
+        ("albert", "AlbertConfig"),
+        ("bert-generation", "BertGenerationConfig"),
+        ("camembert", "CamembertConfig"),
+        ("xlm-roberta", "XLMRobertaConfig"),
+        ("pegasus", "PegasusConfig"),
+        ("marian", "MarianConfig"),
+        ("mbart", "MBartConfig"),
+        ("megatron-bert", "MegatronBertConfig"),
+        ("mpnet", "MPNetConfig"),
+        ("bart", "BartConfig"),
+        ("blenderbot", "BlenderbotConfig"),
+        ("reformer", "ReformerConfig"),
+        ("longformer", "LongformerConfig"),
+        ("roberta", "RobertaConfig"),
+        ("deberta-v2", "DebertaV2Config"),
+        ("deberta", "DebertaConfig"),
+        ("flaubert", "FlaubertConfig"),
+        ("fsmt", "FSMTConfig"),
+        ("squeezebert", "SqueezeBertConfig"),
+        ("hubert", "HubertConfig"),
+        ("bert", "BertConfig"),
+        ("openai-gpt", "OpenAIGPTConfig"),
+        ("gpt2", "GPT2Config"),
+        ("transfo-xl", "TransfoXLConfig"),
+        ("xlnet", "XLNetConfig"),
+        ("xlm-prophetnet", "XLMProphetNetConfig"),
+        ("prophetnet", "ProphetNetConfig"),
+        ("xlm", "XLMConfig"),
+        ("ctrl", "CTRLConfig"),
+        ("electra", "ElectraConfig"),
+        ("speech-encoder-decoder", "SpeechEncoderDecoderConfig"),
+        ("encoder-decoder", "EncoderDecoderConfig"),
+        ("funnel", "FunnelConfig"),
+        ("lxmert", "LxmertConfig"),
+        ("dpr", "DPRConfig"),
+        ("layoutlm", "LayoutLMConfig"),
+        ("rag", "RagConfig"),
+        ("tapas", "TapasConfig"),
+        ("splinter", "SplinterConfig"),
+    ]
+)
+
+CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
+    [
+        # Add archive maps here
+        ("fnet", "FNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("pegasus", "PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("gptj", "GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("layoutlmv2", "LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("rembert", "REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("visual_bert", "VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("roformer", "ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("clip", "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("bigbird_pegasus", "BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("luke", "LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("detr", "DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("gpt_neo", "GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("big_bird", "BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("megatron-bert", "MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("speech_to_text", "SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("speech_to_text_2", "SPEECH_TO_TEXT_2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("vit", "VIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("wav2vec2", "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("m2m_100", "M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("convbert", "CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("led", "LED_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("blenderbot-small", "BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("bert", "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("bart", "BART_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("blenderbot", "BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("mbart", "MBART_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("openai-gpt", "OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("gpt2", "GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("ctrl", "CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("xlnet", "XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("xlm", "XLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("roberta", "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("distilbert", "DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("camembert", "CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("t5", "T5_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("xlm-roberta", "XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("flaubert", "FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("fsmt", "FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("electra", "ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("longformer", "LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("retribert", "RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("funnel", "FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("lxmert", "LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("layoutlm", "LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("dpr", "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("deberta", "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("deberta-v2", "DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("squeezebert", "SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("xlm-prophetnet", "XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("prophetnet", "PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("mpnet", "MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("tapas", "TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("splinter", "SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+    ]
+)
+
+MODEL_NAMES_MAPPING = OrderedDict(
+    [
+        # Add full (and cased) model names here
+        ("fnet", "FNet"),
+        ("gptj", "GPT-J"),
+        ("beit", "BeiT"),
+        ("rembert", "RemBERT"),
+        ("layoutlmv2", "LayoutLMv2"),
+        ("visual_bert", "VisualBert"),
+        ("canine", "Canine"),
+        ("roformer", "RoFormer"),
+        ("clip", "CLIP"),
+        ("bigbird_pegasus", "BigBirdPegasus"),
+        ("deit", "DeiT"),
+        ("luke", "LUKE"),
+        ("detr", "DETR"),
+        ("gpt_neo", "GPT Neo"),
+        ("big_bird", "BigBird"),
+        ("speech_to_text_2", "Speech2Text2"),
+        ("speech_to_text", "Speech2Text"),
+        ("vit", "ViT"),
+        ("wav2vec2", "Wav2Vec2"),
+        ("m2m_100", "M2M100"),
+        ("convbert", "ConvBERT"),
+        ("led", "LED"),
+        ("blenderbot-small", "BlenderbotSmall"),
+        ("retribert", "RetriBERT"),
+        ("ibert", "I-BERT"),
+        ("t5", "T5"),
+        ("mobilebert", "MobileBERT"),
+        ("distilbert", "DistilBERT"),
+        ("albert", "ALBERT"),
+        ("bert-generation", "Bert Generation"),
+        ("camembert", "CamemBERT"),
+        ("xlm-roberta", "XLM-RoBERTa"),
+        ("pegasus", "Pegasus"),
+        ("blenderbot", "Blenderbot"),
+        ("marian", "Marian"),
+        ("mbart", "mBART"),
+        ("megatron-bert", "MegatronBert"),
+        ("bart", "BART"),
+        ("reformer", "Reformer"),
+        ("longformer", "Longformer"),
+        ("roberta", "RoBERTa"),
+        ("flaubert", "FlauBERT"),
+        ("fsmt", "FairSeq Machine-Translation"),
+        ("squeezebert", "SqueezeBERT"),
+        ("bert", "BERT"),
+        ("openai-gpt", "OpenAI GPT"),
+        ("gpt2", "OpenAI GPT-2"),
+        ("transfo-xl", "Transformer-XL"),
+        ("xlnet", "XLNet"),
+        ("xlm", "XLM"),
+        ("ctrl", "CTRL"),
+        ("electra", "ELECTRA"),
+        ("encoder-decoder", "Encoder decoder"),
+        ("speech-encoder-decoder", "Speech Encoder decoder"),
+        ("funnel", "Funnel Transformer"),
+        ("lxmert", "LXMERT"),
+        ("deberta-v2", "DeBERTa-v2"),
+        ("deberta", "DeBERTa"),
+        ("layoutlm", "LayoutLM"),
+        ("dpr", "DPR"),
+        ("rag", "RAG"),
+        ("xlm-prophetnet", "XLMProphetNet"),
+        ("prophetnet", "ProphetNet"),
+        ("mt5", "mT5"),
+        ("mpnet", "MPNet"),
+        ("tapas", "TAPAS"),
+        ("hubert", "Hubert"),
+        ("barthez", "BARThez"),
+        ("phobert", "PhoBERT"),
+        ("cpm", "CPM"),
+        ("bertweet", "Bertweet"),
+        ("bert-japanese", "BertJapanese"),
+        ("byt5", "ByT5"),
+        ("mbart50", "mBART-50"),
+        ("splinter", "Splinter"),
+    ]
+)
+
+SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict([("openai-gpt", "openai")])
+
+
+def model_type_to_module_name(key):
+    """Converts a config key to the corresponding module."""
+    # Special treatment
+    if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME:
+        return SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key]
+
+    return key.replace("-", "_")
+
+
+def config_class_to_model_type(config):
+    """Converts a config class name to the corresponding model type"""
+    for key, cls in CONFIG_MAPPING_NAMES.items():
+        if cls == config:
+            return key
+    return None
+
+
+class _LazyConfigMapping(OrderedDict):
+    """
+    A dictionary that lazily load its values when they are requested.
+    """
+
+    def __init__(self, mapping):
+        self._mapping = mapping
+        self._modules = {}
+
+    def __getitem__(self, key):
+        if key not in self._mapping:
+            raise KeyError(key)
+        value = self._mapping[key]
+        module_name = model_type_to_module_name(key)
+        if module_name not in self._modules:
+            self._modules[module_name] = importlib.import_module(f".{module_name}", "transformers.models")
+        return getattr(self._modules[module_name], value)
+
+    def keys(self):
+        return self._mapping.keys()
+
+    def values(self):
+        return [self[k] for k in self._mapping.keys()]
+
+    def items(self):
+        return [(k, self[k]) for k in self._mapping.keys()]
+
+    def __iter__(self):
+        return iter(self._mapping.keys())
+
+    def __contains__(self, item):
+        return item in self._mapping
+
+
+CONFIG_MAPPING = _LazyConfigMapping(CONFIG_MAPPING_NAMES)
+
+
+class _LazyLoadAllMappings(OrderedDict):
+    """
+    A mapping that will load all pairs of key values at the first access (either by indexing, requestions keys, values,
+    etc.)
+
+    Args:
+        mapping: The mapping to load.
+    """
+
+    def __init__(self, mapping):
+        self._mapping = mapping
+        self._initialized = False
+        self._data = {}
+
+    def _initialize(self):
+        if self._initialized:
+            return
+        logger.warn(
+            "ALL_PRETRAINED_CONFIG_ARCHIVE_MAP is deprecated and will be removed in v5 of Transformers. "
+            "It does not contain all available model checkpoints, far from it. Checkout hf.co/models for that.",
+            FutureWarning,
+        )
+
+        for model_type, map_name in self._mapping.items():
+            module_name = model_type_to_module_name(model_type)
+            module = importlib.import_module(f".{module_name}", "transformers.models")
+            mapping = getattr(module, map_name)
+            self._data.update(mapping)
+
+        self._initialized = True
+
+    def __getitem__(self, key):
+        self._initialize()
+        return self._data[key]
+
+    def keys(self):
+        self._initialize()
+        return self._data.keys()
+
+    def values(self):
+        self._initialize()
+        return self._data.values()
+
+    def items(self):
+        self._initialize()
+        return self._data.keys()
+
+    def __iter__(self):
+        self._initialize()
+        return iter(self._data)
+
+    def __contains__(self, item):
+        self._initialize()
+        return item in self._data
+
+
+ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = _LazyLoadAllMappings(CONFIG_ARCHIVE_MAP_MAPPING_NAMES)
+
+
+def _get_class_name(model_class: Union[str, List[str]]):
+    if isinstance(model_class, (list, tuple)):
+        return " or ".join([f":class:`~transformers.{c}`" for c in model_class if c is not None])
+    return f":class:`~transformers.{model_class}`"
+
+
+def _list_model_options(indent, config_to_class=None, use_model_types=True):
+    if config_to_class is None and not use_model_types:
+        raise ValueError("Using `use_model_types=False` requires a `config_to_class` dictionary.")
+    if use_model_types:
+        if config_to_class is None:
+            model_type_to_name = {
+                model_type: f":class:`~transformers.{config}`" for model_type, config in CONFIG_MAPPING_NAMES.items()
+            }
+        else:
+            model_type_to_name = {
+                model_type: _get_class_name(model_class)
+                for model_type, model_class in config_to_class.items()
+                if model_type in MODEL_NAMES_MAPPING
+            }
+        lines = [
+            f"{indent}- **{model_type}** -- {model_type_to_name[model_type]} ({MODEL_NAMES_MAPPING[model_type]} model)"
+            for model_type in sorted(model_type_to_name.keys())
+        ]
+    else:
+        config_to_name = {
+            CONFIG_MAPPING_NAMES[config]: _get_class_name(clas)
+            for config, clas in config_to_class.items()
+            if config in CONFIG_MAPPING_NAMES
+        }
+        config_to_model_name = {
+            config: MODEL_NAMES_MAPPING[model_type] for model_type, config in CONFIG_MAPPING_NAMES.items()
+        }
+        lines = [
+            f"{indent}- :class:`~transformers.{config_name}` configuration class: {config_to_name[config_name]} ({config_to_model_name[config_name]} model)"
+            for config_name in sorted(config_to_name.keys())
+        ]
+    return "\n".join(lines)
+
+
+def replace_list_option_in_docstrings(config_to_class=None, use_model_types=True):
+    def docstring_decorator(fn):
+        docstrings = fn.__doc__
+        lines = docstrings.split("\n")
+        i = 0
+        while i < len(lines) and re.search(r"^(\s*)List options\s*$", lines[i]) is None:
+            i += 1
+        if i < len(lines):
+            indent = re.search(r"^(\s*)List options\s*$", lines[i]).groups()[0]
+            if use_model_types:
+                indent = f"{indent}    "
+            lines[i] = _list_model_options(indent, config_to_class=config_to_class, use_model_types=use_model_types)
+            docstrings = "\n".join(lines)
+        else:
+            raise ValueError(
+                f"The function {fn} should have an empty 'List options' in its docstring as placeholder, current docstring is:\n{docstrings}"
+            )
+        fn.__doc__ = docstrings
+        return fn
+
+    return docstring_decorator
+
+
+class AutoConfig:
+    r"""
+    This is a generic configuration class that will be instantiated as one of the configuration classes of the library
+    when created with the :meth:`~transformers.AutoConfig.from_pretrained` class method.
+
+    This class cannot be instantiated directly using ``__init__()`` (throws an error).
+    """
+
+    def __init__(self):
+        raise EnvironmentError(
+            "AutoConfig is designed to be instantiated "
+            "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method."
+        )
+
+    @classmethod
+    def for_model(cls, model_type: str, *args, **kwargs):
+        if model_type in CONFIG_MAPPING:
+            config_class = CONFIG_MAPPING[model_type]
+            return config_class(*args, **kwargs)
+        raise ValueError(
+            f"Unrecognized model identifier: {model_type}. Should contain one of {', '.join(CONFIG_MAPPING.keys())}"
+        )
+
+    @classmethod
+    @replace_list_option_in_docstrings()
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r"""
+        Instantiate one of the configuration classes of the library from a pretrained model configuration.
+
+        The configuration class to instantiate is selected based on the :obj:`model_type` property of the config object
+        that is loaded, or when it's missing, by falling back to using pattern matching on
+        :obj:`pretrained_model_name_or_path`:
+
+        List options
+
+        Args:
+            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+                Can be either:
+
+                    - A string, the `model id` of a pretrained model configuration hosted inside a model repo on
+                      huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
+                      namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
+                    - A path to a `directory` containing a configuration file saved using the
+                      :meth:`~transformers.PretrainedConfig.save_pretrained` method, or the
+                      :meth:`~transformers.PreTrainedModel.save_pretrained` method, e.g., ``./my_model_directory/``.
+                    - A path or url to a saved configuration JSON `file`, e.g.,
+                      ``./my_model_directory/configuration.json``.
+            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+                Path to a directory in which a downloaded pretrained model configuration should be cached if the
+                standard cache should not be used.
+            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to force the (re-)download the model weights and configuration files and override the
+                cached versions if they exist.
+            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to delete incompletely received files. Will attempt to resume the download if such a
+                file exists.
+            proxies (:obj:`Dict[str, str]`, `optional`):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                identifier allowed by git.
+            return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                If :obj:`False`, then this function returns just the final configuration object.
+
+                If :obj:`True`, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs`
+                is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e.,
+                the part of ``kwargs`` which has not been used to update ``config`` and is otherwise ignored.
+            kwargs(additional keyword arguments, `optional`):
+                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
+                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
+                by the ``return_unused_kwargs`` keyword parameter.
+
+        Examples::
+
+            >>> from transformers import AutoConfig
+
+            >>> # Download configuration from huggingface.co and cache.
+            >>> config = AutoConfig.from_pretrained('bert-base-uncased')
+
+            >>> # Download configuration from huggingface.co (user-uploaded) and cache.
+            >>> config = AutoConfig.from_pretrained('dbmdz/bert-base-german-cased')
+
+            >>> # If configuration file is in a directory (e.g., was saved using `save_pretrained('./test/saved_model/')`).
+            >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/')
+
+            >>> # Load a specific configuration file.
+            >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
+
+            >>> # Change some config attributes when loading a pretrained config.
+            >>> config = AutoConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
+            >>> config.output_attentions
+            True
+            >>> config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False, return_unused_kwargs=True)
+            >>> config.output_attentions
+            True
+            >>> config.unused_kwargs
+            {'foo': False}
+        """
+        kwargs["_from_auto"] = True
+        config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if "model_type" in config_dict:
+            config_class = CONFIG_MAPPING[config_dict["model_type"]]
+            return config_class.from_dict(config_dict, **kwargs)
+        else:
+            # Fallback: use pattern matching on the string.
+            for pattern, config_class in CONFIG_MAPPING.items():
+                if pattern in str(pretrained_model_name_or_path):
+                    return config_class.from_dict(config_dict, **kwargs)
+
+        raise ValueError(
+            f"Unrecognized model in {pretrained_model_name_or_path}. "
+            f"Should have a `model_type` key in its {CONFIG_NAME}, or contain one of the following strings "
+            f"in its name: {', '.join(CONFIG_MAPPING.keys())}"
+        )
diff --git a/fastNLP/transformers/torch/models/auto/tokenization_auto.py b/fastNLP/transformers/torch/models/auto/tokenization_auto.py
new file mode 100644
index 00000000..e275579f
--- /dev/null
+++ b/fastNLP/transformers/torch/models/auto/tokenization_auto.py
@@ -0,0 +1,199 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Tokenizer class. """
+
+from collections import OrderedDict
+from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union
+
+from ...file_utils import (
+    is_sentencepiece_available,
+    is_tokenizers_available,
+)
+
+if TYPE_CHECKING:
+    # This significantly improves completion suggestion performance when
+    # the transformers package is used with Microsoft's Pylance language server.
+    TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict()
+else:
+    TOKENIZER_MAPPING_NAMES = OrderedDict(
+        [
+            ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)),
+            ("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)),
+            ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)),
+            (
+                "t5",
+                (
+                    "T5Tokenizer" if is_sentencepiece_available() else None,
+                    "T5TokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            (
+                "mt5",
+                (
+                    "MT5Tokenizer" if is_sentencepiece_available() else None,
+                    "MT5TokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
+            ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)),
+            (
+                "albert",
+                (
+                    "AlbertTokenizer" if is_sentencepiece_available() else None,
+                    "AlbertTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            (
+                "camembert",
+                (
+                    "CamembertTokenizer" if is_sentencepiece_available() else None,
+                    "CamembertTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            (
+                "pegasus",
+                (
+                    "PegasusTokenizer" if is_sentencepiece_available() else None,
+                    "PegasusTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            (
+                "mbart",
+                (
+                    "MBartTokenizer" if is_sentencepiece_available() else None,
+                    "MBartTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            (
+                "xlm-roberta",
+                (
+                    "XLMRobertaTokenizer" if is_sentencepiece_available() else None,
+                    "XLMRobertaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)),
+            ("blenderbot-small", ("BlenderbotSmallTokenizer", None)),
+            ("blenderbot", ("BlenderbotTokenizer", None)),
+            ("bart", ("BartTokenizer", "BartTokenizerFast")),
+            ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
+            ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
+            (
+                "reformer",
+                (
+                    "ReformerTokenizer" if is_sentencepiece_available() else None,
+                    "ReformerTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)),
+            ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)),
+            ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)),
+            ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)),
+            ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)),
+            (
+                "dpr",
+                (
+                    "DPRQuestionEncoderTokenizer",
+                    "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            (
+                "squeezebert",
+                ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None),
+            ),
+            ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+            ("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)),
+            ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+            ("transfo-xl", ("TransfoXLTokenizer", None)),
+            (
+                "xlnet",
+                (
+                    "XLNetTokenizer" if is_sentencepiece_available() else None,
+                    "XLNetTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            ("flaubert", ("FlaubertTokenizer", None)),
+            ("xlm", ("XLMTokenizer", None)),
+            ("ctrl", ("CTRLTokenizer", None)),
+            ("fsmt", ("FSMTTokenizer", None)),
+            ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)),
+            ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)),
+            ("deberta-v2", ("DebertaV2Tokenizer" if is_sentencepiece_available() else None, None)),
+            ("rag", ("RagTokenizer", None)),
+            ("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)),
+            ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
+            ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
+            ("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)),
+            ("prophetnet", ("ProphetNetTokenizer", None)),
+            ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
+            ("tapas", ("TapasTokenizer", None)),
+            ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)),
+            ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)),
+            (
+                "big_bird",
+                (
+                    "BigBirdTokenizer" if is_sentencepiece_available() else None,
+                    "BigBirdTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)),
+            ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
+            ("hubert", ("Wav2Vec2CTCTokenizer", None)),
+            ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)),
+            ("luke", ("LukeTokenizer", None)),
+            ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)),
+            ("canine", ("CanineTokenizer", None)),
+            ("bertweet", ("BertweetTokenizer", None)),
+            ("bert-japanese", ("BertJapaneseTokenizer", None)),
+            ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")),
+            ("byt5", ("ByT5Tokenizer", None)),
+            (
+                "cpm",
+                (
+                    "CpmTokenizer" if is_sentencepiece_available() else None,
+                    "CpmTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)),
+            ("phobert", ("PhobertTokenizer", None)),
+            (
+                "barthez",
+                (
+                    "BarthezTokenizer" if is_sentencepiece_available() else None,
+                    "BarthezTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            (
+                "mbart50",
+                (
+                    "MBart50Tokenizer" if is_sentencepiece_available() else None,
+                    "MBart50TokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            (
+                "rembert",
+                (
+                    "RemBertTokenizer" if is_sentencepiece_available() else None,
+                    "RemBertTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+            (
+                "clip",
+                (
+                    "CLIPTokenizer",
+                    "CLIPTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
+        ]
+    )
\ No newline at end of file
diff --git a/fastNLP/transformers/torch/models/bart/__init__.py b/fastNLP/transformers/torch/models/bart/__init__.py
new file mode 100644
index 00000000..127f95b6
--- /dev/null
+++ b/fastNLP/transformers/torch/models/bart/__init__.py
@@ -0,0 +1,20 @@
+__all__ = [
+    "BartConfig",
+    "BART_PRETRAINED_CONFIG_ARCHIVE_MAP",
+
+    "BART_PRETRAINED_MODEL_ARCHIVE_LIST",
+    "BartForCausalLM",
+    "BartForConditionalGeneration",
+    "BartForQuestionAnswering",
+    "BartForSequenceClassification",
+    "BartModel",
+    "BartPretrainedModel",
+    "PretrainedBartModel",
+
+    "BartTokenizer",
+]
+
+from .configuration_bart import BartConfig, BART_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .tokenization_bart import BartTokenizer
+from .modeling_bart import BartForCausalLM, BartForConditionalGeneration, BartModel, BartForQuestionAnswering, \
+    BartForSequenceClassification, BartPretrainedModel, PretrainedBartModel, BART_PRETRAINED_MODEL_ARCHIVE_LIST
\ No newline at end of file
diff --git a/fastNLP/transformers/torch/models/bart/configuration_bart.py b/fastNLP/transformers/torch/models/bart/configuration_bart.py
new file mode 100644
index 00000000..3b52bc81
--- /dev/null
+++ b/fastNLP/transformers/torch/models/bart/configuration_bart.py
@@ -0,0 +1,177 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BART model configuration """
+from fastNLP.transformers.torch.configuration_utils import PretrainedConfig
+from fastNLP.core.log import logger
+
+__all__ = [
+    "BartConfig",
+    "BART_PRETRAINED_CONFIG_ARCHIVE_MAP",
+]
+
+BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/config.json",
+    # See all BART models at https://huggingface.co/models?filter=bart
+}
+
+
+class BartConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.BartModel`. It is used to
+    instantiate a BART model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the BART `facebook/bart-large
+    <https://huggingface.co/facebook/bart-large>`__ architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+            Vocabulary size of the BART model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BartModel` or
+            :class:`~transformers.TFBartModel`.
+        d_model (:obj:`int`, `optional`, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        num_labels: (:obj:`int`, `optional`, defaults to 3):
+            The number of labels to use in :class:`~transformers.BartForSequenceClassification`.
+        forced_eos_token_id (:obj:`int`, `optional`, defaults to 2):
+            The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
+            :obj:`eos_token_id`.
+
+    Example::
+
+        >>> from transformers import BartModel, BartConfig
+
+        >>> # Initializing a BART facebook/bart-large style configuration
+        >>> configuration = BartConfig()
+
+        >>> # Initializing a model from the facebook/bart-large style configuration
+        >>> model = BartModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = "bart"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        use_cache=True,
+        num_labels=3,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        is_encoder_decoder=True,
+        decoder_start_token_id=2,
+        forced_eos_token_id=2,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+        super().__init__(
+            num_labels=num_labels,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            **kwargs,
+        )
+
+        # ensure backward compatibility for BART CNN models
+        if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
+            self.forced_bos_token_id = self.bos_token_id
+            logger.warn(
+                f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions."
+                "The config can simply be saved and uploaded again to be fixed."
+            )
diff --git a/fastNLP/transformers/torch/models/bart/modeling_bart.py b/fastNLP/transformers/torch/models/bart/modeling_bart.py
new file mode 100644
index 00000000..7219f49a
--- /dev/null
+++ b/fastNLP/transformers/torch/models/bart/modeling_bart.py
@@ -0,0 +1,1834 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch BART model. """
+import copy
+import math
+import random
+from typing import Optional, Tuple
+
+from fastNLP.transformers.torch.activations import ACT2FN
+from fastNLP.transformers.torch.file_utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from fastNLP.transformers.torch.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
+)
+from fastNLP.transformers.torch.modeling_utils import PreTrainedModel
+from .configuration_bart import BartConfig
+from fastNLP.envs.imports import _NEED_IMPORT_TORCH
+from fastNLP.core.log import logger
+
+if _NEED_IMPORT_TORCH:
+    import torch
+    import torch.utils.checkpoint
+    from torch import nn
+    from torch.nn import CrossEntropyLoss, MSELoss, Module, Embedding
+else:
+    from fastNLP.core.utils.dummy_class import DummyClass as Module, DummyClass as Embedding
+
+__all__ = [
+    "BART_PRETRAINED_MODEL_ARCHIVE_LIST",
+    "BartForCausalLM",
+    "BartForConditionalGeneration",
+    "BartForQuestionAnswering",
+    "BartForSequenceClassification",
+    "BartModel",
+    "BartPretrainedModel",
+    "PretrainedBartModel",
+]
+
+_CHECKPOINT_FOR_DOC = "facebook/bart-large"
+_CONFIG_FOR_DOC = "BartConfig"
+_TOKENIZER_FOR_DOC = "BartTokenizer"
+
+
+BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/bart-large",
+    # See all BART models at https://huggingface.co/models?filter=bart
+]
+
+
+def shift_tokens_right(input_ids: "torch.Tensor", pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def _make_causal_mask(input_ids_shape: "torch.Size", dtype: "torch.dtype", past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), float("-inf"))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+def _expand_mask(mask: "torch.Tensor", dtype: "torch.dtype", tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+
+class BartLearnedPositionalEmbedding(Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+
+    def forward(self, input_ids_shape: "torch.Size", past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions + self.offset)
+
+
+class BartAttention(Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+        self.scaling = self.head_dim ** -0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: "torch.Tensor", seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: "torch.Tensor",
+        key_value_states: Optional["torch.Tensor"] = None,
+        past_key_value: Optional[Tuple["torch.Tensor"]] = None,
+        attention_mask: Optional["torch.Tensor"] = None,
+        layer_head_mask: Optional["torch.Tensor"] = None,
+        output_attentions: bool = False,
+    ) -> Tuple["torch.Tensor", Optional["torch.Tensor"], Optional[Tuple["torch.Tensor"]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class BartEncoderLayer(Module):
+    def __init__(self, config: BartConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = BartAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: "torch.Tensor",
+        attention_mask: "torch.Tensor",
+        layer_head_mask: "torch.Tensor",
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class BartDecoderLayer(Module):
+    def __init__(self, config: BartConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = BartAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = BartAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: "torch.Tensor",
+        attention_mask: Optional["torch.Tensor"] = None,
+        encoder_hidden_states: Optional["torch.Tensor"] = None,
+        encoder_attention_mask: Optional["torch.Tensor"] = None,
+        layer_head_mask: Optional["torch.Tensor"] = None,
+        cross_attn_layer_head_mask: Optional["torch.Tensor"] = None,
+        past_key_value: Optional[Tuple["torch.Tensor"]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (:obj:`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class BartClassificationHead(Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(
+        self,
+        input_dim: int,
+        inner_dim: int,
+        num_classes: int,
+        pooler_dropout: float,
+    ):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: "torch.Tensor"):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+class BartPretrainedModel(PreTrainedModel):
+    config_class = BartConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_unexpected = [r"encoder\.version", r"decoder\.version"]
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (BartDecoder, BartEncoder)):
+            module.gradient_checkpointing = value
+
+    @property
+    def dummy_inputs(self):
+        pad_token = self.config.pad_token_id
+        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+        dummy_inputs = {
+            "attention_mask": input_ids.ne(pad_token),
+            "input_ids": input_ids,
+        }
+        return dummy_inputs
+
+
+class PretrainedBartModel(BartPretrainedModel):
+    def __init_subclass__(self):
+        logger.warn(
+            "The class `PretrainedBartModel` has been depreciated, please use `BartPretrainedModel` instead.",
+            FutureWarning,
+        )
+
+
+BART_START_DOCSTRING = r"""
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.BartConfig`):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+BART_GENERATION_EXAMPLE = r"""
+    Summarization example::
+
+        >>> from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
+
+        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+
+        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+
+        >>> # Generate Summary
+        >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
+        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+
+    Mask filling example::
+
+        >>> from transformers import BartTokenizer, BartForConditionalGeneration
+        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+        >>> TXT = "My friends are <mask> but they eat too many carbs."
+
+        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
+        >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+        >>> logits = model(input_ids).logits
+
+        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+        >>> probs = logits[0, masked_index].softmax(dim=0)
+        >>> values, predictions = probs.topk(5)
+
+        >>> tokenizer.decode(predictions).split()
+"""
+
+BART_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are decoder input IDs? <../glossary.html#decoder-input-ids>`__
+
+            Bart uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
+            :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
+            :obj:`past_key_values`).
+
+            For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
+            :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+            the right for denoising pre-training following the paper.
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+            also be used by default.
+
+            If you want to change padding behavior, you should read :func:`modeling_bart._prepare_decoder_inputs` and
+            modify to your needs. See diagram 1 in `the paper <https://arxiv.org/abs/1910.13461>`__ for more
+            information on the default strategy.
+        head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in ``[0,
+            1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
+            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
+            :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+            cross-attention of the decoder.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors
+            of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+            shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
+            representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
+            have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
+            :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
+            takes the value of :obj:`inputs_embeds`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+class BartEncoder(BartPretrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    :class:`BartEncoderLayer`.
+
+    Args:
+        config: BartConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: BartConfig, embed_tokens: Optional["nn.Embedding"] = None):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+
+        self.embed_positions = BartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+        )
+        self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+
+        self.init_weights()
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            head_mask (:obj:`torch.Tensor` of shape :obj:`(encoder_layers, encoder_attention_heads)`, `optional`):
+                Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        embed_pos = self.embed_positions(input_shape)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layers)
+            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class BartDecoder(BartPretrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`BartDecoderLayer`
+
+    Args:
+        config: BartConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: BartConfig, embed_tokens: Optional["nn.Embedding"] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        self.embed_positions = BartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+        )
+        self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+
+        self.init_weights()
+        self.gradient_checkpointing = False
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(self.device)
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
+                Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in ``[0, 1]``:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+                Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
+                tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+                tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+                decoding.
+
+                If :obj:`past_key_values` are used, the user can optionally input only the last
+                :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
+                shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
+                sequence_length)`.
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # embed positions
+        positions = self.embed_positions(input_shape, past_key_values_length)
+
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.layernorm_embedding(hidden_states)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                assert attn_mask.size()[0] == (
+                    len(self.layers)
+                ), f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare BART Model outputting raw hidden-states without any specific head on top.",
+    BART_START_DOCSTRING,
+)
+class BartModel(BartPretrainedModel):
+    def __init__(self, config: BartConfig):
+        super().__init__(config)
+
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+
+        self.encoder = BartEncoder(config, self.shared)
+        self.decoder = BartDecoder(config, self.shared)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+
+    def get_encoder(self):
+        return self.encoder
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Seq2SeqModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+
+        # different to other models, Bart automatically creates decoder_input_ids from
+        # input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            decoder_input_ids = shift_tokens_right(
+                input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
+)
+class BartForConditionalGeneration(BartPretrainedModel):
+    base_model_prefix = "model"
+    _keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head\.weight"]
+
+    def __init__(self, config: BartConfig):
+        super().__init__(config)
+        self.model = BartModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+
+        self.init_weights()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def resize_token_embeddings(self, new_num_tokens: int) -> "nn.Embedding":
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        self._resize_final_logits_bias(new_num_tokens)
+        return new_embeddings
+
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @add_end_docstrings(BART_GENERATION_EXAMPLE)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
+            config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: "torch.Tensor"):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
+    tasks.
+    """,
+    BART_START_DOCSTRING,
+)
+class BartForSequenceClassification(BartPretrainedModel):
+    def __init__(self, config: BartConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = BartModel(config)
+        self.classification_head = BartClassificationHead(
+            config.d_model,
+            config.d_model,
+            config.num_labels,
+            config.classifier_dropout,
+        )
+        self.model._init_weights(self.classification_head.dense)
+        self.model._init_weights(self.classification_head.out_proj)
+
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Seq2SeqSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            raise NotImplementedError(
+                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
+            )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]  # last hidden state
+
+        eos_mask = input_ids.eq(self.config.eos_token_id)
+
+        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
+            raise ValueError("All examples must have the same number of <eos> tokens.")
+        sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
+            :, -1, :
+        ]
+        logits = self.classification_head(sentence_representation)
+
+        loss = None
+        if labels is not None:
+            if self.config.num_labels == 1:
+                # regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    BART_START_DOCSTRING,
+)
+class BartForQuestionAnswering(BartPretrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        config.num_labels = 2
+        self.num_labels = config.num_labels
+
+        self.model = BartModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.model._init_weights(self.qa_outputs)
+
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Seq2SeqQuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        start_positions=None,
+        end_positions=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if start_positions is not None and end_positions is not None:
+            use_cache = False
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (
+                start_logits,
+                end_logits,
+            ) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return Seq2SeqQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+class BartDecoderWrapper(BartPretrainedModel):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
+    used in combination with the :class:`~transformers.EncoderDecoderModel` framework.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.decoder = BartDecoder(config)
+
+    def forward(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+
+
+class BartForCausalLM(BartPretrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        config = copy.deepcopy(config)
+        config.is_decoder = True
+        config.is_encoder_decoder = False
+        self.model = BartDecoderWrapper(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model.decoder = decoder
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+                in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
+                Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (:obj:`torch.Tensor` of shape :obj:`(decoder_layers, decoder_attention_heads)`, `optional`):
+                Mask to nullify selected heads of the cross-attention modules. Mask values selected in ``[0, 1]``:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+                Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2
+                tensors of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+                tensors of shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two
+                additional tensors are only required when the model is used as a decoder in a Sequence to Sequence
+                model.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential
+                decoding.
+
+                If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
+                (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+                instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
+            labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
+                config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are
+                ignored (masked), the loss is only computed for the tokens with labels in ``[0, ...,
+                config.vocab_size]``.
+            use_cache (:obj:`bool`, `optional`):
+                If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+                decoding (see :obj:`past_key_values`).
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import BartTokenizer, BartForCausalLM
+
+            >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+            >>> model = BartForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
+            >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> last_hidden_states = outputs.last_hidden_state
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = self.lm_head(outputs[0])
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, use_cache=None, **kwargs):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+
+        if past:
+            input_ids = input_ids[:, -1:]
+        # first step, decoder_cached_states are empty
+        return {
+            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
+            "attention_mask": attention_mask,
+            "past_key_values": past,
+            "use_cache": use_cache,
+        }
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
diff --git a/fastNLP/transformers/torch/models/bart/tokenization_bart.py b/fastNLP/transformers/torch/models/bart/tokenization_bart.py
new file mode 100644
index 00000000..fe6c1d04
--- /dev/null
+++ b/fastNLP/transformers/torch/models/bart/tokenization_bart.py
@@ -0,0 +1,65 @@
+# coding=utf-8
+# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..roberta.tokenization_roberta import RobertaTokenizer
+from fastNLP.core.log import logger
+
+__all__ = [
+    "BartTokenizer",
+]
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+
+# See all BART models at https://huggingface.co/models?filter=bart
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/vocab.json",
+        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/vocab.json",
+        "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/vocab.json",
+        "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/vocab.json",
+        "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/vocab.json",
+        "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/merges.txt",
+        "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/merges.txt",
+        "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/merges.txt",
+        "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/merges.txt",
+        "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/merges.txt",
+        "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/merges.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "facebook/bart-base": 1024,
+    "facebook/bart-large": 1024,
+    "facebook/bart-large-mnli": 1024,
+    "facebook/bart-large-cnn": 1024,
+    "facebook/bart-large-xsum": 1024,
+    "yjernite/bart_eli5": 1024,
+}
+
+
+class BartTokenizer(RobertaTokenizer):
+    r"""
+    Construct a BART tokenizer.
+
+    :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to superclass
+    :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization
+    parameters and other methods.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
diff --git a/fastNLP/transformers/torch/models/bert/__init__.py b/fastNLP/transformers/torch/models/bert/__init__.py
new file mode 100644
index 00000000..0edc1d6c
--- /dev/null
+++ b/fastNLP/transformers/torch/models/bert/__init__.py
@@ -0,0 +1,27 @@
+__all__ = [
+    "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+    "BertConfig",
+
+    "BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+    "BertForMaskedLM",
+    "BertForMultipleChoice",
+    "BertForNextSentencePrediction",
+    "BertForPreTraining",
+    "BertForQuestionAnswering",
+    "BertForSequenceClassification",
+    "BertForTokenClassification",
+    "BertLayer",
+    "BertLMHeadModel",
+    "BertModel",
+    "BertPreTrainedModel",
+
+    "BasicTokenizer",
+    "BertTokenizer",
+    "WordpieceTokenizer",
+]
+
+from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer
+from .modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST, BertForMaskedLM, BertForMultipleChoice, BertForPreTraining, \
+    BertForNextSentencePrediction, BertForQuestionAnswering, BertForSequenceClassification, BertForTokenClassification, \
+        BertLayer, BertLMHeadModel, BertModel, BertPreTrainedModel
\ No newline at end of file
diff --git a/fastNLP/transformers/torch/models/bert/configuration_bert.py b/fastNLP/transformers/torch/models/bert/configuration_bert.py
new file mode 100644
index 00000000..f8be6082
--- /dev/null
+++ b/fastNLP/transformers/torch/models/bert/configuration_bert.py
@@ -0,0 +1,158 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT model configuration """
+
+from fastNLP.transformers.torch.configuration_utils import PretrainedConfig
+from fastNLP.core.log import logger
+
+__all__ = [
+    "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+    "BertConfig",
+]
+
+BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/config.json",
+    "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/config.json",
+    "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/config.json",
+    "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/config.json",
+    "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json",
+    "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json",
+    "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/config.json",
+    "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/config.json",
+    "bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/config.json",
+    "bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/config.json",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/config.json",
+    "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/config.json",
+    "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/config.json",
+    "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese-char": "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/config.json",
+    "cl-tohoku/bert-base-japanese-char-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/config.json",
+    "TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json",
+    "TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/config.json",
+    "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/config.json",
+    # See all BERT models at https://huggingface.co/models?filter=bert
+}
+
+
+class BertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a
+    :class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
+            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
+            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
+            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
+            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
+            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
+            <https://arxiv.org/abs/2009.13658>`__.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+        classifier_dropout (:obj:`float`, `optional`):
+            The dropout ratio for the classification head.
+
+    Examples::
+
+        >>> from transformers import BertModel, BertConfig
+
+        >>> # Initializing a BERT bert-base-uncased style configuration
+        >>> configuration = BertConfig()
+
+        >>> # Initializing a model from the bert-base-uncased style configuration
+        >>> model = BertModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = "bert"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
diff --git a/fastNLP/transformers/torch/models/bert/modeling_bert.py b/fastNLP/transformers/torch/models/bert/modeling_bert.py
new file mode 100644
index 00000000..b95da0df
--- /dev/null
+++ b/fastNLP/transformers/torch/models/bert/modeling_bert.py
@@ -0,0 +1,1806 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model. """
+
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+from packaging import version
+
+from fastNLP.transformers.torch.activations import ACT2FN
+from fastNLP.transformers.torch.file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from fastNLP.transformers.torch.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from fastNLP.transformers.torch.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from .configuration_bert import BertConfig
+from fastNLP.envs.imports import _NEED_IMPORT_TORCH
+from fastNLP.core.log import logger
+
+if _NEED_IMPORT_TORCH:
+    import torch
+    import torch.utils.checkpoint
+    from torch import nn
+    from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss, Module
+else:
+    from fastNLP.core.utils.dummy_class import DummyClass as Module
+
+__all__ = [
+    "BERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+    "BertForMaskedLM",
+    "BertForMultipleChoice",
+    "BertForNextSentencePrediction",
+    "BertForPreTraining",
+    "BertForQuestionAnswering",
+    "BertForSequenceClassification",
+    "BertForTokenClassification",
+    "BertLayer",
+    "BertLMHeadModel",
+    "BertModel",
+    "BertPreTrainedModel",
+]
+
+_CHECKPOINT_FOR_DOC = "bert-base-uncased"
+_CONFIG_FOR_DOC = "BertConfig"
+_TOKENIZER_FOR_DOC = "BertTokenizer"
+
+BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "bert-base-uncased",
+    "bert-large-uncased",
+    "bert-base-cased",
+    "bert-large-cased",
+    "bert-base-multilingual-uncased",
+    "bert-base-multilingual-cased",
+    "bert-base-chinese",
+    "bert-base-german-cased",
+    "bert-large-uncased-whole-word-masking",
+    "bert-large-cased-whole-word-masking",
+    "bert-large-uncased-whole-word-masking-finetuned-squad",
+    "bert-large-cased-whole-word-masking-finetuned-squad",
+    "bert-base-cased-finetuned-mrpc",
+    "bert-base-german-dbmdz-cased",
+    "bert-base-german-dbmdz-uncased",
+    "cl-tohoku/bert-base-japanese",
+    "cl-tohoku/bert-base-japanese-whole-word-masking",
+    "cl-tohoku/bert-base-japanese-char",
+    "cl-tohoku/bert-base-japanese-char-whole-word-masking",
+    "TurkuNLP/bert-base-finnish-cased-v1",
+    "TurkuNLP/bert-base-finnish-uncased-v1",
+    "wietsedv/bert-base-dutch-cased",
+    # See all BERT models at https://huggingface.co/models?filter=bert
+]
+
+class BertEmbeddings(Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse("1.6.0"):
+            self.register_buffer(
+                "token_type_ids",
+                torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
+                persistent=False,
+            )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class BertSelfOutput(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
+            self.crossattention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            assert hasattr(
+                self, "crossattention"
+            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertOnlyNSPHead(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class BertPreTrainingHeads(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, BertEncoder):
+            module.gradient_checkpointing = value
+
+
+@dataclass
+class BertForPreTrainingOutput(ModelOutput):
+    """
+    Output type of :class:`~transformers.BertForPreTraining`.
+
+    Args:
+        loss (`optional`, returned when ``labels`` is provided, ``torch.FloatTensor`` of shape :obj:`(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional["torch.FloatTensor"] = None
+    prediction_logits: "torch.FloatTensor" = None
+    seq_relationship_logits: "torch.FloatTensor" = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+BERT_START_DOCSTRING = r"""
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+)
+class BertModel(BertPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForPreTraining(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import BertTokenizer, BertForPreTraining
+            >>> import torch
+
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
+
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> prediction_logits = outputs.prediction_logits
+            >>> seq_relationship_logits = outputs.seq_relationship_logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return BertForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top for CLM fine-tuning. """, BERT_START_DOCSTRING
+)
+class BertLMHeadModel(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> config.is_decoder = True
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past}
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
+@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
+class BertForMaskedLM(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """,
+    BERT_START_DOCSTRING,
+)
+class BertForNextSentencePrediction(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see ``input_ids`` docstring). Indices should be in ``[0, 1]``:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import BertTokenizer, BertForNextSentencePrediction
+            >>> import torch
+
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+
+            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+            >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+            >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+
+            >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+            >>> logits = outputs.logits
+            >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+        """
+
+        if "next_sentence_label" in kwargs:
+            logger.warn(
+                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.",
+                FutureWarning,
+            )
+            labels = kwargs.pop("next_sentence_label")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores,) + outputs[2:]
+            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForSequenceClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForMultipleChoice(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
+            :obj:`input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForTokenClassification(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    BERT_START_DOCSTRING,
+)
+class BertForQuestionAnswering(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/fastNLP/transformers/torch/models/bert/tokenization_bert.py b/fastNLP/transformers/torch/models/bert/tokenization_bert.py
new file mode 100644
index 00000000..26edd70d
--- /dev/null
+++ b/fastNLP/transformers/torch/models/bert/tokenization_bert.py
@@ -0,0 +1,558 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Bert."""
+
+
+import collections
+import os
+import unicodedata
+from typing import List, Optional, Tuple
+
+from fastNLP.transformers.torch.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from fastNLP.core.log import logger
+
+__all__ = [
+    "BasicTokenizer",
+    "BertTokenizer",
+    "WordpieceTokenizer",
+]
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
+        "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
+        "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt",
+        "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt",
+        "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/vocab.txt",
+        "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt",
+        "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt",
+        "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/vocab.txt",
+        "bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt",
+        "bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/vocab.txt",
+        "bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt",
+        "bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt",
+        "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt",
+        "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/vocab.txt",
+        "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt",
+        "TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt",
+        "TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/vocab.txt",
+        "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/vocab.txt",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "bert-base-uncased": 512,
+    "bert-large-uncased": 512,
+    "bert-base-cased": 512,
+    "bert-large-cased": 512,
+    "bert-base-multilingual-uncased": 512,
+    "bert-base-multilingual-cased": 512,
+    "bert-base-chinese": 512,
+    "bert-base-german-cased": 512,
+    "bert-large-uncased-whole-word-masking": 512,
+    "bert-large-cased-whole-word-masking": 512,
+    "bert-large-uncased-whole-word-masking-finetuned-squad": 512,
+    "bert-large-cased-whole-word-masking-finetuned-squad": 512,
+    "bert-base-cased-finetuned-mrpc": 512,
+    "bert-base-german-dbmdz-cased": 512,
+    "bert-base-german-dbmdz-uncased": 512,
+    "TurkuNLP/bert-base-finnish-cased-v1": 512,
+    "TurkuNLP/bert-base-finnish-uncased-v1": 512,
+    "wietsedv/bert-base-dutch-cased": 512,
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    "bert-base-uncased": {"do_lower_case": True},
+    "bert-large-uncased": {"do_lower_case": True},
+    "bert-base-cased": {"do_lower_case": False},
+    "bert-large-cased": {"do_lower_case": False},
+    "bert-base-multilingual-uncased": {"do_lower_case": True},
+    "bert-base-multilingual-cased": {"do_lower_case": False},
+    "bert-base-chinese": {"do_lower_case": False},
+    "bert-base-german-cased": {"do_lower_case": False},
+    "bert-large-uncased-whole-word-masking": {"do_lower_case": True},
+    "bert-large-cased-whole-word-masking": {"do_lower_case": False},
+    "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
+    "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
+    "bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
+    "bert-base-german-dbmdz-cased": {"do_lower_case": False},
+    "bert-base-german-dbmdz-uncased": {"do_lower_case": True},
+    "TurkuNLP/bert-base-finnish-cased-v1": {"do_lower_case": False},
+    "TurkuNLP/bert-base-finnish-uncased-v1": {"do_lower_case": True},
+    "wietsedv/bert-base-dutch-cased": {"do_lower_case": False},
+}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class BertTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a BERT tokenizer. Based on WordPiece.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            File containing the vocabulary.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (:obj:`Iterable`, `optional`):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            :obj:`do_basic_tokenize=True`
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this `issue
+            <https://github.com/huggingface/transformers/issues/328>`__).
+        strip_accents: (:obj:`bool`, `optional`):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for :obj:`lowercase` (as in the original BERT).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs
+    ):
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
+        pair mask has the following format:
+
+        ::
+
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+
+        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+
+
+class BasicTokenizer(object):
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (:obj:`Iterable`, `optional`):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            :obj:`do_basic_tokenize=True`
+        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this `issue
+            <https://github.com/huggingface/transformers/issues/328>`__).
+        strip_accents: (:obj:`bool`, `optional`):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for :obj:`lowercase` (as in the original BERT).
+    """
+
+    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
+        WordPieceTokenizer.
+
+        Args:
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                :func:`PreTrainedTokenizer.tokenize`) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+
+        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
diff --git a/fastNLP/transformers/torch/models/cpt/__init__.py b/fastNLP/transformers/torch/models/cpt/__init__.py
new file mode 100644
index 00000000..58d9f918
--- /dev/null
+++ b/fastNLP/transformers/torch/models/cpt/__init__.py
@@ -0,0 +1,12 @@
+__all__ = [
+    "CPT_PRETRAINED_MODEL_ARCHIVE_LIST",
+    "CPTForConditionalGeneration",
+    "CPTForSequenceClassification",
+    "CPTForMaskedLM",
+    "CPTForQuestionAnswering",
+    "CPTModel",
+    "CPTPretrainedModel",
+]
+
+from .modeling_cpt import CPT_PRETRAINED_MODEL_ARCHIVE_LIST, CPTForConditionalGeneration, CPTForSequenceClassification, \
+    CPTForMaskedLM, CPTForQuestionAnswering, CPTModel, CPTPretrainedModel
\ No newline at end of file
diff --git a/fastNLP/transformers/torch/models/cpt/modeling_cpt.py b/fastNLP/transformers/torch/models/cpt/modeling_cpt.py
new file mode 100644
index 00000000..2910cc26
--- /dev/null
+++ b/fastNLP/transformers/torch/models/cpt/modeling_cpt.py
@@ -0,0 +1,1489 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch CPT model. modified from transformers==4.4.1"""
+import math
+import random
+from typing import Optional, Tuple
+
+from fastNLP.transformers.torch.activations import ACT2FN
+from fastNLP.transformers.torch.file_utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from fastNLP.transformers.torch.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
+)
+from fastNLP.transformers.torch.modeling_utils import PreTrainedModel
+from ..bart import BartConfig as CPTConfig
+from ..bert import BertModel, BertConfig
+from fastNLP.core.log import logger
+from fastNLP.envs.imports import _NEED_IMPORT_TORCH
+
+if _NEED_IMPORT_TORCH:
+    import torch
+    import torch.nn.functional as F
+    import torch.utils.checkpoint
+    from torch import nn
+    from torch.nn import CrossEntropyLoss, LayerNorm, Module, Embedding
+else:
+    from fastNLP.core.utils.dummy_class import(
+        DummyClass as Module,
+        DummyClass as Embedding
+    )
+
+__all__ = [
+    "CPT_PRETRAINED_MODEL_ARCHIVE_LIST",
+    "CPTForConditionalGeneration",
+    "CPTForSequenceClassification",
+    "CPTForMaskedLM",
+    "CPTForQuestionAnswering",
+    "CPTModel",
+    "CPTPretrainedModel",
+]
+
+_CHECKPOINT_FOR_DOC = "fnlp/cpt-large"
+_CONFIG_FOR_DOC = "CPTConfig"
+_TOKENIZER_FOR_DOC = "CPTTokenizer"
+
+
+CPT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "fnlp/cpt-large",
+]
+
+
+def shift_tokens_right(input_ids: "torch.Tensor", pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def _make_causal_mask(input_ids_shape: "torch.Size", dtype: "torch.dtype", past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), float("-inf"))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+
+
+def _expand_mask(mask: "torch.Tensor", dtype: "torch.dtype", tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+
+    inverted_mask = 1.0 - expanded_mask
+
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+
+def attention_mask_func(attention_scores, attention_mask):
+    return attention_scores + attention_mask
+
+def init_method(std):
+    def init_(tensor):
+        return torch.nn.init.normal_(tensor, mean=0.0, std=std)
+
+    return init_
+
+class CPTLearnedPositionalEmbedding(Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        # CPT is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models dont have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+
+    def forward(self, input_ids_shape: "torch.Size", past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions + self.offset)
+
+
+class CPTAttention(Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        assert (
+            self.head_dim * num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {num_heads})."
+        self.scaling = self.head_dim ** -0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+
+    def _shape(self, tensor: "torch.Tensor", seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: "torch.Tensor",
+        key_value_states: Optional["torch.Tensor"] = None,
+        past_key_value: Optional[Tuple["torch.Tensor"]] = None,
+        attention_mask: Optional["torch.Tensor"] = None,
+        layer_head_mask: Optional["torch.Tensor"] = None,
+        output_attentions: bool = False,
+    ) -> Tuple["torch.Tensor", Optional["torch.Tensor"], Optional[Tuple["torch.Tensor"]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        assert attn_weights.size() == (
+            bsz * self.num_heads,
+            tgt_len,
+            src_len,
+        ), f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+
+        if attention_mask is not None:
+            assert attention_mask.size() == (
+                bsz,
+                1,
+                tgt_len,
+                src_len,
+            ), f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = F.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            assert layer_head_mask.size() == (
+                self.num_heads,
+            ), f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit akward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+        
+        # with mpu.get_cuda_rng_tracker().fork():
+        attn_probs = F.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        assert attn_output.size() == (
+            bsz * self.num_heads,
+            tgt_len,
+            self.head_dim,
+        ), f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+
+        attn_output = (
+            attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+            .transpose(1, 2)
+            .reshape(bsz, tgt_len, embed_dim)
+        )
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+class CPTDecoderLayer(Module):
+    def __init__(self, config: CPTConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = CPTAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.encoder_attn = CPTAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: "torch.Tensor",
+        attention_mask: Optional["torch.Tensor"] = None,
+        encoder_hidden_states: Optional["torch.Tensor"] = None,
+        encoder_attention_mask: Optional["torch.Tensor"] = None,
+        layer_head_mask: Optional["torch.Tensor"] = None,
+        encoder_layer_head_mask: Optional["torch.Tensor"] = None,
+        past_key_value: Optional[Tuple["torch.Tensor"]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ):
+        """
+        Args:
+            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
+            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (:obj:`torch.FloatTensor`): cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (:obj:`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(config.encoder_attention_heads,)`.
+            encoder_layer_head_mask (:obj:`torch.FloatTensor`): mask for encoder attention heads in a given layer of
+                size `(config.encoder_attention_heads,)`.
+            past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=encoder_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class CPTClassificationHead(Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(
+        self,
+        input_dim: int,
+        inner_dim: int,
+        num_classes: int,
+        pooler_dropout: float,
+    ):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+
+    def forward(self, hidden_states: "torch.Tensor"):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+
+
+class CPTPretrainedModel(PreTrainedModel):
+    config_class = CPTConfig
+    base_model_prefix = "model"
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    @property
+    def dummy_inputs(self):
+        pad_token = self.config.pad_token_id
+        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+        dummy_inputs = {
+            "attention_mask": input_ids.ne(pad_token),
+            "input_ids": input_ids,
+        }
+        return dummy_inputs
+
+CPT_START_DOCSTRING = r"""
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+    Parameters:
+        config (:class:`~transformers.CPTConfig`):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+CPT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using :class:`~transformers.CPTTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Indices of decoder input sequence tokens in the vocabulary.
+            Indices can be obtained using :class:`~transformers.CPTTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+            `What are input IDs? <../glossary.html#input-ids>`__
+            CPT uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
+            :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
+            :obj:`past_key_values`).
+            For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
+            :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+            the right for denoising pre-training following the paper.
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
+            also be used by default.
+            If you want to change padding behavior, you should read :func:`modeling_cpt._prepare_decoder_inputs` and
+            modify to your needs. See diagram 1 in `the paper <https://arxiv.org/abs/1910.13461>`__ for more
+            information on the default strategy.
+        head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in ``[0, 1]``:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the heas is **masked**.
+        decoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in ``[0, 1]``:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
+            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
+            :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
+            `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+            cross-attention of the decoder.
+        past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, target_sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded
+            representation. If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_inputs_embeds`
+            have to be input (see :obj:`past_key_values`). This is useful if you want more control over how to convert
+            :obj:`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+            If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset, :obj:`decoder_inputs_embeds`
+            takes the value of :obj:`inputs_embeds`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+class CPTDecoder(CPTPretrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a :class:`CPTDecoderLayer`
+    Args:
+        config: CPTConfig
+        embed_tokens (torch.nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: CPTConfig, embed_tokens: Optional["nn.Embedding"] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        self.embed_positions = CPTLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+        )
+        self.layers = nn.ModuleList([CPTDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = LayerNorm(config.d_model)
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(self.device)
+
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+
+        return combined_attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        encoder_head_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using :class:`~transformers.CPTTokenizer`. See
+                :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__`
+                for details.
+                `What are input IDs? <../glossary.html#input-ids>`__
+            attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+                Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, encoder_sequence_length)`, `optional`):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in ``[0, 1]``:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
+                Mask to nullify selected heads of the attention modules. Mask values selected in ``[0, 1]``:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the heas is **masked**.
+            encoder_head_mask (:obj:`torch.Tensor` of shape :obj:`(num_layers, num_heads)`, `optional`):
+                Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
+                on hidden heads. Mask values selected in ``[0, 1]``:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the heas is **masked**.
+            past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
+                decoding.
+                If :obj:`past_key_values` are used, the user can optionally input only the last
+                :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of
+                shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids`` of shape :obj:`(batch_size,
+                sequence_length)`.
+            inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+                Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert :obj:`input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
+            output_hidden_states (:obj:`bool`, `optional`):
+                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+                for more detail.
+            return_dict (:obj:`bool`, `optional`):
+                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+
+        # embed positions
+        positions = self.embed_positions(input_shape, past_key_values_length)
+
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.layernorm_embedding(hidden_states)
+
+        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layers)
+            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting "
+                        "`use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                # layer_outputs = mpu.checkpoint(
+                layer_outputs = torch.utils.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    encoder_head_mask[idx] if encoder_head_mask is not None else None,
+                    None,
+                )
+            else:
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    encoder_layer_head_mask=(encoder_head_mask[idx] if encoder_head_mask is not None else None),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare CPT Model outputting raw hidden-states without any specific head on top.",
+    CPT_START_DOCSTRING,
+)
+class CPTModel(CPTPretrainedModel):
+    def __init__(self, config: CPTConfig):
+        super().__init__(config)
+        encoder_config = BertConfig(
+            vocab_size=config.vocab_size,
+            hidden_size=config.d_model,
+            num_hidden_layers=config.encoder_layers,
+            num_attention_heads=config.encoder_attention_heads,
+            intermediate_size=config.encoder_ffn_dim,
+            hidden_dropout_prob=config.activation_dropout,
+            attention_probs_dropout_prob=config.attention_dropout,
+        )
+        config.vocab_size = encoder_config.vocab_size
+        self.encoder = BertModel(encoder_config, add_pooling_layer=False)
+        self.shared = self.encoder.get_input_embeddings()
+        self.decoder = CPTDecoder(config, self.shared)
+        self.num_decoder_layers = config.decoder_layers
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.set_input_embeddings(self.shared)
+        self.decoder.embed_tokens = self.shared
+
+    def get_encoder(self):
+        class _Encoder(torch.nn.Module):
+            def __init__(self, encoder):
+                super().__init__()
+                self.encoder = encoder
+            
+            def forward(self, *args, **kwargs):
+                kwargs['output_hidden_states'] = True                
+                return self.encoder(*args, **kwargs)
+        return _Encoder(self.encoder)
+
+    def get_decoder(self):
+        return self.decoder
+
+    @add_start_docstrings_to_model_forward(CPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Seq2SeqModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+
+        # different to other models, CPT automatically creates decoder_input_ids from
+        # input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            decoder_input_ids = shift_tokens_right(
+                input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if getattr(self.config, "gradient_checkpointing", False) and self.training:
+            # mpu.reset_checkpointed_activations_memory_buffer()
+            use_cache = False
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=torch.ones_like(input_ids),
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=True,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and isinstance(encoder_outputs, (tuple, list)):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        if isinstance(encoder_outputs, (torch.Tensor)):
+            encoder_hidden_states = encoder_outputs
+        else:
+            encoder_hidden_states = encoder_outputs[1][-self.num_decoder_layers - 1]
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            encoder_head_mask=head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state if isinstance(encoder_outputs, dict) else None,
+            encoder_hidden_states=encoder_outputs.hidden_states if isinstance(encoder_outputs, dict) else None,
+            encoder_attentions=encoder_outputs.attentions if isinstance(encoder_outputs, dict) else None,
+        )
+
+
+@add_start_docstrings(
+    "The CPT Model with a language modeling head. Can be used for summarization.", CPT_START_DOCSTRING
+)
+class CPTForConditionalGeneration(CPTPretrainedModel):
+    base_model_prefix = "model"
+    _keys_to_ignore_on_load_missing = [
+        r"final_logits_bias",
+        r"encoder\.version",
+        r"decoder\.version",
+        r"lm_head\.weight",
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = CPTModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+
+        self.init_weights()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def resize_token_embeddings(self, new_num_tokens: int) -> "nn.Embedding":
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        self._resize_final_logits_bias(new_num_tokens)
+        return new_embeddings
+
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(CPT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should either be in ``[0, ...,
+            config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if decoder_input_ids is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past=None,
+        attention_mask=None,
+        head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+    
+    @staticmethod
+    def _expand_inputs_for_generation(
+        input_ids: "torch.LongTensor",
+        expand_size: int = 1,
+        is_encoder_decoder: bool = False,
+        attention_mask: "torch.LongTensor" = None,
+        encoder_outputs = None,
+        **model_kwargs,
+    ):
+        expanded_return_idx = (
+            torch.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1).to(input_ids.device)
+        )
+        input_ids = input_ids.index_select(0, expanded_return_idx)
+
+        if "token_type_ids" in model_kwargs:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = token_type_ids.index_select(0, expanded_return_idx)
+
+        if attention_mask is not None:
+            model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
+
+        if is_encoder_decoder:
+            assert encoder_outputs is not None
+            device = encoder_outputs.last_hidden_state.device
+            encoder_outputs["hidden_states"] = tuple(h.index_select(0, expanded_return_idx.to(device)) \
+                 for h in encoder_outputs["hidden_states"])
+            model_kwargs["encoder_outputs"] = encoder_outputs
+        return input_ids, model_kwargs
+
+    def prepare_decoder_input_ids_from_labels(self, labels: "torch.Tensor"):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    CPT model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
+    tasks.
+    """,
+    CPT_START_DOCSTRING,
+)
+class CPTForSequenceClassification(CPTPretrainedModel):
+    def __init__(self, config: CPTConfig, cls_mode=1, **kwargs):
+        super().__init__(config, **kwargs)
+        self.model = CPTModel(config)
+        cls_mode = getattr(config, 'cls_mode', cls_mode)
+        if cls_mode == 1:
+            logger.info('Encoder for classification.')
+            cls_dim = config.d_model
+        elif cls_mode == 2:
+            logger.info('Decoder for classification.')
+            cls_dim = config.d_model
+        elif cls_mode == 3:
+            logger.info('Both encoder & decoder for classification.')
+            cls_dim = config.d_model * 2
+        else:
+            raise NotImplementedError
+
+        self.cls_head = CPTClassificationHead(
+            cls_dim,
+            cls_dim,
+            config.num_labels,
+            config.classifier_dropout,
+        )
+        self.model._init_weights(self.cls_head.dense)
+        self.model._init_weights(self.cls_head.out_proj)
+        self.cls_mode = cls_mode
+        config.cls_mode = cls_mode
+
+    @add_start_docstrings_to_model_forward(CPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Seq2SeqSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        if input_ids is None and inputs_embeds is not None:
+            raise NotImplementedError(
+                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
+            )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        enc_hidden_states = outputs.encoder_last_hidden_state
+        enc_rep = enc_hidden_states[:, 0]
+
+        eos_mask = input_ids.eq(self.config.eos_token_id)
+
+        if len(torch.unique(eos_mask.sum(1))) > 1:
+            raise ValueError("All examples must have the same number of <eos> tokens.")
+        dec_rep = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
+            :, -1, :
+        ]
+
+        if self.cls_mode == 1:
+            logits = self.cls_head(enc_rep)
+        elif self.cls_mode == 2:
+            logits = self.cls_head(dec_rep)
+        elif self.cls_mode == 3:
+            rep = torch.cat([enc_rep, dec_rep], dim=-1)
+            logits = self.cls_head(rep)
+        else:
+            raise NotImplementedError
+        
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return Seq2SeqSequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    CPT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    CPT_START_DOCSTRING,
+)
+class CPTForQuestionAnswering(CPTPretrainedModel):
+    def __init__(self, config: CPTConfig, cls_mode=1, **kwargs):
+        super().__init__(config, **kwargs)
+        config.num_labels = 2
+        self.num_labels = config.num_labels
+
+        self.model = CPTModel(config)
+
+        cls_mode = getattr(config, 'cls_mode', cls_mode)
+        if cls_mode == 1:
+            logger.info('Encoder for classification.')
+            cls_dim = config.d_model
+        elif cls_mode == 2:
+            logger.info('Decoder for classification.')
+            cls_dim = config.d_model
+        elif cls_mode == 3:
+            logger.info('Both encoder & decoder for classification.')
+            cls_dim = config.d_model * 2
+        else:
+            raise NotImplementedError
+
+        self.qa_outputs = nn.Linear(cls_dim, config.num_labels)
+        self.model._init_weights(self.qa_outputs)
+
+        self.cls_mode = cls_mode
+        config.cls_mode = cls_mode
+
+    @add_start_docstrings_to_model_forward(CPT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Seq2SeqSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        encoder_outputs=None,
+        start_positions=None,
+        end_positions=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        
+        if input_ids is None and inputs_embeds is not None:
+            raise NotImplementedError(
+                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
+            )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        enc_hidden_states = outputs.encoder_last_hidden_state
+
+        if self.cls_mode == 1:
+            logits = self.qa_outputs(enc_hidden_states)
+        elif self.cls_mode == 2:
+            logits = self.qa_outputs(hidden_states)
+        elif self.cls_mode == 3:
+            rep = torch.cat([enc_hidden_states, hidden_states], dim=-1)
+            logits = self.qa_outputs(rep)
+        else:
+            raise NotImplementedError
+
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (
+                start_logits,
+                end_logits,
+            ) + outputs[1:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return Seq2SeqQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+        
+
+class CPTForMaskedLM(CPTPretrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        r"final_logits_bias",
+        r"encoder\.version",
+        r"decoder\.version",
+        r"lm_head\.weight",
+    ]
+    def __init__(self, config, **kwargs):
+        super().__init__(config)
+        self.model = CPTModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+
+        self.init_weights()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        encoder_outputs=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        
+        if input_ids is None and inputs_embeds is not None:
+            raise NotImplementedError(
+                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
+            )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        enc_hidden_states = outputs.encoder_last_hidden_state
+
+        dec_logits = self.lm_head(hidden_states) + self.final_logits_bias
+        enc_logits = self.lm_head(enc_hidden_states) + self.final_logits_bias
+
+        if not return_dict:
+            logits = (enc_logits, dec_logits)
+            output = (logits,) + outputs[1:]
+            return output
+
+        return Seq2SeqLMOutput(
+            loss=None,
+            logits=(enc_logits, dec_logits),
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
\ No newline at end of file
diff --git a/fastNLP/transformers/torch/models/gpt2/__init__.py b/fastNLP/transformers/torch/models/gpt2/__init__.py
new file mode 100644
index 00000000..70f24bfa
--- /dev/null
+++ b/fastNLP/transformers/torch/models/gpt2/__init__.py
@@ -0,0 +1,19 @@
+__all__ = [
+    "GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP",
+    "GPT2Config",
+
+    "GPT2_PRETRAINED_MODEL_ARCHIVE_LIST",
+    "GPT2DoubleHeadsModel",
+    "GPT2ForSequenceClassification",
+    "GPT2ForTokenClassification",
+    "GPT2LMHeadModel",
+    "GPT2Model",
+    "GPT2PreTrainedModel",
+
+    "GPT2Tokenizer",
+]
+
+from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config
+from .tokenization_gpt2 import GPT2Tokenizer
+from .modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_LIST, GPT2DoubleHeadsModel, GPT2ForSequenceClassification, \
+    GPT2ForTokenClassification, GPT2LMHeadModel, GPT2Model, GPT2PreTrainedModel
\ No newline at end of file
diff --git a/fastNLP/transformers/torch/models/gpt2/configuration_gpt2.py b/fastNLP/transformers/torch/models/gpt2/configuration_gpt2.py
new file mode 100644
index 00000000..c0794e5a
--- /dev/null
+++ b/fastNLP/transformers/torch/models/gpt2/configuration_gpt2.py
@@ -0,0 +1,184 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" OpenAI GPT-2 configuration """
+
+from fastNLP.transformers.torch.configuration_utils import PretrainedConfig
+
+__all__ = [
+    "GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP",
+    "GPT2Config",
+]
+
+GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "gpt2": "https://huggingface.co/gpt2/resolve/main/config.json",
+    "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/config.json",
+    "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/config.json",
+    "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/config.json",
+    "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json",
+}
+
+
+class GPT2Config(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model` or a
+    :class:`~transformers.TFGPT2Model`. It is used to instantiate a GPT-2 model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.GPT2Model` or
+            :class:`~transformers.TFGPT2Model`.
+        n_positions (:obj:`int`, `optional`, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_ctx (:obj:`int`, `optional`, defaults to 1024):
+            Dimensionality of the causal mask (usually same as n_positions).
+        n_embd (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        n_head (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (:obj:`int`, `optional`, defaults to None):
+            Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
+        activation_function (:obj:`str`, `optional`, defaults to :obj:`"gelu"`):
+            Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        summary_type (:obj:`string`, `optional`, defaults to :obj:`"cls_index"`):
+            Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
+            and :class:`~transformers.TFGPT2DoubleHeadsModel`.
+
+            Has to be one of the following options:
+
+                - :obj:`"last"`: Take the last token hidden state (like XLNet).
+                - :obj:`"first"`: Take the first token hidden state (like BERT).
+                - :obj:`"mean"`: Take the mean of all tokens hidden states.
+                - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - :obj:`"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
+            and :class:`~transformers.TFGPT2DoubleHeadsModel`.
+
+            Whether or not to add a projection after the vector extraction.
+        summary_activation (:obj:`str`, `optional`):
+            Argument used when doing sequence summary. Used in for the multiple choice head in
+            :class:`~transformers.GPT2DoubleHeadsModel`.
+
+            Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
+            and :class:`~transformers.TFGPT2DoubleHeadsModel`.
+
+            Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes.
+        summary_first_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
+            and :class:`~transformers.TFGPT2DoubleHeadsModel`.
+
+            The dropout ratio to be used after the projection and activation.
+        scale_attn_weights (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Scale attention weights by dividing by sqrt(hidden_size)..
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+
+    Example::
+
+        >>> from transformers import GPT2Model, GPT2Config
+
+        >>> # Initializing a GPT2 configuration
+        >>> configuration = GPT2Config()
+
+        >>> # Initializing a model from the configuration
+        >>> model = GPT2Model(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+
+    model_type = "gpt2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50257,
+        n_positions=1024,
+        n_ctx=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        summary_type="cls_index",
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        **kwargs
+    ):
+        self.vocab_size = vocab_size
+        self.n_ctx = n_ctx
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_first_dropout = summary_first_dropout
+        self.summary_proj_to_labels = summary_proj_to_labels
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
diff --git a/fastNLP/transformers/torch/models/gpt2/modeling_gpt2.py b/fastNLP/transformers/torch/models/gpt2/modeling_gpt2.py
new file mode 100644
index 00000000..27626f0d
--- /dev/null
+++ b/fastNLP/transformers/torch/models/gpt2/modeling_gpt2.py
@@ -0,0 +1,1393 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OpenAI GPT-2 model."""
+
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+from .configuration_gpt2 import GPT2Config
+from fastNLP.transformers.torch.activations import ACT2FN
+from fastNLP.transformers.torch.file_utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from fastNLP.transformers.torch.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    SequenceClassifierOutputWithPast,
+    TokenClassifierOutput,
+)
+from fastNLP.transformers.torch.modeling_utils import (
+    Conv1D,
+    PreTrainedModel,
+    SequenceSummary,
+    find_pruneable_heads_and_indices,
+    prune_conv1d_layer,
+)
+from fastNLP.transformers.torch.utils.model_parallel_utils import assert_device_map, get_device_map
+
+from fastNLP.envs.imports import _NEED_IMPORT_TORCH
+from fastNLP.core.log import logger
+
+__all__ = [
+    "GPT2_PRETRAINED_MODEL_ARCHIVE_LIST",
+    "GPT2DoubleHeadsModel",
+    "GPT2ForSequenceClassification",
+    "GPT2ForTokenClassification",
+    "GPT2LMHeadModel",
+    "GPT2Model",
+    "GPT2PreTrainedModel",
+]
+
+if _NEED_IMPORT_TORCH:
+    import torch
+    import torch.utils.checkpoint
+    from torch import nn
+    from torch.nn import CrossEntropyLoss, MSELoss, Module
+else:
+    from fastNLP.core.utils.dummy_class import DummyClass as Module
+
+_CHECKPOINT_FOR_DOC = "gpt2"
+_CONFIG_FOR_DOC = "GPT2Config"
+_TOKENIZER_FOR_DOC = "GPT2Tokenizer"
+
+GPT2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "gpt2",
+    "gpt2-medium",
+    "gpt2-large",
+    "gpt2-xl",
+    "distilgpt2",
+    # See all GPT-2 models at https://huggingface.co/models?filter=gpt2
+]
+
+class GPT2Attention(Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.uint8)).view(
+                1, 1, max_positions, max_positions
+            ),
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4))
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.split_size = self.embed_dim
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+            )
+
+        self.scale_attn_weights = config.scale_attn_weights
+        self.is_cross_attention = is_cross_attention
+
+        if self.is_cross_attention:
+            self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
+            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
+        else:
+            self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
+
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
+
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+
+        # Update hyper params
+        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
+        self.num_heads = self.num_heads - len(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+
+        if self.scale_attn_weights:
+            attn_weights = attn_weights / (float(value.size(-1)) ** 0.5)
+
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            query_length, key_length = query.size(-2), key.size(-2)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].bool()
+            attn_weights = torch.where(causal_mask, attn_weights, self.masked_bias.to(attn_weights.dtype))
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = nn.Softmax(dim=-1)(attn_weights)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Splits hidden_size dim into attn_head_size and num_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(*new_shape)
+        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden_size
+        """
+        tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
+        return tensor.view(new_shape)
+
+    def forward(
+        self,
+        hidden_states,
+        layer_past=None,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        if encoder_hidden_states is not None:
+            if not hasattr(self, "q_attn"):
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `GPT2Attention(..., is_cross_attention=True)`."
+                )
+
+            query = self.q_attn(hidden_states)
+            key, value = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
+            attention_mask = encoder_attention_mask
+        else:
+            query, key, value = self.c_attn(hidden_states).split(self.split_size, dim=2)
+
+        query = self._split_heads(query, self.num_heads, self.head_dim)
+        key = self._split_heads(key, self.num_heads, self.head_dim)
+        value = self._split_heads(value, self.num_heads, self.head_dim)
+
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            key = torch.cat((past_key, key), dim=-2)
+            value = torch.cat((past_value, value), dim=-2)
+
+        if use_cache is True:
+            present = (key, value)
+        else:
+            present = None
+
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs  # a, present, (attentions)
+
+
+class GPT2MLP(Module):
+    def __init__(self, intermediate_size, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.c_fc = Conv1D(intermediate_size, embed_dim)
+        self.c_proj = Conv1D(embed_dim, intermediate_size)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, hidden_states):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class GPT2Block(Module):
+    def __init__(self, config):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = GPT2Attention(config)
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        if config.add_cross_attention:
+            self.crossattention = GPT2Attention(config, is_cross_attention=True)
+            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        self.mlp = GPT2MLP(inner_dim, config)
+
+    def forward(
+        self,
+        hidden_states,
+        layer_past=None,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        use_cache=False,
+        output_attentions=False,
+    ):
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            hidden_states,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attn_output = attn_outputs[0]  # output_attn: a, present, (attentions)
+        outputs = attn_outputs[1:]
+        # residual connection
+        hidden_states = attn_output + residual
+
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+            residual = hidden_states
+            hidden_states = self.ln_cross_attn(hidden_states)
+            cross_attn_outputs = self.crossattention(
+                hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+            attn_output = cross_attn_outputs[0]
+            # residual connection
+            hidden_states = residual + attn_output
+            outputs = outputs + cross_attn_outputs[2:]  # add cross attentions if we output attention weights
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+
+        if use_cache:
+            outputs = (hidden_states,) + outputs
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        return outputs  # hidden_states, present, (attentions, cross_attentions)
+
+
+class GPT2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = GPT2Config
+    base_model_prefix = "transformer"
+    is_parallelizable = True
+    supports_gradient_checkpointing = True
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, Conv1D)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GPT2Model):
+            module.gradient_checkpointing = value
+
+
+@dataclass
+class GPT2DoubleHeadsModelOutput(ModelOutput):
+    """
+    Base class for outputs of models predicting if two sentences are consecutive or not.
+
+    Args:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided):
+            Language modeling loss.
+        mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided):
+            Multiple choice classification loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        mc_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
+            Prediction scores of the multiple choice classification head (scores for each choice before SoftMax).
+        past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``):
+            Tuple of length :obj:`config.n_layers`, containing tuples of tensors of shape :obj:`(batch_size, num_heads,
+            sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
+            :obj:`past_key_values` input) to speed up sequential decoding.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads,
+            sequence_length, sequence_length)`.
+
+            GPT2Attentions weights after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    loss: Optional["torch.FloatTensor"] = None
+    mc_loss: Optional["torch.FloatTensor"] = None
+    logits: "torch.FloatTensor" = None
+    mc_logits: "torch.FloatTensor" = None
+    past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None
+    hidden_states: Optional[Tuple["torch.FloatTensor"]] = None
+    attentions: Optional[Tuple["torch.FloatTensor"]] = None
+
+
+GPT2_START_DOCSTRING = r"""
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+GPT2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`):
+            :obj:`input_ids_length` = ``sequence_length`` if :obj:`past_key_values` is ``None`` else
+            ``past_key_values[0][0].shape[-2]`` (``sequence_length`` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If :obj:`past_key_values` is used, only ``input_ids`` that do not have their past calculated should be
+            passed as ``input_ids``.
+
+            Indices can be obtained using :class:`~transformers.GPT2Tokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        past_key_values (:obj:`Tuple[Tuple[torch.Tensor]]` of length :obj:`config.n_layers`):
+            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
+            :obj:`past_key_values` output below). Can be used to speed up sequential decoding. The ``input_ids`` which
+            have their past given to this model should not be passed as ``input_ids`` as they have already been
+            computed.
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, input_ids_length)`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+
+            If :obj:`past_key_values` is used, optionally only the last :obj:`inputs_embeds` have to be input (see
+            :obj:`past_key_values`).
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+PARALLELIZE_DOCSTRING = r"""
+    This is an experimental feature and is a subject to change at a moment's notice.
+
+    Uses a device map to distribute attention modules of the model across several devices. If no device map is given,
+    it will evenly distribute blocks across all devices.
+
+    Args:
+        device_map (:obj:`Dict[int, list]`, optional, defaults to None):
+            A dictionary that maps attention modules to devices. Note that the embedding module and LMHead are always
+            automatically mapped to the first device (for esoteric reasons). That means that the first device should
+            have fewer attention modules mapped to it than other devices. For reference, the gpt2 models have the
+            following number of attention modules:
+
+                - gpt2: 12
+                - gpt2-medium: 24
+                - gpt2-large: 36
+                - gpt2-xl: 48
+
+    Example::
+
+            # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules:
+            model = GPT2LMHeadModel.from_pretrained('gpt2-xl')
+            device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
+
+                          1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
+                          2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
+                          3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]}
+            model.parallelize(device_map)
+"""
+DEPARALLELIZE_DOCSTRING = r"""
+    Moves the model to cpu from a model parallel state.
+
+    Example::
+
+        # On a 4 GPU machine with gpt2-large:
+        model = GPT2LMHeadModel.from_pretrained('gpt2-large')
+        device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7],
+
+                    1: [8, 9, 10, 11, 12, 13, 14, 15],
+                    2: [16, 17, 18, 19, 20, 21, 22, 23],
+                    3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]}
+        model.parallelize(device_map) # Splits the model across several devices
+        model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+"""
+
+
+@add_start_docstrings(
+    "The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
+    GPT2_START_DOCSTRING,
+)
+class GPT2Model(GPT2PreTrainedModel):
+    _keys_to_ignore_on_load_missing = ["attn.masked_bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embed_dim = config.hidden_size
+
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([GPT2Block(config) for _ in range(config.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        self.init_weights()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        # Check validity of device_map
+        self.device_map = (
+            get_device_map(len(self.h), range(torch.cuda.device_count())) if device_map is None else device_map
+        )
+        assert_device_map(self.device_map, len(self.h))
+        self.model_parallel = True
+        self.first_device = "cpu" if "cpu" in self.device_map.keys() else "cuda:" + str(min(self.device_map.keys()))
+        self.last_device = "cuda:" + str(max(self.device_map.keys()))
+        self.wte = self.wte.to(self.first_device)
+        self.wpe = self.wpe.to(self.first_device)
+        # Load onto devices
+        for k, v in self.device_map.items():
+            for block in v:
+                cuda_device = "cuda:" + str(k)
+                self.h[block] = self.h[block].to(cuda_device)
+        # ln_f to last
+        self.ln_f = self.ln_f.to(self.last_device)
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.model_parallel = False
+        self.device_map = None
+        self.first_device = "cpu"
+        self.last_device = "cpu"
+        self.wte = self.wte.to("cpu")
+        self.wpe = self.wpe.to("cpu")
+        for index in range(len(self.h)):
+            self.h[index] = self.h[index].to("cpu")
+        self.ln_f = self.ln_f.to("cpu")
+        torch.cuda.empty_cache()
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+        if position_ids is not None:
+            position_ids = position_ids.view(-1, input_shape[-1])
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = tuple([None] * len(self.h))
+        else:
+            past_length = past_key_values[0][0].size(-2)
+        if position_ids is None:
+            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+
+        # GPT2Attention mask.
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+            attention_mask = attention_mask.view(batch_size, -1)
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask[:, None, None, :]
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * -10000.0
+
+        # If a 2D ou 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.add_cross_attention and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds
+
+        if token_type_ids is not None:
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+
+        hidden_states = self.drop(hidden_states)
+
+        output_shape = input_shape + (hidden_states.size(-1),)
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure layer_past is on same device as hidden_states (might not be correct)
+                if layer_past is not None:
+                    layer_past = tuple(past_state.to(hidden_states.device) for past_state in layer_past)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if isinstance(head_mask, torch.Tensor):
+                    head_mask = head_mask.to(hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, use_cache, output_attentions)
+
+                    return custom_forward
+
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    None,
+                    attention_mask,
+                    head_mask[i],
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=attention_mask,
+                    head_mask=head_mask[i],
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (outputs[3 if use_cache else 2],)
+
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = hidden_states.view(*output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, presents, all_hidden_states, all_self_attentions, all_cross_attentions]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The GPT2 Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    GPT2_START_DOCSTRING,
+)
+class GPT2LMHeadModel(GPT2PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = GPT2Model(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
+        self.init_weights()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.transformer.h))
+        self.transformer.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.transformer.first_device)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.transformer.deparallelize()
+        self.transformer = self.transformer.to("cpu")
+        self.lm_head = self.lm_head.to("cpu")
+        self.model_parallel = False
+        torch.cuda.empty_cache()
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
+        token_type_ids = kwargs.get("token_type_ids", None)
+        # only last token for inputs_ids if past is defined in kwargs
+        if past:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        else:
+            position_ids = None
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past,
+            "use_cache": kwargs.get("use_cache"),
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+        }
+
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutputWithCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
+            ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.transformer.first_device)
+            hidden_states = hidden_states.to(self.lm_head.weight.device)
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(past: Tuple[Tuple["torch.Tensor"]], beam_idx: "torch.Tensor") -> Tuple[Tuple["torch.Tensor"]]:
+        """
+        This function is used to re-order the :obj:`past_key_values` cache if
+        :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is
+        called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step.
+        """
+        return tuple(
+            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
+            for layer_past in past
+        )
+
+
+@add_start_docstrings(
+    """
+The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for
+RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the
+input embeddings, the classification head takes as input the input of a specified classification token index in the
+input sequence).
+""",
+    GPT2_START_DOCSTRING,
+)
+class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"attn.masked_bias", r"attn.bias", r"lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        config.num_labels = 1
+        self.transformer = GPT2Model(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.multiple_choice_head = SequenceSummary(config)
+
+        self.init_weights()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings(PARALLELIZE_DOCSTRING)
+    def parallelize(self, device_map=None):
+        self.device_map = (
+            get_device_map(len(self.transformer.h), range(torch.cuda.device_count()))
+            if device_map is None
+            else device_map
+        )
+        assert_device_map(self.device_map, len(self.transformer.h))
+        self.transformer.parallelize(self.device_map)
+        self.lm_head = self.lm_head.to(self.transformer.first_device)
+        self.multiple_choice_head = self.multiple_choice_head.to(self.transformer.first_device)
+        self.model_parallel = True
+
+    @add_start_docstrings(DEPARALLELIZE_DOCSTRING)
+    def deparallelize(self):
+        self.transformer.deparallelize()
+        self.transformer = self.transformer.to("cpu")
+        self.lm_head = self.lm_head.to("cpu")
+        self.multiple_choice_head = self.multiple_choice_head.to("cpu")
+        self.model_parallel = False
+        torch.cuda.empty_cache()
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, **kwargs):
+        token_type_ids = kwargs.get("token_type_ids", None)
+        # only last token for inputs_ids if past is defined in kwargs
+        if past:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -1].unsqueeze(-1)
+
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past:
+                position_ids = position_ids[:, -1].unsqueeze(-1)
+        else:
+            position_ids = None
+
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past,
+            "use_cache": kwargs.get("use_cache"),
+            "position_ids": position_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+        }
+
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=GPT2DoubleHeadsModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        mc_token_ids=None,
+        labels=None,
+        mc_labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        **kwargs,
+    ):
+        r"""
+        mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input):
+            Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) -
+            1[``.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size - 1]`` All labels set to
+            ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size - 1]``
+        mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`):
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
+            num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see
+            `input_ids` above)
+
+        Return:
+
+        Example::
+
+            >>> import torch
+            >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
+
+            >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+            >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
+
+            >>> # Add a [CLS] to the vocabulary (we should train it also!)
+            >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+
+            >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
+
+            >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
+            >>> encoded_choices = [tokenizer.encode(s) for s in choices]
+            >>> cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
+
+            >>> input_ids = torch.tensor(encoded_choices).unsqueeze(0)  # Batch size: 1, number of choices: 2
+            >>> mc_token_ids = torch.tensor([cls_token_location])  # Batch size: 1
+
+            >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
+            >>> lm_logits = outputs.logits
+            >>> mc_logits = outputs.mc_logits
+
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+
+        # Set device for model parallelism
+        if self.model_parallel:
+            torch.cuda.set_device(self.transformer.first_device)
+            hidden_states = hidden_states.to(self.lm_head.weight.device)
+
+        lm_logits = self.lm_head(hidden_states)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
+
+        mc_loss = None
+        if mc_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            mc_loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
+        lm_loss = None
+        if labels is not None:
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits, mc_logits) + transformer_outputs[1:]
+            if mc_loss is not None:
+                output = (mc_loss,) + output
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return GPT2DoubleHeadsModelOutput(
+            loss=lm_loss,
+            mc_loss=mc_loss,
+            logits=lm_logits,
+            mc_logits=mc_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    @staticmethod
+    def _reorder_cache(past: Tuple[Tuple["torch.Tensor"]], beam_idx: "torch.Tensor") -> Tuple[Tuple["torch.Tensor"]]:
+        """
+        This function is used to re-order the :obj:`past_key_values` cache if
+        :meth:`~transformers.PreTrainedModel.beam_search` or :meth:`~transformers.PreTrainedModel.beam_sample` is
+        called. This is required to match :obj:`past_key_values` with the correct beam_idx at every generation step.
+        """
+        return tuple(
+            tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past)
+            for layer_past in past
+        )
+
+
+@add_start_docstrings(
+    """
+    The GPT2 Model transformer with a sequence classification head on top (linear layer).
+
+    :class:`~transformers.GPT2ForSequenceClassification` uses the last token in order to do the classification, as
+    other causal models (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    :obj:`pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each
+    row. If no :obj:`pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
+    guess the padding tokens when :obj:`inputs_embeds` are passed instead of :obj:`input_ids`, it does the same (take
+    the last value in each row of the batch).
+    """,
+    GPT2_START_DOCSTRING,
+)
+class GPT2ForSequenceClassification(GPT2PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"h\.\d+\.attn\.masked_bias", r"lm_head\.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = GPT2Model(config)
+        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
+
+        self.init_weights()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="microsoft/DialogRPT-updown",
+        output_type=SequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size, sequence_length = input_ids.shape[:2]
+        else:
+            batch_size, sequence_length = inputs_embeds.shape[:2]
+
+        assert (
+            self.config.pad_token_id is not None or batch_size == 1
+        ), "Cannot handle batch sizes > 1 if no padding token is defined."
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1
+            else:
+                sequence_lengths = -1
+                logger.warning(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    f"unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+
+        pooled_logits = logits[range(batch_size), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    GPT2 Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    GPT2_START_DOCSTRING,
+)
+class GPT2ForTokenClassification(GPT2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = GPT2Model(config)
+        if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None:
+            classifier_dropout = config.classifier_dropout
+        elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None:
+            classifier_dropout = config.hidden_dropout
+        else:
+            classifier_dropout = 0.1
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+
+    @add_start_docstrings_to_model_forward(GPT2_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint="microsoft/DialogRPT-updown",
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        past_key_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/fastNLP/transformers/torch/models/gpt2/tokenization_gpt2.py b/fastNLP/transformers/torch/models/gpt2/tokenization_gpt2.py
new file mode 100644
index 00000000..94932da0
--- /dev/null
+++ b/fastNLP/transformers/torch/models/gpt2/tokenization_gpt2.py
@@ -0,0 +1,308 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+
+import json
+import os
+from functools import lru_cache
+from typing import TYPE_CHECKING, List, Optional, Tuple
+
+import regex as re
+
+from fastNLP.transformers.torch.tokenization_utils import AddedToken, PreTrainedTokenizer
+# if TYPE_CHECKING:
+#     from transformers.pipelines.conversational import Conversation
+from fastNLP.core.log import logger
+
+__all__ = [
+    "GPT2Tokenizer",
+]
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "gpt2": "https://huggingface.co/gpt2/resolve/main/vocab.json",
+        "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/vocab.json",
+        "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/vocab.json",
+        "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/vocab.json",
+        "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "gpt2": "https://huggingface.co/gpt2/resolve/main/merges.txt",
+        "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/merges.txt",
+        "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/merges.txt",
+        "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/merges.txt",
+        "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/merges.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "gpt2": 1024,
+    "gpt2-medium": 1024,
+    "gpt2-large": 1024,
+    "gpt2-xl": 1024,
+    "distilgpt2": 1024,
+}
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2 ** 8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2 ** 8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class GPT2Tokenizer(PreTrainedTokenizer):
+    """
+    Construct a GPT-2 tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ::
+
+        >>> from transformers import GPT2Tokenizer
+        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        >>> tokenizer("Hello world")['input_ids']
+        [15496, 995]
+        >>> tokenizer(" Hello world")['input_ids']
+        [18435, 995]
+
+    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    .. note::
+
+        When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
+        one).
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
+            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+            The beginning of sequence token.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+            The end of sequence token.
+        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (GPT2 tokenizer detect beginning of words by the preceding space).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        add_prefix_space=False,
+        **kwargs
+    ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        super().__init__(
+            errors=errors,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+
+        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if is_split_into_words or add_prefix_space:
+            text = " " + text
+        return (text, kwargs)
+
+    # def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
+    #     input_ids = []
+    #     for is_user, text in conversation.iter_texts():
+    #         input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
+    #     if len(input_ids) > self.model_max_length:
+    #         input_ids = input_ids[-self.model_max_length :]
+    #     return input_ids
diff --git a/fastNLP/transformers/torch/models/roberta/__init__.py b/fastNLP/transformers/torch/models/roberta/__init__.py
new file mode 100644
index 00000000..582ea614
--- /dev/null
+++ b/fastNLP/transformers/torch/models/roberta/__init__.py
@@ -0,0 +1,21 @@
+__all__ = [
+    "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+    "RobertaConfig",
+
+    "ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
+    "RobertaForCausalLM",
+    "RobertaForMaskedLM",
+    "RobertaForMultipleChoice",
+    "RobertaForQuestionAnswering",
+    "RobertaForSequenceClassification",
+    "RobertaForTokenClassification",
+    "RobertaModel",
+    "RobertaPreTrainedModel",
+
+    "RobertaTokenizer",
+]
+
+from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
+from .tokenization_roberta import RobertaTokenizer
+from .modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, RobertaForCausalLM, RobertaForMaskedLM, RobertaForMultipleChoice, \
+    RobertaForQuestionAnswering, RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel, RobertaPreTrainedModel
\ No newline at end of file
diff --git a/fastNLP/transformers/torch/models/roberta/configuration_roberta.py b/fastNLP/transformers/torch/models/roberta/configuration_roberta.py
new file mode 100644
index 00000000..9a514be1
--- /dev/null
+++ b/fastNLP/transformers/torch/models/roberta/configuration_roberta.py
@@ -0,0 +1,65 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" RoBERTa configuration """
+from ..bert.configuration_bert import BertConfig
+from fastNLP.core.log import logger
+
+__all__ = [
+    "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+    "RobertaConfig",
+]
+
+ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "roberta-base": "https://huggingface.co/roberta-base/resolve/main/config.json",
+    "roberta-large": "https://huggingface.co/roberta-large/resolve/main/config.json",
+    "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/config.json",
+    "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/config.json",
+    "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/config.json",
+    "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/config.json",
+}
+
+
+class RobertaConfig(BertConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel` or a
+    :class:`~transformers.TFRobertaModel`. It is used to instantiate a RoBERTa model according to the specified
+    arguments, defining the model architecture.
+
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+    The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. It reuses the
+    same defaults. Please check the parent class for more information.
+
+    Examples::
+
+        >>> from transformers import RobertaConfig, RobertaModel
+
+        >>> # Initializing a RoBERTa configuration
+        >>> configuration = RobertaConfig()
+
+        >>> # Initializing a model from the configuration
+        >>> model = RobertaModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = "roberta"
+
+    def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
+        """Constructs RobertaConfig."""
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
diff --git a/fastNLP/transformers/torch/models/roberta/modeling_roberta.py b/fastNLP/transformers/torch/models/roberta/modeling_roberta.py
new file mode 100644
index 00000000..4e914214
--- /dev/null
+++ b/fastNLP/transformers/torch/models/roberta/modeling_roberta.py
@@ -0,0 +1,1584 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch RoBERTa model. """
+
+import math
+
+from packaging import version
+
+from fastNLP.transformers.torch.activations import ACT2FN, gelu
+from fastNLP.transformers.torch.file_utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from fastNLP.transformers.torch.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from fastNLP.transformers.torch.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from .configuration_roberta import RobertaConfig
+from fastNLP.envs.imports import _NEED_IMPORT_TORCH
+from fastNLP.core.log import logger
+
+if _NEED_IMPORT_TORCH:
+    import torch
+    import torch.utils.checkpoint
+    from torch import nn
+    from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss, Module
+else:
+    from fastNLP.core.utils.dummy_class import DummyClass as Module
+
+__all__ = [
+    "ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST",
+    "RobertaForCausalLM",
+    "RobertaForMaskedLM",
+    "RobertaForMultipleChoice",
+    "RobertaForQuestionAnswering",
+    "RobertaForSequenceClassification",
+    "RobertaForTokenClassification",
+    "RobertaModel",
+    "RobertaPreTrainedModel",
+]
+
+_CHECKPOINT_FOR_DOC = "roberta-base"
+_CONFIG_FOR_DOC = "RobertaConfig"
+_TOKENIZER_FOR_DOC = "RobertaTokenizer"
+
+ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "roberta-base",
+    "roberta-large",
+    "roberta-large-mnli",
+    "distilroberta-base",
+    "roberta-base-openai-detector",
+    "roberta-large-openai-detector",
+    # See all RoBERTa models at https://huggingface.co/models?filter=roberta
+]
+
+
+class RobertaEmbeddings(Module):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        if version.parse(torch.__version__) > version.parse("1.6.0"):
+            self.register_buffer(
+                "token_type_ids",
+                torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
+                persistent=False,
+            )
+
+        # End copy
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if position_ids is None:
+            if input_ids is not None:
+                # Create the position ids from the input token ids. Any padded tokens remain padded.
+                position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
+            else:
+                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
+
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape)
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->Roberta
+class RobertaSelfAttention(Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_value[0]
+            value_layer = past_key_value[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class RobertaSelfOutput(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta
+class RobertaAttention(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = RobertaSelfAttention(config)
+        self.output = RobertaSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class RobertaIntermediate(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class RobertaOutput(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->Roberta
+class RobertaLayer(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = RobertaAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            assert self.is_decoder, f"{self} should be used as a decoder model if cross attention is added"
+            self.crossattention = RobertaAttention(config)
+        self.intermediate = RobertaIntermediate(config)
+        self.output = RobertaOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            assert hasattr(
+                self, "crossattention"
+            ), f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->Roberta
+class RobertaEncoder(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([RobertaLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler
+class RobertaPooler(Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class RobertaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = RobertaConfig
+    base_model_prefix = "roberta"
+    supports_gradient_checkpointing = True
+
+    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, RobertaEncoder):
+            module.gradient_checkpointing = value
+
+    def update_keys_to_ignore(self, config, del_keys_to_ignore):
+        """Remove some keys from ignore list"""
+        if not config.tie_word_embeddings:
+            # must make a new list, or the class variable gets modified!
+            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
+            self._keys_to_ignore_on_load_missing = [
+                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
+            ]
+
+
+ROBERTA_START_DOCSTRING = r"""
+
+    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+
+    This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+
+    Parameters:
+        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+ROBERTA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.RobertaTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        output_attentions (:obj:`bool`, `optional`):
+            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+            tensors for more detail.
+        output_hidden_states (:obj:`bool`, `optional`):
+            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+            more detail.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+    ROBERTA_START_DOCSTRING,
+)
+class RobertaModel(RobertaPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need`_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the :obj:`is_decoder` argument of the configuration
+    set to :obj:`True`. To be used in a Seq2Seq model, the model needs to initialized with both :obj:`is_decoder`
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+
+    .. _`Attention is all you need`: https://arxiv.org/abs/1706.03762
+
+    """
+
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = RobertaEmbeddings(config)
+        self.encoder = RobertaEncoder(config)
+
+        self.pooler = RobertaPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """RoBERTa Model with a `language modeling` head on top for CLM fine-tuning. """, ROBERTA_START_DOCSTRING
+)
+class RobertaForCausalLM(RobertaPreTrainedModel):
+    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `RobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.lm_head = RobertaLMHead(config)
+
+        # The LM head weights require special treatment only when they are tied with the word embeddings
+        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+
+        Returns:
+
+        Example::
+
+            >>> from transformers import RobertaTokenizer, RobertaForCausalLM, RobertaConfig
+            >>> import torch
+
+            >>> tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+            >>> config = RobertaConfig.from_pretrained("roberta-base")
+            >>> config.is_decoder = True
+            >>> model = RobertaForCausalLM.from_pretrained('roberta-base', config=config)
+
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past}
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
+@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING)
+class RobertaForMaskedLM(RobertaPreTrainedModel):
+    _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `RobertaForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.lm_head = RobertaLMHead(config)
+
+        # The LM head weights require special treatment only when they are tied with the word embeddings
+        self.update_keys_to_ignore(config, ["lm_head.decoder.weight"])
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.lm_head.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="<mask>",
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class RobertaLMHead(Module):
+    """Roberta Head for masked language modeling."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        self.decoder.bias = self.bias
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = self.decoder(x)
+
+        return x
+
+    def _tie_weights(self):
+        # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
+        self.bias = self.decoder.bias
+
+
+@add_start_docstrings(
+    """
+    RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    ROBERTA_START_DOCSTRING,
+)
+class RobertaForSequenceClassification(RobertaPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.classifier = RobertaClassificationHead(config)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    ROBERTA_START_DOCSTRING,
+)
+class RobertaForMultipleChoice(RobertaPreTrainedModel):
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.roberta = RobertaModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
+        labels=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
+            num_choices-1]`` where :obj:`num_choices` is the size of the second dimension of the input tensors. (See
+            :obj:`input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.roberta(
+            flat_input_ids,
+            position_ids=flat_position_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            head_mask=head_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    ROBERTA_START_DOCSTRING,
+)
+class RobertaForTokenClassification(RobertaPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
+            1]``.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class RobertaClassificationHead(Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@add_start_docstrings(
+    """
+    Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    ROBERTA_START_DOCSTRING,
+)
+class RobertaForQuestionAnswering(RobertaPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        tokenizer_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
+            sequence are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
diff --git a/fastNLP/transformers/torch/models/roberta/tokenization_roberta.py b/fastNLP/transformers/torch/models/roberta/tokenization_roberta.py
new file mode 100644
index 00000000..c0c11e29
--- /dev/null
+++ b/fastNLP/transformers/torch/models/roberta/tokenization_roberta.py
@@ -0,0 +1,254 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for RoBERTa."""
+
+from typing import List, Optional
+
+from fastNLP.transformers.torch.tokenization_utils import AddedToken
+from ..gpt2.tokenization_gpt2 import GPT2Tokenizer
+from fastNLP.core.log import logger
+
+__all__ = [
+    "RobertaTokenizer",
+]
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
+        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
+        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json",
+        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json",
+        "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/vocab.json",
+        "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
+        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
+        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt",
+        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt",
+        "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/merges.txt",
+        "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/merges.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "roberta-base": 512,
+    "roberta-large": 512,
+    "roberta-large-mnli": 512,
+    "distilroberta-base": 512,
+    "roberta-base-openai-detector": 512,
+    "roberta-large-openai-detector": 512,
+}
+
+
+class RobertaTokenizer(GPT2Tokenizer):
+    """
+    Constructs a RoBERTa tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.
+
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ::
+
+        >>> from transformers import RobertaTokenizer
+        >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+        >>> tokenizer("Hello world")['input_ids']
+        [0, 31414, 232, 328, 2]
+        >>> tokenizer(" Hello world")['input_ids']
+        [0, 20920, 232, 2]
+
+    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    .. note::
+
+        When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
+        one).
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
+            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
+        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the beginning of
+                sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end of
+                sequence. The token used is the :obj:`sep_token`.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        add_prefix_space=False,
+        **kwargs
+    ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A RoBERTa sequence has the following format:
+
+        - single sequence: ``<s> X </s>``
+        - pair of sequences: ``<s> A </s></s> B </s>``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
+        make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
+            text = " " + text
+        return (text, kwargs)
diff --git a/fastNLP/transformers/torch/tokenization_utils.py b/fastNLP/transformers/torch/tokenization_utils.py
new file mode 100644
index 00000000..f0f57e39
--- /dev/null
+++ b/fastNLP/transformers/torch/tokenization_utils.py
@@ -0,0 +1,915 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see
+ tokenization_utils_fast.py
+"""
+import bisect
+import itertools
+import re
+import unicodedata
+from collections import OrderedDict
+from typing import Any, Dict, List, Optional, Tuple, Union, overload
+
+from .file_utils import PaddingStrategy, TensorType, add_end_docstrings
+from .tokenization_utils_base import (
+    ENCODE_KWARGS_DOCSTRING,
+    ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
+    INIT_TOKENIZER_DOCSTRING,
+    AddedToken,
+    BatchEncoding,
+    EncodedInput,
+    EncodedInputPair,
+    PreTokenizedInput,
+    PreTokenizedInputPair,
+    PreTrainedTokenizerBase,
+    TextInput,
+    TextInputPair,
+    TruncationStrategy,
+)
+
+from fastNLP.core.log import logger
+
+# Slow tokenizers are saved in a vocabulary plus three separated files
+SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
+ADDED_TOKENS_FILE = "added_tokens.json"
+TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
+
+
+class Trie:
+    """
+    Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass
+    Loose reference https://en.wikipedia.org/wiki/Trie
+    """
+
+    def __init__(self):
+        self.data = {}
+
+    def add(self, word: str):
+        """
+        Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation.
+        The special key `""` is used to represent termination.
+
+        This function is idempotent, adding twice the same word will leave the trie unchanged
+
+        Example::
+
+            >>> trie = Trie()
+            >>> trie.add("Hello 友達")
+            >>> trie.data
+            {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}
+            >>> trie.add("Hello")
+            >>> trie.data
+            {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
+        """
+        if not word:
+            # Prevent empty string
+            return
+        ref = self.data
+        for char in word:
+            ref[char] = char in ref and ref[char] or {}
+            ref = ref[char]
+        ref[""] = 1
+
+    def split(self, text: str) -> List[str]:
+        """
+        Will look for the words added to the trie within `text`. Output is the original string splitted along the
+        boundaries of the words found.
+
+        This trie will match the longest possible word first !
+
+        Example::
+
+            >>> trie = Trie()
+            >>> trie.split("[CLS] This is a extra_id_100")
+            ["[CLS] This is a extra_id_100"]
+            >>> trie.add("[CLS]")
+            >>> trie.add("extra_id_1")
+            >>> trie.add("extra_id_100")
+            >>> trie.split("[CLS] This is a extra_id_100")
+            ["[CLS]", " This is a ", "extra_id_100"]
+        """
+        # indexes are counted left of the chars index.
+        # "hello", index 0, is left of h, index 1 is between h and e.
+        # index 5 is right of the "o".
+
+        # States are going to capture every possible start (indexes as above)
+        # as keys, and have as values, a pointer to the position in the trie
+        # where we're at. This is a partial match for now.
+        # This enables to keep track of multiple matches while we're iterating
+        # the string
+        # If the trie contains, "blowing", and "lower" and we encounter the
+        # string "blower", we need to split into ["b", "lower"].
+        # This is where we need to keep track of multiple possible starts.
+        states = OrderedDict()
+
+        # This will contain every indices where we need
+        # to cut.
+        # We force to cut at offset 0 and len(text) (added later)
+        offsets = [0]
+
+        # This is used by the lookahead which needs to skip over
+        # some text where the full match exceeded the place in the initial
+        # for loop
+        skip = None
+        # Main loop, Giving this algorithm O(n) complexity
+        for current, current_char in enumerate(text):
+            if skip and current < skip:
+                # Prevents the lookahead for matching twice
+                # like extra_id_100 and id_100
+                continue
+
+            # This will track every state
+            # that stop matching, we need to stop tracking them.
+            # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then
+            # fail on "b", we need to remove 0 from the valid states.
+            to_remove = set()
+            # Whenever we found a match, we need to drop everything
+            # this is a greedy algorithm, it will match on the first found token
+            reset = False
+
+            # In this case, we already have partial matches (But unfinished)
+            for start, trie_pointer in states.items():
+                if "" in trie_pointer:
+                    # This is a final match, we need to reset and
+                    # store the results in `offsets`.
+
+                    # Lookahead to match longest first
+                    # Important in case of extra_id_1 vs extra_id_100
+                    lookahead_index = current
+                    end = current
+                    next_char = text[lookahead_index] if lookahead_index < len(text) else None
+                    while next_char in trie_pointer:
+                        trie_pointer = trie_pointer[next_char]
+                        lookahead_index += 1
+                        if "" in trie_pointer:
+                            end = lookahead_index
+                            skip = lookahead_index
+
+                        if lookahead_index == len(text):
+                            # End of string
+                            break
+                        next_char = text[lookahead_index]
+                    # End lookahead
+
+                    # Storing and resetting
+                    offsets.append(start)
+                    offsets.append(end)
+                    reset = True
+                elif current_char in trie_pointer:
+                    # The current character being looked at has a match within the trie
+                    # update the pointer (it will be stored back into states later).
+                    trie_pointer = trie_pointer[current_char]
+
+                    # Storing back the new pointer into the states.
+                    # Partial matches got longer by one.
+                    states[start] = trie_pointer
+                else:
+                    # The new character has not match in the trie, we need
+                    # to stop keeping track of this partial match.
+                    # We can't do it directly within the loop because of how
+                    # python iteration works
+                    to_remove.add(start)
+
+            # Either clearing the full start (we found a real match)
+            # Or clearing only the partial matches that didn't work.
+            if reset:
+                states = {}
+            else:
+                for start in to_remove:
+                    del states[start]
+
+            # If this character is a starting character within the trie
+            # start keeping track of this partial match.
+            if current_char in self.data:
+                states[current] = self.data[current_char]
+
+        # We have a cut at the end with states.
+        for start, trie_pointer in states.items():
+            if "" in trie_pointer:
+                # This is a final match, we need to reset and
+                # store the results in `offsets`.
+                end = len(text)
+                offsets.append(start)
+                offsets.append(end)
+                # Longest cut is always the one with lower start so the first
+                # item so we need to break.
+                break
+
+        # We have all the offsets now, we just need to do the actual splitting.
+        # We need to eventually add the first part of the string and the eventual
+        # last part.
+        offsets.append(len(text))
+        tokens = []
+        start = 0
+        for end in offsets:
+            if start == end:
+                # This might happen if there's a match at index 0
+                # we're also preventing zero-width cuts in case of two
+                # consecutive matches
+                continue
+            tokens.append(text[start:end])
+            start = end
+
+        return tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `char` is a whitespace character."""
+    # \t, \n, and \r are technically control characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `char` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `char` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+
+
+def _is_end_of_word(text):
+    """Checks whether the last character in text is one of a punctuation, control or whitespace character."""
+    last_char = text[-1]
+    return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))
+
+
+def _is_start_of_word(text):
+    """Checks whether the first character in text is one of a punctuation, control or whitespace character."""
+    first_char = text[0]
+    return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))
+
+
+def _insert_one_token_to_ordered_list(token_list: List[str], new_token: str):
+    """
+    Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted.
+    """
+    insertion_idx = bisect.bisect_left(token_list, new_token)
+    # Checks if new_token is already in the ordered token_list
+    if insertion_idx < len(token_list) and token_list[insertion_idx] == new_token:
+        # new_token is in token_list, don't add
+        return
+    else:
+        token_list.insert(insertion_idx, new_token)
+
+
+@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
+class PreTrainedTokenizer(PreTrainedTokenizerBase):
+    """
+    Base class for all slow tokenizers.
+
+    Inherits from :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`.
+
+    Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading
+    pretrained tokenizers as well as adding tokens to the vocabulary.
+
+    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the
+    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # Added tokens - We store this for both slow and fast tokenizers
+        # until the serialization of Fast tokenizers is updated
+        self.added_tokens_encoder: Dict[str, int] = {}
+        self.added_tokens_decoder: Dict[int, str] = {}
+        self.unique_no_split_tokens: List[str] = []
+        self.tokens_trie = Trie()
+
+        self._decode_use_source_tokenizer = False
+
+    @property
+    def is_fast(self) -> bool:
+        return False
+
+    @property
+    def vocab_size(self) -> int:
+        """
+        :obj:`int`: Size of the base vocabulary (without the added tokens).
+        """
+        raise NotImplementedError
+
+    def get_added_vocab(self) -> Dict[str, int]:
+        """
+        Returns the added tokens in the vocabulary as a dictionary of token to index.
+
+        Returns:
+            :obj:`Dict[str, int]`: The added tokens.
+        """
+        return self.added_tokens_encoder
+
+    def __len__(self):
+        """
+        Size of the full vocabulary with the added tokens.
+        """
+        return self.vocab_size + len(self.added_tokens_encoder)
+
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        """
+        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
+        it with indices starting from length of the current vocabulary.
+
+        Args:
+            new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`):
+                Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
+                checking if the tokenizer assign the index of the ``unk_token`` to them).
+            special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the tokens should be added as special tokens.
+
+        Returns:
+            :obj:`int`: The number of tokens actually added to the vocabulary.
+
+        Examples::
+
+            # Let's see how to increase the vocabulary of Bert model and tokenizer
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            model = BertModel.from_pretrained('bert-base-uncased')
+
+            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
+            print('We have added', num_added_toks, 'tokens')
+            # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+            model.resize_token_embeddings(len(tokenizer))
+        """
+        new_tokens = [str(tok) for tok in new_tokens]
+
+        tokens_to_add = []
+        for token in new_tokens:
+            if not isinstance(token, str):
+                raise TypeError(f"Token {token} is not a string but a {type(token)}.")
+            if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case:
+                token = token.lower()
+            if (
+                token != self.unk_token
+                and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
+                and token not in tokens_to_add
+            ):
+                tokens_to_add.append(token)
+                if self.verbose:
+                    logger.info(f"Adding {token} to the vocabulary")
+
+        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
+        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
+        self.added_tokens_encoder.update(added_tok_encoder)
+        self.added_tokens_decoder.update(added_tok_decoder)
+
+        # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
+        if special_tokens:
+            if len(new_tokens) == 1:
+                _insert_one_token_to_ordered_list(self.unique_no_split_tokens, new_tokens[0])
+            else:
+                self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens)))
+        else:
+            # Or on the newly added tokens
+            if len(tokens_to_add) == 1:
+                _insert_one_token_to_ordered_list(self.unique_no_split_tokens, tokens_to_add[0])
+            else:
+                self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
+        self._create_trie(self.unique_no_split_tokens)
+
+        return len(tokens_to_add)
+
+    def _create_trie(self, unique_no_split_tokens):
+        trie = Trie()
+        for token in unique_no_split_tokens:
+            if hasattr(self, "do_lower_case") and self.do_lower_case and token not in self.all_special_tokens:
+                trie.add(token.lower())
+            else:
+                trie.add(token)
+        self.tokens_trie = trie
+
+    def num_special_tokens_to_add(self, pair: bool = False) -> int:
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+
+        .. note::
+            This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not
+            put this inside your training loop.
+
+        Args:
+            pair (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether the number of added tokens should be computed in the case of a sequence pair or a single
+                sequence.
+
+        Returns:
+            :obj:`int`: Number of special tokens added to sequences.
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+
+    def tokenize(self, text: TextInput, **kwargs) -> List[str]:
+        """
+        Converts a string in a sequence of tokens, using the tokenizer.
+
+        Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies
+        (BPE/SentencePieces/WordPieces). Takes care of added tokens.
+
+        Args:
+            text (:obj:`str`):
+                The sequence to be encoded.
+            **kwargs (additional keyword arguments):
+                Passed along to the model-specific ``prepare_for_tokenization`` preprocessing method.
+
+        Returns:
+            :obj:`List[str]`: The list of tokens.
+        """
+        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
+        all_special_tokens_extended = dict(
+            (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
+        )
+
+        text, kwargs = self.prepare_for_tokenization(text, **kwargs)
+
+        if kwargs:
+            logger.warning(f"Keyword arguments {kwargs} not recognized.")
+
+        # TODO: should this be in the base class?
+        if hasattr(self, "do_lower_case") and self.do_lower_case:
+            # convert non-special tokens to lowercase
+            escaped_special_toks = [
+                re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens)
+            ]
+            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
+            text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
+
+        no_split_token = set(self.unique_no_split_tokens)
+        tokens = self.tokens_trie.split(text)
+        # ["This is something", "<special_token_1>", "  else"]
+        for i, token in enumerate(tokens):
+            if token in no_split_token:
+                tok_extended = all_special_tokens_extended.get(token, None)
+                left = tokens[i - 1] if i > 0 else None
+                right = tokens[i + 1] if i < len(tokens) - 1 else None
+                if isinstance(tok_extended, AddedToken):
+                    if tok_extended.rstrip and right:
+                        # A bit counter-intuitive but we strip the left of the string
+                        # since tok_extended.rstrip means the special token is eating all white spaces on its right
+                        tokens[i + 1] = right.lstrip()
+                    # Strip white spaces on the left
+                    if tok_extended.lstrip and left:
+                        tokens[i - 1] = left.rstrip()  # Opposite here
+                else:
+                    # We strip left and right by default
+                    if right:
+                        tokens[i + 1] = right.lstrip()
+                    if left:
+                        tokens[i - 1] = left.rstrip()
+        # ["This is something", "<special_token_1>", "else"]
+        tokenized_text = []
+        for token in tokens:
+            # Need to skip eventual empty (fully stripped) tokens
+            if not token:
+                continue
+            if token in no_split_token:
+                tokenized_text.append(token)
+            else:
+                tokenized_text.extend(self._tokenize(token))
+        # ["This", " is", " something", "<special_token_1>", "else"]
+        return tokenized_text
+
+    def _tokenize(self, text, **kwargs):
+        """
+        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+
+        Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+
+    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        """
+        Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
+        vocabulary.
+
+        Args:
+            tokens (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
+
+        Returns:
+            :obj:`int` or :obj:`List[int]`: The token id or list of token ids.
+        """
+        if tokens is None:
+            return None
+
+        if isinstance(tokens, str):
+            return self._convert_token_to_id_with_added_voc(tokens)
+
+        ids = []
+        for token in tokens:
+            ids.append(self._convert_token_to_id_with_added_voc(token))
+        return ids
+
+    def _convert_token_to_id_with_added_voc(self, token):
+        if token is None:
+            return None
+
+        if token in self.added_tokens_encoder:
+            return self.added_tokens_encoder[token]
+        return self._convert_token_to_id(token)
+
+    def _convert_token_to_id(self, token):
+        raise NotImplementedError
+
+    def _encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput, EncodedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self.tokenize(text, **kwargs)
+                return self.convert_tokens_to_ids(tokens)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                if is_split_into_words:
+                    tokens = list(
+                        itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
+                    )
+                    return self.convert_tokens_to_ids(tokens)
+                else:
+                    return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                return text
+            else:
+                if is_split_into_words:
+                    raise ValueError(
+                        f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_split_into_words=True`."
+                    )
+                else:
+                    raise ValueError(
+                        f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                    )
+
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers."
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+
+        first_ids = get_input_ids(text)
+        second_ids = get_input_ids(text_pair) if text_pair is not None else None
+
+        return self.prepare_for_model(
+            first_ids,
+            pair_ids=second_ids,
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+        )
+
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            List[TextInput],
+            List[TextInputPair],
+            List[PreTokenizedInput],
+            List[PreTokenizedInputPair],
+            List[EncodedInput],
+            List[EncodedInputPair],
+        ],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self.tokenize(text, **kwargs)
+                return self.convert_tokens_to_ids(tokens)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                if is_split_into_words:
+                    tokens = list(
+                        itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
+                    )
+                    return self.convert_tokens_to_ids(tokens)
+                else:
+                    return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                return text
+            else:
+                raise ValueError(
+                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                )
+
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers."
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+            )
+
+        input_ids = []
+        for ids_or_pair_ids in batch_text_or_text_pairs:
+            if not isinstance(ids_or_pair_ids, (list, tuple)):
+                ids, pair_ids = ids_or_pair_ids, None
+            elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
+                ids, pair_ids = ids_or_pair_ids, None
+            else:
+                ids, pair_ids = ids_or_pair_ids
+
+            first_ids = get_input_ids(ids)
+            second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
+            input_ids.append((first_ids, second_ids))
+
+        batch_outputs = self._batch_prepare_for_model(
+            input_ids,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            return_tensors=return_tensors,
+            verbose=verbose,
+        )
+
+        return BatchEncoding(batch_outputs)
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def _batch_prepare_for_model(
+        self,
+        batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        """
+        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
+        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
+        manages a moving window (with user defined stride) for overflowing tokens
+
+        Args:
+            batch_ids_pairs: list of tokenized input ids or input ids pairs
+        """
+
+        batch_outputs = {}
+        for first_ids, second_ids in batch_ids_pairs:
+            outputs = self.prepare_for_model(
+                first_ids,
+                second_ids,
+                add_special_tokens=add_special_tokens,
+                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
+                truncation=truncation_strategy.value,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=None,  # we pad in batch afterward
+                return_attention_mask=False,  # we pad in batch afterward
+                return_token_type_ids=return_token_type_ids,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_length=return_length,
+                return_tensors=None,  # We convert the whole batch to tensors at the end
+                prepend_batch_axis=False,
+                verbose=verbose,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
+
+        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+        return batch_outputs
+
+    def prepare_for_tokenization(
+        self, text: str, is_split_into_words: bool = False, **kwargs
+    ) -> Tuple[str, Dict[str, Any]]:
+        """
+        Performs any necessary transformations before tokenization.
+
+        This method should pop the arguments from kwargs and return the remaining :obj:`kwargs` as well. We test the
+        :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used.
+
+        Args:
+            text (:obj:`str`):
+                The text to prepare.
+            is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`, the
+                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
+                which it will tokenize. This is useful for NER or token classification.
+            kwargs:
+                Keyword arguments to use for the tokenization.
+
+        Returns:
+            :obj:`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
+        """
+        return (text, kwargs)
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids of the first sequence.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                List of ids of the second sequence.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+        return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
+
+    @overload
+    def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str:
+        ...
+
+    @overload
+    def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]:
+        ...
+
+    def convert_ids_to_tokens(
+        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
+    ) -> Union[str, List[str]]:
+        """
+        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
+        added tokens.
+
+        Args:
+            ids (:obj:`int` or :obj:`List[int]`):
+                The token id (or token ids) to convert to tokens.
+            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to remove special tokens in the decoding.
+
+        Returns:
+            :obj:`str` or :obj:`List[str]`: The decoded token(s).
+        """
+        if isinstance(ids, int):
+            if ids in self.added_tokens_decoder:
+                return self.added_tokens_decoder[ids]
+            else:
+                return self._convert_id_to_token(ids)
+        tokens = []
+        for index in ids:
+            index = int(index)
+            if skip_special_tokens and index in self.all_special_ids:
+                continue
+            if index in self.added_tokens_decoder:
+                tokens.append(self.added_tokens_decoder[index])
+            else:
+                tokens.append(self._convert_id_to_token(index))
+        return tokens
+
+    def _convert_id_to_token(self, index: int) -> str:
+        raise NotImplementedError
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return " ".join(tokens)
+
+    def _decode(
+        self,
+        token_ids: List[int],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = True,
+        spaces_between_special_tokens: bool = True,
+        **kwargs
+    ) -> str:
+        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
+
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+
+        # To avoid mixing byte-level and unicode for byte-level BPT
+        # we need to build string separately for added tokens and byte-level tokens
+        # cf. https://github.com/huggingface/transformers/issues/1133
+        sub_texts = []
+        current_sub_text = []
+        for token in filtered_tokens:
+            if skip_special_tokens and token in self.all_special_ids:
+                continue
+            if token in self.added_tokens_encoder:
+                if current_sub_text:
+                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+                    current_sub_text = []
+                sub_texts.append(token)
+            else:
+                current_sub_text.append(token)
+        if current_sub_text:
+            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+
+        if spaces_between_special_tokens:
+            text = " ".join(sub_texts)
+        else:
+            text = "".join(sub_texts)
+
+        if clean_up_tokenization_spaces:
+            clean_text = self.clean_up_tokenization(text)
+            return clean_text
+        else:
+            return text
diff --git a/fastNLP/transformers/torch/tokenization_utils_base.py b/fastNLP/transformers/torch/tokenization_utils_base.py
new file mode 100644
index 00000000..aebf4bb6
--- /dev/null
+++ b/fastNLP/transformers/torch/tokenization_utils_base.py
@@ -0,0 +1,3351 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user
+fronting encoding methods) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary
+of output with special method for the Fast tokenizers)
+"""
+
+import copy
+import json
+import os
+import re
+from collections import OrderedDict, UserDict
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
+
+import numpy as np
+from packaging import version
+
+import requests
+
+from . import __version__
+from .file_utils import (
+    ExplicitEnum,
+    PaddingStrategy,
+    TensorType,
+    _is_numpy,
+    _is_torch,
+    _is_torch_device,
+    add_end_docstrings,
+    cached_path,
+    is_offline_mode,
+    is_remote_url,
+    is_tokenizers_available,
+    to_py_obj,
+)
+
+from fastNLP.envs.imports import _NEED_IMPORT_TORCH
+from fastNLP.core.log import logger
+
+if _NEED_IMPORT_TORCH:
+    import torch
+
+if is_tokenizers_available():
+    from tokenizers import AddedToken
+    from tokenizers import Encoding as EncodingFast
+else:
+
+    @dataclass(frozen=True, eq=True)
+    class AddedToken:
+        """
+        AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the
+        way it should behave.
+        """
+
+        content: str = field(default_factory=str)
+        single_word: bool = False
+        lstrip: bool = False
+        rstrip: bool = False
+        normalized: bool = True
+
+        def __getstate__(self):
+            return self.__dict__
+
+    @dataclass
+    class EncodingFast:
+        """This is dummy class because without the `tokenizers` library we don't have these objects anyway"""
+
+        pass
+
+VERY_LARGE_INTEGER = int(1e30)  # This is used to set the max input length for a model with infinite size input
+LARGE_INTEGER = int(1e20)  # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER
+
+# Define type aliases and NamedTuples
+TextInput = str
+PreTokenizedInput = List[str]
+EncodedInput = List[int]
+TextInputPair = Tuple[str, str]
+PreTokenizedInputPair = Tuple[List[str], List[str]]
+EncodedInputPair = Tuple[List[int], List[int]]
+
+
+# Slow tokenizers used to be saved in three separated files
+SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
+ADDED_TOKENS_FILE = "added_tokens.json"
+TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
+
+# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
+FULL_TOKENIZER_FILE = "tokenizer.json"
+
+
+class TruncationStrategy(ExplicitEnum):
+    """
+    Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
+    tab-completion in an IDE.
+    """
+
+    ONLY_FIRST = "only_first"
+    ONLY_SECOND = "only_second"
+    LONGEST_FIRST = "longest_first"
+    DO_NOT_TRUNCATE = "do_not_truncate"
+
+
+class CharSpan(NamedTuple):
+    """
+    Character span in the original string.
+
+    Args:
+        start (:obj:`int`): Index of the first character in the original string.
+        end (:obj:`int`): Index of the character following the last character in the original string.
+    """
+
+    start: int
+    end: int
+
+
+class TokenSpan(NamedTuple):
+    """
+    Token span in an encoded string (list of tokens).
+
+    Args:
+        start (:obj:`int`): Index of the first token in the span.
+        end (:obj:`int`): Index of the token following the last token in the span.
+    """
+
+    start: int
+    end: int
+
+
+class BatchEncoding(UserDict):
+    """
+    Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus` and
+    :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode` methods (tokens,
+    attention_masks, etc).
+
+    This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
+    utility methods to map from word/character space to token space.
+
+    Args:
+        data (:obj:`dict`):
+            Dictionary of lists/arrays/tensors returned by the encode/batch_encode methods ('input_ids',
+            'attention_mask', etc.).
+        encoding (:obj:`tokenizers.Encoding` or :obj:`Sequence[tokenizers.Encoding]`, `optional`):
+            If the tokenizer is a fast tokenizer which outputs additional information like mapping from word/character
+            space to token space the :obj:`tokenizers.Encoding` instance or list of instance (for batches) hold this
+            information.
+        tensor_type (:obj:`Union[None, str, TensorType]`, `optional`):
+            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
+            initialization.
+        prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to add a batch axis when converting to tensors (see :obj:`tensor_type` above).
+        n_sequences (:obj:`Optional[int]`, `optional`):
+            You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
+            initialization.
+    """
+
+    def __init__(
+        self,
+        data: Optional[Dict[str, Any]] = None,
+        encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None,
+        tensor_type: Union[None, str, TensorType] = None,
+        prepend_batch_axis: bool = False,
+        n_sequences: Optional[int] = None,
+    ):
+        super().__init__(data)
+
+        if isinstance(encoding, EncodingFast):
+            encoding = [encoding]
+
+        self._encodings = encoding
+
+        if n_sequences is None and encoding is not None and len(encoding):
+            n_sequences = encoding[0].n_sequences
+
+        self._n_sequences = n_sequences
+
+        self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
+
+    @property
+    def n_sequences(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
+        :class:`~transformers.BatchEncoding`. Currently can be one of :obj:`None` (unknown), :obj:`1` (a single
+        sentence) or :obj:`2` (a pair of sentences)
+        """
+        return self._n_sequences
+
+    @property
+    def is_fast(self) -> bool:
+        """
+        :obj:`bool`: Indicate whether this :class:`~transformers.BatchEncoding` was generated from the result of a
+        :class:`~transformers.PreTrainedTokenizerFast` or not.
+        """
+        return self._encodings is not None
+
+    def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
+        """
+        If the key is a string, returns the value of the dict associated to :obj:`key` ('input_ids', 'attention_mask',
+        etc.).
+
+        If the key is an integer, get the :obj:`tokenizers.Encoding` for batch item with index :obj:`key`.
+        """
+        if isinstance(item, str):
+            return self.data[item]
+        elif self._encodings is not None:
+            return self._encodings[item]
+        else:
+            raise KeyError(
+                "Indexing with integers (to access backend Encoding for a given batch index) "
+                "is not available when using Python based tokenizers"
+            )
+
+    def __getattr__(self, item: str):
+        try:
+            return self.data[item]
+        except KeyError:
+            raise AttributeError
+
+    def __getstate__(self):
+        return {"data": self.data, "encodings": self._encodings}
+
+    def __setstate__(self, state):
+        if "data" in state:
+            self.data = state["data"]
+
+        if "encodings" in state:
+            self._encodings = state["encodings"]
+
+    def keys(self):
+        return self.data.keys()
+
+    def values(self):
+        return self.data.values()
+
+    def items(self):
+        return self.data.items()
+
+    # After this point:
+    # Extended properties and methods only available for fast (Rust-based) tokenizers
+    # provided by HuggingFace tokenizers library.
+
+    @property
+    def encodings(self) -> Optional[List[EncodingFast]]:
+        """
+        :obj:`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns
+        :obj:`None` if the input was tokenized through Python (i.e., not a fast) tokenizer.
+        """
+        return self._encodings
+
+    def tokens(self, batch_index: int = 0) -> List[str]:
+        """
+        Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to
+        integer indices) at a given batch index (only works for the output of a fast tokenizer).
+
+        Args:
+            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
+
+        Returns:
+            :obj:`List[str]`: The list of tokens at that index.
+        """
+        if not self._encodings:
+            raise ValueError("tokens() is not available when using Python-based tokenizers")
+        return self._encodings[batch_index].tokens
+
+    def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]:
+        """
+        Return a list mapping the tokens to the id of their original sentences:
+
+            - :obj:`None` for special tokens added around or between sequences,
+            - :obj:`0` for tokens corresponding to words in the first sequence,
+            - :obj:`1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
+              encoded.
+
+        Args:
+            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
+
+        Returns:
+            :obj:`List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens
+            added by the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their
+            corresponding sequence.
+        """
+        if not self._encodings:
+            raise ValueError("sequence_ids() is not available when using Python-based tokenizers")
+        return self._encodings[batch_index].sequence_ids
+
+    def words(self, batch_index: int = 0) -> List[Optional[int]]:
+        """
+        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
+
+        Args:
+            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
+
+        Returns:
+            :obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
+            the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding
+            word (several tokens will be mapped to the same word index if they are parts of that word).
+        """
+        if not self._encodings:
+            raise ValueError("words() is not available when using Python-based tokenizers")
+        logger.warn(
+            "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, "
+            "but more self-explanatory `BatchEncoding.word_ids()` property.",
+            FutureWarning,
+        )
+        return self.word_ids(batch_index)
+
+    def word_ids(self, batch_index: int = 0) -> List[Optional[int]]:
+        """
+        Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
+
+        Args:
+            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
+
+        Returns:
+            :obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
+            the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding
+            word (several tokens will be mapped to the same word index if they are parts of that word).
+        """
+        if not self._encodings:
+            raise ValueError("word_ids() is not available when using Python-based tokenizers")
+        return self._encodings[batch_index].word_ids
+
+    def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
+        """
+        Get the index of the sequence represented by the given token. In the general use case, this method returns
+        :obj:`0` for a single sequence or the first sequence of a pair, and :obj:`1` for the second sequence of a pair
+
+        Can be called as:
+
+        - ``self.token_to_sequence(token_index)`` if batch size is 1
+        - ``self.token_to_sequence(batch_index, token_index)`` if batch size is greater than 1
+
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
+        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
+        tokenized words.
+
+        Args:
+            batch_or_token_index (:obj:`int`):
+                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
+                the token in the sequence.
+            token_index (:obj:`int`, `optional`):
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
+                sequence.
+
+        Returns:
+            :obj:`int`: Index of the word in the input sequence.
+        """
+
+        if not self._encodings:
+            raise ValueError("token_to_sequence() is not available when using Python based tokenizers")
+        if token_index is not None:
+            batch_index = batch_or_token_index
+        else:
+            batch_index = 0
+            token_index = batch_or_token_index
+        if batch_index < 0:
+            batch_index = self._batch_size + batch_index
+        if token_index < 0:
+            token_index = self._seq_len + token_index
+        return self._encodings[batch_index].token_to_sequence(token_index)
+
+    def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
+        """
+        Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch.
+
+        Can be called as:
+
+        - ``self.token_to_word(token_index)`` if batch size is 1
+        - ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1
+
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
+        words are defined by the user). In this case it allows to easily associate encoded tokens with provided
+        tokenized words.
+
+        Args:
+            batch_or_token_index (:obj:`int`):
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                the token in the sequence.
+            token_index (:obj:`int`, `optional`):
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
+                sequence.
+
+        Returns:
+            :obj:`int`: Index of the word in the input sequence.
+        """
+
+        if not self._encodings:
+            raise ValueError("token_to_word() is not available when using Python based tokenizers")
+        if token_index is not None:
+            batch_index = batch_or_token_index
+        else:
+            batch_index = 0
+            token_index = batch_or_token_index
+        if batch_index < 0:
+            batch_index = self._batch_size + batch_index
+        if token_index < 0:
+            token_index = self._seq_len + token_index
+        return self._encodings[batch_index].token_to_word(token_index)
+
+    def word_to_tokens(
+        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
+    ) -> Optional[TokenSpan]:
+        """
+        Get the encoded token span corresponding to a word in a sequence of the batch.
+
+        Token spans are returned as a :class:`~transformers.tokenization_utils_base.TokenSpan` with:
+
+        - **start** -- Index of the first token.
+        - **end** -- Index of the token following the last token.
+
+        Can be called as:
+
+        - ``self.word_to_tokens(word_index, sequence_index: int = 0)`` if batch size is 1
+        - ``self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)`` if batch size is greater or equal
+          to 1
+
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
+        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
+        words.
+
+        Args:
+            batch_or_word_index (:obj:`int`):
+                Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
+                the word in the sequence.
+            word_index (:obj:`int`, `optional`):
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
+                sequence.
+            sequence_index (:obj:`int`, `optional`, defaults to 0):
+                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
+                or 1) the provided word index belongs to.
+
+        Returns:
+            Optional :class:`~transformers.tokenization_utils_base.TokenSpan` Span of tokens in the encoded sequence.
+            Returns :obj:`None` if no tokens correspond to the word.
+        """
+
+        if not self._encodings:
+            raise ValueError("word_to_tokens() is not available when using Python based tokenizers")
+        if word_index is not None:
+            batch_index = batch_or_word_index
+        else:
+            batch_index = 0
+            word_index = batch_or_word_index
+        if batch_index < 0:
+            batch_index = self._batch_size + batch_index
+        if word_index < 0:
+            word_index = self._seq_len + word_index
+        span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index)
+        return TokenSpan(*span) if span is not None else None
+
+    def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> CharSpan:
+        """
+        Get the character span corresponding to an encoded token in a sequence of the batch.
+
+        Character spans are returned as a :class:`~transformers.tokenization_utils_base.CharSpan` with:
+
+        - **start** -- Index of the first character in the original string associated to the token.
+        - **end** -- Index of the character following the last character in the original string associated to the
+          token.
+
+        Can be called as:
+
+        - ``self.token_to_chars(token_index)`` if batch size is 1
+        - ``self.token_to_chars(batch_index, token_index)`` if batch size is greater or equal to 1
+
+        Args:
+            batch_or_token_index (:obj:`int`):
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                the token in the sequence.
+            token_index (:obj:`int`, `optional`):
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the token or tokens in
+                the sequence.
+
+        Returns:
+            :class:`~transformers.tokenization_utils_base.CharSpan`: Span of characters in the original string.
+        """
+
+        if not self._encodings:
+            raise ValueError("token_to_chars() is not available when using Python based tokenizers")
+        if token_index is not None:
+            batch_index = batch_or_token_index
+        else:
+            batch_index = 0
+            token_index = batch_or_token_index
+        return CharSpan(*(self._encodings[batch_index].token_to_chars(token_index)))
+
+    def char_to_token(
+        self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0
+    ) -> int:
+        """
+        Get the index of the token in the encoded output comprising a character in the original string for a sequence
+        of the batch.
+
+        Can be called as:
+
+        - ``self.char_to_token(char_index)`` if batch size is 1
+        - ``self.char_to_token(batch_index, char_index)`` if batch size is greater or equal to 1
+
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
+        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
+        words.
+
+        Args:
+            batch_or_char_index (:obj:`int`):
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                the word in the sequence
+            char_index (:obj:`int`, `optional`):
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
+                sequence.
+            sequence_index (:obj:`int`, `optional`, defaults to 0):
+                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
+                or 1) the provided character index belongs to.
+
+
+        Returns:
+            :obj:`int`: Index of the token.
+        """
+
+        if not self._encodings:
+            raise ValueError("char_to_token() is not available when using Python based tokenizers")
+        if char_index is not None:
+            batch_index = batch_or_char_index
+        else:
+            batch_index = 0
+            char_index = batch_or_char_index
+        return self._encodings[batch_index].char_to_token(char_index, sequence_index)
+
+    def word_to_chars(
+        self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0
+    ) -> CharSpan:
+        """
+        Get the character span in the original string corresponding to given word in a sequence of the batch.
+
+        Character spans are returned as a CharSpan NamedTuple with:
+
+        - start: index of the first character in the original string
+        - end: index of the character following the last character in the original string
+
+        Can be called as:
+
+        - ``self.word_to_chars(word_index)`` if batch size is 1
+        - ``self.word_to_chars(batch_index, word_index)`` if batch size is greater or equal to 1
+
+        Args:
+            batch_or_word_index (:obj:`int`):
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                the word in the sequence
+            word_index (:obj:`int`, `optional`):
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
+                sequence.
+            sequence_index (:obj:`int`, `optional`, defaults to 0):
+                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
+                or 1) the provided word index belongs to.
+
+        Returns:
+            :obj:`CharSpan` or :obj:`List[CharSpan]`: Span(s) of the associated character or characters in the string.
+            CharSpan are NamedTuple with:
+
+                - start: index of the first character associated to the token in the original string
+                - end: index of the character following the last character associated to the token in the original
+                  string
+        """
+
+        if not self._encodings:
+            raise ValueError("word_to_chars() is not available when using Python based tokenizers")
+        if word_index is not None:
+            batch_index = batch_or_word_index
+        else:
+            batch_index = 0
+            word_index = batch_or_word_index
+        return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index)))
+
+    def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int:
+        """
+        Get the word in the original string corresponding to a character in the original string of a sequence of the
+        batch.
+
+        Can be called as:
+
+        - ``self.char_to_word(char_index)`` if batch size is 1
+        - ``self.char_to_word(batch_index, char_index)`` if batch size is greater than 1
+
+        This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
+        are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
+        words.
+
+        Args:
+            batch_or_char_index (:obj:`int`):
+                Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
+                the character in the original string.
+            char_index (:obj:`int`, `optional`):
+                If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the
+                original string.
+            sequence_index (:obj:`int`, `optional`, defaults to 0):
+                If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
+                or 1) the provided character index belongs to.
+
+
+        Returns:
+            :obj:`int` or :obj:`List[int]`: Index or indices of the associated encoded token(s).
+        """
+
+        if not self._encodings:
+            raise ValueError("char_to_word() is not available when using Python based tokenizers")
+        if char_index is not None:
+            batch_index = batch_or_char_index
+        else:
+            batch_index = 0
+            char_index = batch_or_char_index
+        return self._encodings[batch_index].char_to_word(char_index, sequence_index)
+
+    def convert_to_tensors(
+        self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
+    ):
+        """
+        Convert the inner content to tensors.
+
+        Args:
+            tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+                The type of tensors to use. If :obj:`str`, should be one of the values of the enum
+                :class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done.
+            prepend_batch_axis (:obj:`int`, `optional`, defaults to :obj:`False`):
+                Whether or not to add the batch dimension during the conversion.
+        """
+        if tensor_type is None:
+            return self
+
+        # Convert to TensorType
+        if not isinstance(tensor_type, TensorType):
+            tensor_type = TensorType(tensor_type)
+
+        # Get a function reference for the correct framework
+        if tensor_type == TensorType.PYTORCH:
+            as_tensor = torch.tensor
+            is_tensor = torch.is_tensor
+        else:
+            as_tensor = np.asarray
+            is_tensor = _is_numpy
+        # (mfuntowicz: This code is unreachable)
+        # else:
+        #     raise ImportError(
+        #         f"Unable to convert output to tensors format {tensor_type}"
+        #     )
+
+        # Do the tensor conversion in batch
+        for key, value in self.items():
+            try:
+                if prepend_batch_axis:
+                    value = [value]
+
+                if not is_tensor(value):
+                    tensor = as_tensor(value)
+
+                    # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
+                    # # at-least2d
+                    # if tensor.ndim > 2:
+                    #     tensor = tensor.squeeze(0)
+                    # elif tensor.ndim < 2:
+                    #     tensor = tensor[None, :]
+
+                    self[key] = tensor
+            except:  # noqa E722
+                if key == "overflowing_tokens":
+                    raise ValueError(
+                        "Unable to create tensor returning overflowing tokens of different lengths. "
+                        "Please see if a fast version of this tokenizer is available to have this feature available."
+                    )
+                raise ValueError(
+                    "Unable to create tensor, you should probably activate truncation and/or padding "
+                    "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
+                )
+
+        return self
+
+    # @torch_required
+    def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
+        """
+        Send all values to device by calling :obj:`v.to(device)` (PyTorch only).
+
+        Args:
+            device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on.
+
+        Returns:
+            :class:`~transformers.BatchEncoding`: The same instance after modification.
+        """
+
+        # This check catches things like APEX blindly calling "to" on all inputs to a module
+        # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
+        # into a HalfTensor
+        if isinstance(device, str) or _is_torch_device(device) or isinstance(device, int):
+            self.data = {k: v.to(device=device) for k, v in self.data.items()}
+        else:
+            logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
+        return self
+
+
+class SpecialTokensMixin:
+    """
+    A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` to
+    handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be
+    used to directly access these special tokens in a model-independent manner and allow to set and update the special
+    tokens.
+
+    Args:
+        bos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            A special token representing the beginning of a sentence.
+        eos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            A special token representing the end of a sentence.
+        unk_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            A special token representing an out-of-vocabulary token.
+        sep_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            A special token separating two different sentences in the same input (used by BERT for instance).
+        pad_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
+            attention mechanisms or loss computation.
+        cls_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            A special token representing the class of the input (used by BERT for instance).
+        mask_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
+            BERT).
+        additional_special_tokens (tuple or list of :obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            A tuple or a list of additional special tokens.
+    """
+
+    SPECIAL_TOKENS_ATTRIBUTES = [
+        "bos_token",
+        "eos_token",
+        "unk_token",
+        "sep_token",
+        "pad_token",
+        "cls_token",
+        "mask_token",
+        "additional_special_tokens",
+    ]
+
+    def __init__(self, verbose=True, **kwargs):
+        self._bos_token = None
+        self._eos_token = None
+        self._unk_token = None
+        self._sep_token = None
+        self._pad_token = None
+        self._cls_token = None
+        self._mask_token = None
+        self._pad_token_type_id = 0
+        self._additional_special_tokens = []
+        self.verbose = verbose
+
+        # We directly set the hidden value to allow initialization with special tokens
+        # which are not yet in the vocabulary. Necessary for serialization/de-serialization
+        # TODO clean this up at some point (probably by switching to fast tokenizers)
+        for key, value in kwargs.items():
+            if value is None:
+                continue
+            if key in self.SPECIAL_TOKENS_ATTRIBUTES:
+                if key == "additional_special_tokens":
+                    assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple"
+                    assert all(
+                        isinstance(t, (str, AddedToken)) for t in value
+                    ), "One of the tokens is not a string or an AddedToken"
+                    setattr(self, key, value)
+                elif isinstance(value, (str, AddedToken)):
+                    setattr(self, key, value)
+                else:
+                    raise TypeError(f"special token {key} has to be either str or AddedToken but got: {type(value)}")
+
+    def sanitize_special_tokens(self) -> int:
+        """
+        Make sure that all the special tokens attributes of the tokenizer (:obj:`tokenizer.mask_token`,
+        :obj:`tokenizer.cls_token`, etc.) are in the vocabulary.
+
+        Add the missing ones to the vocabulary if needed.
+
+        Return:
+            :obj:`int`: The number of tokens added in the vocabulary during the operation.
+        """
+        return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
+
+    def add_special_tokens(self, special_tokens_dict: Dict[str, Union[str, AddedToken]]) -> int:
+        """
+        Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If
+        special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
+        current vocabulary).
+
+        .. Note::
+            When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of
+            the model so that its embedding matrix matches the tokenizer.
+
+            In order to do that, please use the :meth:`~transformers.PreTrainedModel.resize_token_embeddings` method.
+
+        Using :obj:`add_special_tokens` will ensure your special tokens can be used in several ways:
+
+        - Special tokens are carefully handled by the tokenizer (they are never split).
+        - You can easily refer to special tokens using tokenizer class attributes like :obj:`tokenizer.cls_token`. This
+          makes it easy to develop model-agnostic training and fine-tuning scripts.
+
+        When possible, special tokens are already registered for provided pretrained models (for instance
+        :class:`~transformers.BertTokenizer` :obj:`cls_token` is already registered to be :obj`'[CLS]'` and XLM's one
+        is also registered to be :obj:`'</s>'`).
+
+        Args:
+            special_tokens_dict (dictionary `str` to `str` or :obj:`tokenizers.AddedToken`):
+                Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``,
+                ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
+                ``additional_special_tokens``].
+
+                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
+                assign the index of the ``unk_token`` to them).
+
+        Returns:
+            :obj:`int`: Number of tokens added to the vocabulary.
+
+        Examples::
+
+            # Let's see how to add a new classification token to GPT-2
+            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+            model = GPT2Model.from_pretrained('gpt2')
+
+            special_tokens_dict = {'cls_token': '<CLS>'}
+
+            num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
+            print('We have added', num_added_toks, 'tokens')
+            # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
+            model.resize_token_embeddings(len(tokenizer))
+
+            assert tokenizer.cls_token == '<CLS>'
+        """
+        if not special_tokens_dict:
+            return 0
+
+        added_tokens = 0
+        for key, value in special_tokens_dict.items():
+            assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token"
+
+            if self.verbose:
+                logger.info(f"Assigning {value} to the {key} key of the tokenizer")
+            setattr(self, key, value)
+
+            if key == "additional_special_tokens":
+                assert isinstance(value, (list, tuple)) and all(
+                    isinstance(t, (str, AddedToken)) for t in value
+                ), f"Tokens {value} for key {key} should all be str or AddedToken instances"
+                added_tokens += self.add_tokens(value, special_tokens=True)
+            else:
+                assert isinstance(
+                    value, (str, AddedToken)
+                ), f"Token {value} for key {key} should be a str or an AddedToken instance"
+                added_tokens += self.add_tokens([value], special_tokens=True)
+
+        return added_tokens
+
+    def add_tokens(
+        self, new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]], special_tokens: bool = False
+    ) -> int:
+        """
+        Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
+        it with indices starting from length of the current vocabulary.
+
+        .. Note::
+            When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of
+            the model so that its embedding matrix matches the tokenizer.
+
+            In order to do that, please use the :meth:`~transformers.PreTrainedModel.resize_token_embeddings` method.
+
+        Args:
+            new_tokens (:obj:`str`, :obj:`tokenizers.AddedToken` or a list of `str` or :obj:`tokenizers.AddedToken`):
+                Tokens are only added if they are not already in the vocabulary. :obj:`tokenizers.AddedToken` wraps a
+                string token to let you personalize its behavior: whether this token should only match against a single
+                word, whether this token should strip all potential whitespaces on the left side, whether this token
+                should strip all potential whitespaces on the right side, etc.
+            special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Can be used to specify if the token is a special token. This mostly change the normalization behavior
+                (special tokens like CLS or [MASK] are usually not lower-cased for instance).
+
+                See details for :obj:`tokenizers.AddedToken` in HuggingFace tokenizers library.
+
+        Returns:
+            :obj:`int`: Number of tokens added to the vocabulary.
+
+        Examples::
+
+            # Let's see how to increase the vocabulary of Bert model and tokenizer
+            tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
+            model = BertModel.from_pretrained('bert-base-uncased')
+
+            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
+            print('We have added', num_added_toks, 'tokens')
+             # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
+            model.resize_token_embeddings(len(tokenizer))
+        """
+        if not new_tokens:
+            return 0
+
+        if not isinstance(new_tokens, (list, tuple)):
+            new_tokens = [new_tokens]
+
+        return self._add_tokens(new_tokens, special_tokens=special_tokens)
+
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int:
+        raise NotImplementedError
+
+    @property
+    def bos_token(self) -> str:
+        """
+        :obj:`str`: Beginning of sentence token. Log an error if used while not having been set.
+        """
+        if self._bos_token is None and self.verbose:
+            logger.error("Using bos_token, but it is not set yet.")
+            return None
+        return str(self._bos_token)
+
+    @property
+    def eos_token(self) -> str:
+        """
+        :obj:`str`: End of sentence token. Log an error if used while not having been set.
+        """
+        if self._eos_token is None and self.verbose:
+            logger.error("Using eos_token, but it is not set yet.")
+            return None
+        return str(self._eos_token)
+
+    @property
+    def unk_token(self) -> str:
+        """
+        :obj:`str`: Unknown token. Log an error if used while not having been set.
+        """
+        if self._unk_token is None and self.verbose:
+            logger.error("Using unk_token, but it is not set yet.")
+            return None
+        return str(self._unk_token)
+
+    @property
+    def sep_token(self) -> str:
+        """
+        :obj:`str`: Separation token, to separate context and query in an input sequence. Log an error if used while
+        not having been set.
+        """
+        if self._sep_token is None and self.verbose:
+            logger.error("Using sep_token, but it is not set yet.")
+            return None
+        return str(self._sep_token)
+
+    @property
+    def pad_token(self) -> str:
+        """
+        :obj:`str`: Padding token. Log an error if used while not having been set.
+        """
+        if self._pad_token is None and self.verbose:
+            logger.error("Using pad_token, but it is not set yet.")
+            return None
+        return str(self._pad_token)
+
+    @property
+    def cls_token(self) -> str:
+        """
+        :obj:`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the
+        full depth of the model. Log an error if used while not having been set.
+        """
+        if self._cls_token is None and self.verbose:
+            logger.error("Using cls_token, but it is not set yet.")
+            return None
+        return str(self._cls_token)
+
+    @property
+    def mask_token(self) -> str:
+        """
+        :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
+        not having been set.
+        """
+        if self._mask_token is None and self.verbose:
+            logger.error("Using mask_token, but it is not set yet.")
+            return None
+        return str(self._mask_token)
+
+    @property
+    def additional_special_tokens(self) -> List[str]:
+        """
+        :obj:`List[str]`: All the additional special tokens you may want to use. Log an error if used while not having
+        been set.
+        """
+        if self._additional_special_tokens is None and self.verbose:
+            logger.error("Using additional_special_tokens, but it is not set yet.")
+            return None
+        return [str(tok) for tok in self._additional_special_tokens]
+
+    @bos_token.setter
+    def bos_token(self, value):
+        self._bos_token = value
+
+    @eos_token.setter
+    def eos_token(self, value):
+        self._eos_token = value
+
+    @unk_token.setter
+    def unk_token(self, value):
+        self._unk_token = value
+
+    @sep_token.setter
+    def sep_token(self, value):
+        self._sep_token = value
+
+    @pad_token.setter
+    def pad_token(self, value):
+        self._pad_token = value
+
+    @cls_token.setter
+    def cls_token(self, value):
+        self._cls_token = value
+
+    @mask_token.setter
+    def mask_token(self, value):
+        self._mask_token = value
+
+    @additional_special_tokens.setter
+    def additional_special_tokens(self, value):
+        self._additional_special_tokens = value
+
+    @property
+    def bos_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns :obj:`None` if the token
+        has not been set.
+        """
+        if self._bos_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.bos_token)
+
+    @property
+    def eos_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the end of sentence token in the vocabulary. Returns :obj:`None` if the token has
+        not been set.
+        """
+        if self._eos_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.eos_token)
+
+    @property
+    def unk_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the unknown token in the vocabulary. Returns :obj:`None` if the token has not been
+        set.
+        """
+        if self._unk_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.unk_token)
+
+    @property
+    def sep_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
+        sequence. Returns :obj:`None` if the token has not been set.
+        """
+        if self._sep_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.sep_token)
+
+    @property
+    def pad_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the padding token in the vocabulary. Returns :obj:`None` if the token has not been
+        set.
+        """
+        if self._pad_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.pad_token)
+
+    @property
+    def pad_token_type_id(self) -> int:
+        """
+        :obj:`int`: Id of the padding token type in the vocabulary.
+        """
+        return self._pad_token_type_id
+
+    @property
+    def cls_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input
+        sequence leveraging self-attention along the full depth of the model.
+
+        Returns :obj:`None` if the token has not been set.
+        """
+        if self._cls_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.cls_token)
+
+    @property
+    def mask_token_id(self) -> Optional[int]:
+        """
+        :obj:`Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
+        modeling. Returns :obj:`None` if the token has not been set.
+        """
+        if self._mask_token is None:
+            return None
+        return self.convert_tokens_to_ids(self.mask_token)
+
+    @property
+    def additional_special_tokens_ids(self) -> List[int]:
+        """
+        :obj:`List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not
+        having been set.
+        """
+        return self.convert_tokens_to_ids(self.additional_special_tokens)
+
+    @bos_token_id.setter
+    def bos_token_id(self, value):
+        self._bos_token = self.convert_tokens_to_ids(value)
+
+    @eos_token_id.setter
+    def eos_token_id(self, value):
+        self._eos_token = self.convert_tokens_to_ids(value)
+
+    @unk_token_id.setter
+    def unk_token_id(self, value):
+        self._unk_token = self.convert_tokens_to_ids(value)
+
+    @sep_token_id.setter
+    def sep_token_id(self, value):
+        self._sep_token = self.convert_tokens_to_ids(value)
+
+    @pad_token_id.setter
+    def pad_token_id(self, value):
+        self._pad_token = self.convert_tokens_to_ids(value)
+
+    @cls_token_id.setter
+    def cls_token_id(self, value):
+        self._cls_token = self.convert_tokens_to_ids(value)
+
+    @mask_token_id.setter
+    def mask_token_id(self, value):
+        self._mask_token = self.convert_tokens_to_ids(value)
+
+    @additional_special_tokens_ids.setter
+    def additional_special_tokens_ids(self, values):
+        self._additional_special_tokens = [self.convert_tokens_to_ids(value) for value in values]
+
+    @property
+    def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
+        """
+        :obj:`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (:obj:`cls_token`,
+        :obj:`unk_token`, etc.) to their values (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
+
+        Convert potential tokens of :obj:`tokenizers.AddedToken` type to string.
+        """
+        set_attr = {}
+        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
+            attr_value = getattr(self, "_" + attr)
+            if attr_value:
+                set_attr[attr] = (
+                    type(attr_value)(str(attr_value_sub) for attr_value_sub in attr_value)
+                    if isinstance(attr_value, (list, tuple))
+                    else str(attr_value)
+                )
+        return set_attr
+
+    @property
+    def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
+        """
+        :obj:`Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: A dictionary
+        mapping special token class attributes (:obj:`cls_token`, :obj:`unk_token`, etc.) to their values
+        (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
+
+        Don't convert tokens of :obj:`tokenizers.AddedToken` type to string so they can be used to control more finely
+        how special tokens are tokenized.
+        """
+        set_attr = {}
+        for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
+            attr_value = getattr(self, "_" + attr)
+            if attr_value:
+                set_attr[attr] = attr_value
+        return set_attr
+
+    @property
+    def all_special_tokens(self) -> List[str]:
+        """
+        :obj:`List[str]`: All the special tokens (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.) mapped to class attributes.
+
+        Convert tokens of :obj:`tokenizers.AddedToken` type to string.
+        """
+        all_toks = [str(s) for s in self.all_special_tokens_extended]
+        return all_toks
+
+    @property
+    def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
+        """
+        :obj:`List[Union[str, tokenizers.AddedToken]]`: All the special tokens (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.)
+        mapped to class attributes.
+
+        Don't convert tokens of :obj:`tokenizers.AddedToken` type to string so they can be used to control more finely
+        how special tokens are tokenized.
+        """
+        all_toks = []
+        set_attr = self.special_tokens_map_extended
+        for attr_value in set_attr.values():
+            all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value])
+        all_toks = list(OrderedDict.fromkeys(all_toks))
+        return all_toks
+
+    @property
+    def all_special_ids(self) -> List[int]:
+        """
+        :obj:`List[int]`: List the ids of the special tokens(:obj:`'<unk>'`, :obj:`'<cls>'`, etc.) mapped to class
+        attributes.
+        """
+        all_toks = self.all_special_tokens
+        all_ids = self.convert_tokens_to_ids(all_toks)
+        return all_ids
+
+
+ENCODE_KWARGS_DOCSTRING = r"""
+            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether or not to encode the sequences with the special tokens relative to their model.
+            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+                Activates and controls padding. Accepts the following values:
+
+                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                  single sequence if provided).
+                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided.
+                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                  different lengths).
+            truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
+                Activates and controls truncation. Accepts the following values:
+
+                * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
+                  :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
+                  provided. This will truncate token by token, removing a token from the longest sequence in the pair
+                  if a pair of sequences (or a batch of pairs) is provided.
+                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+                  the maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
+                  sequence lengths greater than the model maximum admissible input size).
+            max_length (:obj:`int`, `optional`):
+                Controls the maximum length to use by one of the truncation/padding parameters.
+
+                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
+                length is required by one of the truncation/padding parameters. If the model has no specific maximum
+                input length (like XLNet) truncation/padding to a maximum length will be deactivated.
+            stride (:obj:`int`, `optional`, defaults to 0):
+                If set to a number along with :obj:`max_length`, the overflowing tokens returned when
+                :obj:`return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+                returned to provide some overlap between truncated and overflowing sequences. The value of this
+                argument defines the number of overlapping tokens.
+            is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`, the
+                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
+                which it will tokenize. This is useful for NER or token classification.
+            pad_to_multiple_of (:obj:`int`, `optional`):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
+            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
+                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
+                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+"""
+
+ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
+            return_token_type_ids (:obj:`bool`, `optional`):
+                Whether to return token type IDs. If left to the default, will return the token type IDs according to
+                the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+                `What are token type IDs? <../glossary.html#token-type-ids>`__
+            return_attention_mask (:obj:`bool`, `optional`):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
+                of pairs) is provided with :obj:`truncation_strategy = longest_first` or :obj:`True`, an error is
+                raised instead of returning overflowing tokens.
+            return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to return special tokens mask information.
+            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to return :obj:`(char_start, char_end)` for each token.
+
+                This is only available on fast tokenizers inheriting from
+                :class:`~transformers.PreTrainedTokenizerFast`, if using Python's tokenizer, this method will raise
+                :obj:`NotImplementedError`.
+            return_length  (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to return the lengths of the encoded inputs.
+            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether or not to print more information and warnings.
+            **kwargs: passed to the :obj:`self.tokenize()` method
+
+        Return:
+            :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model.
+
+              `What are input IDs? <../glossary.html#input-ids>`__
+
+            - **token_type_ids** -- List of token type ids to be fed to a model (when :obj:`return_token_type_ids=True`
+              or if `"token_type_ids"` is in :obj:`self.model_input_names`).
+
+              `What are token type IDs? <../glossary.html#token-type-ids>`__
+
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
+
+              `What are attention masks? <../glossary.html#attention-mask>`__
+
+            - **overflowing_tokens** -- List of overflowing tokens sequences (when a :obj:`max_length` is specified and
+              :obj:`return_overflowing_tokens=True`).
+            - **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and
+              :obj:`return_overflowing_tokens=True`).
+            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
+              regular sequence tokens (when :obj:`add_special_tokens=True` and :obj:`return_special_tokens_mask=True`).
+            - **length** -- The length of the inputs (when :obj:`return_length=True`)
+"""
+
+INIT_TOKENIZER_DOCSTRING = r"""
+    Class attributes (overridden by derived classes)
+
+        - **vocab_files_names** (:obj:`Dict[str, str]`) -- A dictionary with, as keys, the ``__init__`` keyword name of
+          each vocabulary file required by the model, and as associated values, the filename for saving the associated
+          file (string).
+        - **pretrained_vocab_files_map** (:obj:`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
+          high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the
+          low-level being the :obj:`short-cut-names` of the pretrained models with, as associated values, the
+          :obj:`url` to the associated pretrained vocabulary file.
+        - **max_model_input_sizes** (:obj:`Dict[str, Optional[int]]`) -- A dictionary with, as keys, the
+          :obj:`short-cut-names` of the pretrained models, and as associated values, the maximum length of the sequence
+          inputs of this model, or :obj:`None` if the model has no maximum input size.
+        - **pretrained_init_configuration** (:obj:`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
+          :obj:`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments
+          to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the
+          tokenizer with the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`
+          method.
+        - **model_input_names** (:obj:`List[str]`) -- A list of inputs expected in the forward pass of the model.
+        - **padding_side** (:obj:`str`) -- The default value for the side on which the model should have padding
+          applied. Should be :obj:`'right'` or :obj:`'left'`.
+
+    Args:
+        model_max_length (:obj:`int`, `optional`):
+            The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
+            loaded with :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`, this
+            will be set to the value stored for the associated model in ``max_model_input_sizes`` (see above). If no
+            value is provided, will default to VERY_LARGE_INTEGER (:obj:`int(1e30)`).
+        padding_side: (:obj:`str`, `optional`):
+            The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+            Default value is picked from the class attribute of the same name.
+        model_input_names (:obj:`List[string]`, `optional`):
+            The list of inputs accepted by the forward pass of the model (like :obj:`"token_type_ids"` or
+            :obj:`"attention_mask"`). Default value is picked from the class attribute of the same name.
+        bos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            A special token representing the beginning of a sentence. Will be associated to ``self.bos_token`` and
+            ``self.bos_token_id``.
+        eos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            A special token representing the end of a sentence. Will be associated to ``self.eos_token`` and
+            ``self.eos_token_id``.
+        unk_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            A special token representing an out-of-vocabulary token. Will be associated to ``self.unk_token`` and
+            ``self.unk_token_id``.
+        sep_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            A special token separating two different sentences in the same input (used by BERT for instance). Will be
+            associated to ``self.sep_token`` and ``self.sep_token_id``.
+        pad_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
+            attention mechanisms or loss computation. Will be associated to ``self.pad_token`` and
+            ``self.pad_token_id``.
+        cls_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            A special token representing the class of the input (used by BERT for instance). Will be associated to
+            ``self.cls_token`` and ``self.cls_token_id``.
+        mask_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            A special token representing a masked token (used by masked-language modeling pretraining objectives, like
+            BERT). Will be associated to ``self.mask_token`` and ``self.mask_token_id``.
+        additional_special_tokens (tuple or list of :obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the
+            tokenization process. Will be associated to ``self.additional_special_tokens`` and
+            ``self.additional_special_tokens_ids``.
+"""
+
+
+@add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
+class PreTrainedTokenizerBase(SpecialTokensMixin):
+    """
+    Base class for :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`.
+
+    Handles shared (mostly boiler plate) methods for those two classes.
+    """
+
+    vocab_files_names: Dict[str, str] = {}
+    pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {}
+    pretrained_init_configuration: Dict[str, Dict[str, Any]] = {}
+    max_model_input_sizes: Dict[str, Optional[int]] = {}
+
+    # first name has to correspond to main model input name
+    # to make sure `tokenizer.pad(...)` works correctly
+    model_input_names: List[str] = ["input_ids", "token_type_ids", "attention_mask"]
+    padding_side: str = "right"
+    slow_tokenizer_class = None
+
+    def __init__(self, **kwargs):
+        # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``)
+        self.init_inputs = ()
+        self.init_kwargs = copy.deepcopy(kwargs)
+        self.name_or_path = kwargs.pop("name_or_path", "")
+
+        # For backward compatibility we fallback to set model_max_length from max_len if provided
+        model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None))
+        self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER
+
+        # Padding side is right by default and overridden in subclasses. If specified in the kwargs, it is changed.
+        self.padding_side = kwargs.pop("padding_side", self.padding_side)
+        assert self.padding_side in [
+            "right",
+            "left",
+        ], f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}"
+        self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
+
+        self.deprecation_warnings = (
+            {}
+        )  # Use to store when we have already noticed a deprecation warning (avoid overlogging).
+
+        super().__init__(**kwargs)
+
+    @property
+    def max_len_single_sentence(self) -> int:
+        """
+        :obj:`int`: The maximum length of a sentence that can be fed to the model.
+        """
+        return self.model_max_length - self.num_special_tokens_to_add(pair=False)
+
+    @property
+    def max_len_sentences_pair(self) -> int:
+        """
+        :obj:`int`: The maximum combined length of a pair of sentences that can be fed to the model.
+        """
+        return self.model_max_length - self.num_special_tokens_to_add(pair=True)
+
+    @max_len_single_sentence.setter
+    def max_len_single_sentence(self, value) -> int:
+        # For backward compatibility, allow to try to setup 'max_len_single_sentence'.
+        if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose:
+            if not self.deprecation_warnings.get("max_len_single_sentence", False):
+                logger.warning(
+                    "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
+                )
+            self.deprecation_warnings["max_len_single_sentence"] = True
+        else:
+            raise ValueError(
+                "Setting 'max_len_single_sentence' is now deprecated. " "This value is automatically set up."
+            )
+
+    @max_len_sentences_pair.setter
+    def max_len_sentences_pair(self, value) -> int:
+        # For backward compatibility, allow to try to setup 'max_len_sentences_pair'.
+        if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose:
+            if not self.deprecation_warnings.get("max_len_sentences_pair", False):
+                logger.warning(
+                    "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
+                )
+            self.deprecation_warnings["max_len_sentences_pair"] = True
+        else:
+            raise ValueError(
+                "Setting 'max_len_sentences_pair' is now deprecated. " "This value is automatically set up."
+            )
+
+    def __repr__(self) -> str:
+        return (
+            f"{'PreTrainedTokenizerFast' if self.is_fast else 'PreTrainedTokenizer'}(name_or_path='{self.name_or_path}', "
+            f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, is_fast={self.is_fast}, "
+            f"padding_side='{self.padding_side}', special_tokens={self.special_tokens_map_extended})"
+        )
+
+    def get_vocab(self) -> Dict[str, int]:
+        """
+        Returns the vocabulary as a dictionary of token to index.
+
+        :obj:`tokenizer.get_vocab()[token]` is equivalent to :obj:`tokenizer.convert_tokens_to_ids(token)` when
+        :obj:`token` is in the vocab.
+
+        Returns:
+            :obj:`Dict[str, int]`: The vocabulary.
+        """
+        raise NotImplementedError()
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs):
+        r"""
+        Instantiate a :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` (or a derived class) from
+        a predefined tokenizer.
+
+        Args:
+            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+                Can be either:
+
+                - A string, the `model id` of a predefined tokenizer hosted inside a model repo on huggingface.co.
+                  Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a
+                  user or organization name, like ``dbmdz/bert-base-german-cased``.
+                - A path to a `directory` containing vocabulary files required by the tokenizer, for instance saved
+                  using the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`
+                  method, e.g., ``./my_model_directory/``.
+                - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary
+                  file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,
+                  ``./my_model_directory/vocab.txt``.
+            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
+                standard cache should not be used.
+            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to force the (re-)download the vocabulary files and override the cached versions if they
+                exist.
+            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to delete incompletely received files. Attempt to resume the download if such a file
+                exists.
+            proxies (:obj:`Dict[str, str], `optional`):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            use_auth_token (:obj:`str` or `bool`, `optional`):
+                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
+                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
+            local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to only rely on local files and not to attempt to download any files.
+            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
+                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                identifier allowed by git.
+            subfolder (:obj:`str`, `optional`):
+                In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
+                facebook/rag-token-base), specify it here.
+            inputs (additional positional arguments, `optional`):
+                Will be passed along to the Tokenizer ``__init__`` method.
+            kwargs (additional keyword arguments, `optional`):
+                Will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like
+                ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``,
+                ``mask_token``, ``additional_special_tokens``. See parameters in the ``__init__`` for more details.
+
+        .. note::
+
+            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+
+        Examples::
+
+            # We can't instantiate directly the base class `PreTrainedTokenizerBase` so let's show our examples on a derived class: BertTokenizer
+            # Download vocabulary from huggingface.co and cache.
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+            # Download vocabulary from huggingface.co (user-uploaded) and cache.
+            tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
+
+            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
+            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
+
+            # If the tokenizer uses a single vocabulary file, you can point directly to this file
+            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
+
+            # You can link tokens to special vocabulary when instantiating
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
+            # You should be sure '<unk>' is in the vocabulary when doing that.
+            # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
+            assert tokenizer.unk_token == '<unk>'
+
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        use_auth_token = kwargs.pop("use_auth_token", None)
+        revision = kwargs.pop("revision", None)
+        subfolder = kwargs.pop("subfolder", None)
+        from_pipeline = kwargs.pop("_from_pipeline", None)
+        from_auto_class = kwargs.pop("_from_auto", False)
+
+        user_agent = {"file_type": "tokenizer", "from_auto_class": from_auto_class, "is_fast": "Fast" in cls.__name__}
+        if from_pipeline is not None:
+            user_agent["using_pipeline"] = from_pipeline
+
+        if is_offline_mode() and not local_files_only:
+            logger.info("Offline mode: forcing local_files_only=True")
+            local_files_only = True
+
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        vocab_files = {}
+        init_configuration = {}
+
+        if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+            if len(cls.vocab_files_names) > 1:
+                raise ValueError(
+                    f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not "
+                    "supported for this tokenizer. Use a model identifier or the path to a directory instead."
+                )
+            logger.warn(
+                f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and "
+                "won't be possible anymore in v5. Use a model identifier or the path to a directory instead.",
+                FutureWarning,
+            )
+            file_id = list(cls.vocab_files_names.keys())[0]
+            vocab_files[file_id] = pretrained_model_name_or_path
+        else:
+            raise RuntimeError("At this point pretrained_model_name_or_path is either a directory or a model identifier name, ", 
+                                "which is not supported in fastNLP now.")
+
+        # Get files from url, cache, or disk depending on the case
+        resolved_vocab_files = {}
+        unresolved_files = []
+        for file_id, file_path in vocab_files.items():
+            if file_path is None:
+                resolved_vocab_files[file_id] = None
+            else:
+                try:
+                    resolved_vocab_files[file_id] = cached_path(
+                        file_path,
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        resume_download=resume_download,
+                        local_files_only=local_files_only,
+                        use_auth_token=use_auth_token,
+                        user_agent=user_agent,
+                    )
+
+                except FileNotFoundError as error:
+                    if local_files_only:
+                        unresolved_files.append(file_id)
+                    else:
+                        raise error
+
+                except requests.exceptions.HTTPError as err:
+                    if "404 Client Error" in str(err):
+                        logger.debug(err)
+                        resolved_vocab_files[file_id] = None
+                    else:
+                        raise err
+
+        if len(unresolved_files) > 0:
+            logger.info(
+                f"Can't load following files from cache: {unresolved_files} and cannot check if these "
+                "files are necessary for the tokenizer to operate."
+            )
+
+        if all(full_file_name is None for full_file_name in resolved_vocab_files.values()):
+            msg = (
+                f"Can't load tokenizer for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
+                f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
+                f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing relevant tokenizer files\n\n"
+            )
+
+            if revision is not None:
+                msg += f"- or '{revision}' is a valid git identifier (branch name, a tag name, or a commit id) that exists for this model name as listed on its model page on 'https://huggingface.co/models'\n\n"
+
+            raise EnvironmentError(msg)
+
+        for file_id, file_path in vocab_files.items():
+            if file_id not in resolved_vocab_files:
+                continue
+
+            if file_path == resolved_vocab_files[file_id]:
+                logger.info(f"loading file {file_path}")
+            else:
+                logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
+
+        return cls._from_pretrained(
+            resolved_vocab_files,
+            pretrained_model_name_or_path,
+            init_configuration,
+            *init_inputs,
+            use_auth_token=use_auth_token,
+            **kwargs,
+        )
+
+    @classmethod
+    def _from_pretrained(
+        cls,
+        resolved_vocab_files,
+        pretrained_model_name_or_path,
+        init_configuration,
+        *init_inputs,
+        use_auth_token=None,
+        **kwargs
+    ):
+        # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
+        # file or if `from_slow` is set to True.
+        from_slow = kwargs.get("from_slow", False)
+        has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
+        if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None:
+            slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
+                copy.deepcopy(resolved_vocab_files),
+                pretrained_model_name_or_path,
+                copy.deepcopy(init_configuration),
+                *init_inputs,
+                **(copy.deepcopy(kwargs)),
+            )
+        else:
+            slow_tokenizer = None
+
+        # Prepare tokenizer initialization kwargs
+        # Did we saved some inputs and kwargs to reload ?
+        tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
+        if tokenizer_config_file is not None:
+            with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
+                init_kwargs = json.load(tokenizer_config_handle)
+            # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.
+            config_tokenizer_class = init_kwargs.get("tokenizer_class")
+            init_kwargs.pop("tokenizer_class", None)
+            saved_init_inputs = init_kwargs.pop("init_inputs", ())
+            if not init_inputs:
+                init_inputs = saved_init_inputs
+        else:
+            config_tokenizer_class = None
+            init_kwargs = init_configuration
+
+        if config_tokenizer_class is None:
+            from .models.auto.configuration_auto import AutoConfig  # tests_ignore
+
+            # Second attempt. If we have not yet found tokenizer_class, let's try to use the config.
+            try:
+                config = AutoConfig.from_pretrained(pretrained_model_name_or_path, use_auth_token=use_auth_token)
+                config_tokenizer_class = config.tokenizer_class
+            except (OSError, ValueError, KeyError):
+                # skip if an error occurred.
+                config = None
+            if config_tokenizer_class is None:
+                # Third attempt. If we have not yet found the original type of the tokenizer,
+                # we are loading we see if we can infer it from the type of the configuration file
+                from .models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES  # tests_ignore
+
+                if hasattr(config, "model_type"):
+                    model_type = config.model_type
+                else:
+                    # Fallback: use pattern matching on the string.
+                    model_type = None
+                    for pattern in TOKENIZER_MAPPING_NAMES.keys():
+                        if pattern in str(pretrained_model_name_or_path):
+                            model_type = pattern
+                            break
+
+                if model_type is not None:
+                    config_tokenizer_class, config_tokenizer_class_fast = TOKENIZER_MAPPING_NAMES.get(
+                        model_type, (None, None)
+                    )
+                    if config_tokenizer_class is None:
+                        config_tokenizer_class = config_tokenizer_class_fast
+
+        if config_tokenizer_class is not None:
+            if cls.__name__.replace("Fast", "") != config_tokenizer_class.replace("Fast", ""):
+                logger.warning(
+                    "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. "
+                    "It may result in unexpected tokenization. \n"
+                    f"The tokenizer class you load from this checkpoint is '{config_tokenizer_class}'. \n"
+                    f"The class this function is called from is '{cls.__name__}'."
+                )
+
+        # Update with newly provided kwargs
+        init_kwargs.update(kwargs)
+
+        # Convert AddedTokens serialized as dict to class instances
+        def convert_added_tokens(obj: Union[AddedToken, Any]):
+            if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken":
+                obj.pop("__type")
+                return AddedToken(**obj)
+            elif isinstance(obj, (list, tuple)):
+                return list(convert_added_tokens(o) for o in obj)
+            elif isinstance(obj, dict):
+                return {k: convert_added_tokens(v) for k, v in obj.items()}
+            return obj
+
+        init_kwargs = convert_added_tokens(init_kwargs)
+
+        # Set max length if needed
+        if pretrained_model_name_or_path in cls.max_model_input_sizes:
+            # if we're using a pretrained model, ensure the tokenizer
+            # wont index sequences longer than the number of positional embeddings
+            model_max_length = cls.max_model_input_sizes[pretrained_model_name_or_path]
+            if model_max_length is not None and isinstance(model_max_length, (int, float)):
+                init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)
+
+        # Merge resolved_vocab_files arguments in init_kwargs.
+        added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
+        for args_name, file_path in resolved_vocab_files.items():
+            if args_name not in init_kwargs:
+                init_kwargs[args_name] = file_path
+
+        if slow_tokenizer is not None:
+            init_kwargs["__slow_tokenizer"] = slow_tokenizer
+
+        init_kwargs["name_or_path"] = pretrained_model_name_or_path
+
+        # Instantiate tokenizer.
+        try:
+            tokenizer = cls(*init_inputs, **init_kwargs)
+        except OSError:
+            raise OSError(
+                "Unable to load vocabulary from file. "
+                "Please check that the provided vocabulary is accessible and not corrupted."
+            )
+
+        # Save inputs and kwargs for saving and re-loading with ``save_pretrained``
+        # Removed: Now done at the base class level
+        # tokenizer.init_inputs = init_inputs
+        # tokenizer.init_kwargs = init_kwargs
+
+        # If there is a complementary special token map, load it
+        special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
+        if special_tokens_map_file is not None:
+            with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
+                special_tokens_map = json.load(special_tokens_map_handle)
+            for key, value in special_tokens_map.items():
+                if key in kwargs and kwargs[key]:
+                    # This value has already been redefined by the kwargs
+                    # We keep this new value and ignore the one stored in the special_tokens_map_file
+
+                    continue
+
+                if isinstance(value, dict):
+                    value = AddedToken(**value)
+                elif isinstance(value, list):
+                    value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
+                setattr(tokenizer, key, value)
+
+        # Add supplementary tokens.
+        special_tokens = tokenizer.all_special_tokens
+        if added_tokens_file is not None:
+            with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
+                added_tok_encoder = json.load(added_tokens_handle)
+
+            # Sort added tokens by index
+            added_tok_encoder_sorted = list(sorted(added_tok_encoder.items(), key=lambda x: x[1]))
+
+            for token, index in added_tok_encoder_sorted:
+                if has_tokenizer_file and index != len(tokenizer) and tokenizer.convert_tokens_to_ids(token) != index:
+                    # Tokenizer fast: added token needs to either be in the vocabulary with the proper index or the
+                    # index is the current length of the tokenizer (not in vocabulary)
+                    raise ValueError(
+                        f"Wrong index found for {token}: should be {tokenizer.convert_tokens_to_ids(token)} but found "
+                        f"{index}."
+                    )
+                elif not has_tokenizer_file and index != len(tokenizer):
+                    # Tokenizer slow: added token cannot already be in the vocabulary so its index needs to be the
+                    # current length of the tokenizer.
+                    raise ValueError(
+                        f"Non-consecutive added token '{token}' found. "
+                        f"Should have index {len(tokenizer)} but has index {index} in saved vocabulary."
+                    )
+
+                # Safe to call on a tokenizer fast even if token already there.
+                tokenizer.add_tokens(token, special_tokens=bool(token in special_tokens))
+
+        # Check all our special tokens are registered as "no split" token (we don't cut them) and are in the vocab
+        added_tokens = tokenizer.sanitize_special_tokens()
+        if added_tokens:
+            logger.warning(
+                "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained."
+            )
+
+        return tokenizer
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        legacy_format: Optional[bool] = None,
+        filename_prefix: Optional[str] = None,
+        **kwargs,
+    ) -> Tuple[str]:
+        """
+        Save the full tokenizer state.
+
+
+        This method make sure the full tokenizer can then be re-loaded using the
+        :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method..
+
+        .. Warning::
+           This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
+           modifying :obj:`tokenizer.do_lower_case` after creation).
+
+        Args:
+            save_directory (:obj:`str` or :obj:`os.PathLike`): The path to a directory where the tokenizer will be saved.
+            legacy_format (:obj:`bool`, `optional`):
+                Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON
+                format as well as in legacy format if it exists, i.e. with tokenizer specific vocabulary and a separate
+                added_tokens files.
+
+                If :obj:`False`, will only save the tokenizer in the unified JSON format. This format is incompatible
+                with "slow" tokenizers (not powered by the `tokenizers` library), so the tokenizer will not be able to
+                be loaded in the corresponding "slow" tokenizer.
+
+                If :obj:`True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a
+                value error is raised.
+            filename_prefix: (:obj:`str`, `optional`):
+                A prefix to add to the names of the files saved by the tokenizer.
+            push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to push your model to the Hugging Face model hub after saving it.
+
+                .. warning::
+
+                    Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with
+                    :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are
+                    pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory
+                    instead.
+
+        Returns:
+            A tuple of :obj:`str`: The files saved.
+        """
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        special_tokens_map_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
+        )
+        tokenizer_config_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
+        )
+
+        tokenizer_config = copy.deepcopy(self.init_kwargs)
+        if len(self.init_inputs) > 0:
+            tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
+        for file_id in self.vocab_files_names.keys():
+            tokenizer_config.pop(file_id, None)
+
+        # Sanitize AddedTokens
+        def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
+            if isinstance(obj, AddedToken):
+                out = obj.__getstate__()
+                if add_type_field:
+                    out["__type"] = "AddedToken"
+                return out
+            elif isinstance(obj, (list, tuple)):
+                return list(convert_added_tokens(o, add_type_field=add_type_field) for o in obj)
+            elif isinstance(obj, dict):
+                return {k: convert_added_tokens(v, add_type_field=add_type_field) for k, v in obj.items()}
+            return obj
+
+        # add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization
+        tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)
+
+        # Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
+        tokenizer_class = self.__class__.__name__
+        # Remove the Fast at the end unless we have a special `PreTrainedTokenizerFast`
+        if tokenizer_class.endswith("Fast") and tokenizer_class != "PreTrainedTokenizerFast":
+            tokenizer_class = tokenizer_class[:-4]
+        tokenizer_config["tokenizer_class"] = tokenizer_class
+
+        with open(tokenizer_config_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(tokenizer_config, ensure_ascii=False))
+        logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
+
+        # Sanitize AddedTokens in special_tokens_map
+        write_dict = convert_added_tokens(self.special_tokens_map_extended, add_type_field=False)
+        with open(special_tokens_map_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(write_dict, ensure_ascii=False))
+        logger.info(f"Special tokens file saved in {special_tokens_map_file}")
+
+        file_names = (tokenizer_config_file, special_tokens_map_file)
+
+        save_files = self._save_pretrained(
+            save_directory=save_directory,
+            file_names=file_names,
+            legacy_format=legacy_format,
+            filename_prefix=filename_prefix,
+        )
+
+        return save_files
+
+    def _save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        file_names: Tuple[str],
+        legacy_format: Optional[bool] = None,
+        filename_prefix: Optional[str] = None,
+    ) -> Tuple[str]:
+        """
+        Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
+
+        Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
+        specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
+        """
+        if legacy_format is False:
+            raise ValueError(
+                "Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format."
+            )
+
+        save_directory = str(save_directory)
+
+        added_tokens_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
+        )
+        added_vocab = self.get_added_vocab()
+        if added_vocab:
+            with open(added_tokens_file, "w", encoding="utf-8") as f:
+                out_str = json.dumps(added_vocab, ensure_ascii=False)
+                f.write(out_str)
+                logger.info(f"added tokens file saved in {added_tokens_file}")
+
+        vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
+
+        return file_names + vocab_files + (added_tokens_file,)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        """
+        Save only the vocabulary of the tokenizer (vocabulary + added tokens).
+
+        This method won't save the configuration and special token mappings of the tokenizer. Use
+        :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save the whole state of the tokenizer.
+
+        Args:
+            save_directory (:obj:`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (:obj:`str`, `optional`):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        raise NotImplementedError
+
+    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
+        """
+        Converts a string in a sequence of tokens, replacing unknown tokens with the :obj:`unk_token`.
+
+        Args:
+            text (:obj:`str`):
+                The sequence to be encoded.
+            pair (:obj:`str`, `optional`):
+                A second sequence to be encoded with the first.
+            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to add the special tokens associated with the corresponding model.
+            kwargs (additional keyword arguments, `optional`):
+                Will be passed to the underlying model specific encode method. See details in
+                :meth:`~transformers.PreTrainedTokenizerBase.__call__`
+
+        Returns:
+            :obj:`List[str]`: The list of tokens.
+        """
+        raise NotImplementedError
+
+    @add_end_docstrings(
+        ENCODE_KWARGS_DOCSTRING,
+        """
+            **kwargs: Passed along to the `.tokenize()` method.
+        """,
+        """
+        Returns:
+            :obj:`List[int]`, :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`: The tokenized ids of the
+            text.
+        """,
+    )
+    def encode(
+        self,
+        text: Union[TextInput, PreTokenizedInput, EncodedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs
+    ) -> List[int]:
+        """
+        Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
+
+        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
+
+        Args:
+            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`):
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
+                ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
+                method).
+            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the ``tokenize`` method) or a list of integers (tokenized string ids using the
+                ``convert_tokens_to_ids`` method).
+        """
+        encoded_inputs = self.encode_plus(
+            text,
+            text_pair=text_pair,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        return encoded_inputs["input_ids"]
+
+    def num_special_tokens_to_add(self, pair: bool = False) -> int:
+        raise NotImplementedError
+
+    def _get_padding_truncation_strategies(
+        self, padding=False, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
+    ):
+        """
+        Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
+        and pad_to_max_length) and behaviors.
+        """
+        old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate")
+        old_pad_to_max_length = kwargs.pop("pad_to_max_length", False)
+
+        # Backward compatibility for previous behavior, maybe we should deprecate it:
+        # If you only set max_length, it activates truncation for max_length
+        if max_length is not None and padding is False and truncation is False:
+            if verbose:
+                if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
+                    logger.warning(
+                        "Truncation was not explicitly activated but `max_length` is provided a specific value, "
+                        "please use `truncation=True` to explicitly truncate examples to max length. "
+                        "Defaulting to 'longest_first' truncation strategy. "
+                        "If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy "
+                        "more precisely by providing a specific strategy to `truncation`."
+                    )
+                self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
+            truncation = "longest_first"
+
+        # Get padding strategy
+        if padding is False and old_pad_to_max_length:
+            if verbose:
+                logger.warn(
+                    "The `pad_to_max_length` argument is deprecated and will be removed in a future version, "
+                    "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or "
+                    "use `padding='max_length'` to pad to a max length. In this case, you can give a specific "
+                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the "
+                    "maximal input size of the model (e.g. 512 for Bert).",
+                    FutureWarning,
+                )
+            if max_length is None:
+                padding_strategy = PaddingStrategy.LONGEST
+            else:
+                padding_strategy = PaddingStrategy.MAX_LENGTH
+        elif padding is not False:
+            if padding is True:
+                if verbose:
+                    if max_length is not None and (truncation is False or truncation == "do_not_truncate"):
+                        logger.warn(
+                            "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
+                            "To pad to max length, use `padding='max_length'`."
+                        )
+                    if old_pad_to_max_length is not False:
+                        logger.warn("Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`.")
+                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch
+            elif not isinstance(padding, PaddingStrategy):
+                padding_strategy = PaddingStrategy(padding)
+            elif isinstance(padding, PaddingStrategy):
+                padding_strategy = padding
+        else:
+            padding_strategy = PaddingStrategy.DO_NOT_PAD
+
+        # Get truncation strategy
+        if truncation is False and old_truncation_strategy != "do_not_truncate":
+            if verbose:
+                logger.warn(
+                    "The `truncation_strategy` argument is deprecated and will be removed in a future version, "
+                    "use `truncation=True` to truncate examples to a max length. You can give a specific "
+                    "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the "
+                    "maximal input size of the model (e.g. 512 for Bert). "
+                    " If you have pairs of inputs, you can give a specific truncation strategy selected among "
+                    "`truncation='only_first'` (will only truncate the first sentence in the pairs) "
+                    "`truncation='only_second'` (will only truncate the second sentence in the pairs) "
+                    "or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence in the pairs).",
+                    FutureWarning,
+                )
+            truncation_strategy = TruncationStrategy(old_truncation_strategy)
+        elif truncation is not False:
+            if truncation is True:
+                truncation_strategy = (
+                    TruncationStrategy.LONGEST_FIRST
+                )  # Default to truncate the longest sequences in pairs of inputs
+            elif not isinstance(truncation, TruncationStrategy):
+                truncation_strategy = TruncationStrategy(truncation)
+            elif isinstance(truncation, TruncationStrategy):
+                truncation_strategy = truncation
+        else:
+            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
+
+        # Set max length if needed
+        if max_length is None:
+            if padding_strategy == PaddingStrategy.MAX_LENGTH:
+                if self.model_max_length > LARGE_INTEGER:
+                    if verbose:
+                        if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
+                            logger.warning(
+                                "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
+                                "Default to no padding."
+                            )
+                        self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
+                    padding_strategy = PaddingStrategy.DO_NOT_PAD
+                else:
+                    max_length = self.model_max_length
+
+            if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
+                if self.model_max_length > LARGE_INTEGER:
+                    if verbose:
+                        if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
+                            logger.warning(
+                                "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
+                                "Default to no truncation."
+                            )
+                        self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
+                    truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
+                else:
+                    max_length = self.model_max_length
+
+        # Test if we have a padding token
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
+            raise ValueError(
+                "Asking to pad but the tokenizer does not have a padding token. "
+                "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
+                "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
+            )
+
+        # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
+        if (
+            truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
+            and padding_strategy != PaddingStrategy.DO_NOT_PAD
+            and pad_to_multiple_of is not None
+            and max_length is not None
+            and (max_length % pad_to_multiple_of != 0)
+        ):
+            raise ValueError(
+                f"Truncation and padding are both activated but "
+                f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
+            )
+
+        return padding_strategy, truncation_strategy, max_length, kwargs
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
+        sequences.
+
+        Args:
+            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+        """
+        # Input type checking for clearer error
+        def _is_valid_text_input(t):
+            if isinstance(t, str):
+                # Strings are fine
+                return True
+            elif isinstance(t, (list, tuple)):
+                # List are fine as long as they are...
+                if len(t) == 0:
+                    # ... empty
+                    return True
+                elif isinstance(t[0], str):
+                    # ... list of strings
+                    return True
+                elif isinstance(t[0], (list, tuple)):
+                    # ... list with an empty list or with a list of strings
+                    return len(t[0]) == 0 or isinstance(t[0][0], str)
+                else:
+                    return False
+            else:
+                return False
+
+        if not _is_valid_text_input(text):
+            raise ValueError(
+                "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
+                "or `List[List[str]]` (batch of pretokenized examples)."
+            )
+
+        if text_pair is not None and not _is_valid_text_input(text_pair):
+            raise ValueError(
+                "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
+                "or `List[List[str]]` (batch of pretokenized examples)."
+            )
+
+        if is_split_into_words:
+            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
+        else:
+            is_batched = isinstance(text, (list, tuple))
+
+        if is_batched:
+            if isinstance(text_pair, str):
+                raise TypeError(
+                    "when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`."
+                )
+            if text_pair is not None and len(text) != len(text_pair):
+                raise ValueError(
+                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`: {len(text_pair)}."
+                )
+            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
+            return self.batch_encode_plus(
+                batch_text_or_text_pairs=batch_text_or_text_pairs,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.encode_plus(
+                text=text,
+                text_pair=text_pair,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                is_split_into_words=is_split_into_words,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput, EncodedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Tokenize and prepare for the model a sequence or a pair of sequences.
+
+        .. warning::
+            This method is deprecated, ``__call__`` should be used instead.
+
+        Args:
+            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the latter only for not-fast tokenizers)):
+                The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
+                ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
+                method).
+            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
+                Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
+                the ``tokenize`` method) or a list of integers (tokenized string ids using the
+                ``convert_tokens_to_ids`` method).
+        """
+
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._encode_plus(
+            text=text,
+            text_pair=text_pair,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput, EncodedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        raise NotImplementedError
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            List[TextInput],
+            List[TextInputPair],
+            List[PreTokenizedInput],
+            List[PreTokenizedInputPair],
+            List[EncodedInput],
+            List[EncodedInputPair],
+        ],
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
+
+        .. warning::
+            This method is deprecated, ``__call__`` should be used instead.
+
+        Args:
+            batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`):
+                Batch of sequences or pair of sequences to be encoded. This can be a list of
+                string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
+                details in ``encode_plus``).
+        """
+
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._batch_encode_plus(
+            batch_text_or_text_pairs=batch_text_or_text_pairs,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            List[TextInput],
+            List[TextInputPair],
+            List[PreTokenizedInput],
+            List[PreTokenizedInputPair],
+            List[EncodedInput],
+            List[EncodedInputPair],
+        ],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        raise NotImplementedError
+
+    def pad(
+        self,
+        encoded_inputs: Union[
+            BatchEncoding,
+            List[BatchEncoding],
+            Dict[str, EncodedInput],
+            Dict[str, List[EncodedInput]],
+            List[Dict[str, EncodedInput]],
+        ],
+        padding: Union[bool, str, PaddingStrategy] = True,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        """
+        Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
+        in the batch.
+
+        Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
+        ``self.pad_token_id`` and ``self.pad_token_type_id``)
+
+        .. note::
+
+            If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
+            result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
+            case of PyTorch tensors, you will lose the specific device of your tensors however.
+
+        Args:
+            encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
+                Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
+                List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
+                List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
+                well as in a PyTorch Dataloader collate function.
+
+                Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
+                see the note above for the return type.
+            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                 index) among:
+
+                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                  single sequence if provided).
+                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided.
+                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                  different lengths).
+            max_length (:obj:`int`, `optional`):
+                Maximum length of the returned list and optionally padding length (see above).
+            pad_to_multiple_of (:obj:`int`, `optional`):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask (:obj:`bool`, `optional`):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
+                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
+                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether or not to print more information and warnings.
+        """
+        # If we have a list of dicts, let's convert it in a dict of lists
+        # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
+        if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
+            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
+
+        # The model's main input name, usually `input_ids`, has be passed for padding
+        if self.model_input_names[0] not in encoded_inputs:
+            raise ValueError(
+                "You should supply an encoding or a list of encodings to this method "
+                f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
+            )
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if not required_input:
+            if return_attention_mask:
+                encoded_inputs["attention_mask"] = []
+            return encoded_inputs
+
+        # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
+        # and rebuild them afterwards if no return_tensors is specified
+        # Note that we lose the specific device the tensor may be on for PyTorch
+
+        first_element = required_input[0]
+        if isinstance(first_element, (list, tuple)):
+            # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
+            index = 0
+            while len(required_input[index]) == 0:
+                index += 1
+            if index < len(required_input):
+                first_element = required_input[index][0]
+        # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
+        if not isinstance(first_element, (int, list, tuple)):
+            if _is_torch(first_element):
+                return_tensors = "pt" if return_tensors is None else return_tensors
+            elif isinstance(first_element, np.ndarray):
+                return_tensors = "np" if return_tensors is None else return_tensors
+            else:
+                raise ValueError(
+                    f"type of {first_element} unknown: {type(first_element)}. "
+                    f"Should be one of a python, numpy, pytorch or tensorflow object."
+                )
+
+            for key, value in encoded_inputs.items():
+                encoded_inputs[key] = to_py_obj(value)
+
+        # Convert padding_strategy in PaddingStrategy
+        padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
+            padding=padding, max_length=max_length, verbose=verbose
+        )
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        if required_input and not isinstance(required_input[0], (list, tuple)):
+            encoded_inputs = self._pad(
+                encoded_inputs,
+                max_length=max_length,
+                padding_strategy=padding_strategy,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+            )
+            return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
+
+        batch_size = len(required_input)
+        assert all(
+            len(v) == batch_size for v in encoded_inputs.values()
+        ), "Some items in the output dictionary have a different batch size than others."
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = max(len(inputs) for inputs in required_input)
+            padding_strategy = PaddingStrategy.MAX_LENGTH
+
+        batch_outputs = {}
+        for i in range(batch_size):
+            inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
+            outputs = self._pad(
+                inputs,
+                max_length=max_length,
+                padding_strategy=padding_strategy,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        return BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create the token type IDs corresponding to the sequences passed. `What are token type IDs?
+        <../glossary.html#token-type-ids>`__
+
+        Should be overridden in a subclass if the model has a special way of building those.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
+            token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence.
+
+        Returns:
+            :obj:`List[int]`: The token type ids.
+        """
+        if token_ids_1 is None:
+            return len(token_ids_0) * [0]
+        return [0] * len(token_ids_0) + [1] * len(token_ids_1)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens.
+
+        This implementation does not add special tokens and this method should be overridden in a subclass.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
+            token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence.
+
+        Returns:
+            :obj:`List[int]`: The model input with special tokens.
+        """
+        if token_ids_1 is None:
+            return token_ids_0
+        return token_ids_0 + token_ids_1
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def prepare_for_model(
+        self,
+        ids: List[int],
+        pair_ids: Optional[List[int]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        prepend_batch_axis: bool = False,
+        **kwargs
+    ) -> BatchEncoding:
+        """
+        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
+        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
+        manages a moving window (with user defined stride) for overflowing tokens. Please Note, for `pair_ids`
+        different than `None` and `truncation_strategy = longest_first` or `True`, it is not possible to return
+        overflowing tokens. Such a combination of arguments will raise an error.
+
+        Args:
+            ids (:obj:`List[int]`):
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
+                and ``convert_tokens_to_ids`` methods.
+            pair_ids (:obj:`List[int]`, `optional`):
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
+                and ``convert_tokens_to_ids`` methods.
+        """
+
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        pair = bool(pair_ids is not None)
+        len_ids = len(ids)
+        len_pair_ids = len(pair_ids) if pair else 0
+
+        if return_token_type_ids and not add_special_tokens:
+            raise ValueError(
+                "Asking to return token_type_ids while setting add_special_tokens to False "
+                "results in an undefined behavior. Please set add_special_tokens to True or "
+                "set return_token_type_ids to None."
+            )
+
+        if (
+            return_overflowing_tokens
+            and truncation_strategy == TruncationStrategy.LONGEST_FIRST
+            and pair_ids is not None
+        ):
+            raise ValueError(
+                "Not possible to return overflowing tokens for pair of sequences with the "
+                "`longest_first`. Please select another truncation strategy than `longest_first`, "
+                "for instance `only_second` or `only_first`."
+            )
+
+        # Load from model defaults
+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        encoded_inputs = {}
+
+        # Compute the total size of the returned encodings
+        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
+
+        # Truncation: Handle max sequence length
+        overflowing_tokens = []
+        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
+            ids, pair_ids, overflowing_tokens = self.truncate_sequences(
+                ids,
+                pair_ids=pair_ids,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+
+        if return_overflowing_tokens:
+            encoded_inputs["overflowing_tokens"] = overflowing_tokens
+            encoded_inputs["num_truncated_tokens"] = total_len - max_length
+
+        # Add special tokens
+        if add_special_tokens:
+            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
+        else:
+            sequence = ids + pair_ids if pair else ids
+            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
+
+        # Build output dictionary
+        encoded_inputs["input_ids"] = sequence
+        if return_token_type_ids:
+            encoded_inputs["token_type_ids"] = token_type_ids
+        if return_special_tokens_mask:
+            if add_special_tokens:
+                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
+            else:
+                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
+
+        # Check lengths
+        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
+
+        # Padding
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
+            encoded_inputs = self.pad(
+                encoded_inputs,
+                max_length=max_length,
+                padding=padding_strategy.value,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_attention_mask=return_attention_mask,
+            )
+
+        if return_length:
+            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
+
+        batch_outputs = BatchEncoding(
+            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
+        )
+
+        return batch_outputs
+
+    def truncate_sequences(
+        self,
+        ids: List[int],
+        pair_ids: Optional[List[int]] = None,
+        num_tokens_to_remove: int = 0,
+        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
+        stride: int = 0,
+    ) -> Tuple[List[int], List[int], List[int]]:
+        """
+        Truncates a sequence pair in-place following the strategy.
+
+        Args:
+            ids (:obj:`List[int]`):
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
+                and ``convert_tokens_to_ids`` methods.
+            pair_ids (:obj:`List[int]`, `optional`):
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
+                and ``convert_tokens_to_ids`` methods.
+            num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0):
+                Number of tokens to remove using the truncation strategy.
+            truncation_strategy (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
+                The strategy to follow for truncation. Can be:
+
+                * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+                  the maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+            stride (:obj:`int`, `optional`, defaults to 0):
+                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
+                sequence returned. The value of this argument defines the number of additional tokens.
+
+        Returns:
+            :obj:`Tuple[List[int], List[int], List[int]]`: The truncated ``ids``, the truncated ``pair_ids`` and the
+            list of overflowing tokens. Note: The `longest_first` strategy returns empty list of overflowing_tokens if
+            a pair of sequences (or a batch of pairs) is provided.
+        """
+        if num_tokens_to_remove <= 0:
+            return ids, pair_ids, []
+
+        if not isinstance(truncation_strategy, TruncationStrategy):
+            truncation_strategy = TruncationStrategy(truncation_strategy)
+
+        overflowing_tokens = []
+        if truncation_strategy == TruncationStrategy.ONLY_FIRST or (
+            truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None
+        ):
+            if len(ids) > num_tokens_to_remove:
+                window_len = min(len(ids), stride + num_tokens_to_remove)
+                overflowing_tokens = ids[-window_len:]
+                ids = ids[:-num_tokens_to_remove]
+            else:
+                error_msg = (
+                    f"We need to remove {num_tokens_to_remove} to truncate the input "
+                    f"but the first sequence has a length {len(ids)}. "
+                )
+                if truncation_strategy == TruncationStrategy.ONLY_FIRST:
+                    error_msg = (
+                        error_msg + "Please select another truncation strategy than "
+                        f"{truncation_strategy}, for instance 'longest_first' or 'only_second'."
+                    )
+                logger.error(error_msg)
+        elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
+            logger.warning(
+                f"Be aware, overflowing tokens are not returned for the setting you have chosen,"
+                f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
+                f"truncation strategy. So the returned list will always be empty even if some "
+                f"tokens have been removed."
+            )
+            for _ in range(num_tokens_to_remove):
+                if pair_ids is None or len(ids) > len(pair_ids):
+                    ids = ids[:-1]
+                else:
+                    pair_ids = pair_ids[:-1]
+        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
+            if len(pair_ids) > num_tokens_to_remove:
+                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
+                overflowing_tokens = pair_ids[-window_len:]
+                pair_ids = pair_ids[:-num_tokens_to_remove]
+            else:
+                logger.error(
+                    f"We need to remove {num_tokens_to_remove} to truncate the input"
+                    f"but the second sequence has a length {len(pair_ids)}. "
+                    f"Please select another truncation strategy than {truncation_strategy}, "
+                    f"for instance 'longest_first' or 'only_first'."
+                )
+
+        return (ids, pair_ids, overflowing_tokens)
+
+    def _pad(
+        self,
+        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                >= 7.5 (Volta).
+            return_attention_mask: (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            if self.padding_side == "right":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [1] * len(required_input) + [0] * difference
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
+            elif self.padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(required_input)
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+            else:
+                raise ValueError("Invalid padding strategy:" + str(self.padding_side))
+        elif return_attention_mask and "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * len(required_input)
+
+        return encoded_inputs
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """
+        Converts a sequence of tokens in a single string. The most simple way to do it is ``" ".join(tokens)`` but we
+        often want to remove sub-word tokenization artifacts at the same time.
+
+        Args:
+            tokens (:obj:`List[str]`): The token to join in a string.
+
+        Returns:
+            :obj:`str`: The joined tokens.
+        """
+        raise NotImplementedError
+
+    def batch_decode(
+        self,
+        sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor"],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = True,
+        **kwargs
+    ) -> List[str]:
+        """
+        Convert a list of lists of token ids into a list of strings by calling decode.
+
+        Args:
+            sequences (:obj:`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the ``__call__`` method.
+            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether or not to clean up the tokenization spaces.
+            kwargs (additional keyword arguments, `optional`):
+                Will be passed to the underlying model specific decode method.
+
+        Returns:
+            :obj:`List[str]`: The list of decoded sentences.
+        """
+        return [
+            self.decode(
+                seq,
+                skip_special_tokens=skip_special_tokens,
+                clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                **kwargs,
+            )
+            for seq in sequences
+        ]
+
+    def decode(
+        self,
+        token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor"],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = True,
+        **kwargs
+    ) -> str:
+        """
+        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+        tokens and clean up tokenization spaces.
+
+        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
+
+        Args:
+            token_ids (:obj:`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the ``__call__`` method.
+            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                Whether or not to clean up the tokenization spaces.
+            kwargs (additional keyword arguments, `optional`):
+                Will be passed to the underlying model specific decode method.
+
+        Returns:
+            :obj:`str`: The decoded sentence.
+        """
+        # Convert inputs to python lists
+        token_ids = to_py_obj(token_ids)
+
+        return self._decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+    def _decode(
+        self,
+        token_ids: Union[int, List[int]],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: bool = True,
+        **kwargs
+    ) -> str:
+        raise NotImplementedError
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids of the first sequence.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                List of ids of the second sequence.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        assert already_has_special_tokens and token_ids_1 is None, (
+            "You cannot use ``already_has_special_tokens=False`` with this tokenizer. "
+            "Please use a slow (full python) tokenizer to activate this argument."
+            "Or set `return_special_tokens_mask=True` when calling the encoding method "
+            "to get the special tokens mask in any tokenizer. "
+        )
+
+        all_special_ids = self.all_special_ids  # cache the property
+
+        special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]
+
+        return special_tokens_mask
+
+    @staticmethod
+    def clean_up_tokenization(out_string: str) -> str:
+        """
+        Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
+
+        Args:
+            out_string (:obj:`str`): The text to clean up.
+
+        Returns:
+            :obj:`str`: The cleaned-up string.
+        """
+        out_string = (
+            out_string.replace(" .", ".")
+            .replace(" ?", "?")
+            .replace(" !", "!")
+            .replace(" ,", ",")
+            .replace(" ' ", "'")
+            .replace(" n't", "n't")
+            .replace(" 'm", "'m")
+            .replace(" 's", "'s")
+            .replace(" 've", "'ve")
+            .replace(" 're", "'re")
+        )
+        return out_string
+
+    def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Optional[int], verbose: bool):
+        """
+        Depending on the input and internal state we might trigger a warning about a sequence that is too long for its
+        corresponding model
+
+        Args:
+            ids (:obj:`List[str]`): The ids produced by the tokenization
+            max_length (:obj:`int`, `optional`): The max_length desired (does not trigger a warning if it is set)
+            verbose (:obj:`bool`): Whether or not to print more information and warnings.
+
+        """
+        if max_length is None and len(ids) > self.model_max_length and verbose:
+            if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False):
+                logger.warning(
+                    "Token indices sequence length is longer than the specified maximum sequence length "
+                    f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model "
+                    "will result in indexing errors"
+                )
+            self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
+
+    @contextmanager
+    def as_target_tokenizer(self):
+        """
+        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
+        sequence-to-sequence models that need a slightly different processing for the labels.
+        """
+        yield
+
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: List[str],
+        tgt_texts: Optional[List[str]] = None,
+        max_length: Optional[int] = None,
+        max_target_length: Optional[int] = None,
+        padding: str = "longest",
+        return_tensors: str = None,
+        truncation: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Prepare model inputs for translation. For best performance, translate one sentence at a time.
+
+        Arguments:
+            src_texts (:obj:`List[str]`):
+                List of documents to summarize or source language texts.
+            tgt_texts (:obj:`list`, `optional`):
+                List of summaries or target language texts.
+            max_length (:obj:`int`, `optional`):
+                Controls the maximum length for encoder inputs (documents to summarize or source language texts) If
+                left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
+            max_target_length (:obj:`int`, `optional`):
+                Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set
+                to :obj:`None`, this will use the max_length value.
+            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+                Activates and controls padding. Accepts the following values:
+
+                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                  single sequence if provided).
+                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided.
+                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                  different lengths).
+            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
+                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
+                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+            truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`True`):
+                Activates and controls truncation. Accepts the following values:
+
+                * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
+                  :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
+                  provided. This will truncate token by token, removing a token from the longest sequence in the pair
+                  if a pair of sequences (or a batch of pairs) is provided.
+                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+                  the maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
+                  sequence lengths greater than the model maximum admissible input size).
+            **kwargs:
+                Additional keyword arguments passed along to :obj:`self.__call__`.
+
+        Return:
+            :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to the encoder.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
+            - **labels** -- List of token ids for tgt_texts.
+
+            The full set of keys ``[input_ids, attention_mask, labels]``, will only be returned if tgt_texts is passed.
+            Otherwise, input_ids, attention_mask will be the only keys.
+        """
+        # docstyle-ignore
+        formatted_warning = """
+`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
+`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
+your targets.
+
+Here is a short example:
+
+model_inputs = tokenizer(src_texts, ...)
+with tokenizer.as_target_tokenizer():
+    labels = tokenizer(tgt_texts, ...)
+model_inputs["labels"] = labels["input_ids"]
+
+See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
+For a more complete example, see the implementation of `prepare_seq2seq_batch`.
+"""
+        logger.warn(formatted_warning, FutureWarning)
+        # mBART-specific kwargs that should be ignored by other models.
+        kwargs.pop("src_lang", None)
+        kwargs.pop("tgt_lang", None)
+        if max_length is None:
+            max_length = self.model_max_length
+        model_inputs = self(
+            src_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            max_length=max_length,
+            padding=padding,
+            truncation=truncation,
+            **kwargs,
+        )
+        if tgt_texts is None:
+            return model_inputs
+        # Process tgt_texts
+        if max_target_length is None:
+            max_target_length = max_length
+        with self.as_target_tokenizer():
+            labels = self(
+                tgt_texts,
+                add_special_tokens=True,
+                return_tensors=return_tensors,
+                padding=padding,
+                max_length=max_target_length,
+                truncation=truncation,
+                **kwargs,
+            )
+        model_inputs["labels"] = labels["input_ids"]
+        return model_inputs
diff --git a/fastNLP/transformers/torch/utils/__init__.py b/fastNLP/transformers/torch/utils/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/fastNLP/transformers/torch/utils/model_parallel_utils.py b/fastNLP/transformers/torch/utils/model_parallel_utils.py
new file mode 100644
index 00000000..3a145df9
--- /dev/null
+++ b/fastNLP/transformers/torch/utils/model_parallel_utils.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from math import ceil
+
+
+def assert_device_map(device_map, num_blocks):
+    blocks = list(range(0, num_blocks))
+
+    device_map_blocks = [item for sublist in list(device_map.values()) for item in sublist]
+
+    # Duplicate check
+    duplicate_blocks = []
+    for i in device_map_blocks:
+        if device_map_blocks.count(i) > 1 and i not in duplicate_blocks:
+            duplicate_blocks.append(i)
+    # Missing blocks
+    missing_blocks = [i for i in blocks if i not in device_map_blocks]
+    extra_blocks = [i for i in device_map_blocks if i not in blocks]
+
+    assert len(duplicate_blocks) == 0, (
+        "Duplicate attention blocks specified in device_map. Attention blocks must be specified to one device. These "
+        "attention blocks were specified more than once: " + str(duplicate_blocks)
+    )
+    assert len(missing_blocks) == 0, (
+        "There are attention blocks for this model that are not specified in the device_map. Add these attention "
+        "blocks to a device on the device_map: " + str(missing_blocks)
+    )
+    assert (
+        len(extra_blocks) == 0
+    ), "The device_map contains more attention blocks than this model has. Remove these from the device_map:" + str(
+        extra_blocks
+    )
+
+
+def get_device_map(n_layers, devices):
+    """Returns a dictionary of layers distributed evenly across all devices."""
+    layers = list(range(n_layers))
+    n_blocks = int(ceil(n_layers / len(devices)))
+    layers_list = list(layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks))
+
+    return dict(zip(devices, layers_list))
diff --git a/fastNLP/transformers/torch/utils/versions.py b/fastNLP/transformers/torch/utils/versions.py
new file mode 100644
index 00000000..cb2fbdb9
--- /dev/null
+++ b/fastNLP/transformers/torch/utils/versions.py
@@ -0,0 +1,120 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities for working with package versions
+"""
+
+import operator
+import re
+import sys
+from typing import Optional
+
+from packaging import version
+
+
+# The package importlib_metadata is in a different place, depending on the python version.
+if sys.version_info < (3, 8):
+    import importlib_metadata
+else:
+    import importlib.metadata as importlib_metadata
+
+
+ops = {
+    "<": operator.lt,
+    "<=": operator.le,
+    "==": operator.eq,
+    "!=": operator.ne,
+    ">=": operator.ge,
+    ">": operator.gt,
+}
+
+
+def _compare_versions(op, got_ver, want_ver, requirement, pkg, hint):
+    if got_ver is None:
+        raise ValueError("got_ver is None")
+    if want_ver is None:
+        raise ValueError("want_ver is None")
+    if not ops[op](version.parse(got_ver), version.parse(want_ver)):
+        raise ImportError(
+            f"{requirement} is required for a normal functioning of this module, but found {pkg}=={got_ver}.{hint}"
+        )
+
+
+def require_version(requirement: str, hint: Optional[str] = None) -> None:
+    """
+    Perform a runtime check of the dependency versions, using the exact same syntax used by pip.
+
+    The installed module version comes from the `site-packages` dir via `importlib_metadata`.
+
+    Args:
+        requirement (:obj:`str`): pip style definition, e.g.,  "tokenizers==0.9.4", "tqdm>=4.27", "numpy"
+        hint (:obj:`str`, `optional`): what suggestion to print in case of requirements not being met
+
+    Example::
+
+       require_version("pandas>1.1.2")
+       require_version("numpy>1.18.5", "this is important to have for whatever reason")
+
+    """
+
+    hint = f"\n{hint}" if hint is not None else ""
+
+    # non-versioned check
+    if re.match(r"^[\w_\-\d]+$", requirement):
+        pkg, op, want_ver = requirement, None, None
+    else:
+        match = re.findall(r"^([^!=<>\s]+)([\s!=<>]{1,2}.+)", requirement)
+        if not match:
+            raise ValueError(
+                f"requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23, but got {requirement}"
+            )
+        pkg, want_full = match[0]
+        want_range = want_full.split(",")  # there could be multiple requirements
+        wanted = {}
+        for w in want_range:
+            match = re.findall(r"^([\s!=<>]{1,2})(.+)", w)
+            if not match:
+                raise ValueError(
+                    f"requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23, but got {requirement}"
+                )
+            op, want_ver = match[0]
+            wanted[op] = want_ver
+            if op not in ops:
+                raise ValueError(f"{requirement}: need one of {list(ops.keys())}, but got {op}")
+
+    # special case
+    if pkg == "python":
+        got_ver = ".".join([str(x) for x in sys.version_info[:3]])
+        for op, want_ver in wanted.items():
+            _compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
+        return
+
+    # check if any version is installed
+    try:
+        got_ver = importlib_metadata.version(pkg)
+    except importlib_metadata.PackageNotFoundError:
+        raise importlib_metadata.PackageNotFoundError(
+            f"The '{requirement}' distribution was not found and is required by this application. {hint}"
+        )
+
+    # check that the right version is installed if version number or a range was provided
+    if want_ver is not None:
+        for op, want_ver in wanted.items():
+            _compare_versions(op, got_ver, want_ver, requirement, pkg, hint)
+
+
+def require_version_core(requirement):
+    """require_version wrapper which emits a core-specific hint on failure"""
+    hint = "Try: pip install transformers -U or pip install -e '.[dev]' if you're working with git master"
+    return require_version(requirement, hint)

From df0651baaecb899b02a7205c4db840a88eca3eac Mon Sep 17 00:00:00 2001
From: x54-729 <17307130121@fudan.edu.cn>
Date: Sat, 30 Apr 2022 09:00:29 +0000
Subject: [PATCH 2/9] =?UTF-8?q?=E5=88=A0=E9=99=A4=20transformers=20?=
 =?UTF-8?q?=E4=B8=AD=E7=9A=84=E4=BE=9D=E8=B5=96=E5=8C=85=E5=AD=97=E5=85=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/transformers/torch/deepspeed.py       |  4 +-
 .../torch/dependency_versions_check.py        | 20 -----
 .../torch/dependency_versions_table.py        | 76 -------------------
 3 files changed, 2 insertions(+), 98 deletions(-)
 delete mode 100644 fastNLP/transformers/torch/dependency_versions_check.py
 delete mode 100644 fastNLP/transformers/torch/dependency_versions_table.py

diff --git a/fastNLP/transformers/torch/deepspeed.py b/fastNLP/transformers/torch/deepspeed.py
index fc3fcc7c..e60a7ce8 100644
--- a/fastNLP/transformers/torch/deepspeed.py
+++ b/fastNLP/transformers/torch/deepspeed.py
@@ -22,7 +22,7 @@ import weakref
 from copy import deepcopy
 from functools import partialmethod
 
-from .dependency_versions_check import dep_version_check
+from .utils.versions import require_version
 from fastNLP.envs.imports import _NEED_IMPORT_TORCH
 from fastNLP.core.log import logger
 
@@ -55,7 +55,7 @@ class HfDeepSpeedConfig:
         # set global weakref object
         set_hf_deepspeed_config(self)
 
-        dep_version_check("deepspeed")
+        require_version("deepspeed>=0.5.3")
 
         if isinstance(config_file_or_dict, dict):
             # Don't modify user's data should they want to reuse it (e.g. in tests), because once we
diff --git a/fastNLP/transformers/torch/dependency_versions_check.py b/fastNLP/transformers/torch/dependency_versions_check.py
deleted file mode 100644
index 30e8f448..00000000
--- a/fastNLP/transformers/torch/dependency_versions_check.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-
-from .dependency_versions_table import deps
-from .utils.versions import require_version
-
-def dep_version_check(pkg, hint=None):
-    require_version(deps[pkg], hint)
diff --git a/fastNLP/transformers/torch/dependency_versions_table.py b/fastNLP/transformers/torch/dependency_versions_table.py
deleted file mode 100644
index ef396637..00000000
--- a/fastNLP/transformers/torch/dependency_versions_table.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# THIS FILE HAS BEEN AUTOGENERATED. To update:
-# 1. modify the `_deps` dict in setup.py
-# 2. run `make deps_table_update``
-deps = {
-    "Pillow": "Pillow",
-    "black": "black==21.4b0",
-    "codecarbon": "codecarbon==1.2.0",
-    "cookiecutter": "cookiecutter==1.7.2",
-    "dataclasses": "dataclasses",
-    "datasets": "datasets",
-    "deepspeed": "deepspeed>=0.5.3",
-    "docutils": "docutils==0.16.0",
-    "fairscale": "fairscale>0.3",
-    "faiss-cpu": "faiss-cpu",
-    "fastapi": "fastapi",
-    "filelock": "filelock",
-    "flake8": "flake8>=3.8.3",
-    "flax": "flax>=0.3.4",
-    "fugashi": "fugashi>=1.0",
-    "GitPython": "GitPython<3.1.19",
-    "huggingface-hub": "huggingface-hub>=0.0.17",
-    "importlib_metadata": "importlib_metadata",
-    "ipadic": "ipadic>=1.0.0,<2.0",
-    "isort": "isort>=5.5.4",
-    "jax": "jax>=0.2.8",
-    "jaxlib": "jaxlib>=0.1.65",
-    "jieba": "jieba",
-    "keras2onnx": "keras2onnx",
-    "nltk": "nltk",
-    "numpy": "numpy>=1.17",
-    "onnxconverter-common": "onnxconverter-common",
-    "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
-    "onnxruntime": "onnxruntime>=1.4.0",
-    "optuna": "optuna",
-    "optax": "optax>=0.0.8",
-    "packaging": "packaging>=20.0",
-    "parameterized": "parameterized",
-    "protobuf": "protobuf",
-    "psutil": "psutil",
-    "pyyaml": "pyyaml>=5.1",
-    "pydantic": "pydantic",
-    "pytest": "pytest",
-    "pytest-timeout": "pytest-timeout",
-    "pytest-xdist": "pytest-xdist",
-    "python": "python>=3.6.0",
-    "ray[tune]": "ray[tune]",
-    "recommonmark": "recommonmark",
-    "regex": "regex!=2019.12.17",
-    "requests": "requests",
-    "rouge-score": "rouge-score",
-    "sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
-    "sacremoses": "sacremoses",
-    "sagemaker": "sagemaker>=2.31.0",
-    "scikit-learn": "scikit-learn",
-    "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
-    "sigopt": "sigopt",
-    "soundfile": "soundfile",
-    "sphinx-copybutton": "sphinx-copybutton",
-    "sphinx-markdown-tables": "sphinx-markdown-tables",
-    "sphinx-rtd-theme": "sphinx-rtd-theme==0.4.3",
-    "sphinx": "sphinx==3.2.1",
-    "sphinxext-opengraph": "sphinxext-opengraph==0.4.1",
-    "sphinx-intl": "sphinx-intl",
-    "starlette": "starlette",
-    "tensorflow-cpu": "tensorflow-cpu>=2.3",
-    "tensorflow": "tensorflow>=2.3",
-    "timeout-decorator": "timeout-decorator",
-    "timm": "timm",
-    "tokenizers": "tokenizers>=0.10.1,<0.11",
-    "torch": "torch>=1.0",
-    "torchaudio": "torchaudio",
-    "tqdm": "tqdm>=4.27",
-    "unidic": "unidic>=1.0.2",
-    "unidic_lite": "unidic_lite>=1.0.7",
-    "uvicorn": "uvicorn",
-}

From b3e0ebd7fc56b119ce4116c41fd7660071165940 Mon Sep 17 00:00:00 2001
From: yh_cc <poemsmileyh@gmail.com>
Date: Sat, 30 Apr 2022 17:10:03 +0800
Subject: [PATCH 3/9] =?UTF-8?q?=E4=BC=98=E5=8C=96=E4=BA=86Collator?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/core/callbacks/callback.py            |  3 +-
 .../callbacks/load_best_model_callback.py     | 33 ++++-----
 fastNLP/core/collators/new_collator.py        | 34 +++++----
 fastNLP/core/collators/padders/utils.py       |  2 +
 fastNLP/core/collators/utils.py               | 51 ++++++++------
 fastNLP/core/dataloaders/fdataloader.py       |  7 --
 .../core/dataloaders/torch_dataloader/fdl.py  |  2 +-
 .../collators/padders/test_numpy_padder.py    |  2 +-
 tests/core/collators/test_new_collator.py     | 70 ++++++++++++++++++-
 tests/core/collators/test_utils.py            | 16 ++---
 10 files changed, 148 insertions(+), 72 deletions(-)
 delete mode 100644 fastNLP/core/dataloaders/fdataloader.py

diff --git a/fastNLP/core/callbacks/callback.py b/fastNLP/core/callbacks/callback.py
index 982df7da..7f0c290d 100644
--- a/fastNLP/core/callbacks/callback.py
+++ b/fastNLP/core/callbacks/callback.py
@@ -126,7 +126,8 @@ class Callback:
 
         :param trainer: `fastNLP.Trainer`
         :param batch: batch 的数据，已经经过 input_mapping (如果有) 以及 移动到指定设备 。
-        :param list[int] indices: 当前的 batch 是 dataset 中的哪些数据
+        :param list[int] indices: 当前的 batch 是 dataset 中的哪些数据。仅在 DataLoader 支持得到当前 batch index 的时候有值，
+            其它时候为 None 。
         """
         pass
 
diff --git a/fastNLP/core/callbacks/load_best_model_callback.py b/fastNLP/core/callbacks/load_best_model_callback.py
index 5addd2e2..32534d2a 100644
--- a/fastNLP/core/callbacks/load_best_model_callback.py
+++ b/fastNLP/core/callbacks/load_best_model_callback.py
@@ -94,20 +94,21 @@ class LoadBestModelCallback(HasMonitorCallback):
         else:
             self.buffer.seek(0)
             trainer.load_model(folder=self.buffer, only_state_dict=self.only_state_dict)
-
-        self._delete_after_after(trainer)
-
-    def _delete_after_after(self, trainer):
-        trainer.driver.barrier()
         if self.delete_after_after:
-            if self.real_save_folder:
-                logger.info(f"Deleting {self.real_save_folder}...")
-                shutil.rmtree(self.real_save_folder, ignore_errors=True)
-                try:
-                    # 如果是 emtpy 的，就会被删除掉
-                    os.rmdir(self.save_folder)
-                except:
-                    pass
-            elif hasattr(self, 'buffer'):
-                self.buffer.close()
-                del self.buffer
\ No newline at end of file
+            trainer.driver.barrier()
+            self._delete_folder()
+            trainer.driver.barrier()
+
+    def _delete_folder(self):
+        if self.real_save_folder:
+            logger.info(f"Deleting {self.real_save_folder}...")
+            shutil.rmtree(self.real_save_folder, ignore_errors=True)
+            try:
+                # 如果是 emtpy 的，就会被删除掉
+                os.rmdir(self.save_folder)
+                logger.debug(f"Since {self.save_folder} is an empty folder, it has been removed.")
+            except:
+                pass
+        elif hasattr(self, 'buffer'):
+            self.buffer.close()
+            del self.buffer
\ No newline at end of file
diff --git a/fastNLP/core/collators/new_collator.py b/fastNLP/core/collators/new_collator.py
index 869a60a7..9123a293 100644
--- a/fastNLP/core/collators/new_collator.py
+++ b/fastNLP/core/collators/new_collator.py
@@ -6,7 +6,7 @@ from .padders.get_padder import get_padder
 import re
 
 from .utils import unpack_batch_mapping, unpack_batch_nested_mapping, pack_batch_nested_mapping, unpack_batch_sequence, \
-    pack_batch_sequence, NESTED_DICT_SEPARATOR
+    pack_batch_sequence
 
 sequence_idx_str = re.compile(r'^_\d+$')  # 形如_0, _1
 SUPPORTED_BACKENDS = ['torch', 'jittor', 'paddle', 'numpy', 'raw', None]
@@ -16,10 +16,11 @@ class Collator:
     def __init__(self, backend='torch'):
         """
         用于 pad 数据的对象。会自动将所有能够 pad （由 fastNLP 根据数据判定能否 pad ）的数据都进行 pad 操作，默认 pad 的值为 0。
-            可使用 set_pad() 函数调整。如果有些 field 不想输出，可以使用 set_ignore() 函数进行设置。
+            可使用 set_pad() 函数调整。如果有些 field 不想输出，可以使用 set_ignore() 函数进行设置。Collator 在第一次进行 pad 的
+            时候自动根据设置以及数据情况，为每个 field 获取一个 padder ，在之后的每次调用中，都将使用对应的 Padder 给对应的 field 。
 
-        :param backend: 对于可以 pad 的 field，使用哪种 tensor，支持 ['torch','jittor','paddle','numpy','raw',None]，
-            若为 None ，则不进行 padding 。
+        :param backend: 对于可以 pad 的 field，使用哪种 tensor，支持 ['torch','jittor','paddle','numpy','raw',None]。
+            若为 None ，则不进行 padding 。该参数对本身就不能进行 pad 的数据没用影响，不能 pad 的数据返回一定是 list 。
         """
         self.unpack_batch_func = None
         self.pack_batch_func = None
@@ -54,22 +55,25 @@ class Collator:
                 else:
                     self.batch_data_type = 's'
                 logger.debug(f"Since batch[0] has type:{type(batch[0])}, so the batch_data_type "
-                             f"is {self.batch_data_type}")
+                             f"is `{self.batch_data_type}`.")
             if self.batch_data_type == 's':
-                self.unpack_batch_func = lambda x:{'_single': x}  # 不需要做任何调整
-                self.pack_batch_func = lambda x:x['_single']
+                self.unpack_batch_func = lambda batch, ignore_fields: {'_single': batch}  # 不需要做任何调整
+                self.pack_batch_func = lambda x: x['_single']
             elif self.batch_data_type == 'l':
                 self.unpack_batch_func = unpack_batch_sequence
                 self.pack_batch_func = pack_batch_sequence
             elif self.batch_data_type == 'd':
-                if any([isinstance(v, Mapping) for v in batch[0].values()]):  # 可能存在 nested 的dict。{'a': {'b': xx}}->{'a@@b': value}
+                if any([isinstance(v, Mapping) for v in batch[0].values()]):  # 可能存在 nested 的dict。{'a': {'b': xx}}->{('a', 'b'): value}
                     self.unpack_batch_func = unpack_batch_nested_mapping
                     self.pack_batch_func = pack_batch_nested_mapping
                 else:
                     self.unpack_batch_func = unpack_batch_mapping
                     self.pack_batch_func = lambda x:x
 
-        unpack_batch:Dict = self.unpack_batch_func(batch)  # 将各自 field 组成 batch 形式。
+        if self.unpack_batch_func is unpack_batch_nested_mapping:  # 比较特殊，需要防止继续往下延伸
+            unpack_batch: Dict = self.unpack_batch_func(batch, self.ignore_fields, set(self.input_fields.keys()))
+        else:
+            unpack_batch:Dict = self.unpack_batch_func(batch, self.ignore_fields)  # 将各自 field 组成 batch 形式。
 
         pad_batch = {}
         if len(self.padders)==0:  # 第一次运行，准备 padder
@@ -96,13 +100,13 @@ class Collator:
 
         return self.pack_batch_func(pad_batch)  # 根据情况恢复成与输入一致的类型
 
-    def set_pad(self, field_name:str, pad_val:Union[int, float, None]=0, dtype=None, backend=None,
+    def set_pad(self, field_name:Union[str, tuple], pad_val:Union[int, float, None]=0, dtype=None, backend=None,
                 pad_fn:Callable=None) -> "Collator":
         """
         如果需要对某个 field 的内容进行特殊的调整，请使用这个函数。
 
         :param field_name: 需要调整的 field 的名称。如果 Dataset 的 __getitem__ 方法返回的是 dict 类型的，则可以直接使用对应的
-            field 的 key 来表示，如果是 nested 的 dict，可以使用 @@ 来连接不同层次的 key，例如 {'a': {'b': 1}} 中的使用 a@@b;
+            field 的 key 来表示，如果是 nested 的 dict，可以使用元组表示多层次的 key，例如 {'a': {'b': 1}} 中的使用 ('a', 'b');
             如果 __getitem__ 返回的是 Sequence 类型的，则可以使用 '_0', '_1' 表示序列中第 0 或 1 个元素。如果该 field 在数据中没
             有找到，则报错；如果 __getitem__ 返回的是就是整体内容，请使用 "_single" 。
         :param pad_val: 这个 field 的默认 pad 值。如果设置为 None，则表示该 field 不需要 pad , fastNLP 默认只会对可以 pad 的
@@ -126,11 +130,11 @@ class Collator:
                                                            f"index, but other field is set as dict mode."
             elif self.batch_data_type == 'l':
                 assert sequence_idx_str.match(field_name) is not None, f"Other field is set as list mode. But the new " \
-                                                                       f"field name is {field_name}"
+                                                                       f"field name is {field_name}."
 
         if field_name == '_single':
             self.batch_data_type = 's'
-        elif sequence_idx_str.match(field_name):
+        elif isinstance(field_name, str) and sequence_idx_str.match(field_name):
             self.batch_data_type = 'l'
         else:
             self.batch_data_type = 'd'
@@ -165,8 +169,8 @@ class Collator:
             collator.set_ignore('field1', 'field2')
 
         :param field_names: 需要忽略的 field 的名称。如果 Dataset 的 __getitem__ 方法返回的是 dict 类型的，则可以直接使用对应的
-            field 的 key 来表示，如果是 nested 的 dict，可以使用 @@ 来连接不同层次的 key，例如 {'a': {'b': 1}} 中的使用 a@@b;
-            如果 __getitem__ 返回的是 Sequence 类型的，则可以使用 '_0', '_1' 表示序列中第 0 或 1 个元素。
+            field 的 key 来表示，如果是 nested 的 dict，可以使用元组来表示，例如 {'a': {'b': 1}} 中的使用 ('a', 'b'); 如果
+            __getitem__ 返回的是 Sequence 类型的，则可以使用 '_0', '_1' 表示序列中第 0 或 1 个元素。
         :return: 返回 Collator 自身
         """
         for field_name in field_names:
diff --git a/fastNLP/core/collators/padders/utils.py b/fastNLP/core/collators/padders/utils.py
index f6240219..d2d3a8e0 100644
--- a/fastNLP/core/collators/padders/utils.py
+++ b/fastNLP/core/collators/padders/utils.py
@@ -149,6 +149,7 @@ def is_number(dtype):
         if dtype in (float, int, complex, bool) and not is_numpy_generic_class(dtype) \
                 and not is_numpy_number_dtype(dtype):
             return True
+        return False
     except:
         return False
 
@@ -161,6 +162,7 @@ if __name__ == '__main__':
     # print(type(b[0]))
     # print(b)
     # import torch
+    print(is_number(type('a')))
     print(is_number_or_numpy_number(type(3)))  # True
     print(is_number_or_numpy_number(type(3.1)))  # True
     print(is_number_or_numpy_number(type('3'))) # False
diff --git a/fastNLP/core/collators/utils.py b/fastNLP/core/collators/utils.py
index 9a397c66..1a82aa23 100644
--- a/fastNLP/core/collators/utils.py
+++ b/fastNLP/core/collators/utils.py
@@ -2,54 +2,58 @@ from collections import defaultdict
 from functools import reduce
 from typing import Sequence, Mapping, Dict
 
-NESTED_DICT_SEPARATOR = '@@'
 
-
-def unpack_batch_mapping(batch:Sequence[Mapping])->Dict:
+def unpack_batch_mapping(batch:Sequence[Mapping], ignore_fields:set)->Dict:
     """
     将 Sequence[Mapping] 转为 Dict 。例如 [{'a': [1, 2], 'b': 1}, {'a': [3], 'b': 2}] -> {'a': [[1, 2], [3]], 'b': [1, 2]}
 
     :param batch:
+    :param ignore_fields:
     :return:
     """
     dict_batch = defaultdict(list)
     for sample in batch:
         for key, value in sample.items():
+            if key in ignore_fields:
+                continue
             dict_batch[key].append(value)
     return dict_batch
 
 
-def unpack_batch_nested_mapping(batch:Sequence[Mapping], _parent='')->Dict:
+def unpack_batch_nested_mapping(batch:Sequence[Mapping], ignore_fields:set, stop_deep_fields:set)->Dict:
     """
     将 nested 的 dict 中的内容展开到一个 flat dict 中
 
     :param batch:
-    :param _parent: 内部使用
+    :param ignore_fields: 需要忽略的 field 。
+    :param stop_deep_fields: 不需要继续往下衍射的
     :return:
     """
     dict_batch = defaultdict(list)
-    if _parent != '':
-        _parent += NESTED_DICT_SEPARATOR
     for sample in batch:
         for key, value in sample.items():
-            if isinstance(value, Mapping):
-                _dict_batch = _unpack_batch_nested_mapping(value, _parent=_parent + key)
+            if key in ignore_fields:
+                continue
+            if isinstance(value, Mapping) and key not in stop_deep_fields:
+                _dict_batch = _unpack_batch_nested_mapping(value, ignore_fields, stop_deep_fields, _parent=(key,))
                 for key, value in _dict_batch.items():
                     dict_batch[key].append(value)
             else:
-                dict_batch[_parent + key].append(value)
+                dict_batch[key].append(value)
     return dict_batch
 
 
-def _unpack_batch_nested_mapping(value, _parent)->Dict:
+def _unpack_batch_nested_mapping(value, ignore_fields, stop_deep_fields, _parent)->Dict:
     _dict = {}
-    _parent += NESTED_DICT_SEPARATOR
     for k, v in value.items():
-        if isinstance(v, Mapping):
-            __dict = _unpack_batch_nested_mapping(v, _parent=_parent + k)
+        _k = _parent + (k,)
+        if _k in ignore_fields:
+            continue
+        if isinstance(v, Mapping) and _k not in stop_deep_fields:
+            __dict = _unpack_batch_nested_mapping(v, ignore_fields, stop_deep_fields, _parent=_k)
             _dict.update(__dict)
         else:
-            _dict[_parent + k] = v
+            _dict[_k] = v
     return _dict
 
 
@@ -63,10 +67,11 @@ def pack_batch_nested_mapping(batch:Mapping) -> Dict:
     dicts = []
 
     for key, value in batch.items():
-        keys = key.split(NESTED_DICT_SEPARATOR)
-        d = {keys[-1]: value}
-        for key in keys[:-1:][::-1]:
-            d = {key: d}
+        if not isinstance(key, tuple):
+            key = [key]
+        d = {key[-1]: value}
+        for k in key[:-1:][::-1]:
+            d = {k: d}
         dicts.append(d)
     return reduce(_merge_dict, dicts)
 
@@ -85,17 +90,21 @@ def _merge_dict(a, b, path=None):
     return a
 
 
-def unpack_batch_sequence(batch:Sequence[Sequence])->Dict:
+def unpack_batch_sequence(batch:Sequence[Sequence], ignore_fields)->Dict:
     """
     将 Sequence[Sequence] 转为 Mapping 。例如 [[[1, 2], 2], [[3], 2]] -> {'_0': [[1, 2], [3]], '_1': [1, 2]}
 
     :param batch:
+    :param ignore_fields: 需要忽略的field
     :return:
     """
     dict_batch = defaultdict(list)
     for sample in batch:
         for i, content in enumerate(sample):
-            dict_batch[f'_{i}'].append(content)
+            field_name = f'_{i}'
+            if field_name in ignore_fields:
+                continue
+            dict_batch[field_name].append(content)
     return dict_batch
 
 
diff --git a/fastNLP/core/dataloaders/fdataloader.py b/fastNLP/core/dataloaders/fdataloader.py
deleted file mode 100644
index 742f3909..00000000
--- a/fastNLP/core/dataloaders/fdataloader.py
+++ /dev/null
@@ -1,7 +0,0 @@
-__all__ = [
-    'FDataLoader'
-]
-
-
-class FDataLoader:
-    pass
diff --git a/fastNLP/core/dataloaders/torch_dataloader/fdl.py b/fastNLP/core/dataloaders/torch_dataloader/fdl.py
index cf8e2c31..02721aaf 100644
--- a/fastNLP/core/dataloaders/torch_dataloader/fdl.py
+++ b/fastNLP/core/dataloaders/torch_dataloader/fdl.py
@@ -17,7 +17,7 @@ if _NEED_IMPORT_TORCH:
     from torch.utils.data import DataLoader, Sampler
     from torch.utils.data._utils.collate import default_collate
 else:
-    from ..fdataloader import FDataLoader as DataLoader
+    from fastNLP.core.utils.dummy_class import DummyClass as DataLoader
 
 
 class _FDataSet:
diff --git a/tests/core/collators/padders/test_numpy_padder.py b/tests/core/collators/padders/test_numpy_padder.py
index 42665857..6cc9d668 100644
--- a/tests/core/collators/padders/test_numpy_padder.py
+++ b/tests/core/collators/padders/test_numpy_padder.py
@@ -10,7 +10,7 @@ class TestNumpyNumberPadder:
     def test_run(self):
         padder = NumpyNumberPadder(ele_dtype=int, dtype=int, pad_val=-1)
         a = [1, 2, 3]
-        assert isinstance(a, np.ndarray)
+        assert isinstance(padder(a), np.ndarray)
         assert (padder(a) == np.array(a)).sum() == 3
 
 
diff --git a/tests/core/collators/test_new_collator.py b/tests/core/collators/test_new_collator.py
index 5fc82c91..7c27b3a9 100644
--- a/tests/core/collators/test_new_collator.py
+++ b/tests/core/collators/test_new_collator.py
@@ -158,7 +158,7 @@ class TestCollator:
 
         # 测试 ignore
         collator = Collator(backend='raw')
-        collator.set_ignore('str', 'int', 'lst_int', 'nested_dict@@a')
+        collator.set_ignore('str', 'int', 'lst_int', ('nested_dict', 'a'))
         raw_pad_batch = {'lst_str': [['1'], ['2', '2']], 'nest_lst_int': [[[1, 0], [0, 0]], [[1, 0], [1, 2]]], 'float': [1.1, 2.1], 'lst_float': [[1.1], [2.1]], 'bool': [True, False], 'numpy': [np.array([1.]), np.array([0.])], 'dict': {'1': ['1', '2']}, 'set': [{'1'}, {'2'}], 'nested_dict': {'b': [[1, 2], [1, 2]]}}
         findDictDiff(raw_pad_batch, collator(dict_batch))
 
@@ -171,7 +171,7 @@ class TestCollator:
         # 测试设置 pad 值
         collator = Collator(backend='raw')
         collator.set_pad('nest_lst_int', pad_val=100)
-        collator.set_ignore('str', 'int', 'lst_int', 'nested_dict@@a')
+        collator.set_ignore('str', 'int', 'lst_int', ('nested_dict','a'))
         raw_pad_batch = {'lst_str': [['1'], ['2', '2']], 'nest_lst_int': [[[1, 100], [100, 100]], [[1, 100], [1, 2]]],
                          'float': [1.1, 2.1], 'lst_float': [[1.1], [2.1]], 'bool': [True, False], 'numpy': [np.array([1.]), np.array([0.])], 'dict': {'1': ['1', '2']}, 'set': [{'1'}, {'2'}], 'nested_dict': {'b': [[1, 2], [1, 2]]}}
         findDictDiff(raw_pad_batch, collator(dict_batch))
@@ -217,6 +217,72 @@ class TestCollator:
         collator.set_pad('_single')
         findListDiff(list_batch, collator(list_batch))
 
+    def test_nest_ignore(self):
+        dict_batch = [{
+            'str': '1',
+            'lst_str': ['1'],
+            'int': 1,
+            'lst_int': [1],
+            'nest_lst_int': [[1]],
+            'float': 1.1,
+            'lst_float': [1.1],
+            'bool': True,
+            'numpy': np.ones(1),
+            'dict': {'1': '1'},
+            'set': {'1'},
+            'nested_dict': {'int': 1, 'lst_int':[1, 2], 'c': {'int': 1}}
+        },
+            {
+                'str': '2',
+                'lst_str': ['2', '2'],
+                'int': 2,
+                'lst_int': [1, 2],
+                'nest_lst_int': [[1], [1, 2]],
+                'float': 2.1,
+                'lst_float': [2.1],
+                'bool': False,
+                'numpy': np.zeros(1),
+                'dict': {'1': '2'},
+                'set': {'2'},
+                'nested_dict': {'int': 1, 'lst_int': [1, 2], 'c': {'int': 1}}
+            }
+        ]
+        # 测试 ignore
+        collator = Collator(backend='raw')
+        collator.set_ignore('str', 'int', 'lst_int', ('nested_dict', 'int'))
+        raw_pad_batch = {'lst_str': [['1'], ['2', '2']], 'nest_lst_int': [[[1, 0], [0, 0]], [[1, 0], [1, 2]]],
+                         'float': [1.1, 2.1], 'lst_float': [[1.1], [2.1]], 'bool': [True, False],
+                         'numpy': [np.array([1.]), np.array([0.])], 'dict': {'1': ['1', '2']},
+                         'set': [{'1'}, {'2'}], 'nested_dict': {'lst_int': [[1, 2], [1, 2]],
+                                                                'c': {'int':[1, 1]}}}
+        findDictDiff(raw_pad_batch, collator(dict_batch))
+
+        collator = Collator(backend='raw')
+        collator.set_pad(('nested_dict', 'c'), pad_val=None)
+        collator.set_ignore('str', 'int', 'lst_int')
+        raw_pad_batch = {'lst_str': [['1'], ['2', '2']], 'nest_lst_int': [[[1, 0], [0, 0]], [[1, 0], [1, 2]]],
+                         'float': [1.1, 2.1], 'lst_float': [[1.1], [2.1]], 'bool': [True, False],
+                         'numpy': [np.array([1.]), np.array([0.])], 'dict': {'1': ['1', '2']},
+                         'set': [{'1'}, {'2'}], 'nested_dict': {'lst_int': [[1, 2], [1, 2]],
+                                                                'c': [{'int':1}, {'int':1}]}}
+        pad_batch = collator(dict_batch)
+        findDictDiff(raw_pad_batch, pad_batch)
+
+        collator = Collator(backend='raw')
+        collator.set_pad(('nested_dict', 'c'), pad_val=1)
+        with pytest.raises(BaseException):
+            collator(dict_batch)
+
+        collator = Collator(backend='raw')
+        collator.set_ignore('str', 'int', 'lst_int')
+        collator.set_pad(('nested_dict', 'c'), pad_fn=lambda x: [d['int'] for d in x])
+        pad_batch = collator(dict_batch)
+        raw_pad_batch = {'lst_str': [['1'], ['2', '2']], 'nest_lst_int': [[[1, 0], [0, 0]], [[1, 0], [1, 2]]],
+                         'float': [1.1, 2.1], 'lst_float': [[1.1], [2.1]], 'bool': [True, False],
+                         'numpy': [np.array([1.]), np.array([0.])], 'dict': {'1': ['1', '2']},
+                         'set': [{'1'}, {'2'}], 'nested_dict': {'lst_int': [[1, 2], [1, 2]],
+                                                                'c': [1, 1]}}
+        findDictDiff(raw_pad_batch, pad_batch)
 
 
diff --git a/tests/core/collators/test_utils.py b/tests/core/collators/test_utils.py
index d56dacc6..74c54a36 100644
--- a/tests/core/collators/test_utils.py
+++ b/tests/core/collators/test_utils.py
@@ -4,25 +4,25 @@ from fastNLP.core.collators.utils import *
 
 def test_unpack_batch_mapping():
     batch = [{'a': [1, 2], 'b': 1}, {'a': [3], 'b': 2}]
-    assert unpack_batch_mapping(batch)=={'a': [[1, 2], [3]], 'b': [1, 2]}
+    assert unpack_batch_mapping(batch, {})=={'a': [[1, 2], [3]], 'b': [1, 2]}
 
 
 def test_unpack_batch_nested_mapping():
     batch = [{'a': [1, 2], 'b': 1, 'c': {'c': 1}}, {'a': [3], 'b': 2, 'c': {'c': 2}}]
-    assert unpack_batch_nested_mapping(batch) == {'a': [[1, 2], [3]], 'b': [1, 2], 'c@@c': [1, 2]}
+    assert unpack_batch_nested_mapping(batch, {}, {}) == {'a': [[1, 2], [3]], 'b': [1, 2], ('c','c'): [1, 2]}
 
     batch = [{'a': [1, 2], 'b': 1, 'c': {'c': {'c': 1}}}, {'a': [3], 'b': 2, 'c': {'c': {'c': 2}}}]
-    assert unpack_batch_nested_mapping(batch) == {'a': [[1, 2], [3]], 'b': [1, 2], 'c@@c@@c': [1, 2]}
+    assert unpack_batch_nested_mapping(batch, {}, {}) == {'a': [[1, 2], [3]], 'b': [1, 2], ('c', 'c', 'c'): [1, 2]}
 
     batch = [{'a': [1, 2], 'b': 1, 'c': {'c': {'c': 1, 'd':[1, 1]}, 'd': [1]}},
              {'a': [3], 'b': 2, 'c': {'c': {'c': 2, 'd': [2, 2]}, 'd': [2, 2]}}]
-    assert unpack_batch_nested_mapping(batch) == {'a': [[1, 2], [3]], 'b': [1, 2], 'c@@c@@c': [1, 2],
-                                                  'c@@c@@d':[[1, 1], [2, 2]], 'c@@d': [[1], [2, 2]]}
+    assert unpack_batch_nested_mapping(batch, {}, {}) == {'a': [[1, 2], [3]], 'b': [1, 2], ('c', 'c', 'c'): [1, 2],
+                                                          ('c','c', 'd'):[[1, 1], [2, 2]], ('c', 'd'): [[1], [2, 2]]}
 
 
 def test_pack_batch_nested_mapping():
-    batch = {'a': [[1, 2], [3]], 'b': [1, 2], 'c@@c@@c': [1, 2],
-             'c@@c@@d':[[1, 1], [2, 2]], 'c@@d': [[1], [2, 2]]}
+    batch = {'a': [[1, 2], [3]], 'b': [1, 2], ('c', 'c', 'c'): [1, 2],
+             ('c', 'c', 'd'):[[1, 1], [2, 2]], ('c', 'd'): [[1], [2, 2]]}
     new_batch = pack_batch_nested_mapping(batch)
     assert new_batch == {'a': [[1, 2], [3]], 'b': [1, 2],
                          'c': {'c':{'c': [1, 2], 'd': [[1, 1], [2, 2]]}, 'd':[[1], [2, 2]]}}
@@ -30,7 +30,7 @@ def test_pack_batch_nested_mapping():
 
 def test_unpack_batch_sequence():
     batch = [[1, 2, 3], [2, 4, 6]]
-    new_batch = unpack_batch_sequence(batch)
+    new_batch = unpack_batch_sequence(batch, {})
     assert new_batch == {'_0': [1, 2], '_1': [2, 4], '_2': [3, 6]}
 
 
From a6103f634253458b909f3e1d8113f94e2f34921c Mon Sep 17 00:00:00 2001
From: x54-729 <17307130121@fudan.edu.cn>
Date: Sat, 30 Apr 2022 10:51:55 +0000
Subject: [PATCH 4/9] =?UTF-8?q?=E9=87=8D=E5=91=BD=E5=90=8D=E4=B8=8D?=
 =?UTF-8?q?=E9=9C=80=E8=A6=81pytest=E7=9A=84=E6=B5=8B=E8=AF=95=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../controllers/{test_trainer_fleet.py => _test_trainer_fleet.py} | 0
 ...st_trainer_fleet_outside.py => _test_trainer_fleet_outside.py} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename tests/core/controllers/{test_trainer_fleet.py => _test_trainer_fleet.py} (100%)
 rename tests/core/controllers/{test_trainer_fleet_outside.py => _test_trainer_fleet_outside.py} (100%)

diff --git a/tests/core/controllers/test_trainer_fleet.py b/tests/core/controllers/_test_trainer_fleet.py
similarity index 100%
rename from tests/core/controllers/test_trainer_fleet.py
rename to tests/core/controllers/_test_trainer_fleet.py
diff --git a/tests/core/controllers/test_trainer_fleet_outside.py b/tests/core/controllers/_test_trainer_fleet_outside.py
similarity index 100%
rename from tests/core/controllers/test_trainer_fleet_outside.py
rename to tests/core/controllers/_test_trainer_fleet_outside.py

From b3c9819fb84c93b674af71bee60f50aed3179fab Mon Sep 17 00:00:00 2001
From: x54-729 <17307130121@fudan.edu.cn>
Date: Sat, 30 Apr 2022 12:55:57 +0000
Subject: [PATCH 5/9] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20=5F=5Finit=5F=5F.py?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/core/dataloaders/jittor_dataloader/__init__.py | 0
 tests/core/dataloaders/paddle_dataloader/__init__.py | 0
 tests/core/dataloaders/torch_dataloader/__init__.py  | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tests/core/dataloaders/jittor_dataloader/__init__.py
 create mode 100644 tests/core/dataloaders/paddle_dataloader/__init__.py
 create mode 100644 tests/core/dataloaders/torch_dataloader/__init__.py

diff --git a/tests/core/dataloaders/jittor_dataloader/__init__.py b/tests/core/dataloaders/jittor_dataloader/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/core/dataloaders/paddle_dataloader/__init__.py b/tests/core/dataloaders/paddle_dataloader/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/core/dataloaders/torch_dataloader/__init__.py b/tests/core/dataloaders/torch_dataloader/__init__.py
new file mode 100644
index 00000000..e69de29b

From cf2ef2ecd79a43f9ecf4054f067231fc421e0dd9 Mon Sep 17 00:00:00 2001
From: x54-729 <17307130121@fudan.edu.cn>
Date: Sat, 30 Apr 2022 13:04:55 +0000
Subject: [PATCH 6/9] =?UTF-8?q?=E8=B0=83=E6=95=B4=E6=B5=8B=E8=AF=95?=
 =?UTF-8?q?=E4=BE=8B=E7=9A=84backend=E8=AE=BE=E7=BD=AE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../paddle_driver/initialize_paddle_driver.py |  2 +-
 .../torch_driver/initialize_torch_driver.py   |  4 +--
 fastNLP/core/metrics/utils.py                 |  5 ++-
 .../test_load_best_model_callback_torch.py    |  4 +--
 tests/core/controllers/_test_trainer_fleet.py |  1 -
 .../_test_trainer_fleet_outside.py            |  1 -
 tests/core/controllers/test_trainer_paddle.py |  4 +--
 .../drivers/paddle_driver/test_dist_utils.py  |  1 -
 .../core/drivers/paddle_driver/test_fleet.py  |  2 --
 .../test_initialize_paddle_driver.py          |  3 --
 .../paddle_driver/test_single_device.py       |  3 --
 .../core/drivers/paddle_driver/test_utils.py  |  2 --
 tests/core/drivers/torch_driver/test.py       | 31 +++++++++++++++++++
 tests/core/drivers/torch_driver/test_ddp.py   |  2 --
 .../test_initialize_torch_driver.py           |  3 --
 .../torch_driver/test_single_device.py        |  2 --
 tests/core/drivers/torch_driver/test_utils.py |  2 --
 .../core/samplers/test_unrepeated_sampler.py  | 18 +++++------
 18 files changed, 48 insertions(+), 42 deletions(-)
 create mode 100644 tests/core/drivers/torch_driver/test.py

diff --git a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py
index 9a9d4198..c0489e6e 100644
--- a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py
+++ b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py
@@ -14,7 +14,7 @@ if _NEED_IMPORT_PADDLE:
     import paddle
 
 def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[int]]],
-                            model: paddle.nn.Layer, **kwargs) -> PaddleDriver:
+                            model: "paddle.nn.Layer", **kwargs) -> PaddleDriver:
     r"""
     用来根据参数 `driver` 和 `device` 来确定并且初始化一个具体的 `Driver` 实例然后返回回去；
     1、如果检测到当前进程为用户通过 `python -m paddle.distributed.launch xxx.py` 方式拉起的，则将
diff --git a/fastNLP/core/drivers/torch_driver/initialize_torch_driver.py b/fastNLP/core/drivers/torch_driver/initialize_torch_driver.py
index 5ee946c4..7cef7316 100644
--- a/fastNLP/core/drivers/torch_driver/initialize_torch_driver.py
+++ b/fastNLP/core/drivers/torch_driver/initialize_torch_driver.py
@@ -11,8 +11,8 @@ from fastNLP.core.log import logger
 from fastNLP.envs import FASTNLP_BACKEND_LAUNCH
 
 
-def initialize_torch_driver(driver: str, device: Optional[Union[str, torch.device, int, List[int]]],
-                            model: torch.nn.Module, **kwargs) -> TorchDriver:
+def initialize_torch_driver(driver: str, device: Optional[Union[str, "torch.device", int, List[int]]],
+                            model: "torch.nn.Module", **kwargs) -> TorchDriver:
     r"""
     用来根据参数 `driver` 和 `device` 来确定并且初始化一个具体的 `Driver` 实例然后返回回去；
     注意如果输入的 `device` 如果和 `driver` 对应不上就直接报错；
diff --git a/fastNLP/core/metrics/utils.py b/fastNLP/core/metrics/utils.py
index ce6f618b..6d3fd74a 100644
--- a/fastNLP/core/metrics/utils.py
+++ b/fastNLP/core/metrics/utils.py
@@ -11,9 +11,8 @@ _IS_ALLENNLP_AVAILABLE = _module_available('allennlp')
 if _IS_ALLENNLP_AVAILABLE:
     from allennlp.training.metrics import Metric as allennlp_Metric
 
-if _NEED_IMPORT_TORCH and _IS_TORCHMETRICS_AVAILABLE:
-    if _IS_TORCHMETRICS_AVAILABLE:
-        from torchmetrics import Metric as torchmetrics_Metric
+if _IS_TORCHMETRICS_AVAILABLE:
+    from torchmetrics import Metric as torchmetrics_Metric
 
 if _NEED_IMPORT_PADDLE:
     from paddle.metric import Metric as paddle_Metric
diff --git a/tests/core/callbacks/test_load_best_model_callback_torch.py b/tests/core/callbacks/test_load_best_model_callback_torch.py
index 0bc63bd5..b042ae0f 100644
--- a/tests/core/callbacks/test_load_best_model_callback_torch.py
+++ b/tests/core/callbacks/test_load_best_model_callback_torch.py
@@ -16,7 +16,7 @@ from fastNLP.core.controllers.trainer import Trainer
 from fastNLP.core.metrics.accuracy import Accuracy
 from fastNLP.core.callbacks.load_best_model_callback import LoadBestModelCallback
 from fastNLP.core import Evaluator
-from fastNLP.core.utils.utils import safe_rm
+from fastNLP.core import rank_zero_rm
 from fastNLP.core.drivers.torch_driver import TorchSingleDriver
 from tests.helpers.models.torch_model import TorchNormalModel_Classification_1
 from tests.helpers.datasets.torch_data import TorchArgMaxDataset
@@ -112,7 +112,7 @@ def test_load_best_model_callback(
     results = evaluator.run()
     assert np.allclose(callbacks[0].monitor_value, results['acc#acc#dl1'])
     if save_folder:
-        safe_rm(save_folder)
+        rank_zero_rm(save_folder)
     if dist.is_initialized():
         dist.destroy_process_group()
 
diff --git a/tests/core/controllers/_test_trainer_fleet.py b/tests/core/controllers/_test_trainer_fleet.py
index 46201c67..f438b6de 100644
--- a/tests/core/controllers/_test_trainer_fleet.py
+++ b/tests/core/controllers/_test_trainer_fleet.py
@@ -4,7 +4,6 @@
 python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet.py
 """
 import os
-os.environ["FASTNLP_BACKEND"] = "paddle"
 import sys
 sys.path.append("../../../")
 
diff --git a/tests/core/controllers/_test_trainer_fleet_outside.py b/tests/core/controllers/_test_trainer_fleet_outside.py
index a48434fa..e8c9a244 100644
--- a/tests/core/controllers/_test_trainer_fleet_outside.py
+++ b/tests/core/controllers/_test_trainer_fleet_outside.py
@@ -4,7 +4,6 @@
 python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet_outside.py
 """
 import os
-os.environ["FASTNLP_BACKEND"] = "paddle"
 import sys
 sys.path.append("../../../")
 
diff --git a/tests/core/controllers/test_trainer_paddle.py b/tests/core/controllers/test_trainer_paddle.py
index 8a3ab2ce..aaf20105 100644
--- a/tests/core/controllers/test_trainer_paddle.py
+++ b/tests/core/controllers/test_trainer_paddle.py
@@ -1,6 +1,4 @@
 import pytest
-import os
-os.environ["FASTNLP_BACKEND"] = "paddle"
 from dataclasses import dataclass
 
 from fastNLP.core.controllers.trainer import Trainer
@@ -25,7 +23,7 @@ class TrainPaddleConfig:
     shuffle: bool = True
     evaluate_every = 2
 
-@pytest.mark.parametrize("driver,device", [("paddle", "cpu"), ("paddle", 1)])
+@pytest.mark.parametrize("driver,device", [("paddle", "cpu"), ("paddle", 1), ("fleet", [0, 1])])
 # @pytest.mark.parametrize("driver,device", [("fleet", [0, 1])])
 @pytest.mark.parametrize("callbacks", [[RecordMetricCallback(monitor="acc#acc", metric_threshold=0.0, larger_better=True), 
                                         RichCallback(5)]])
diff --git a/tests/core/drivers/paddle_driver/test_dist_utils.py b/tests/core/drivers/paddle_driver/test_dist_utils.py
index 9b81c38d..bd43378e 100644
--- a/tests/core/drivers/paddle_driver/test_dist_utils.py
+++ b/tests/core/drivers/paddle_driver/test_dist_utils.py
@@ -3,7 +3,6 @@ import sys
 import signal
 import pytest
 import traceback
-os.environ["FASTNLP_BACKEND"] = "paddle"
 
 import numpy as np
 
diff --git a/tests/core/drivers/paddle_driver/test_fleet.py b/tests/core/drivers/paddle_driver/test_fleet.py
index 34c80888..6190dd8c 100644
--- a/tests/core/drivers/paddle_driver/test_fleet.py
+++ b/tests/core/drivers/paddle_driver/test_fleet.py
@@ -1,8 +1,6 @@
 import pytest
-import os
 from pathlib import Path
 
-os.environ["FASTNLP_BACKEND"] = "paddle"
 from fastNLP.core.drivers.paddle_driver.fleet import PaddleFleetDriver
 from fastNLP.core.samplers import (
     RandomSampler,
diff --git a/tests/core/drivers/paddle_driver/test_initialize_paddle_driver.py b/tests/core/drivers/paddle_driver/test_initialize_paddle_driver.py
index df96d746..c8b5bfff 100644
--- a/tests/core/drivers/paddle_driver/test_initialize_paddle_driver.py
+++ b/tests/core/drivers/paddle_driver/test_initialize_paddle_driver.py
@@ -1,8 +1,5 @@
-import os
 import pytest
 
-os.environ["FASTNLP_BACKEND"] = "paddle"
-
 from fastNLP.core.drivers import PaddleSingleDriver, PaddleFleetDriver
 from fastNLP.core.drivers.paddle_driver.initialize_paddle_driver import initialize_paddle_driver
 from fastNLP.envs import get_gpu_count
diff --git a/tests/core/drivers/paddle_driver/test_single_device.py b/tests/core/drivers/paddle_driver/test_single_device.py
index 2aa4e0e6..ec40e9f3 100644
--- a/tests/core/drivers/paddle_driver/test_single_device.py
+++ b/tests/core/drivers/paddle_driver/test_single_device.py
@@ -1,6 +1,3 @@
-import os
-from re import S
-os.environ["FASTNLP_BACKEND"] = "paddle"
 import pytest
 from pathlib import Path
 
diff --git a/tests/core/drivers/paddle_driver/test_utils.py b/tests/core/drivers/paddle_driver/test_utils.py
index 690d0fb8..69be8055 100644
--- a/tests/core/drivers/paddle_driver/test_utils.py
+++ b/tests/core/drivers/paddle_driver/test_utils.py
@@ -1,6 +1,4 @@
-import os
 import pytest
-os.environ["FASTNLP_BACKEND"] = "paddle"
 
 from fastNLP.core.drivers.paddle_driver.utils import (
     get_device_from_visible,
diff --git a/tests/core/drivers/torch_driver/test.py b/tests/core/drivers/torch_driver/test.py
new file mode 100644
index 00000000..3a1a280d
--- /dev/null
+++ b/tests/core/drivers/torch_driver/test.py
@@ -0,0 +1,31 @@
+import sys
+sys.path.append("../../../../")
+from fastNLP.core.drivers.torch_driver.ddp import TorchDDPDriver
+from tests.helpers.models.torch_model import TorchNormalModel_Classification_1
+
+import torch
+
+device = [0, 1]
+torch_model = TorchNormalModel_Classification_1(10, 10)
+torch_opt = torch.optim.Adam(params=torch_model.parameters(), lr=0.01)
+device = [torch.device(i) for i in device]
+driver = TorchDDPDriver(
+    model=torch_model,
+    parallel_device=device,
+    fp16=False
+)
+driver.set_optimizers(torch_opt)
+driver.setup()
+print("-----------first--------------")
+
+device = [0, 2]
+torch_model = TorchNormalModel_Classification_1(10, 10)
+torch_opt = torch.optim.Adam(params=torch_model.parameters(), lr=0.01)
+device = [torch.device(i) for i in device]
+driver = TorchDDPDriver(
+    model=torch_model,
+    parallel_device=device,
+    fp16=False
+)
+driver.set_optimizers(torch_opt)
+driver.setup()
\ No newline at end of file
diff --git a/tests/core/drivers/torch_driver/test_ddp.py b/tests/core/drivers/torch_driver/test_ddp.py
index 0e91fe77..87787fbc 100644
--- a/tests/core/drivers/torch_driver/test_ddp.py
+++ b/tests/core/drivers/torch_driver/test_ddp.py
@@ -1,8 +1,6 @@
 import pytest
-import os
 from pathlib import Path
 
-os.environ["FASTNLP_BACKEND"] = "torch"
 from fastNLP.core.drivers.torch_driver.ddp import TorchDDPDriver
 from fastNLP.core.samplers import (
     RandomSampler,
diff --git a/tests/core/drivers/torch_driver/test_initialize_torch_driver.py b/tests/core/drivers/torch_driver/test_initialize_torch_driver.py
index 6c47e30e..3e612964 100644
--- a/tests/core/drivers/torch_driver/test_initialize_torch_driver.py
+++ b/tests/core/drivers/torch_driver/test_initialize_torch_driver.py
@@ -1,8 +1,5 @@
-import os
 import pytest
 
-os.environ["FASTNLP_BACKEND"] = "torch"
-
 from fastNLP.core.drivers import TorchSingleDriver, TorchDDPDriver
 from fastNLP.core.drivers.torch_driver.initialize_torch_driver import initialize_torch_driver
 from fastNLP.envs import get_gpu_count
diff --git a/tests/core/drivers/torch_driver/test_single_device.py b/tests/core/drivers/torch_driver/test_single_device.py
index b8a8def9..f46f69c0 100644
--- a/tests/core/drivers/torch_driver/test_single_device.py
+++ b/tests/core/drivers/torch_driver/test_single_device.py
@@ -1,5 +1,3 @@
-import os
-os.environ["FASTNLP_BACKEND"] = "torch"
 import pytest
 from pathlib import Path
 
diff --git a/tests/core/drivers/torch_driver/test_utils.py b/tests/core/drivers/torch_driver/test_utils.py
index 8f0172e0..4df767b5 100644
--- a/tests/core/drivers/torch_driver/test_utils.py
+++ b/tests/core/drivers/torch_driver/test_utils.py
@@ -1,6 +1,4 @@
-import os
 import pytest
-os.environ["FASTNLP_BACKEND"] = "torch"
 
 from fastNLP.core.drivers.torch_driver.utils import (
     replace_batch_sampler,
diff --git a/tests/core/samplers/test_unrepeated_sampler.py b/tests/core/samplers/test_unrepeated_sampler.py
index 4a271f41..39d4e34f 100644
--- a/tests/core/samplers/test_unrepeated_sampler.py
+++ b/tests/core/samplers/test_unrepeated_sampler.py
@@ -28,12 +28,12 @@ class TestUnrepeatedSampler:
     @pytest.mark.parametrize('num_replicas', [2, 3])
     @pytest.mark.parametrize('num_of_data', [2, 3, 4, 100])
     @pytest.mark.parametrize('shuffle', [False, True])
-    def test_multi(self, num_replica, num_of_data, shuffle):
+    def test_multi(self, num_replicas, num_of_data, shuffle):
         data = DatasetWithVaryLength(num_of_data=num_of_data)
         samplers = []
-        for i in range(num_replica):
+        for i in range(num_replicas):
             sampler = UnrepeatedRandomSampler(dataset=data, shuffle=shuffle)
-            sampler.set_distributed(num_replica, rank=i)
+            sampler.set_distributed(num_replicas, rank=i)
             samplers.append(sampler)
 
         indexes = list(chain(*samplers))
@@ -52,12 +52,12 @@ class TestUnrepeatedSortedSampler:
 
     @pytest.mark.parametrize('num_replicas', [2, 3])
     @pytest.mark.parametrize('num_of_data', [2, 3, 4, 100])
-    def test_multi(self, num_replica, num_of_data):
+    def test_multi(self, num_replicas, num_of_data):
         data = DatasetWithVaryLength(num_of_data=num_of_data)
         samplers = []
-        for i in range(num_replica):
+        for i in range(num_replicas):
             sampler = UnrepeatedSortedSampler(dataset=data, length=data.data)
-            sampler.set_distributed(num_replica, rank=i)
+            sampler.set_distributed(num_replicas, rank=i)
             samplers.append(sampler)
 
         # 保证顺序是没乱的
@@ -83,12 +83,12 @@ class TestUnrepeatedSequentialSampler:
 
     @pytest.mark.parametrize('num_replicas', [2, 3])
     @pytest.mark.parametrize('num_of_data', [2, 3, 4, 100])
-    def test_multi(self, num_replica, num_of_data):
+    def test_multi(self, num_replicas, num_of_data):
         data = DatasetWithVaryLength(num_of_data=num_of_data)
         samplers = []
-        for i in range(num_replica):
+        for i in range(num_replicas):
             sampler = UnrepeatedSequentialSampler(dataset=data, length=data.data)
-            sampler.set_distributed(num_replica, rank=i)
+            sampler.set_distributed(num_replicas, rank=i)
             samplers.append(sampler)
 
         # 保证顺序是没乱的

From 35f05932687ddf93229d5d26987e9030b744acd9 Mon Sep 17 00:00:00 2001
From: YWMditto <ditto@YWMdittodeMacBook-Pro.local>
Date: Sat, 30 Apr 2022 21:39:20 +0800
Subject: [PATCH 7/9] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86=E4=B8=80?=
 =?UTF-8?q?=E4=BA=9B=E6=B5=8B=E8=AF=95=E6=96=87=E4=BB=B6=E7=9A=84=E5=90=8D?=
 =?UTF-8?q?=E7=A7=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../{test_logger.py => test_logger_torch.py}  |   0
 .../test_reproducible_batch_sampler.py        | 294 +++++++++---------
 2 files changed, 147 insertions(+), 147 deletions(-)
 rename tests/core/log/{test_logger.py => test_logger_torch.py} (100%)

diff --git a/tests/core/log/test_logger.py b/tests/core/log/test_logger_torch.py
similarity index 100%
rename from tests/core/log/test_logger.py
rename to tests/core/log/test_logger_torch.py
diff --git a/tests/core/samplers/test_reproducible_batch_sampler.py b/tests/core/samplers/test_reproducible_batch_sampler.py
index 3514c331..6cf4b7d4 100644
--- a/tests/core/samplers/test_reproducible_batch_sampler.py
+++ b/tests/core/samplers/test_reproducible_batch_sampler.py
@@ -9,153 +9,153 @@ from fastNLP.core.samplers import RandomBatchSampler, BucketedBatchSampler
 from fastNLP.core.drivers.torch_driver.utils import replace_batch_sampler
 from tests.helpers.datasets.torch_data import TorchNormalDataset
 
-
-class TestReproducibleBatchSampler:
-    # TODO 拆分测试，在这里只测试一个东西
-    def test_torch_dataloader_1(self):
-        import torch
-        from torch.utils.data import DataLoader
-        # no shuffle
-        before_batch_size = 7
-        dataset = TorchNormalDataset(num_of_data=100)
-        dataloader = DataLoader(dataset, batch_size=before_batch_size)
-        re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False)
-        dataloader = replace_batch_sampler(dataloader, re_batchsampler)
-
-        forward_steps = 3
-        iter_dataloader = iter(dataloader)
-        for _ in range(forward_steps):
-            next(iter_dataloader)
-
-        # 1. 保存状态
-        _get_re_batchsampler = dataloader.batch_sampler
-        assert isinstance(_get_re_batchsampler, RandomBatchSampler)
-        state = _get_re_batchsampler.state_dict()
-        assert state == {"index_list": array("I", list(range(100))), "num_consumed_samples": forward_steps*before_batch_size,
-                         "sampler_type": "RandomBatchSampler"}
-
-        # 2. 断点重训，重新生成一个 dataloader；
-        # 不改变 batch_size；
-        dataloader = DataLoader(dataset, batch_size=before_batch_size)
-        re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False)
-        re_batchsampler.load_state_dict(state)
-        dataloader = replace_batch_sampler(dataloader, re_batchsampler)
-
-        real_res = []
-        supposed_res = (torch.tensor(list(range(21, 28))), torch.tensor(list(range(28, 35))))
-        forward_steps = 2
-        iter_dataloader = iter(dataloader)
-        for _ in range(forward_steps):
-            real_res.append(next(iter_dataloader))
-
-        for i in range(forward_steps):
-            assert all(real_res[i] == supposed_res[i])
-
-        # 改变 batch_size；
-        after_batch_size = 3
-        dataloader = DataLoader(dataset, batch_size=after_batch_size)
-        re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False)
-        re_batchsampler.load_state_dict(state)
-        dataloader = replace_batch_sampler(dataloader, re_batchsampler)
-
-        real_res = []
-        supposed_res = (torch.tensor(list(range(21, 24))), torch.tensor(list(range(24, 27))))
-        forward_steps = 2
-        iter_dataloader = iter(dataloader)
-        for _ in range(forward_steps):
-            real_res.append(next(iter_dataloader))
-
-        for i in range(forward_steps):
-            assert all(real_res[i] == supposed_res[i])
-
-        # 断点重训的第二轮是否是一个完整的 dataloader；
-        # 先把断点重训所在的那一个 epoch 跑完；
-        begin_idx = 27
-        while True:
-            try:
-                data = next(iter_dataloader)
-                _batch_size = len(data)
-                assert all(data == torch.tensor(list(range(begin_idx, begin_idx + _batch_size))))
-                begin_idx += _batch_size
-            except StopIteration:
-                break
-
-        # 开始新的一轮；
-        begin_idx = 0
-        iter_dataloader = iter(dataloader)
-        while True:
-            try:
-                data = next(iter_dataloader)
-                _batch_size = len(data)
-                assert all(data == torch.tensor(list(range(begin_idx, begin_idx + _batch_size))))
-                begin_idx += _batch_size
-            except StopIteration:
-                break
-
-    def test_torch_dataloader_2(self):
-        # 测试新的一轮的 index list 是重新生成的，而不是沿用上一轮的；
-        from torch.utils.data import DataLoader
-        # no shuffle
-        before_batch_size = 7
-        dataset = TorchNormalDataset(num_of_data=100)
-        # 开启 shuffle，来检验断点重训后的第二轮的 index list 是不是重新生成的；
-        dataloader = DataLoader(dataset, batch_size=before_batch_size, shuffle=True)
-        re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False)
-        dataloader = replace_batch_sampler(dataloader, re_batchsampler)
-
-        # 将一轮的所有数据保存下来，看是否恢复的是正确的；
-        all_supposed_data = []
-        forward_steps = 3
-        iter_dataloader = iter(dataloader)
-        for _ in range(forward_steps):
-            all_supposed_data.extend(next(iter_dataloader).tolist())
-
-        # 1. 保存状态
-        _get_re_batchsampler = dataloader.batch_sampler
-        assert isinstance(_get_re_batchsampler, RandomBatchSampler)
-        state = _get_re_batchsampler.state_dict()
-
-        # 2. 断点重训，重新生成一个 dataloader；
-        # 不改变 batch_size；
-        dataloader = DataLoader(dataset, batch_size=before_batch_size, shuffle=True)
-        re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False)
-        re_batchsampler.load_state_dict(state)
-        dataloader = replace_batch_sampler(dataloader, re_batchsampler)
-
-        # 先把这一轮的数据过完；
-        pre_index_list = dataloader.batch_sampler.state_dict()["index_list"]
-        while True:
-            try:
-                all_supposed_data.extend(next(iter_dataloader).tolist())
-            except StopIteration:
-                break
-        assert all_supposed_data == list(pre_index_list)
-
-        # 重新开启新的一轮；
-        for _ in range(3):
-            iter_dataloader = iter(dataloader)
-            res = []
-            while True:
-                try:
-                    res.append(next(iter_dataloader))
-                except StopIteration:
-                    break
-
-    def test_3(self):
-        import torch
-        from torch.utils.data import DataLoader
-        before_batch_size = 7
-        dataset = TorchNormalDataset(num_of_data=100)
-        # 开启 shuffle，来检验断点重训后的第二轮的 index list 是不是重新生成的；
-        dataloader = DataLoader(dataset, batch_size=before_batch_size)
-
-        for idx, data in enumerate(dataloader):
-            if idx > 3:
-                break
-
-        iterator = iter(dataloader)
-        for each in iterator:
-            pass
+#
+# class TestReproducibleBatchSampler:
+#     # TODO 拆分测试，在这里只测试一个东西
+#     def test_torch_dataloader_1(self):
+#         import torch
+#         from torch.utils.data import DataLoader
+#         # no shuffle
+#         before_batch_size = 7
+#         dataset = TorchNormalDataset(num_of_data=100)
+#         dataloader = DataLoader(dataset, batch_size=before_batch_size)
+#         re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False)
+#         dataloader = replace_batch_sampler(dataloader, re_batchsampler)
+#
+#         forward_steps = 3
+#         iter_dataloader = iter(dataloader)
+#         for _ in range(forward_steps):
+#             next(iter_dataloader)
+#
+#         # 1. 保存状态
+#         _get_re_batchsampler = dataloader.batch_sampler
+#         assert isinstance(_get_re_batchsampler, RandomBatchSampler)
+#         state = _get_re_batchsampler.state_dict()
+#         assert state == {"index_list": array("I", list(range(100))), "num_consumed_samples": forward_steps*before_batch_size,
+#                          "sampler_type": "RandomBatchSampler"}
+#
+#         # 2. 断点重训，重新生成一个 dataloader；
+#         # 不改变 batch_size；
+#         dataloader = DataLoader(dataset, batch_size=before_batch_size)
+#         re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False)
+#         re_batchsampler.load_state_dict(state)
+#         dataloader = replace_batch_sampler(dataloader, re_batchsampler)
+#
+#         real_res = []
+#         supposed_res = (torch.tensor(list(range(21, 28))), torch.tensor(list(range(28, 35))))
+#         forward_steps = 2
+#         iter_dataloader = iter(dataloader)
+#         for _ in range(forward_steps):
+#             real_res.append(next(iter_dataloader))
+#
+#         for i in range(forward_steps):
+#             assert all(real_res[i] == supposed_res[i])
+#
+#         # 改变 batch_size；
+#         after_batch_size = 3
+#         dataloader = DataLoader(dataset, batch_size=after_batch_size)
+#         re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False)
+#         re_batchsampler.load_state_dict(state)
+#         dataloader = replace_batch_sampler(dataloader, re_batchsampler)
+#
+#         real_res = []
+#         supposed_res = (torch.tensor(list(range(21, 24))), torch.tensor(list(range(24, 27))))
+#         forward_steps = 2
+#         iter_dataloader = iter(dataloader)
+#         for _ in range(forward_steps):
+#             real_res.append(next(iter_dataloader))
+#
+#         for i in range(forward_steps):
+#             assert all(real_res[i] == supposed_res[i])
+#
+#         # 断点重训的第二轮是否是一个完整的 dataloader；
+#         # 先把断点重训所在的那一个 epoch 跑完；
+#         begin_idx = 27
+#         while True:
+#             try:
+#                 data = next(iter_dataloader)
+#                 _batch_size = len(data)
+#                 assert all(data == torch.tensor(list(range(begin_idx, begin_idx + _batch_size))))
+#                 begin_idx += _batch_size
+#             except StopIteration:
+#                 break
+#
+#         # 开始新的一轮；
+#         begin_idx = 0
+#         iter_dataloader = iter(dataloader)
+#         while True:
+#             try:
+#                 data = next(iter_dataloader)
+#                 _batch_size = len(data)
+#                 assert all(data == torch.tensor(list(range(begin_idx, begin_idx + _batch_size))))
+#                 begin_idx += _batch_size
+#             except StopIteration:
+#                 break
+#
+#     def test_torch_dataloader_2(self):
+#         # 测试新的一轮的 index list 是重新生成的，而不是沿用上一轮的；
+#         from torch.utils.data import DataLoader
+#         # no shuffle
+#         before_batch_size = 7
+#         dataset = TorchNormalDataset(num_of_data=100)
+#         # 开启 shuffle，来检验断点重训后的第二轮的 index list 是不是重新生成的；
+#         dataloader = DataLoader(dataset, batch_size=before_batch_size, shuffle=True)
+#         re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False)
+#         dataloader = replace_batch_sampler(dataloader, re_batchsampler)
+#
+#         # 将一轮的所有数据保存下来，看是否恢复的是正确的；
+#         all_supposed_data = []
+#         forward_steps = 3
+#         iter_dataloader = iter(dataloader)
+#         for _ in range(forward_steps):
+#             all_supposed_data.extend(next(iter_dataloader).tolist())
+#
+#         # 1. 保存状态
+#         _get_re_batchsampler = dataloader.batch_sampler
+#         assert isinstance(_get_re_batchsampler, RandomBatchSampler)
+#         state = _get_re_batchsampler.state_dict()
+#
+#         # 2. 断点重训，重新生成一个 dataloader；
+#         # 不改变 batch_size；
+#         dataloader = DataLoader(dataset, batch_size=before_batch_size, shuffle=True)
+#         re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False)
+#         re_batchsampler.load_state_dict(state)
+#         dataloader = replace_batch_sampler(dataloader, re_batchsampler)
+#
+#         # 先把这一轮的数据过完；
+#         pre_index_list = dataloader.batch_sampler.state_dict()["index_list"]
+#         while True:
+#             try:
+#                 all_supposed_data.extend(next(iter_dataloader).tolist())
+#             except StopIteration:
+#                 break
+#         assert all_supposed_data == list(pre_index_list)
+#
+#         # 重新开启新的一轮；
+#         for _ in range(3):
+#             iter_dataloader = iter(dataloader)
+#             res = []
+#             while True:
+#                 try:
+#                     res.append(next(iter_dataloader))
+#                 except StopIteration:
+#                     break
+#
+#     def test_3(self):
+#         import torch
+#         from torch.utils.data import DataLoader
+#         before_batch_size = 7
+#         dataset = TorchNormalDataset(num_of_data=100)
+#         # 开启 shuffle，来检验断点重训后的第二轮的 index list 是不是重新生成的；
+#         dataloader = DataLoader(dataset, batch_size=before_batch_size)
+#
+#         for idx, data in enumerate(dataloader):
+#             if idx > 3:
+#                 break
+#
+#         iterator = iter(dataloader)
+#         for each in iterator:
+#             pass
 
 
 class DatasetWithVaryLength:

From 6da627d4ceb2103046b374ca9c4d76d8e627469e Mon Sep 17 00:00:00 2001
From: lxr-tech <1838593642@qq.com>
Date: Sat, 30 Apr 2022 23:29:12 +0800
Subject: [PATCH 8/9] modify-fastnlp_tutorial_0-lxr-220430

---
 fastNLP/core/metrics/accuracy.py              |   4 +-
 .../metrics/classify_f1_pre_rec_metric.py     |   4 +-
 tutorials/fastnlp_tutorial_0.ipynb            | 701 +++++++-----------
 .../figures/T0-fig-trainer-and-evaluator.png  | Bin 0 -> 104863 bytes
 4 files changed, 255 insertions(+), 454 deletions(-)
 create mode 100644 tutorials/figures/T0-fig-trainer-and-evaluator.png

diff --git a/fastNLP/core/metrics/accuracy.py b/fastNLP/core/metrics/accuracy.py
index d9ccb332..0869d8c8 100644
--- a/fastNLP/core/metrics/accuracy.py
+++ b/fastNLP/core/metrics/accuracy.py
@@ -28,7 +28,7 @@ class Accuracy(Metric):
 
     def get_metric(self) -> dict:
         r"""
-        get_metric 函数将根据 evaluate 函数累计的评价指标统计量来计算最终的评价结果.
+        get_metric 函数将根据 update 函数累计的评价指标统计量来计算最终的评价结果.
 
         :return dict evaluate_result: {"acc": float}
         """
@@ -37,7 +37,7 @@ class Accuracy(Metric):
 
     def update(self, pred, target, seq_len=None):
         r"""
-        evaluate函数将针对一个批次的预测结果做评价指标的累计
+        update 函数将针对一个批次的预测结果做评价指标的累计
 
         :param torch.Tensor pred: 预测的tensor, tensor的形状可以是torch.Size([B,]), torch.Size([B, n_classes]),
                 torch.Size([B, max_len]), 或者torch.Size([B, max_len, n_classes])
diff --git a/fastNLP/core/metrics/classify_f1_pre_rec_metric.py b/fastNLP/core/metrics/classify_f1_pre_rec_metric.py
index 2c71602d..8de007ce 100644
--- a/fastNLP/core/metrics/classify_f1_pre_rec_metric.py
+++ b/fastNLP/core/metrics/classify_f1_pre_rec_metric.py
@@ -56,7 +56,7 @@ class ClassifyFPreRecMetric(Metric):
 
     def get_metric(self) -> dict:
         r"""
-        get_metric函数将根据evaluate函数累计的评价指标统计量来计算最终的评价结果.
+        get_metric函数将根据update函数累计的评价指标统计量来计算最终的评价结果.
 
         :return dict evaluate_result: {"acc": float}
         """
@@ -117,7 +117,7 @@ class ClassifyFPreRecMetric(Metric):
 
     def update(self, pred, target, seq_len=None):
         r"""
-        evaluate函数将针对一个批次的预测结果做评价指标的累计
+        update 函数将针对一个批次的预测结果做评价指标的累计
 
         :param torch.Tensor pred: 预测的tensor, tensor的形状可以是torch.Size([B,]), torch.Size([B, n_classes]),
                 torch.Size([B, max_len]), 或者torch.Size([B, max_len, n_classes])
diff --git a/tutorials/fastnlp_tutorial_0.ipynb b/tutorials/fastnlp_tutorial_0.ipynb
index 01913ac0..28fcfddf 100644
--- a/tutorials/fastnlp_tutorial_0.ipynb
+++ b/tutorials/fastnlp_tutorial_0.ipynb
@@ -15,15 +15,15 @@
     "\n",
     "&emsp; &emsp; 1.3 &ensp; trainer 内部初始化 evaluater\n",
     "\n",
-    "&emsp; 2 &ensp; 使用 trainer 训练模型\n",
+    "&emsp; 2 &ensp; 使用 fastNLP 0.8 搭建 argmax 模型\n",
     "\n",
-    "&emsp; &emsp; 2.1 &ensp; argmax 模型实例\n",
+    "&emsp; &emsp; 2.1 &ensp; trainer_step 和 evaluator_step\n",
     "\n",
-    "&emsp; &emsp; 2.2 &ensp; trainer 的参数匹配\n",
+    "&emsp; &emsp; 2.2 &ensp; trainer 和 evaluator 的参数匹配\n",
     "\n",
-    "&emsp; &emsp; 2.3 &ensp; trainer 的实际使用 \n",
+    "&emsp; &emsp; 2.3 &ensp; 一个实际案例：argmax 模型\n",
     "\n",
-    "&emsp; 3 &ensp; 使用 evaluator 评测模型\n",
+    "&emsp; 3 &ensp; 使用 fastNLP 0.8 训练 argmax 模型\n",
     " \n",
     "&emsp; &emsp; 3.1 &ensp; trainer 外部初始化的 evaluator\n",
     "\n",
@@ -50,21 +50,21 @@
     "\n",
     "```python\n",
     "trainer = Trainer(\n",
-    "    model=model,\n",
-    "    train_dataloader=train_dataloader,\n",
-    "    optimizers=optimizer,\n",
+    "    model=model,                        # 模型基于 torch.nn.Module\n",
+    "    train_dataloader=train_dataloader,  # 加载模块基于 torch.utils.data.DataLoader  \n",
+    "    optimizers=optimizer,               # 优化模块基于 torch.optim.*\n",
     "\t...\n",
-    "\tdriver=\"torch\",\n",
-    "\tdevice=0,\n",
+    "\tdriver=\"torch\",                     # 使用 pytorch 模块进行训练 \n",
+    "\tdevice='cuda',                      # 使用 GPU：0 显卡执行训练\n",
     "\t...\n",
     ")\n",
     "...\n",
     "evaluator = Evaluator(\n",
-    "    model=model,\n",
-    "    dataloaders=evaluate_dataloader,\n",
-    "    metrics={'acc': Accuracy()} \n",
+    "    model=model,                        # 模型基于 torch.nn.Module\n",
+    "    dataloaders=evaluate_dataloader,    # 加载模块基于 torch.utils.data.DataLoader\n",
+    "    metrics={'acc': Accuracy()},        # 测评方法使用 fastNLP.core.metrics.Accuracy \n",
     "    ...\n",
-    "    driver=trainer.driver,\n",
+    "    driver=trainer.driver,              # 保持同 trainer 的 driver 一致\n",
     "\tdevice=None,\n",
     "    ...\n",
     ")\n",
@@ -88,7 +88,7 @@
     "\n",
     "注：在同一脚本中，`Trainer`和`Evaluator`使用的`driver`应当保持一致\n",
     "\n",
-    "&emsp; 一个不能违背的原则在于：**不要将多卡的`driver`前使用单卡的`driver`**（？？？），这样使用可能会带来很多意想不到的错误。"
+    "&emsp; 一个不能违背的原则在于：**不要将多卡的`driver`前使用单卡的`driver`**（？？？），这样使用可能会带来很多意想不到的错误"
    ]
   },
   {
@@ -109,10 +109,10 @@
     "    optimizers=optimizer,\n",
     "\t...\n",
     "\tdriver=\"torch\",\n",
-    "\tdevice=0,\n",
+    "\tdevice='cuda',\n",
     "\t...\n",
-    "    evaluate_dataloaders=evaluate_dataloader,\n",
-    "    metrics={'acc': Accuracy()},\n",
+    "    evaluate_dataloaders=evaluate_dataloader,   # 传入参数 evaluator_dataloaders\n",
+    "    metrics={'acc': Accuracy()},                # 传入参数 metrics\n",
     "\t...\n",
     ")\n",
     "```"
@@ -123,7 +123,7 @@
    "id": "0c9c7dda",
    "metadata": {},
    "source": [
-    "## 2. 使用 trainer 训练模型"
+    "## 2. argmax 模型的搭建实例"
    ]
   },
   {
@@ -131,71 +131,41 @@
    "id": "524ac200",
    "metadata": {},
    "source": [
-    "### 2.1 argmax 模型实例\n",
+    "### 2.1 trainer_step 和 evaluator_step\n",
     "\n",
-    "本节将通过训练`argmax`模型，简单介绍如何`Trainer`模块的使用方式\n",
+    "在`fastNLP 0.8`中，使用`pytorch.nn.Module`搭建需要训练的模型，在搭建模型过程中，除了\n",
     "\n",
-    "&emsp; 使用`pytorch`定义`argmax`模型，输入一组固定维度的向量，输出其中数值最大的数的索引\n",
-    "\n",
-    "&emsp; 除了添加`pytorch`要求的`forward`方法外，还需要添加 **`train_step`** 和 **`evaluate_step`** 这两个方法"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5314482b",
-   "metadata": {
-    "pycharm": {
-     "is_executing": true
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import torch\n",
-    "import torch.nn as nn\n",
-    "\n",
-    "class ArgMaxModel(nn.Module):\n",
-    "    def __init__(self, num_labels, feature_dimension):\n",
-    "        super(ArgMaxModel, self).__init__()\n",
-    "        self.num_labels = num_labels\n",
-    "\n",
-    "        self.linear1 = nn.Linear(in_features=feature_dimension, out_features=10)\n",
-    "        self.ac1 = nn.ReLU()\n",
-    "        self.linear2 = nn.Linear(in_features=10, out_features=10)\n",
-    "        self.ac2 = nn.ReLU()\n",
-    "        self.output = nn.Linear(in_features=10, out_features=num_labels)\n",
-    "        self.loss_fn = nn.CrossEntropyLoss()\n",
+    "&emsp; 添加`pytorch`要求的`forward`方法外，还需要添加 **`train_step`** 和 **`evaluate_step`** 这两个方法\n",
+    "***\n",
+    "```python\n",
+    "class Model(torch.nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(Model, self).__init__()\n",
+    "        self.loss_fn = torch.nn.CrossEntropyLoss()\n",
+    "        pass\n",
     "\n",
     "    def forward(self, x):\n",
-    "        x = self.ac1(self.linear1(x))\n",
-    "        x = self.ac2(self.linear2(x))\n",
-    "        x = self.output(x)\n",
-    "        return x\n",
+    "        pass\n",
     "\n",
     "    def train_step(self, x, y):\n",
-    "        x = self(x)\n",
-    "        return {\"loss\": self.loss_fn(x, y)}\n",
+    "        pred = self(x)\n",
+    "        return {\"loss\": self.loss_fn(pred, y)}\n",
     "\n",
     "    def evaluate_step(self, x, y):\n",
-    "        x = self(x)\n",
-    "        x = torch.max(x, dim=-1)[1]\n",
-    "        return {\"pred\": x, \"target\": y}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ca897322",
-   "metadata": {},
-   "source": [
+    "        pred = self(x)\n",
+    "        pred = torch.max(pred, dim=-1)[1]\n",
+    "        return {\"pred\": pred, \"target\": y}\n",
+    "```\n",
+    "***\n",
     "在`fastNLP 0.8`中，**函数`train_step`是`Trainer`中参数`train_fn`的默认值**\n",
     "\n",
-    "&emsp; 由于，在`Trainer`训练时，**`Trainer`通过参数`_train_fn_`对应的模型方法获得当前数据批次的损失值**\n",
+    "&emsp; 由于，在`Trainer`训练时，**`Trainer`通过参数`train_fn`对应的模型方法获得当前数据批次的损失值**\n",
     "\n",
     "&emsp; 因此，在`Trainer`训练时，`Trainer`首先会寻找模型是否定义了`train_step`这一方法\n",
     "\n",
     "&emsp; &emsp; 如果没有找到，那么`Trainer`会默认使用模型的`forward`函数来进行训练的前向传播过程\n",
     "\n",
-    "注：在`fastNLP 0.8`中，`Trainer`要求模型通过`train_step`来返回一个字典，将损失值作为`loss`的键值\n",
+    "注：在`fastNLP 0.8`中，**`Trainer`要求模型通过`train_step`来返回一个字典**，**满足如`{\"loss\": loss}`的形式**\n",
     "\n",
     "&emsp; 此外，这里也可以通过传入`Trainer`的参数`output_mapping`来实现高度化的定制，具体请见这一note（？？？）\n",
     "\n",
@@ -205,7 +175,11 @@
     "\n",
     "&emsp; 从用户角度，模型通过`evaluate_step`方法来返回一个字典，内容与传入`Evaluator`的`metrics`一致\n",
     "\n",
-    "<!-- &emsp; 从模块角度，`fastNLP 0.8`会匹配该字典的键值和一个`metric`的更新函数的函数签名，自动地将`metric`所需要的内容传给该`metric`，也就是我们会自动进行“**参数匹配**”。 -->"
+    "&emsp; 从模块角度，该字典的键值和`metric`中的`update`函数的签名一致，这样的机制在传参时被称为“**参数匹配**”\n",
+    "\n",
+    "***\n",
+    "\n",
+    "![fastNLP 0.8 中，Trainer 和 Evaluator 的关系图](./figures/T0-fig-trainer-and-evaluator.png)"
    ]
   },
   {
@@ -213,13 +187,52 @@
    "id": "fb3272eb",
    "metadata": {},
    "source": [
-    "### 2.2 trainer 的参数匹配\n",
+    "### 2.2 trainer 和 evaluator 的参数匹配\n",
+    "\n",
+    "在`fastNLP 0.8`中，参数匹配涉及到两个方面，分别是在\n",
+    "\n",
+    "&emsp; 一方面，**在模型的前向传播中**，**`dataloader`向`train_step`或`evaluate_step`函数传递`batch`**\n",
+    "\n",
+    "&emsp; 另方面，**在模型的评测过程中**，**`evaluate_dataloader`向`metric`的`update`函数传递`batch`**\n",
     "\n",
-    "`fastNLP 0.8`中的参数匹配涉及到两个方面，一是在模型训练或者评测的前向传播过程中，如果从`dataloader`中出来一个`batch`的数据是一个字典，那么我们会查看模型的`train_step`和`evaluate_step`方法的参数签名，然后对于每一个参数，我们会根据其名字从 batch 这一字典中选择出对应的数据传入进去。例如在接下来的定义`Dataset`的部分，注意`ArgMaxDatset`的`__getitem__`方法，您可以通过在`Trainer`和`Evaluator`中设置参数 `model_wo_auto_param_call`来关闭这一行为。当您关闭了这一行为后，我们会将`batch`直接传给您的`train_step`、`evaluate_step`或者 `forward`函数。\n",
+    "对于前者，在`Trainer`和`Evaluator`中的参数`model_wo_auto_param_call`被设置为`False`时\n",
     "\n",
-    "二是在传入`Trainer`或者`Evaluator metrics`后，我们会在需要评测的时间点主动调用`metrics`来对`evaluate_dataloaders`进行评测，这一功能主要就是通过对`metrics`的`update`方法和一个`batch`的数据进行参数评测实现的。首先需要明确的是一个 metric 的计算通常分为 `update` 和 `get_metric`两步，其中`update`表示更新一个`batch`的评测数据，`get_metric` 表示根据已经得到的评测数据计算出最终的评测值，例如对于 `Accuracy`来说，其在`update`的时候会更新一个`batch`计算正确的数量 right_num 和计算错误的数量 total_num，最终在 `get_metric` 时返回评测值`right_num / total_num`。\n",
+    "&emsp; &emsp; **`fastNLP 0.8`要求`dataloader`生成的每个`batch`**，**满足如`{\"x\": x, \"y\": y}`的形式**\n",
+    "\n",
+    "&emsp; 同时，`fastNLP 0.8`会查看模型的`train_step`和`evaluate_step`方法的参数签名，并为对应参数传入对应数值\n",
+    "\n",
+    "&emsp; &emsp; **字典形式的定义**，**对应在`Dataset`定义的`__getitem__`方法中**，例如下方的`ArgMaxDatset`\n",
+    "\n",
+    "&emsp; 而在`Trainer`和`Evaluator`中的参数`model_wo_auto_param_call`被设置为`True`时\n",
+    "\n",
+    "&emsp; &emsp; `fastNLP 0.8`会将`batch`直接传给模型的`train_step`、`evaluate_step`或`forward`函数\n",
+    "***\n",
+    "```python\n",
+    "class Dataset(torch.utils.data.Dataset):\n",
+    "    def __init__(self, x, y):\n",
+    "        self.x = x\n",
+    "        self.y = y\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.x)\n",
+    "\n",
+    "    def __getitem__(self, item):\n",
+    "        return {\"x\": self.x[item], \"y\": self.y[item]}\n",
+    "```\n",
+    "***\n",
+    "对于后者，首先要明确，在`Trainer`和`Evaluator`中，`metrics`的计算分为`update`和`get_metric`两步\n",
     "\n",
-    "因为`fastNLP 0.8`的`metrics`是自动计算的（只需要传给`Trainer`或者`Evaluator`），因此其一定依赖于参数匹配。对于从`evaluate_dataloader`中生成的一个`batch`的数据，我们会查看传给 `Trainer`（最终是传给`Evaluator`）和`Evaluator`的每一个`metric`，然后查看其`update`函数的函数签名，然后根据每一个参数的名字从`batch`字典中选择出对应的数据传入进去。"
+    "&emsp; &emsp; **`update`函数**，**针对一个`batch`的预测结果**，计算其累计的评价指标\n",
+    "\n",
+    "&emsp; &emsp; **`get_metric`函数**，**统计`update`函数累计的评价指标**，来计算最终的评价结果\n",
+    "\n",
+    "&emsp; 例如对于`Accuracy`来说，`update`函数会更新一个`batch`的正例数量`right_num`和负例数量`total_num`\n",
+    "\n",
+    "&emsp; &emsp; 而`get_metric`函数则会返回所有`batch`的评测值`right_num / total_num`\n",
+    "\n",
+    "&emsp; 在此基础上，**`fastNLP 0.8`要求`evaluate_dataloader`生成的每个`batch`传递给对应的`metric`**\n",
+    "\n",
+    "&emsp; &emsp; **以`{\"pred\": y_pred, \"target\": y_true}`的形式**，对应其`update`函数的函数签名"
    ]
   },
   {
@@ -227,9 +240,65 @@
    "id": "f62b7bb1",
    "metadata": {},
    "source": [
-    "### 2.3 trainer的实际使用\n",
+    "### 2.3 一个实际案例：argmax 模型\n",
     "\n",
-    "接下来我们创建用于训练的 dataset，其接受三个参数：数据维度、数据量和随机数种子，生成指定数量的维度为 `feature_dimension` 向量，而每一个向量的标签就是该向量中最大值的索引。"
+    "下文将通过训练`argmax`模型，简单介绍如何`Trainer`模块的使用方式\n",
+    "\n",
+    "&emsp; 首先，使用`pytorch.nn.Module`定义`argmax`模型，目标是输入一组固定维度的向量，输出其中数值最大的数的索引"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "5314482b",
+   "metadata": {
+    "pycharm": {
+     "is_executing": false
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "\n",
+    "class ArgMaxModel(nn.Module):\n",
+    "    def __init__(self, num_labels, feature_dimension):\n",
+    "        super(ArgMaxModel, self).__init__()\n",
+    "        self.num_labels = num_labels\n",
+    "\n",
+    "        self.linear1 = nn.Linear(in_features=feature_dimension, out_features=10)\n",
+    "        self.ac1 = nn.ReLU()\n",
+    "        self.linear2 = nn.Linear(in_features=10, out_features=10)\n",
+    "        self.ac2 = nn.ReLU()\n",
+    "        self.output = nn.Linear(in_features=10, out_features=num_labels)\n",
+    "        self.loss_fn = nn.CrossEntropyLoss()\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        pred = self.ac1(self.linear1(x))\n",
+    "        pred = self.ac2(self.linear2(pred))\n",
+    "        pred = self.output(pred)\n",
+    "        return pred\n",
+    "\n",
+    "    def train_step(self, x, y):\n",
+    "        pred = self(x)\n",
+    "        return {\"loss\": self.loss_fn(pred, y)}\n",
+    "\n",
+    "    def evaluate_step(self, x, y):\n",
+    "        pred = self(x)\n",
+    "        pred = torch.max(pred, dim=-1)[1]\n",
+    "        return {\"pred\": pred, \"target\": y}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "71f3fa6b",
+   "metadata": {},
+   "source": [
+    "&emsp; 接着，使用`torch.utils.data.Dataset`定义`ArgMaxDataset`数据集\n",
+    "\n",
+    "&emsp; &emsp; 数据集包含三个参数：维度`feature_dimension`、数据量`data_num`和随机种子`seed`\n",
+    "\n",
+    "&emsp; &emsp; 数据及初始化是，自动生成指定维度的向量，并为每个向量标注出其中最大值的索引作为预测标签"
    ]
   },
   {
@@ -245,7 +314,7 @@
    "source": [
     "from torch.utils.data import Dataset\n",
     "\n",
-    "class ArgMaxDatset(Dataset):\n",
+    "class ArgMaxDataset(Dataset):\n",
     "    def __init__(self, feature_dimension, data_num=1000, seed=0):\n",
     "        self.num_labels = feature_dimension\n",
     "        self.feature_dimension = feature_dimension\n",
@@ -269,7 +338,9 @@
    "id": "2cb96332",
    "metadata": {},
    "source": [
-    "现在准备好数据和模型。"
+    "&emsp; 然后，根据`ArgMaxModel`类初始化模型实例，保持输入维度`feature_dimension`和输出标签数量`num_labels`一致\n",
+    "\n",
+    "&emsp; &emsp; 再根据`ArgMaxDataset`类初始化两个数据集实例，分别用来模型测试和模型评测，数据量各1000笔"
    ]
   },
   {
@@ -283,16 +354,10 @@
    },
    "outputs": [],
    "source": [
-    "from torch.utils.data import DataLoader\n",
-    "\n",
-    "train_dataset = ArgMaxDatset(feature_dimension=10, data_num=1000)\n",
-    "evaluate_dataset = ArgMaxDatset(feature_dimension=10, data_num=100)\n",
-    "\n",
-    "train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n",
-    "evaluate_dataloader = DataLoader(evaluate_dataset, batch_size=8)\n",
+    "model = ArgMaxModel(num_labels=10, feature_dimension=10)\n",
     "\n",
-    "# num_labels 设置为 10，与 feature_dimension 保持一致，因为我们是预测十个位置中哪一个的概率最大。\n",
-    "model = ArgMaxModel(num_labels=10, feature_dimension=10)"
+    "train_dataset = ArgMaxDataset(feature_dimension=10, data_num=1000)\n",
+    "evaluate_dataset = ArgMaxDataset(feature_dimension=10, data_num=100)"
    ]
   },
   {
@@ -300,12 +365,33 @@
    "id": "4e7d25ee",
    "metadata": {},
    "source": [
-    "将优化器也定义好。"
+    "&emsp; 此外，使用`torch.utils.data.DataLoader`初始化两个数据加载模块，批量大小同为8，分别用于训练和测评"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 4,
+   "id": "363b5b09",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from torch.utils.data import DataLoader\n",
+    "\n",
+    "train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n",
+    "evaluate_dataloader = DataLoader(evaluate_dataset, batch_size=8)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c8d4443f",
+   "metadata": {},
+   "source": [
+    "&emsp; 最后，使用`torch.optim.SGD`初始化一个优化模块，基于随机梯度下降法"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
    "id": "dc28a2d9",
    "metadata": {
     "pycharm": {
@@ -321,15 +407,33 @@
   },
   {
    "cell_type": "markdown",
-   "id": "4f1fba81",
+   "id": "eb8ca6cf",
+   "metadata": {},
+   "source": [
+    "## 3. 使用 fastNLP 0.8 训练 argmax 模型\n",
+    "\n",
+    "### 3.1 trainer 外部初始化的 evaluator"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "55145553",
    "metadata": {},
    "source": [
-    "现在万事俱备，开始使用 Trainer 进行训练！"
+    "通过从`fastNLP`库中导入`Trainer`类，初始化`trainer`实例，对模型进行训练\n",
+    "\n",
+    "&emsp; 需要导入预先定义好的模型`model`、对应的数据加载模块`train_dataloader`、优化模块`optimizer`\n",
+    "\n",
+    "&emsp; 通过`progress_bar`设定进度条格式，默认为`\"auto\"`，此外还有`\"rich\"`、`\"raw\"`和`None`\n",
+    "\n",
+    "&emsp; &emsp; 但对于`\"auto\"`和`\"rich\"`格式，训练结束后进度条会不显示（？？？）\n",
+    "\n",
+    "&emsp; 通过`n_epochs`设定优化迭代轮数，默认为20；全部`Trainer`的全部变量与函数可以通过`dir(trainer)`查询"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "b51b7a2d",
    "metadata": {
     "pycharm": {
@@ -349,167 +453,20 @@
      },
      "metadata": {},
      "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "['__annotations__',\n",
-       " '__class__',\n",
-       " '__delattr__',\n",
-       " '__dict__',\n",
-       " '__dir__',\n",
-       " '__doc__',\n",
-       " '__eq__',\n",
-       " '__format__',\n",
-       " '__ge__',\n",
-       " '__getattribute__',\n",
-       " '__gt__',\n",
-       " '__hash__',\n",
-       " '__init__',\n",
-       " '__init_subclass__',\n",
-       " '__le__',\n",
-       " '__lt__',\n",
-       " '__module__',\n",
-       " '__ne__',\n",
-       " '__new__',\n",
-       " '__reduce__',\n",
-       " '__reduce_ex__',\n",
-       " '__repr__',\n",
-       " '__setattr__',\n",
-       " '__sizeof__',\n",
-       " '__str__',\n",
-       " '__subclasshook__',\n",
-       " '__weakref__',\n",
-       " '_check_callback_called_legality',\n",
-       " '_check_train_batch_loop_legality',\n",
-       " '_custom_callbacks',\n",
-       " '_driver',\n",
-       " '_evaluate_dataloaders',\n",
-       " '_fetch_matched_fn_callbacks',\n",
-       " '_set_num_eval_batch_per_dl',\n",
-       " '_train_batch_loop',\n",
-       " '_train_dataloader',\n",
-       " '_train_step',\n",
-       " '_train_step_signature_fn',\n",
-       " 'accumulation_steps',\n",
-       " 'add_callback_fn',\n",
-       " 'backward',\n",
-       " 'batch_idx_in_epoch',\n",
-       " 'batch_step_fn',\n",
-       " 'callback_manager',\n",
-       " 'check_batch_step_fn',\n",
-       " 'cur_epoch_idx',\n",
-       " 'data_device',\n",
-       " 'dataloader',\n",
-       " 'device',\n",
-       " 'driver',\n",
-       " 'driver_name',\n",
-       " 'epoch_validate',\n",
-       " 'evaluate_batch_step_fn',\n",
-       " 'evaluate_dataloaders',\n",
-       " 'evaluate_every',\n",
-       " 'evaluate_fn',\n",
-       " 'evaluator',\n",
-       " 'extract_loss_from_outputs',\n",
-       " 'fp16',\n",
-       " 'get_no_sync_context',\n",
-       " 'global_forward_batches',\n",
-       " 'has_checked_train_batch_loop',\n",
-       " 'input_mapping',\n",
-       " 'kwargs',\n",
-       " 'larger_better',\n",
-       " 'load',\n",
-       " 'load_model',\n",
-       " 'marker',\n",
-       " 'metrics',\n",
-       " 'model',\n",
-       " 'model_device',\n",
-       " 'monitor',\n",
-       " 'move_data_to_device',\n",
-       " 'n_epochs',\n",
-       " 'num_batches_per_epoch',\n",
-       " 'on',\n",
-       " 'on_after_backward',\n",
-       " 'on_after_optimizers_step',\n",
-       " 'on_after_trainer_initialized',\n",
-       " 'on_after_zero_grad',\n",
-       " 'on_before_backward',\n",
-       " 'on_before_optimizers_step',\n",
-       " 'on_before_zero_grad',\n",
-       " 'on_exception',\n",
-       " 'on_fetch_data_begin',\n",
-       " 'on_fetch_data_end',\n",
-       " 'on_load_checkpoint',\n",
-       " 'on_load_model',\n",
-       " 'on_sanity_check_begin',\n",
-       " 'on_sanity_check_end',\n",
-       " 'on_save_checkpoint',\n",
-       " 'on_save_model',\n",
-       " 'on_train_batch_begin',\n",
-       " 'on_train_batch_end',\n",
-       " 'on_train_begin',\n",
-       " 'on_train_end',\n",
-       " 'on_train_epoch_begin',\n",
-       " 'on_train_epoch_end',\n",
-       " 'on_validate_begin',\n",
-       " 'on_validate_end',\n",
-       " 'optimizers',\n",
-       " 'output_mapping',\n",
-       " 'run',\n",
-       " 'save',\n",
-       " 'save_model',\n",
-       " 'set_grad_to_none',\n",
-       " 'state',\n",
-       " 'step',\n",
-       " 'step_validate',\n",
-       " 'total_batches',\n",
-       " 'train_batch_loop',\n",
-       " 'train_dataloader',\n",
-       " 'train_fn',\n",
-       " 'train_step',\n",
-       " 'trainer_state',\n",
-       " 'zero_grad']"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
     }
    ],
    "source": [
     "from fastNLP import Trainer\n",
     "\n",
-    "# 定义一个 Trainer\n",
     "trainer = Trainer(\n",
     "    model=model,\n",
-    "    driver=\"torch\",    # 使用 pytorch 进行训练\n",
-    "    device=0,          # 使用 GPU：0\n",
+    "    driver=\"torch\",\n",
+    "    device='cuda',\n",
     "    train_dataloader=train_dataloader,\n",
     "    optimizers=optimizer,\n",
-    "    n_epochs=10,       # 训练 40 个 epoch\n",
-    "    progress_bar=\"rich\"\n",
-    ")\n",
-    "dir(trainer)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "f8fe9c32",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "FullArgSpec(args=['self', 'num_train_batch_per_epoch', 'num_eval_batch_per_dl', 'num_eval_sanity_batch', 'resume_from', 'resume_training', 'catch_KeyboardInterrupt'], varargs=None, varkw=None, defaults=(-1, -1, 2, None, True, None), kwonlyargs=[], kwonlydefaults=None, annotations={'num_train_batch_per_epoch': <class 'int'>, 'num_eval_batch_per_dl': <class 'int'>, 'num_eval_sanity_batch': <class 'int'>, 'resume_from': <class 'str'>, 'resume_training': <class 'bool'>})\n"
-     ]
-    }
-   ],
-   "source": [
-    "import inspect \n",
-    "\n",
-    "print(inspect.getfullargspec(trainer.run))"
+    "    n_epochs=10,                    # 设定迭代轮数 \n",
+    "    progress_bar=\"auto\"             # 设定进度条格式\n",
+    ")"
    ]
   },
   {
@@ -517,16 +474,20 @@
    "id": "6e202d6e",
    "metadata": {},
    "source": [
-    "没有问题，那么开始真正的训练！"
+    "通过使用`Trainer`类的`run`函数，进行训练\n",
+    "\n",
+    "&emsp; 其中，可以通过参数`num_train_batch_per_epoch`决定每个`epoch`运行多少个`batch`后停止，默认全部\n",
+    "\n",
+    "&emsp; 此外，可以通过`inspect.getfullargspec(trainer.run)`查询`run`函数的全部参数列表"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "id": "ba047ead",
    "metadata": {
     "pycharm": {
-     "is_executing": false
+     "is_executing": true
     }
    },
    "outputs": [
@@ -585,29 +546,27 @@
     "trainer.run()"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "eb8ca6cf",
-   "metadata": {},
-   "source": [
-    "## 3. 使用 evaluator 评测模型"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "c16c5fa4",
    "metadata": {},
    "source": [
-    "模型训练好了我们开始使用 Evaluator 进行评测，查看效果怎么样吧。"
+    "通过从`fastNLP`库中导入`Evaluator`类，初始化`evaluator`实例，对模型进行评测\n",
+    "\n",
+    "&emsp; 需要导入预先定义好的模型`model`、对应的数据加载模块`evaluate_dataloader`\n",
+    "\n",
+    "&emsp; 需要注意的是评测方法`metrics`，设定为形如`{'acc': fastNLP.core.metrics.Accuracy()}`的字典\n",
+    "\n",
+    "&emsp; 类似地，也可以通过`progress_bar`限定进度条格式，默认为`\"auto\"`"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
    "id": "1c6b6b36",
    "metadata": {
     "pycharm": {
-     "is_executing": false
+     "is_executing": true
     }
    },
    "outputs": [],
@@ -617,100 +576,32 @@
     "\n",
     "evaluator = Evaluator(\n",
     "    model=model,\n",
-    "    driver=trainer.driver,  # 使用 trainer 已经启动的 driver；\n",
+    "    driver=trainer.driver,          # 需要使用 trainer 已经启动的 driver\n",
     "    device=None,\n",
     "    dataloaders=evaluate_dataloader,\n",
-    "    metrics={'acc': Accuracy()}  # 注意这里一定得是一个字典；\n",
+    "    metrics={'acc': Accuracy()}     # 需要严格使用此种形式的字典\n",
     ")"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "257061df",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['__annotations__',\n",
-       " '__class__',\n",
-       " '__delattr__',\n",
-       " '__dict__',\n",
-       " '__dir__',\n",
-       " '__doc__',\n",
-       " '__eq__',\n",
-       " '__format__',\n",
-       " '__ge__',\n",
-       " '__getattribute__',\n",
-       " '__gt__',\n",
-       " '__hash__',\n",
-       " '__init__',\n",
-       " '__init_subclass__',\n",
-       " '__le__',\n",
-       " '__lt__',\n",
-       " '__module__',\n",
-       " '__ne__',\n",
-       " '__new__',\n",
-       " '__reduce__',\n",
-       " '__reduce_ex__',\n",
-       " '__repr__',\n",
-       " '__setattr__',\n",
-       " '__sizeof__',\n",
-       " '__str__',\n",
-       " '__subclasshook__',\n",
-       " '__weakref__',\n",
-       " '_dist_sampler',\n",
-       " '_evaluate_batch_loop',\n",
-       " '_evaluate_step',\n",
-       " '_evaluate_step_signature_fn',\n",
-       " '_metric_wrapper',\n",
-       " '_metrics',\n",
-       " 'dataloaders',\n",
-       " 'device',\n",
-       " 'driver',\n",
-       " 'evaluate_batch_loop',\n",
-       " 'evaluate_batch_step_fn',\n",
-       " 'evaluate_fn',\n",
-       " 'evaluate_step',\n",
-       " 'finally_progress_bar',\n",
-       " 'get_dataloader_metric',\n",
-       " 'input_mapping',\n",
-       " 'metrics',\n",
-       " 'metrics_wrapper',\n",
-       " 'model',\n",
-       " 'model_use_eval_mode',\n",
-       " 'move_data_to_device',\n",
-       " 'output_mapping',\n",
-       " 'progress_bar',\n",
-       " 'remove_progress_bar',\n",
-       " 'reset',\n",
-       " 'run',\n",
-       " 'separator',\n",
-       " 'start_progress_bar',\n",
-       " 'update',\n",
-       " 'update_progress_bar',\n",
-       " 'verbose']"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "cell_type": "markdown",
+   "id": "8157bb9b",
+   "metadata": {},
    "source": [
-    "dir(evaluator)"
+    "通过使用`Evaluator`类的`run`函数，进行训练\n",
+    "\n",
+    "&emsp; 其中，可以通过参数`num_eval_batch_per_dl`决定每个`evaluate_dataloader`运行多少个`batch`停止，默认全部\n",
+    "\n",
+    "&emsp; 最终，输出形如`{'acc#acc': acc}`的字典，中间的进度条会在运行结束后丢弃掉（？？？）"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 9,
    "id": "f7cb0165",
    "metadata": {
     "pycharm": {
-     "is_executing": false
+     "is_executing": true
     }
    },
    "outputs": [
@@ -750,11 +641,11 @@
     {
      "data": {
       "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'acc#acc'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.3</span><span style=\"font-weight: bold\">}</span>\n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'acc#acc'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.43</span><span style=\"font-weight: bold\">}</span>\n",
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[1m{\u001b[0m\u001b[32m'acc#acc'\u001b[0m: \u001b[1;36m0.3\u001b[0m\u001b[1m}\u001b[0m\n"
+       "\u001b[1m{\u001b[0m\u001b[32m'acc#acc'\u001b[0m: \u001b[1;36m0.43\u001b[0m\u001b[1m}\u001b[0m\n"
       ]
      },
      "metadata": {},
@@ -763,10 +654,10 @@
     {
      "data": {
       "text/plain": [
-       "{'acc#acc': 0.3}"
+       "{'acc#acc': 0.43}"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -780,39 +671,37 @@
    "id": "dd9f68fa",
    "metadata": {},
    "source": [
-    "## 4. 在 trainer 中加入 metric 来自动评测；"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ca97c9a4",
-   "metadata": {},
-   "source": [
-    "现在我们尝试在训练过程中进行评测。"
+    "### 3.2 trainer 内部初始化的 evaluator \n",
+    "\n",
+    "通过在初始化`trainer`实例时加入`evaluate_dataloaders`和`metrics`，可以实现在训练过程中进行评测\n",
+    "\n",
+    "&emsp; 通过`progress_bar`同时设定训练和评估进度条格式，训练结束后进度条会不显示（？？？）\n",
+    "\n",
+    "&emsp; **通过`evaluate_every`设定评估频率**，可以为负数、正数或者函数：\n",
+    "\n",
+    "&emsp; &emsp; **为负数时**，**表示每隔几个`epoch`评估一次**；**为正数时**，**则表示每隔几个`batch`评估一次**"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 10,
    "id": "183c7d19",
    "metadata": {
     "pycharm": {
-     "is_executing": false
+     "is_executing": true
     }
    },
    "outputs": [],
    "source": [
-    "# 重新定义一个 Trainer\n",
-    "\n",
     "trainer = Trainer(\n",
     "    model=model,\n",
-    "    driver=trainer.driver,  # 因为我们是在同一脚本中，因此这里的 driver 同样需要重用；\n",
+    "    driver=trainer.driver,      # 因为是在同个脚本中，这里的 driver 同样需要重用\n",
     "    train_dataloader=train_dataloader,\n",
     "    evaluate_dataloaders=evaluate_dataloader,\n",
     "    metrics={'acc': Accuracy()},\n",
     "    optimizers=optimizer,\n",
-    "    n_epochs=10,  # 训练 40 个 epoch；\n",
-    "    evaluate_every=-1,  # 表示每一个 epoch 的结束会进行 evaluate；\n",
+    "    n_epochs=10,  \n",
+    "    evaluate_every=-1,          # 表示每个 epoch 的结束进行评估\n",
     ")"
    ]
   },
@@ -821,16 +710,18 @@
    "id": "714cc404",
    "metadata": {},
    "source": [
-    "再次训练。"
+    "通过使用`Trainer`类的`run`函数，进行训练\n",
+    "\n",
+    "&emsp; 还可以通过参数`num_eval_sanity_batch`决定每次训练前运行多少个`evaluate_batch`进行评测，默认为2"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 11,
    "id": "2e4daa2c",
    "metadata": {
     "pycharm": {
-     "is_executing": false
+     "is_executing": true
     }
    },
    "outputs": [
@@ -884,96 +775,6 @@
    "source": [
     "trainer.run()"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "eabda5eb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "evaluator = Evaluator(\n",
-    "    model=model,\n",
-    "    driver=trainer.driver,  # 使用 trainer 已经启动的 driver；\n",
-    "    dataloaders=evaluate_dataloader,\n",
-    "    metrics={'acc': Accuracy()}  # 注意这里一定得是一个字典；\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "a310d157",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
-       "</pre>\n"
-      ],
-      "text/plain": [
-       "\n"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'acc#acc'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.5</span><span style=\"font-weight: bold\">}</span>\n",
-       "</pre>\n"
-      ],
-      "text/plain": [
-       "\u001b[1m{\u001b[0m\u001b[32m'acc#acc'\u001b[0m: \u001b[1;36m0.5\u001b[0m\u001b[1m}\u001b[0m\n"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/plain": [
-       "{'acc#acc': 0.5}"
-      ]
-     },
-     "execution_count": 16,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "evaluator.run()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f1ef78f0",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/tutorials/figures/T0-fig-trainer-and-evaluator.png b/tutorials/figures/T0-fig-trainer-and-evaluator.png
new file mode 100644
index 0000000000000000000000000000000000000000..a98ab83b48b29a07ba450f077a95fc5fcf5a8659
GIT binary patch
literal 104863
zcmd43Ra{nEv<FHlA(B!e4WguUhje#$cXx*(DcvpI-5;rRcb9a7bi-XA>fZaD^K@VC
z?>;GO%{Av7V~+Tbm|u{Ll<-T0*9Z_05HCeV1mqwfo?$^iKr_KX0Z$C0WicQi2q8oT
zJ}S6C?l-@1P*8T;a_?yG(!oEh{Q}8DO4MMg=Ct-P+G)gOA&QfckJ)y+nw>-$>Me9-
zRVG=^5yS<ez`GBheO2))Vle6i{JXOy&^Btza1k@{NLGs?$5I35zfw0=h>QGS)-hn2
z;j~x+EpzAXZcm7_pUZH08&`;J?N<>jKj6jYdwVWFICcH}`K=2Q5{?%F3dIKkh7jV<
ze|=s;>03->ApHB}pO?XHtRJ3+^V*?)&<hMC6)Vm2uc3PzkwHL*AG7i_1)(I|x6JYN
zj|`G<yo7>qfi5kMdvVupw+&qH>FIJIePxM1=uf_Ll&E`JG@=$pX4Ld{JCPNJFv0@{
z@z|r?exb?r^kO|329uvK7iL|P#A+u2saNOs@Lgbd!Gm8S0Wh2bMgXygsPB4rvPk)C
z?dE(>J2`=3Tp#}*Y6wN)A~dN)@n0G70W^*<25n&SROfGpV9}=Wct4Cw^Lzn_{J;r$
zeQ0`xu<+$KnSNpr7<d$PyZpc>fyj`H7+fa#+x_vh-p8F-#Mx{52w8lDu5j4aDeDzj
z;zVGk5yAr$NB<<!B?hzXv(GkUJD%skTuoWajaW6K!9R?JfYFlx=GOm-2n+s74$40w
zPH@ytU=HhmC}I~Ta9iq}eBcf|yu%lQf*olD2MstG-WiyIUkdHCcuqWwe*c^V3S2la
zgwv|}6YvN{8mfMJRW!ZgX*<BaMXaInx6iyD2Jrvq(L%uR5%~loeZh46JM!<r6E?#G
zr*?bI;0~4tQ@(G5s`VYSb6AbD7u9%6OH+$$TzbP{D#@sg@C<0bC4<lse9)V+46@))
zDo971>uq{h@Nr2l&K#|mQNQJ;)*c?-J{Q6k{%TuoZoj!<SQJcbSmg9|qGM6tc`}On
z3kS&aR=H|kZjxDpykRbJ&c7;&J2UUP&G-Hw)qXS96~oPH?s|4}AmRN*zOCn3L+aGZ
zbma9=F@4wM4bprr)`L&=AC8dH``QLsys7;NmSx^Cw1l+f$%Hxq`vc0s$*qy`%>}0(
zhUB%pm=6cL2WPLUO^}_72hKE1)sh*c9m*C-@-p--B|n%_Ae5I27A><*JX(~@SG34f
z#atmyOlzL!PF2%uV>q90PVbVBNe?DE&f|$eU#9L%AY8<#AYSYc{WM(Wpw?u2Ie)$c
zrBtr0ax9Sh-8i<YprE5jayX3I)v;MikeTN0_SiCpi%Yvdk#cQU*m?2h>pcc+UA+IJ
zqt6k923X#J(})TJ&NVR&vlc9}BCnxluO;(}Y7#MtJXaM?Z7nBLRLu~iF^ZG9G*b<X
zH%-;7YSZ5grd9A#syWZe=2E=$c2FMVL@D+cxik!uza9#e#OzE~U)|nLkX6Cf5DAeg
zH7`!0e$8ZW<$IB~>#5@|8FDcp#LrTTk2dU7b9}&Co^jvrzO=tP;l}Zz$gKJB<t>Jf
zvgCI%mFj?wCX0Zg2Zc2KkWCG9jIjpFX{6FKE0V)Kf1I=<`d38_?H+4C7a4ykI+=Fw
z1k};0wq9-Ql$Km)@WCyR=ls@6-)v%^V5!3HFBQLu<@J&mu2Hi8$*Wy3FUy(NWNMzB
zrqT$}k*+a5vyelbU*@HTYZAM?hi<HGpGud!g=I46_}$OpVfQAb$-liw9G`i&u2fZ<
zBhPK*fSTZnw*~3Ln+YRlesVOcgUcMBg#(VJ;?n8)Jq|*(Y9}dI8QL3}bjTrr4+IX1
z)Z7z`3TTws{7ME>`|~QLhQt{G5jM}Ctl*9S1V*ZqbaDgu6awiWk427f@j}VXGw!)&
zre=fMl}H^vhHbK6B#Doc`g=%y2ML}oD8?Xrtn9Wp$FVtCdK2?9x*YZyYf=i5ft0!9
zxmrgJk)|(%ER{z;9?6*58;s6}q;AH>u~Z3;2P36vU44`k&&$$V%%J{4p2not@9wlO
zckef_H393Hw1P%EKgXEwe=>PcVqRFxEb?*@F<eo}xIW8mWY7Jj=8Ute+09)w^BWLZ
zRTVn@hY25s-!`4M5}j9&{1&~#3;cJ16t7+sba5m;9f}%XUoxjj46B9H&{*1~Rv+c}
zUOZ%XatL|xO3wg$1txcPwT%VVx{uT|TAbtVvrZqspfZ)j94INOi_Wec@~M_KsgwlL
z#6p})e%VtjuHTG#Hc*se-poH0fKh>ud#Cq<$r`J!u&zx57N0XyojNn_2&)OnmY!wO
zB^DbC4Phz&nz|iHCX)EiHeaLg3g&(1Hwp%e1e%{u6<zyk9v*$)Gzq&yx98?)ei|<E
zZdk#_cVA@f<>K6??o7~mhYq(SC%%!WYpS_}$|^SJ!HD_$o%}QVkE0(+3KIH`n#rW!
z^yFSnif5@c9(qX{sr`JPs99e4pvz^~Sl#ZLJ0csK@j`Pi%Bb1w1M1$6aGWu%>Z4YP
zwH)2)MShw?I`#-ve}5<J(p<dgrwUm^sjVmRjYt68PtdhSIl^x<Gh(lo+*{YYkPSIe
zdAP3eYB1kAJ;5wm-}0xA!LX-x28~-y#jNb8O;e5_M@_v=KgCv+<Sg|3t{c__k2!mU
zyHrZl%W69=&jcQKr#CVujYVYEgAVYCL)7n?o>dvcmc5BJ<0x*I9`v8={}5_GTNGb2
zslM1dIT)PkXkA@;24mfrUb*q&SrtC{AMH{g2ej)SFOuNXx1v!P{vP4FO+!1pq7J(1
zK&6Tb7pby~m9u>?!K}FfpQOZ<xajt|0=JDO<R&tDx`n~pXeD(5?(%H1R;L4eNjuLn
z+TyZ9*2M`BzmtVinWa;M8`k=Di0&*F$T>DE|N5%n*-+OHPJ8hZ-zla>Ob+*&Pr;w1
z9jR%G&1@Bp;Hd{4H~H<WrQ*}87Q+?tEQAzU6kdC1p2^_#&z?1O<DMEhT^|rkJ*XvT
z6&rkW>@Mv*vHX-Zcg_jlMK#0v<dS`>0iPw6EIcRxc3PwmkfH@SFbT$!rE~OsvX&sH
zEX!~CQ|C3geoF2#>2fq1Xc$VVD>G;|!{na!#TQaRt4qp-2QzZ>IVDle^%I*ar)o?a
z4Dlrm?3b<MRdO0N6spb-#5@?5*#t$CN#R)gJMU&Lav9M!uDfzU7gE&5iuYY_WA_>z
ze!<5Ia$4usw#mv?Y_FW4$w#^o_uktC>U%qCh1EV<gflN-$}cUGU{f}S2Hhs^viMQ-
z48>25ZlE=4shls3ywzoUd0u6Prw)4IR>Ot?%Q1!4kB5T)<+o%Z0g|=zD<+WyaIU^r
z=pPnp&G_|m@TZOU%WN#Cnb`~*>=@e{s86e1GHyY2tsdVluBiBL&jW=)9V>S1rwDaT
z%8$jRrU7#sM#DYM3j4Ch1B;5FpU$xQd+O@q%?soRm=lf#2aVOvY9KobkJN)j#W1Jq
zq;C@L{EkVQAh%kE_SE3w?Qf2jwPHF;)ouays+!^lwk1s+Hb2F2W3dKPu?M{u9c#aO
z2}OOn4u~vTfqDZyPllJ2H3!mn@=@~=JjvFmYMkUVsWaxzTM7E|@^-q8(-1UuKPU4C
zS&AToQHjgYz!R)Vfmd0m72~q<a2qUb`k^K3jMqZJ7?DxgrecX>$Yr<b#_UlUx4Q++
zWob<nUA+_*{5d3-K&2{Ys-UEh)_JF5oPsNXv|9;Dw_B%s<p<yBso=|+7v`L<RchK+
zrcP#qQo}ZJBPMk(xNcX1KgGqd!1mRY4t@H>wRBNe8tG>s2-`Wouvsgvy#Jym%GZ(s
z7YobH_(Dn1z|>YAZs}@&=UOcUU-jT{`Go&*mGCFGqUOgF5KgG)v|nT60Cg|cut!Ej
zahb{_*}c(}MVSem-B)2IxMlihQh$(@C@<haV36U_pM;jnm)mw8vjjIvE727Vn>yf>
zcQSJTbs?KywCX_TLySR~g(w;K`hmDp^QA(=UN!2Y1(o>JYxX1ski}>~fYJWWjLXjr
z&6=?D8ho-H_yDDQ#B0sT3*|X>j38=ZBS}ly68&j<NyFEtC-9e!txg&jsVCV#PiB&f
zNI_Ia7xSs&M^p0so5Oc`cyp`#i;`py+#ap3mLZjV_B=MPEYR_6Oekq=`}Z6x#V;qE
zH=e=!BhS3}Etn9#6C@wrp67Oa{f+KN;Q~L`jnfUU*<!f9MbgbNg5^K}k(o~;$tH}l
z3y4}dI2KRFHl&nUsuhqrNYAenlI>v;&_;Z-ssrn%^mHYId2K1){*1}~1*d^uvE@`S
zXQk>hLE3!n?F#L2F-+!HXHH0pzpBSSR}VTYq>a6Mq*7PNW(%(gTi9#6NWzqCI^@^Z
z>XFiHoXFK4zNNG|IwTEN@7`c9S*u)7h+H~Xvs{TQtbNDL)DMvY_TFHhk_KR^6#~@f
zPngOIN{oRv?tB<zcbczSBcNKKD!RQWn=x5Jt1*5s3q^4<YXoxkic7lzsXMd2h*#KP
z169w3e!dkWBab5&bJY(>B;c4I*f;#jJ>s?<Gd0=bvKd#1H}?6Rw7l#dMvS?p)EtYT
z{39;^ybE<=U0>G-+7ylK4uxvnfZpKB6gRvBPm@IEY--Do8Fvv;zfCGCi~ThDgWJJR
zd1-JvpOH#bTOI;8<xkIsr2qKGALxXc9RJyVqM`aUC@r2kw|a_6F(frA3N)vZxt|t?
zHa1>l7S`Z#DisoVt}XZ`LQsmoO;6FqY5YO{x(1hHp+9X=w$H8U@&kNP?BHkhSXg&;
z{xv<+foIRu(oMWPv5&_wDF}{!q965C?($JH&(;b+x7z=dnRXB97))ygagz@(G^^U>
znO^Tw!XNQ(V9?r%s7h;J`m!KfP*}(&yI{R4>Bl6nD{pY_r<sk4Tad^+T-+Fmb<1@{
zlNm|aiYZ>(C~^#V9I+hw6BE6c1e9bfJiZr<+XJy6Jq1gnEGdv{<y7s|Y8|uYbZ4ZG
zdZ)xBZ}0TKk{!eyL9Ibl$wI$xwfV_U3bIfk6JF-s>u+Q(XijQIH-xn^ib1MJa%6IT
zAIm!JsAIc=V;D|HD}4T~MO{%iAlI+Df3tSDx#o*vO$!O;HKt1`S$zGTpIuNRLlMS0
zRf2rER9<{zrH7*CS}xv;{*JFL`ZZzt$j=O!2r<+yF8h`=U)G#f^V3J3(|@ukt2uPt
zNWA1eA@fd{x-3sLAG|G{j!m<=x0&o8j~vm`sDFyAd>IJ=-}O0Q;1$>mbj11iNo*A?
zrWI<gW=?GERVIcKYg3wSagkO^rhvMLC}erAe~wgKR?9ECZs37tlngzMW1#9BCCY4G
zajq*@<(14kuFja#w;jtbYF7HIPKVYyu_?K8wAQvZt7=q6Z8Ww=BwaHNJ6kJf>@=^6
z6D;hG-t|S&@TqB33ckcc@(w7k({!o~5xt;=Cp?<O-K4S^Ftdq^7jJW|4Zmb<?NyB-
zqMK9`eDEscs_Q)6-{rWS%V0<%bSw-qf5uiMVeKL66*}L&RH?$b`v5;&DNbF5Z8C|)
z<88CeTb29F=E^vp+44M%Sosec`Cr8ImQpP^4%jrlt5S4um{r1vXgV12#pIuMosF*g
z##?Hvs3Nab(6pcx`$@|ngzC&h%{&Jk7IWHk%)aHyx{gPEvxu)m(Q}BQ-^e`K0J<>j
zi&jZCY^=9>l%wklH7?@j8rtRVvUw6K151D9X|tpy;fo)ir3QPH4jUp>A10^{3QkT?
z^|L4ArswE|#j98oskb{OTT5gRI&Ha%eyFzOb@WId?r1$Gs>{uhK29*t8Bbom{unMX
zd(O^3RxWBCW)}iNn)AW^L!1L)1RX1D0@A5L82Cw4YFmkVoz+6U?e)oq{npS+L`08^
zSz|034VT<?yUO7gw?rQbVXPLa^!8^eaBy(4*b!2nLOxzKR6=2AH7={SFa9K8{t5C?
zQF#T0{ldF`ulVoV2qkdR<h(wS>USjm+sc7;N&tvk@KCv5{|P4ls2LCvs{vExO|BVL
z{_nX^P$3l@1g=aF|DFD~yEYXBbW<rjx%ZC|{dX<_V0F}$EV}=_cs2$w)&Hl22nDtA
z?j=!wVJM<EQo=aI#?GPJX8sO;wm(DM3W38=_o{T~P}D}94*AIq5TZZ=8-)2~g78Eu
zI{;YsusGZ~pvQl~?K3SN-g*WU#RV?RRW{Swd~VCJl;Y3WDFGG9eXJ1vEJ3WdbKt<d
zNzL{(0_iSU;{6lxq&&X1y*$s1esj#0Y{c0mS|0Ho`^?I_{)-9&o!hAM8qGL9S<f)1
zd-iTQvo%KdE-1;p2_vp-|DF`}Z>E}R5Bn0h)aX@WF+HScaSO5Lu*zm?+Rlo)`u57x
zzAa;~mzM)NJTO(B{XQPJP{5NhYEAfqWyIG926KY;EP6z3I80NF4@=?Af>z=3LFpLT
z8O2stZqzzP8ziKkQbM0W<*D-Y7UT4n=YDg4q?eSKSHKyKi%J3vu_Fv%G)lE9Tog-)
zGCIXwl>{BJ6pE45`LG6h#Oa^=4u;Yv!n8+mxX0z5w_wS2c<D&R{PnDnz<yBE5tzXS
zT!aj25Xrd6kz146b@R2Uq`796U6i4U7^7rns-mu9a*pc7*<D7_K)_~?;E=|n&q_->
zMel4>yA}x&)0>rnQaTa`{Yxhs`mP_=k2z~@4UWAPl2m%KR|Xci`p!!epMUEGDGP9j
zc04@A0pOkY!1Se+MRX1%73(sG?T<Mk)x2^0xFo~sJUbQoQttb#iJ6gebyXIqpkrL-
ze!!Z2k+HsvQJIz!h^SrMN**ic(^wi6qN+;cri&3Yto61!Yw4WWaZ86p3>0Ll>+RK-
z&cs}ns@8a`4(C}2x1nc#5Hpbw>Mtj(<PLECanF6=@O2=|fEdi$RX&|>=Np))rM~E%
zhVf4CcJ|(#CMeZ99{k+N4U6TcP?P&IxnHD}V`|lHvAkmF*o=05FlTnm<(?FN#c&w}
zrCF!Ud63g4Y+<Yvm`i7r2Pq{)tS^ym|Hu(1F%1oSA@1iP>mncGC(+Zj%{H8bi&Td;
zf%kJ(9_C3tM5O_pJqwS32PbKP<RuB8OYXickVc8O(2Z1yT%I7(+Pml!Pf=vb><vM3
z^B5v>-c`!W$+bdmfVFiK?oSx$#y88jyI56-mzYkgYyn~gnDZYW@-X2OWkP#t7D-yp
zO5-}enB8uV)H0q*#Has}CO<O3%gqRX)&>6^72wlP`HS^U<07c2xj|fg<IRa?z_{YN
zC0ctzrJ&%{H!Chuj=WX)B3z!6g1G}C2YsW0ZxBd-xJiMglUJnRQ8)k(4zUs8-Ypvy
zq`_NwmK2+5O}t9vaO+t;M2A+J5p4<p1r4ckhQ3#wkr~lazZBXNyH?9^Plf?;6yC>w
z5;jgFxj>7eqo8VS9w*YhK6KnCaY`4%cz#Y2Yt*4Z7%>4uPfy?BwZ>FH{^=NA5JKC5
zj|=ImFIcD?VCc=wIt>fo*WmPyZVrqqu71Bs83LUf<yMG*5L+!Xv7=QV6vA?AiX==e
zyOU~C{I}XwDe<{s%i=Py$DM`Z`pVQ|oY2(PPpR+gT~9Y1?<0^8J)ACYxK^Q{jtSf5
zKj2Tudq{YUzJKsZ#0P7#Ecv&*s9=*<s(J?|@lC3*b(`^~i9iH3&0afj02lGeeO)%Y
z=ttbpCj?bhyv-Zk<gRMA&ds4unf?c1qo>GQrBvFA(=#61zcflT=}Tj8kM^19QKdrz
zdj-a^!)4!JjLVF&a0Zy{^hT3ko}Hzz*}v^H>H|EO2FxL7uT+@8Q^7(=ihTOBxPiIs
zXwaQQ`AabO*-^Gj(H1GgrjYd6s{5kumj=4#eo(7z^UYPJ%bTjQ!VmAi9=BhOX~|J?
zKj=R-`yobdbu_xkb1;&WN?o%$=P0y37n<;O2A4^He~_!!EXWTtq!R<28mGJ=B46tc
zZ?#$ZF<WIEZ9$*#!dHQm&{|Hxfr5MHY9J`^@XE$m;YlV&;1Mmf$KJUU(&l?lt&R>_
z+ZA@G)qEM6LZK6^Opk8qnlGj6qcfjzx+^whC`1{okzH)Aa;V4ENF)&u>RObn{z_LO
zR)HQlvKoMS>;ZV^VV=jks$uTi-J&u%i!cdTC{eMtt29c{(C&R0QXg7&NCJenPxg!e
zYSq=M@HpKnNdH&0#o4Qyl_6J5H~vXtQhmhk;@rW+G~6W<X7wHOukFh}``(Zd)2C_e
z<B){8N%kk=irzICw`Jd&!kMBDW1aKAavYO{{QMLg>IgtxWeN)jJbNM!JCQdpJBrzv
z8Nb18Y<k<&-pZ~}wOEZY7^l_Xm44?HDE4^vXWvZ<!0Nm_^txJ!Dtmqr?PfQ{>gsH4
z@v2z{P4G0P9{~YY<1~5;4;TWSSMUb$rS{cSPPMc&F^IZo$;}lckfh7+hUx|rGwd=7
z^nJ6~)o$H!@Icz35;}(=DXiAtEawx_@YEGD9oOEy3w(!_)EJ+Ar@M|FJ)wuTD2ZY!
z$IbT@>~%h%L3)gmpssQQVhaiPRp6V&Uf1WalxHN0;sv)+sq+mL`maud->bbq2u5-$
z#>2?KziBVGlnC=^_&J9qOH(~kB3_S~nI1~ALe)<A#DKsAUjMtG@OpIi-;i0BT*og+
zY%23Nh!R-^k}S6py&JLw36O*l1AieDoyY(sL4L<0Rc}M<)oO?|&%MG@_SSHEI*)ft
zlcKvj7tM{q+n@s!5iO|kJZV}Q8qC}7uirl>G=<YuP;IfiuSkQP0$a(aHGex-A6{Z`
z!t`YPJM+`V{_{$N3P@daxZ$fn{~7w*7XcFl0VSQ(9%<P5fAG<tSA-Qnc?5mWRp;ds
zr~b1RDLvr9h<WfV{&=*1Uio%_ljFr(zD)mPTVRva1JgxTX*=^fI}LuzM`m*{7a9^@
zs&TVBhbM33eUK)jE>G~Apbt=x3TJRW^)LStwu=5Cwrbm$4eRq!8u#tcZ65<YJ=(Gh
z)l>PzcOATJ>DHH;zsuIcUr`#P4_24)u-Z<^>v!eHn!}l*zxbVHM8N=fk*|}D{+CI5
zA9b)dERcAvOK-auPn&Jmx`Kj*_>xsv!NwHA*944WLrCxclKBAy)6!&jP`?+w(p_)A
zTl~GC6aB43{Xy-#%xHVnD$NtS{DghpEUwsQVf<xY#ao-5vB{Ny6h_CXl7h}?a%oHq
z%d5<Qvf(HB2dot~084dwv*7=JVM}PUb7DiGCc6E8IhEe9uOkTEy7m5|Lcd##l?8Rt
z)<p^ttSp3+K;=vR9-e{cH$A+CC=qw_-*fduwhQF*q>>m#Nf|>jKl^6BdEzZGkYNo=
zA>#Unnq3$J*|S0QIw+(yl8o+fg5lt8pKtGeGd3T92ExxsQvVqYg84)Z@Ftwb*$oU|
zARu|5@=bV1U;P~r;Q|9i0}LQv`uyW5JzfCz=r_X&IDY__oR80ArU|J70;f_g7<Cfj
z0K1PqF3@H8O*UaO+!ZY_LEf}AMAZ|v69L!~bjDcz?2oMb^1^sv3X)_%f6PRDdZZ_y
zqw%#%7yIvV5~9#R?t#~+f3>z;T3#-5L3CQNwmFy##4`0R$9mGz;lm~Gv>v9c@29Fw
z#}AuMhm^HFnSvL)>WX4g6g(p7*<mcucm?NS1eyNMxhmAA@ha3WY@Q*otIc`Yv(94s
zV0pY)wJQ5pq%c04`-O$DaDew0IB~BnXD{-f^{7s`%bV}n3a*F^9pGljc?Ie`l_vgH
zKM9dw1_Vb%wpmBQS0Vt>aryNfHl9ZFbvqr-Q(m4Gpm;?l>mR+NBhjG(SkjH?V(}bg
zx6x;K9}#G^7nqrYI;KRJ0)$HNB0hnKFdpv`z$i)w1_}tzNG5(V!^j2tfIrL|IdzcE
zDJwftZoP@MjZ9AFFdj;IFQ1DYPn<0-CN?)W_i6BERG5ItD)hq442P_#g4h(}v*<Hu
zaW)u2ooEP(xwr2MpBxGtRA%~1cxqu@E5X0L0A>1JDAF+~$j?ZGlG{S*oR^a{7#(*N
zROcuf3cA7xXj|{PaCNH#avx_<BAWE<Vq#+#9j7(B0xuWr!-q$r5+x4%j@o7_^h@eC
zhtp|$do^g{&vdBEii@kYlC~Z;S|85KB)O~=UN}-!oql<N)#nUZx9F6tN=zOFE%D~4
z-GC{2C09C~W*IM`1UO>n!v(g-Bh<T4mZYorTj{?26hO-f?Er=E!5M=FM9en<4dwoW
z)dDLjnqAWvb1ykIeQxfD1Qy?6sXHE_?~7I5)3-><8K@t;dIf_;gbYVXqtsk?W8RBw
zj}X4qJ(!wOl7AVjfO&jP`*=dT$!XAA?P|ppnUq;7DBjOPb<}cOdag9HrTg`~#M+aR
zhi6%MCHc7uMy0Ar3g`8DbT0Z*6Itus<&t}uKrM1s^I%^Jpg9#_mE^J)0?P-=TgdiD
zDl|22okiPjygI8zG$QJ<&!0c*UWioXqp@Ben+tiXE<b$EcS_Mp#cAC-X{7$qE;E`*
z7cpXbB2K7fC@<GHj?8MVDq$JdPwQ_>b8?=%d}maxP(9mvUr~&9Ggo6i_6z7ZKqr@a
z)h<CYXf=p#+nRhDTPDzHNL{zivMjS^_bGx|)fURv4{TQRyM5`vX7v*qiYZZmh{<tX
zk1@UMP-c$xLOl@$y9usUM_il2FcNoVs<ef$RL!UWUr!lG1nyc9>phvAfY-dBGI~#{
zne?1Bx3<@HP65S6gZdHR<Q`aBuR|v}sGUpjb*{-G#pOkg78|E6tQKGOsaYxgVuN=q
z?k4bF7-=30-`+~U-#6yQ3)bZ(vwLrzHwfo+ee?5tTAO7r&?Ro!yCFt7=#ts_vGO&+
zZuY<s8(_BoYvlq_AxQ=l70%XsqXlIHK`6^#dQ92$3?SPP#dVUp?c)*?*(?ODo|6%3
zWAt(X+m<BG6e0Dy`_Ug1Xh!NFWA@&yeIlppj&y0>J2NxUu*~$-y-2sI<-s!GJ-j1T
zQGvGSd93$vZf)I?+fQHEuj1F@R<R>;R~Pfv;=GNv(+YD_wO}PZ24ajbT5ln+-PDQq
z$)H)UN(w|+!+^Jp*DT(ZJkK{dW$-K(lB%&4_|Q+(x<#C}p@`JLxCB+ethl)*$(zzv
zi9ge5;YSJIOJ@%A0*ZxFz(voP%JmjGU1y>C`^4Q;z3p15ff$2gwd$Zn58y^noB_G7
z=Yc{50|8*ke7I3G|56lfMlF-ZpL&W%NYYwMqYv*0ir_mf-&`xQAn|kJZrN7(`|69d
zkq6Iv8^DKRCYr%q<2rAIXL#4-$IKAywIlOX2Ya){Gn%4hQ)?rnaJf&ByVmO_H(<nR
zhz)%4Gls2FZ4pFp3b6z9P75$($Hc?+VEOq$L}<t7@THnjt9|DabC!@!(-x0n5HEhs
z$FY3<Ys>iA*JY-R2WM%89dii~qzhe~@#v$~l_d@qZ>loF$k<hD3r~9Ysb4g0MDWcb
zs55t8!0*<2X%l+WU1jNdPNnb@9>cplv`c%4SD-w31qUdR5WjoD5yAAzup?Hq3mjdQ
zPw3@^CcTVCYLt~zNig$N1fkKtx}soo^vbCmFQ%_Eb6W2Yti>Ccn35cGv*=#j0Qx8t
z3qj$ga6ASW!rwU(6dxZ=C?$A`l@TL`+0H&hi#QR}B6DR-BK53-!MD2Ueq2*!XZ&wA
zSL5SibcoYRm<^1L)fLZj#N!7V(>+c^)zy<-dx5iwxB|Pi$FJqTcMG09n;UYL7CfVk
z)o6BCN++8|T))x)_fBzEKiNqIDM%fuonjVgZlIic8LCIk#9nT?Oq+$ePRKd=3)qST
zN$^sau7QvG?@oE&zz6=Q2F38)pUID7%kzl`DSEud$dybsnyR)ZW_L|68^xpi8K&B`
zO`JFQWqGv*f2mLxk=aUc#-rn*0GQ<pr48!<+E4ti7kmoz2**Zy<f+qCL>nzbH(KmA
z#LD4}MUAP6#{H|5rSM(XQQzZ?D#@3LNd^^Ph@N2-H{4fz)hsB6oEJ*xUcFfD44JAR
zWn4T{5XWwSa~YSnREka;PUSFWuN$;ugSVEO!e<3E<i}f}%?B6n<)31d7EEP}{FdY3
zGeD?Q+Nkb-wbIhf17N7OMx%OzADPRbI3#ZR`g99-cY>m+<otfvFsP<MOjXM3=+bNW
zm#Mflajf?}&|Y=I)Kp<$&taP#Qiyd=dx7gZ`c^v4qDF@K%LQutYQe(_vbV{AIB7N9
z4iMEP4>`?#@3;9H;`+c4g?1Z_Fq6Uu2sIj@R3h7>n7`r32LZypikJLYDS&^>Ku0me
z{(zu%VjZ;2e1kq~s+dPEopv^>*XOMw`i&Z=RidmWU!MQl&2&tZNLLim`Ui~0+1C6$
zuyxzaaUEM+v>fE(ZqhZPTqDgEy{_H>>WqOA5PO*<Fe3K<IfGyDfu~A$)p=5>QGwe&
zXlfv@y^^}@PfY%*2syD=rh*c4IJc9^+BI+Q@pDD7^);K_m{bDjIU+ac-P#b))E_Rr
zA20P1h9?I4=!Mf+ZBFA-X3}G7Fn|^T-gOxI=diFOM_=&&l_GZZ2xJ9fzmAQoPAqK^
zqheoQ$VeeH_q$o+^z)00T!zH$p+yxjYR}->Gx&&Fwusr!6IekU1Fuc1atA9&Gu1Qe
zON<bgTWfMtlG|}Mm|>YH)aU&f^TS^jxV?{r>+I~LZ=}=zMVPzGA9mgpxE;o1+x4Ho
z0xsOA!H2DBbF9w``NI5lmtWrQV2~es%It@1QB`$YBuF?vNuMMAA40<X|FY<J*dW3I
z996aZx&-zg==l3F*ckm!@%kG!@Dd(?Tfz<_8Fc^mTyV({vz`Ag;)CV;pl$J?be454
z<OLSW2)u%l5+89F%*Ej%?iwl%63~GL;NG{$Nx;9r;%)ip+mD9Lbk8Z=CR7>zs3@Tq
z*gWbF_x!uOCEQOe$)`?%h#MFctp5V%f6t>D*khJpl-EB`{?2y}4g?-?82%3<0x^Mr
zty*LZ`Cr@c`y0UIC^+o$=o`WP!<s*{jsm_v3{&*xDFph@OWz4_hsM3}0OB9j{CD$e
z06C+?qCEdR`4gQ2qyRmSk6#ms{`c|%A#L?0jskza@MR>AI8OyJlDD1xnR^FZgGQ20
zWP4&FFBO~tS1l1{`-kU$C$c32vcMY03_$bcU!(V%Fj8<TUyh6B>r)5DfA?PxAp33%
zp3ncM!XSsLqxtufPw3ch-ozw#ng3E-063s0KuAKuFGR1U&q&(-C)Cm7BV?yYLN23o
zRY8P;REJq-Fn{f35!=dVw*YeNXT@Q$O5u$VSMSLOFu6doPZQ+EV1mDQNao9^ho`*~
zqQVP&6iiOn2puk~OP`MpMDKhHmrdl0Q#58U)}@`1%woM%Z_BT>QNI?(!QE(Ac<W%C
zgrbv#BBBDde47i+mRc?i+x~khPN>L2leP9D4?8BL>tAl}F8(7xAQ8yydv?8Uc1HyA
zy1$8~R4&zOrXaUVxjE^`(B{Z`84K*gcLb0n{xw0%Cvg{f4ux7^a=lIw@Xq*phJrwA
zBSN|q=t)I9?GgeEpCVj>cVMYz<Hbh2wid`?r>LwYs#Sb$Mj?^t)Q9&3J>t?dNx&J`
zJ92=W+4W={9*As`7<;K!V1TV~LAoNp?w8QxVI;|=@C{U`=8b%3b<7tM+~RVadA#Cy
z@wuQc*!3e}FI@Iyx6-vb>GEojS6r*0T@T2_(kzOLcHj$dA(}{0t6&#DV|8_tR3g0~
zX2Jpxi51+~Bx1ot1^P6RMfa#lNZLT59yl1@k?a1`^a><Zq<_Kd<HL*v_#7iiMql?C
zyd<LjNO!U&qBU-?DuE7U{@+1$5iC{ToF>cM(LDzgme&GKAUm^DU@f``&NL|J6>Wrv
zOK9lgS13S~+5&v8w?tCLC^D0w!9x+@d(sK@8Lx$;k^CGGXBl3>SfKvYG<|^{yN?>k
zm^qIv_)v#JM4hg&QO-hB^W;Nyys~l6BhgGD5^EmdFEINv(RV6zUciif@;Sy?^oV8C
zuDAK9)j1x4d1@SmBBB6QIb^;nAjd+syCy2tnE5R;^pV=&rYm52ecfjg*nt>o1ds50
zCW5y>c88P*@|vHNLXEoxy&&FC8aY}%*oX8w8u8hCCbGBa)=-f`4#^j%iIW1~|4B5M
zinR6c)0AzBzUI^ELlKdKB4A;3Sa6)?8BRFfI9?NFe>M<C=KJm{Q{R+E9~=ihfec?5
zQXL9oO#$PtI1F9yM0w@u;GuRz+gz+~hVK_L_{ks6q$Q83;?cJTj?0ALVTu7Gp!6CH
zg#Ud03m%(b{IjPDa;vr)px;@%KgGOT_pY}DnUg(3yjq;E?M|c6;b2HmY61^+pk?q4
zp)7}fkPS_>yt`zVaOScMsH=0lbP1oSBAGHm)F-O@V*!GA+ibX#(dIZsVdl#rp-{@v
z>W{CtJZW35%9ihP-)D47CdW0qp0?hfmu+;YMcc!{kOEL6uv2d4cl)pS$MW&Cmcs)M
zsEO+FsYT;u-tr+X3?wY}xf-_9lS)Njk=a5#5+>Wa{Zh5c92?=f+rGODqq>|z%X%rh
z8itye#!Q|9(v7~*xAPoZ1C*x}XA9M*RfCCi^82!DH{13cCGBM-r>UxjUyIK1Y9(p!
zAWgwp93TTK*d{EJH@N^5MWep7kvBSrcs=~eiMP^h(3{~rZP*gv;raFBMKArc{jVI5
z#2{f0nDNub*QaqNrJgwrZi{KmTg(iHYgg~dBpTf~xEBBa+at^iLjU0B0Fu{vKoJpv
z!ty$Pp$bJ^#yyq#&?~*3F0IS15tjIY==9Q?=V&8-gA{KQx05Y1Q}|P2>aqh!E!h-x
zj9iL|U|MM~F-MJCvE1sy!7(Z|G0(JN*2@lDpDZz1G!;%PWV{>IWIE+7e@(m?&nC6O
z!NRQi%GxR@D@$KUSJ#G!n!Ta4!mQS0VxZyKoO|J=?*S9_oILK!#Thpd&iTo%I_U5I
z7qmyNRYmf92UNv{=OC7SWe6>%XYo3t_-Ln<c%?V;u3^+k_9coUTcsKt^<#~p^<}t5
zE-n2s#hXZ{p`V?{+mvT>@bU<jS`6-2<MSG8Hv<reY|{VM9>D@jO6vRYc%sccS4l=B
z8Iml!8j3Z1$HjBEGL1<coQ&Gb_JLw<$iCltH-E|u>CHB!BS-L742xyx1iHDgj-v2g
zFH`8C=G>>7-MkuKrJxpW+Y6>?oGMo@-zg0&36ZU`(&1in)+W%pZkE?x(s_-2A?)JP
zi7)c{9z^>`OD7fw;*}{cZG@i1miGQ#x8~*+=uAQLtr*zzOyqN%pAs;qAC)Jc`mHMX
z*bM}Y23tZm)ZU`G!ND8<<Kzi3;5mO8d%rbL6BOys+*U}@zOn~h^pdZzNKMD(o48AB
zagN;z39t34nRp}DPQJ693HGDu8P(|4vNuiYh!}paQfHFZ7<}SOtFd@FMBiO{APp1v
zd|30k*gPj+;IeP4a*G8turjxle?Y;$jVERqM&;&SG2S$GVpMCQHu`=11y8=|j)Oum
zM|IAxjw&GwSD=_&m>zMRa`R1^L1WcWJ^A^hObe*lQ8IS&c|k_<N=4r;qO$_Bn`c?Z
zi(49L$@KIofBrfHCnvt={;&V^Pd||tK?Or<qN>C#j$j49i+|PF$}pG~5lcoiXuAE?
z^KsU=6V=l0Cn+faG9zpRl~n&ncQQ1k?aY2k4%bn0jPj>w4*Ux#h}<}XTcoXgvkQmn
zz6M*X&uZn(F1eSL2cHl~%+%h`mlW1URdz3asnl?-P(#(EfZR<+m@ZPW9en>HL78pZ
zEua{gQjd%Is?*PWiN;FN+P49T(?y_8O@Mlt)&+0X04VfO(9w{-YLY!w_s9E2Ecnk6
zK|sA1{#omObicX$a8R7|dUkDhtU{ljXZC0}wUS<TwCZt{cIG|%2L6?aBpe1FrmC^N
zf2TCZ#FsK*hwZT2fh<`r7J&&o`Toz)1T8J9lCI%`j(b7hJ(65F2ck~cr?<;H=v?M9
zrsc+DceXxdeio9Wi(PgXPh*XcJSvJ8s&QJpPa3Fhlc#UgQM_+=TX!-IIF-oAqTt5@
zO_2?{P3jO&jfYbWYgIlpt1V~iZ63&S?Qqk5Q;mxcL2jy{jfT<B>Ps2=2XFGOq9mNm
z=%0F@Q}(`ld#i~9xm39kdxvgZv+o?s>ZB+71Aut2-i&Ti77W+5@yU!nYE!8xA(=Sm
z2k7#auhno_31{t>mGeWiEG?WA7~fc%j!IF}EL8SgTgYFu)q<9uoew7=t*4>q89<k4
zNE2)hr{TfI+#Q<rsxxTv<=TsjNq*m-{Z1*IGl#VIOTJ3?oL+)twtv^q@?6Q;&aq0*
zcoDaRlI#2YVV=b`-}mC~e;h5~s&~;MUY3&bpZDI}gyX*cs@iv<T*q_uEhg_UJ5kjE
zu3X!}CX6D$e*D{wa<f>i@X0xL2=w?^EztL@bmkDj)D`DOlv0>rRtITUS@H4BkIaa3
zH05p~$D%Agdd-lgItBCBXZt-lLQw+-QJ3OHHkOLBohRngpp+`0ak%n+awpeuj^=Wx
zH)+%EoTld#tG$xe;fO*wQYoUUZPr9dX=t8F-2k?12<U-6Ps+df!L*1kYia)}5qWTw
zU31lfU}ZaIFyBb2z#9#bR&R+_x%6BMXSyE4L%Mj!K3qJSdcH(j?Bxv54c&xPeK-?E
zlpMfTsEPMtkoJ|(hrs#&NMr#Y?@6dAVVwmg*Fobl(<Uj=asfTtCS5ShFTntKWjQly
zr9+KNtwFQ?cf#lS@9AVqK-g0lmHTe7TdX-6ePk5H?Clj~KhvadHdTln$c~f?YW2k}
z<7>q(R^^iQ2hAsAhBaz8cNZR|hQ(|r3k?>pL+%u+BfW~1%(=D7^Rjg2lgC72ws4bo
z=QwIo6wh&!6-tlBzLNu~-KNAjC}F|Q%z%FJftcwu5j&o)%sb_+4U^m93zK$BI>XXI
zC+BZUZn)@|43pMyiV>-_ML>$iskwZ&IYaV?-RoWBN%mR?tTQuv3ZR5Y7RN!ziSV!V
zCZR-Zj4X5830y?RAsRRjosu3)M)xudmV=>qmr=dtq<@zfSLyJK_R3tTrCp!hhKawu
z`oI{2V-Sxjse|_AMLhBZe!8JaYg!AWY$r_~Lzzv#`}y#0l2g8>)x}p2;ZA&v;RVf{
z7hcXEYFlb*Ci*=ybv|mw&$WoV=jBl5{g|Fz(%dkKFN5!^W>LtS%_%bGU|1;V2}V+z
zHY$SIlq_;)ji0QUz@P3D7vmDjZ!$UTFnQQ3tuj1gh<Y`^^L1f-VQZ_+Ni<+HzxFWU
zK%V4h4(I#neXAOjxfuqDep44c-S&rbYUzenNcfpxAZPLq=Bp+q49OWcYms0EXHjH!
zRR3=WwahTZ+lJ0tp&^Z7f9a(SJEoEGi!S;g11E(y8jSaCNz^q`mWrMY;#1_fJKss!
zYVF`So^v2!VFstsn&oS%to0^O-q?v($(uX5`A3gzvtPnlVYywjV?3j#wnhS)TT`Ds
zb8kAZ)+CV~2e(ice@dUspuG5^*4tK_7#i}TnZdp2wz9hZJ&d)-#hFLRMh&yDs?Ue3
zpYK~5Kdn2;Ki@_@n(}eVl2J{gO}pkJQq`^~6u)TgE1uOF)mz6#{s79u;rZvVe+Q#i
zKmqZV2&d(n^g^xRo`1HoQ5uX;{|cu}%{zM-mK~X~{gv(yhp)xxg=@3X2u?1@8kgJ)
z7aZpTz2hK`G%F+R_=Tgz;;C<bjeO<8!$odtDw?WjnTpmNKu0RM!8vIx{AOaA8r%^_
zQk7eqKUHC#m`fk(dhK@$(&nI{EaiG%j~Ej3Cf2_)Vb<E2rdeH@I+^D*-N_^b*^wpo
z>h{;YoW~wp+ex!i{@C@v@$6gnpCd{dt8vqN2JwdlM}05Q5DCh${uSWgG{O1O$-o8q
zu)1*IopRzUbM-2v%2j>Fdq!bC9TqumIEyb8vo56Et)`jSe?6BQ3wi-MC?`c!_h{n%
z1X@H>JNU?@**{b~3vw#g&#IU|9eGz-t4ev&V}$Xl`P)*IXpw;~|3~XBnP<`*Tq5%l
z-3RYOPlkc!o1{~fy`E+yi!#&Ejy>+PjW7EXHR6^x=S{6pmxofrWpC>>@*Fkdk}1a2
zbA*74d6=DfMmRq5|B4_0wE@KPE%Sg!F((nL|HfFe;k<sbSS_=7qL}pTFn?rsDj==I
z4I7$%k%LD(w&idN4~RMxUBhTfoU%w)<5E=8-fe{9xqYqMeFtu$cHsDb(?so7c}T5j
zj#PIc`t5eb5N+`A45Mp_UyPX1SXWp7GR17!8k6%}K!m`-Wbou%xQDHaOY;{Hp8hAP
zE5ZOJ;4ts~^LGz50gB-~C|3-s5UU4CG}j~6DkdULOVNH{BiUI^p!ktu)n{$9{<83e
z&FOybecp*Vi&K3(S7TPpxyz9*ni6F)Y+djYcT-7*Z3W!sCn(8UPx?6CkaVRB*TQAi
zs}7FJVvtOTWG(e*+tC}=iSs35+o~r>$>3}qw_vIUTo3(mE92TJ{zh(UeA@HpAapW<
z`o!C%XM%f`odItw4i_uMYqQB%qVwII*&K4xOqb%CKwX@926(jEB^-AFR`&n413X1<
zZOK5bF*)EdwuM-^mpK+zy?~01dLA@--qa~;Q7h*=@<7)W<<#%&TQRG*E$}k??uRdO
z<aVWQ>>E$$656A=^?eMSo!ECAHVjz~ZI1ax4dMkI4W-Z_FNWW_e|2Rmggsv@WM^k2
z%hsH9FJ68Db0B{(sj=r?*HhPKPu<X=NggA<kg6)_l%59kQY-g2Q1&ZNGFi=4luHb3
zb13bxUdYHOLCa+rdYA1lPRejdbC9B|(vtLCsO_aGr@0=HXrBa_dLkz`Of?<3Y3}Sy
zAuBg?;=vzuet7}ck=*8e)#Xj)!h65yei^NO=x!|BG^pzVm;Kt8gqFMBqs&n*wo%%O
zrWTWQ<vJ28z9sT<ZIH6^&hm)oWHo$A<;rYKdeqw}1~8_!Ku<P45~zitii)(<DJVoP
zzO3JfIoP8lKlL)AhuL=bbwIwr{rYGA?xk@l$E{fBZ9Vna^Hf=cg(<q`AF{t9v<K5_
zt)_$Y%X%TYr5&AnUmIz%%*}16930%uJzN;s_KL+FA@Q4SPIE3dYktYS45sz8DH1-V
z6b8YW+V)~HIk4X0{aA9E?WK7;+e+%wq->o~)katJT~&()a%+}*Y3=T<v8;cGf;q<Z
zZS%zf=5STf(Y|@WkjbZsg14m@>&5iV$7I(`Rs`<-ddK;<<edF9H3NFz6t(Z}zGpxW
zCSkU%q-)MqPr3(d0O<x>vU0*kkCl{1R;ztWJ;BZOTkb{&P}LEy2DP3Ipw6T4XGa<!
zk0V)2FZD?TZ=A*Dsrt+9(Pr_QMYtUS?}!;#Ujs154&QI|{m|@2tP7!67Zi@SRD|rE
zN5V6qb4XR*OL_Lp4rroIIuYC0xF~#eMyzg-Av~@|naWyTFBP43+UebRKMUNUH7nV*
zPg7-9pC`qYailskBjd`yn~y5)S0C9$-6`?B=Jr~|w<R%WXaXsXpK(`W;X9ZK)!Ge*
z4xHo}nM%jThhvkJ)HY4Uv8g3>_F8--k35}6$dP$7T}6?y;V$;|ZawYrJiUP|NAg74
zGRsrRvkdjDCq}c3f%Ika+2*^t{Jn24>mhw#k^F*vy!g^vCtbx>pr1FlMx(klNI_3p
zty`thwMd$LmjB$>fJmn4dcAiLRFTO|P{xx)<pf)G+7n`icDlFZHmS`go?MJkcIXsz
z>3MXL<NC1>W-k#a*+D>olhR%yO$aB+9<i3W84J}a4K#{P$6~a}eDyMv!S_W)UzZ<p
z3Q6ac`NR{`A$@5E#UB6m0%T5YP(J5DO>P{MMR&noma93ru2#F=_grU5wK6-QLGZxX
ze*tHHFj9+CRlEKMs(_d9cF+DOY5TrY^_QvVU1!iw7JY}U)yd%!63s@}Nw?movKuWF
z2v-gL<^7ru+FZVR({%~xQqXMr(ZpC(r{G|c91Cn7KcNFX=r+vYWG#?jV@176_V(Pl
zi;wy!gO{2Ac#|GlvJ?uX1@_A-fVA92p5!PW;)~)Mmt5H{dPvH%gO=wYf%*r~mITfQ
zp<+*hhmsQc(w5!{2^3rp@9s3)o+hbUrFb|V$Pr5%4<p~Ne0RQlV|#x#jCAKkJoEV_
ziikc`RHFIL80=#ASPm`z&ZR$5sZPW7DyrljPvxng^k9y-^NrG(6=qHmiKH0+a9L4h
zB<|qEsH7ehEO^&8@F=5rG;DKEofh~|^N%~yC0#VGE{3*Cm!W8~6Pd~-&Ic*tPzM}V
zjpIgRv^YcC<3^t2KHeUW)`Rx9<WY(HhaAR0gTzHLE~dX(ny#G4yZzx$K*ksws-?Gy
zOnCaFdl|X<p<a5-T@_P>fJ?3Q&UQNA_<{SROG{T#l}m!EF4j<~H~MtX5Xt(@tcC@@
zlX|ipP+{S1<A=aFAo~9MLeDP}oyqxC%s9NfhpXjg(xT$Q32kYQwSdQk>D@AAvh-Qc
zal2M#y76Muv-`$yh1BLVTjL|hM+@Dl3J*j&|BLrA{XFVttzXZVN`3Xc&s(+gU8*-n
zZy)acj-$UOks;qNm2p3Kt9#Wfwr0o@3vC(>Rc&gzQ+Rg@Z)uLjdZQP)5@q~43Pyt?
zMTWfmq#qRLWo~a9SE+u0O;%72O+~Y$u1!m`(znPnWe&2B3k9h;hHQVS#h{~C;}Qb8
zbP4GJ4=UUhJ_SDRG5<C{TlB}KyCVT7wJ#NcDzi~y3$5JP4}<s1v3P~WOrZ#JKrd~W
ziK*?%4P(%+6Q1~oNo(WyL^L(GX#4l51#il?upZmBLn4*nEg2)>`xv2_>0a>j>DI2o
zK=lfLYdvt}Qkf&kFnq4snY<Vh0u3YBKW0!j6PG<R8Ef|A=<>XXAj|RWvYA$Mc(}io
z$0ur3PW`O;p82U+=qGT6;o!^X7f<j7PKftaLHH)4S?~v~h_#~3L8;=9(m_5`;xN5a
z`jhC}qds(7{PJ{RBbr~dSiRK5%kNl}{riJtiO1h)i39}vSVj~hjQ9z&Kz!31MMcZM
zNI#)W&`%?uT!O!{&q<~ve7$X7zy)fFOvU)Ysye$2m%*$ER)Q<w^45GhJlPWnL5cAx
z&ayhXPV}=030L-L5Nh6W24NJ;?oVJf6QsE>lP9U&4X@T?!c$Ln$Wp~m$=JYS#d{ke
z8&%h;(hmf!e8R-JyV#NHJ)$0+7G#sgf$lBRP;-wHm$yUKxN|w>c_6PcMjh^Ad3Hj9
z9#cA?{y@$wd&QCsu#&GWB42oqz*|PI2K@xC9fSIG<osUtoSN5GiBIZ97N9-?<Us4|
zXgbQplm5Jc%ETqpuO;#n)a}dBO@61C4tMZqdll|UT0IftKQ)+tV6AHz;3Kll682b<
z;V#H<dB4e3HR&mDoV;a6Z><z*`(CmbO$`CL`I5y*Qgzxg?KXWzcx5>b?Jt&$N`h$F
z&i3-aMh0U_Cl~>HHNjlOyX1c30c>m0S<CL2J(kj>$-^|pcFyoSfu<3q;S1yV;dRqf
zC@1s6OZznHf<d#W(|y=*CEWO^*Mn4-Y(WSJ>D&!|yijN#JZjMjxW<GAY-{iHi=;6v
z=2?hSKG@tg%Y&;G&_M9S=+YyB3NAk|!UbMVl2=@`rg@AzT&wZt4vuzwy#G19DUs|l
z+;MCb>KQ=pTdPX0M$SDtt!(9iQ*I2u#oV7KQ7^vQH7K5rPC~elH&TxXE!gJu&X4yK
z`Un;Z!Ar%gE;~8@FK!4`t5l890a(wi>YR50&i;>*95x0Ncjg?s|F*H{cSASNlDbuQ
zZpR<flF$ZBH5K3HsNio6uwUt3it@vCr1KvC5VJg5YVMCWbDUJ946Z^9xw#FmW?}+i
zig8<I5r&odf60LN)EKq(&_R6Z`wDz#Iuf>*@hMA+1Q|JEgjvy1(O2Gq_)(g%y&M**
zg3M;U6E^<n{I%WD3GCvtIF!THLF>upN0<B2=JWo`8w2bjjqOQJEi9{y<@%~Z7r{7c
z622wfIi)fM;73Qsf_>Bl(>YwXKi6)RX>7;$8hhr^0Hu7$_h*lK{ExJr5B^AsDaEEI
zWsi&GRW0WeGNuS=FRM#uYkFtr3rt_^vN#nZf^2A><)z)p@6J|`m6a-w+hMZyU0Mtd
zOFrPxLBo*}LV{aF0^&$#GzsvTm8l+&Tir#O4Z^Ypr_RTI8$alyw|K%Ya9%?N2@;xj
z7Fin}S(tfv9*t&M0oPs2Jmea3(c5L}SZ>a1A26H@93F>+t}AuCG!;K3q+_F0KtZ9v
zMFMmXJa!wGX+Ds4)S9X_I}YA-4iVa0+Z(aY31%=k6qocKmL|>4>=#wT*3znK&u^G&
z$*jyNzm_Iqsr-&FPjZu}Q>A48bUg?!oaHeve+(N6UX1t6s8PaM>|VTQluq}e$)4Lz
zf%5{Mv-<}CkrDsK7R=p46j~Lk56{Y|#ctOTTIYSm?+XPrWHc06yshs?ZwL;kaM)hV
zMdrF)(|@a+3;P*1Ll16}mW%Uo&!Z%*UL;=tE*wRT2*PqCJ(p&=X3wlLrjo%MbbF_+
z#ju5$A96Pfzh5lH{uyI5Fbud`wDJ9=lR{aP6$peRA}0;D<%KZCy`WZ**KFRAd!EsC
zOvja6<u_4(OZ}07-pioP@Cr!qZ`BAS^G*d5Ob<>o=49Ol95i}w$zgPbk+`XdEdV_}
z7AQ8inS)8oB{+=3_tK<e^RM?k_ohm4S*$c#JgyfS?4hBd(>*U{f$qbWTk7RuiHCbq
z_ufx8Ukt^_18jlc-Y_qeJ#e_Rz{VprjP6KwbOQgl$0jH{J7^R|gczz4&h?{HL!_&?
z6z1UlX91NskBhPHs@zR%Y_cm2gI4cP$HBtp-2V?>Zvj<R_q+iMUIe6Dx}>|MLApg6
zq`SLYk?!u4?(UFMy1PWWyX!kweSP2G|6AX;*41^_Ioxyh*)w})_A~R$d29(t)s;9O
z@Gmr#5!l(DF}!hlSYt5CfR8^b@!pDX;6Fc%;+-}{i0K0|XzVc8MJBj11<a_vAF*9Y
zl_W~;B#doXk2i+Xb4?G9f*8048HBTSL7w2fZXgL=ziYoL7F~J!o%j^m%5|2sL$}9L
z)sA8XSC8vTg7=HkGi}8Cn=JlNzs09F_um%0j5BnzKp9da?P;~&9a09gK8On6qRhyV
zVt=kf4kUnhlGcejx9r2ml;Fs!#lN5uzrMCga7iZ1aM8*<DD6R@qt;j)wT42Q%aREM
z_*w#PXf|RF`5b=RLwxRTkME2DG%+e9wNW>yCY{}QHM%G?skfa@iMz?&-%Z(=RJlUw
zTi#dC0JxrLvH*=%%fLe=2P1<BX$QJ|4si0d<L-Govy^a2FgyFk$<x&se^UKVm($*p
zcDA*q9{8vs|EN<l`rYsxS9qX$+)psm<*c}wW=lmm!2rr6Rq52g&`^`e6F-yr7d|dZ
z<ip?w5+Gh42B=k&>!u2IwWc=3TO_d=K&z4#w8RVc88-GKLdnllO-2KH`LC1Nk(2~e
zmL?VT;!T`(zZY3@zqo3dBZKX$1sGWKk^{6t@CcYEbAbx+DEouoWzzz~a-aVy$@P91
zaeUgT-)oP$DOf#XGYdhwnJsiWQG80dYv+D@H`Y$a*|o9aP5)55<GgIlHjbxO@~}T{
z3QmD`#|cm-b^J6lIsU583WYM>BN+N!z0x)D-b4^b9!z^AVg!mEbm#>op@G%`D7=kk
ztZ4py%F+O%4jq5-kWpFt>KuPrd18EVs><S9mm6R8;f(NGsn4F_%w3~=OcpcHTA~Yu
zYg0OjA-}pTTsUDW^6+w@&iBUlaB;I#DSTs*=dD(^?^?~Tmg>^nINS-&Ko)0*Br$wY
zfrL)=dr0OR5)qf`iw;o1*~k_jV(az3-6hHB?=^g7JKx1xso&`5q!D(`F=dvYhFE&R
z`MYo&7`74^hv8L&HKkX)RYRO3o!>pJomuuO1#+iTR=o%Wuyl0x)lw;S=1Y1VkIrt&
z{lltmmq%p^<+TX0aoeyR)hq*bWsWpvSMQv5XX~h&Yt|eJ59YI!Ygi0-CWb@zCr=U+
zG@J6<Xv)hBZFNJ8?U_gDyns}*Jc*7%v6yKMW3$9d(ymn8p+Bi(<X2FTB_W9TMdn23
z?3WtwEfmcXSm?2LC3(YM4R2RCq>d=7jof>RJ}w)5b5nbx+0fnCYT02tRbwKH>$tDM
zR>XE7tp;DgLdxw}VxN?IByB{u`n%}<pBwme*nk$Px%Rjnd)rrc^UioTLgNs!IoQ!n
z(uRAB>RhUkkIvi36RNYnM&&rx&Tvg{4K4mOk4xh^Nf$F&TS$sZ<LWK&3;@tsXD;dw
zyM1M)CpYsXtOEuA5G8(tb6?;fB7&zmKGq#VrO`ion_>)fSct0G=3{a@G$AE#-1FJ*
zFyOsiHbo)4NZo0S?_7)?PcAhVSq@3n$_z4}-^mbKkj0luu;oP1?bpjl!=5Y50pJ6v
zMJx$F9`IHLtaG9uue0F{@9VS9k;f75R>uc`USsgP20xG;Vp=3XTv1K}LoX$=$T*%q
zPGMn6Lu1Y3do^oQ{=qSGK>CzLR9lqt%4$(Ei@iBke1ZYaOU8QPpRF2PB1S0zX<SAd
zC~~2Ot56uwS8;CTZ3LCMX)oFHXqRTebKpdqjR<-l(xFYu8>@~c`Ed?3kQook)BkCL
zio9_zGV&CQLVwQ1;8}B2##yUi|75}n{l)rM+y+D}1>5ppacU;mjs0_$?CQy^^S;zY
z=WPElb?!V^j2mI!*VbN`YHJu(SJ3!Tz+Ro~=v>ReJ#S8jU#Renr?|UT;amPEorI`E
z2e$DNK-?D9U~zwxiTSm823LBazGCs}kHrH@rLjfOYp-(|yso`h9>kaYnh9~IY<GSu
zM79*bbJk3k-=V8ETImg#-UqNNwyKX{mbzJnCG)uL08I3U`9e$cZL>o=YwXs{%z|tu
z-7vMUM)(W8;g_;1)b_K-3U!>M+y)jS$-&6t*pSyE3Y+rGQRqZASM^T|_*eJtV<Q}G
z_o{jJ2O(Fo^-ssccL^R3tNgg`cf3@Lg1^}p_(0M@|1MSv?^8ZLyZ@wf**s27cFCP(
zw71|<F|N}t{DSX>ksr6_T=E4B_;z>Ihn%*O><!6M@0kYdH_?A#lX?O=<AX;sEmzGM
z?N2Hh%~}L4u1cb@+lK@B-h6aNH>NAsYoSc_-#T9S1I`}^)y09kc4f_|0h~v)aAuI|
zn`6`*8SSe&u383XJ=ISb;JwECq>d-oT2>+aHv5*Vl!3#v=EZu&_Q&<srHp$wxeg5@
z-A?xQ(G|Nw<D&-inZqXAwFzx$K7Omkp%f08>adtTr2A#fm3xJtL(KddaT6+ZPR~b|
zPau~N1$BcuIZygu7(G!IIO*bnH<!DIM;DTIC^GOXt{a4}ge&?MBLHrFCdt1S@YJ?=
z!W#CgJKYy@-w}Q<phUDiIGug(L<g<SF6-$m;)!ZjGlxevMXKO3Y^2!$Ep~zRVAq;;
z!~Bx_a1`sU6pXWx;96{nG2cy(B&lJ24e#B6($B(pJ2l~hnQ+KoswoZwd6-s)+8O_v
zDf<s6O+t^(=ml{k>D>H>yV*@@7=qy6qb3KHX>-{g&UouCUF_Bs+LJju_e*V)GSGX1
znIvru)%@hCXa>kh7mO^js^ky-horEb$Sg#WM4-jCZE)QpyxxEbE|SqYMhiPvmZS&F
zI=Fy=nNNs?8Z4gk*T)CB0pCJlk;i#;C6yV8P<MC_+Kyf{o5xaEob@3+{@A^?_IkY8
z!eY?!*v<+7YG^$+`Y|#ZLsMKbD2-uP+1cBe4}<L<=j*3s`|4u&d-B^sjr;sn)rFOh
z4-_kIjv~_)<QGR4i|&eMcuRL~8dJeUah6x?`|pD6tG&0zH<<Ts1dUP~q;D72P4SnS
z6&D_MH-C1o29Y5LG61IS(Xdq)nwqO^<kQ%%2>}w^qB`@Q5fNjCH4m;qZwW+SN6gB3
zq(o#mJT;i2S!v=nI9=QyXlL;6lr`53BzRi1jc)N=9by*4sk4X6dSN-N;9f|Q^X;%`
zJPnj!OvROld`Z>5F_4IVvdrmrvi`~?UT_Mav|0=nyC1!}A{(Tulv|n>E`FT0x+gFU
z!I^oz{o4BqrrQUo)zyENMEuv>`;ZW6XdvJ{w69Nf6Q_u*oNap(dT&`&zs8ph=4f}b
zv*(gl%*Z+PpB5H+dK0*!SsvFR4V+NAYV$yN_mq=HMhb=VWk|s=+f>%aam@4)?r&)%
zGQeuE<;&Q)7gBf|k!CY;e#AOLdwb^)Gs@JNM%e|%xEiu*`<MDk_^Xxg8?4&8e}se|
z%Ar@xLioCXOCV$T^#7^FisS<sY*beYI`qyika^wt9yxZqD$vBSOB&vsa>u(#k(C4g
zCKTH@vhZEUCOmGmlssx4ZRLC)Cz2D)3d)*;FY=bR-i1ZEPvF9vrxayf74$MyIX5aK
z7aH~ZkIAJtevvTUzHw9%oJL0)dSfPSFx{mO8xT~OlOoQV_q2Huy$N8B|NC(GzG^F;
zGERPNPeJT+3|#ty2bH+9-mjygA_nFS;x^1@h0qgJcqLeC?`;W_crBoFTqf!)kJ?YN
z(-)*HIGgI*Ot*#!3#GMhr|kI22$eoXv}Cnu6qFFq$8#4wmo&bt@Ed$xye?rYX70Ws
zAS!kjG^ULT%nHS-P?y2q4StLd$Xbeosc#J5`iz$<i^jR#@jd_Yd?|_3M!}-z=@)GG
zeXTK?&xd9dEAyH=_;VU9bK5`G`<CL0S`9`|(YN_aUDE<~+cpSfYAXnY*hKe-{4I?}
z#I7XiS(pZ2IazzQdKto>cxkt*G3-)Up|mmvlB)OOCflMXihI{p`VE7y@h?`8xTLiJ
zpSAAkqzXpjey379Nk&&Lo9ud&50^ir8ahNOS(JEO*s<YE340|^)s*}kI>e~54c|wY
zVqd~fje9`wY!b~b-{)BidH$)xq~#KTo0f)01_;82pI>65#=lD<%6ZP5&qxQvV-u&9
zEIX5AHV?O@_SzpFx6d2p7`%BMjr#_FyM0X1V(_3dH+6kHjuPQv-~`+l`;lWG6I{ii
zeIWx4fF7~^^vG5uMyJ(fJn?`-Av5230of<zlU5deHK)r7S_BDmYTut04<wXpwK!b#
zPwl;TyE>nXF^V@dFzA5!o$3NO7O0&UW}nYO;Fi2LA4x^2O-t=4Z)Zn+s5MnRefWNA
zpl4e$p;uwjS2~LXd?dpoZ@POdxP{DpGr%k_OC0G0R;R1)Nj7H44X%6BSXiyG)tu#~
z+TJ!lOY(a|<)g^|C4=e^N7iOHRcVduNaZocJ@oS^hSt0ODUAgEsto-67Unexg#0iT
zgJz?{&2FxgN_&ePU1m_sQIq!~?+UO@Of3M&h62Lqzp9jdB-cuuqLa1N3MH25@W0>R
z#7*&+Q4d#*GT}v3(asOD8#Q8YO$gTwK@Z8RWaJs%pofodnCJ(&2HsNIW^2F*OLmlI
zSGOr(nab4wk+Nt^N3A(W(+WTDJ&dz#8rz#%mrg%efC7h0?dGEVtLJOvH0XSCELzX}
zEVyJS<x}*i%2$H0To`z>;FU1oKCIJ>??ZH-om1)uc?LH10UL_fm=MlRl_lTUUp=7?
zaFqa<IS;5C?tl~F-hO;itv(M^E}chBv1T;{@k7M>-@eHm(*G6Lt=mGB7w5+{O05F+
z^7&?uoe=H_c%{7rwn+y1ixnfat50TPd)@}6cr9lrhoO@iCUs$qvnEUIp_A<afxXD2
zOi@gtSa4<ACOo3I;;;Y;o8PT(fNCo6V_d~os3EM729|fDSs`CK>5c8rgZdRdr+rn}
zE!nQXL%|1R@>8YGG&J({<uh@ddoxB6@UV!{Y$&j}!6<e{i|!fCUQw|!x^ZJO?ee|y
z(p|(1;S!B`fv1@=y~7NXx68D;yEDz5oO`@Fq3ekthL{!SCB65k<_|0mK0RyVIU{VA
zynM;Y!=>hWTVth@;1U5+v4o&d#$;8|<p-`&bgEAR)PRA=|EXXoTPUvcO1nnmty~(H
z>G<oYQPXBhb@ZB(^^MmoEN8wuNP|gQk-_b7zdJy5gJOj}JsZ_gc)cGz4z}g>O=}#A
z_Y7wukEZ}9a;tnfoYy;Zl^kaPbrB<JcBeu7(#knqMe~X4Qc856?)Acc1nyL_e<)_X
z_fv2OGgr-+r~%1R7-sxqTG(k^z}2H_mYSR;%ojvo_%nOsSN2Vgd(&aK9Q4|49*2vK
zzP`RdpMd?&&u}BL9}shBl7U(4LVjf=UlZX+!0+YJR2mr1uLo3`x7ck2-#^@(PsnXp
zj{!s)LihbjXaX0VS6>zZtV~Z-U^#2&*tzds;~e0CBzlJkXDtIP&Ql30qZ&)Z$#(m~
zH;E$1H|q#{D>pyaE}Gt-HtJSu7eNsr!#nvD!D<?~4AEsl!CIEq+oEPUT`f6FCEe4V
zw#7`V9P!`n1GTPjAFcQ29gj{oOovF$`_59(xmeT%h&DkVIkTfir^Sd@_WJTw*DB)e
zf7`%i(Cvbj%J5ddnf8&1vc$5-QPBqxX#hZr*z+q=XILh*H9pxDXnQ#t(=?+51tS=k
zARh|n`vw*0Ll%AvfNf5&ATXVm{UoV>4te~&*TwD>E=Rd`tL;=t7f`K*j0G8E)vmCd
zWs99GDE7Ld{-o#gTvfeNyu3oRKih*i&m?Niok39ZU_+O=0Vp=yOEdN9BBDLfk{&gp
zWMH=LaHa4`Pw`N;gKk7m>f;#E4*<-(jQl|XLQV8(XTX6ycxo7w+qJX1+@Etd@_IPG
zL3${b2TIGkqdOMwHhb(E(D4WOA(URDO?5RC(~xN<(tgx>Y;dk)n5wk!+1SeoH)t_1
z;eP2?-Lgq173bXCokFuSd=^co>P1Pi6hk&_ZuUWdh!;pFr2*6`H3~EcHFUbC5=_#p
zQr1Cx7K%n!y=5Rh0gkd;V9>4kZl({LB>QNMIA5EqE*;j({<N-P3|cll*;H{>87>F#
zH;(q#4&^21nu3J9^yeVo_18ZxMXsLoKIyja<D{jceKmvmLJL7C#)>-sBvCBW%FO(Q
z2cP-Xja&u*mIL(kz;)h6&fS>7Us$0?>x#j?jRVYG6f$vIXmN10vc=|1_mS@V$U_hh
z^=-n%3s^;mrYBLxr*2XmO+ZKln>Vje=M^WrCf6~6==5J7NDgp6X17gXd|#n3dIEH6
z-9X$nlh@~rfelc;E+~x4=_fd8wtwP7c`(J6>phpB04r=_2%uA>dTjiKDHNpaS&;v|
z=v#oKi5U0lf*4r=(w`Iu))I}Y%_JqW;6H^5$n3rag(j)|y8O>^-s-%9tfIt$<%K7-
z0;e5*ZEwK4<6cm9g?7!(eH2>XA=(-&BOd5|O2@@lcPkk+M#tSri5T)Fue$@Fg9rtJ
zqZz-g5xAlDpWg#H8lb}8(Eqg33uwl5dZQJtGEctL;>z+hKg<=({^)JqsavpWJ{#r+
zOeM4Bd`+Q39#IzBrK|3%e;-Al2_;Zb1ozQ6YV<VgDV+PT0q9~5W9o{JhbPSuJBS4t
zg~qu2?vy;4QTJn3wum@s7Adq8(%%OF#uA7L=4qe|(4neB%4~dUqck2H=dfBt#=xh$
z+lGeR5!C@2ft{wKV$dB?p^O7`;@JAQZ8ONmi31Mbn_sT*HyH7n9DE(=zCb<<3|`k$
zgPTkS!dtRbjrAJ5AWq$qj{=>jk>$DF?5=V_k(e8_Z`GI8>`1b4U;%y$!f$@d?}-@?
z6A-4JzaC}CxZbF4I|=rat=A^n#C3!uN!(!M89${C_<(a%TM!`<@ZYEK5h(bBxMbC_
z?hXXF-vlzWG2fIhw3)PwuIhfl07*lhrZfMdzHVyD{zt!njnFmfal!UCc0bDikN$*+
zyYWRAgl0k<;MvF_Ts8U{Ps}jwz=V*01>c<}AdIq<cj57qL*C|pPXT{LB`t6h$@xFG
z`2Fh&AiZiCfa>`FJ{A0oI(Y>^5dYi)*%NZTE|5>b4NMh}d<Dv`rsrbX{P=6b{OiX=
z#4v$=@%Yk4gNe2ami02ptW?AnoiH$8Xdv`g(B`BlH6a*}wt+4oVi{Dt{|E`_VK2--
zAm%g&p{O;I&g%|1j<c<id^!YjMBfHv#<t$3hsy<k$;k6d*Yf-8)om;N1Qp_*waXRW
z8UXAYBA_!I-)&0%%n8AVe`)~|S{%-!u;#Ds1(g$@*S1&eH>xbBUk&aKF(~mN8xgVs
z6hG8(pPQ2hKsQ|gj{x|fBpY%)WYGIGpB99T6k6A-AeVUOpBH;G<(kdpzQl!m)|*(|
zkX9H=Paz1siE1boc=+7K*uUB=RiSOK+{LIY$4lV3Z){ofs$kcl%Q?Z%A_1imM32Td
zlYF|Je%wbb3Vhuw084thRD+6LEbqB*H_)oZFVG_Wjcs-d4$4xHH9ICb%AXA;D>YXT
z0YM%xNfOdT#ixzNUS)UL?N=_!s3XrO0J9R~v--%)*D%oN(h)gIjwefm><)Pv7vDm-
zk@m*x-UVg6lEVi@A@Lc?64kv;5)L4*QBbhf{Mn%ttR-<TU@b0T11;i(?nh1wHD%Wo
zn#E13u;-w~WnhHUd?<}wrTSCk*I5s@3)W=L5as{R_6QLIW#m*sldlR_9Y@T;p<)~k
z)Jc}vHg{@p$1>b?5afh>3>dOYvQ9R@A^;6D_{^R%+`Xz+^S_0<pAn9Q-%gu(ua0eL
zF`b5aPCj;Wsa=*(k32oXA?%e5q+gg#*R4GDiR}fKKs~r;P|Z6@!^JiYdJg0V8NQK8
z;dr_mVJo5OuevOls~IXr0O~IU0zvEgO*H_?30i9#c~u@CGDoCibfkB|zNR;G<(w2q
z<+U3RhY~;Yjf8cuXn2x){5ro|APJATzCC+<o^k)FurlKDJeMQ^^F6h7t$|w#Z4#fu
zbq@b^&RuDahs*VrWD>uEh5S-m9X|K%#1>E$6dfRDkurq?E(+&|?A>yV8uc)^(iegN
z{S@-y(D!J_zbz=KmiygE+E*O*d!BNWUwZp3#X-B<73oEJ?E`~1Bfn0&b-Uu1*XJz=
z-g8A}hwg$(?svHv4vDoPT6WH-R(HFuMuk@rV^?hqP~R@=y|<Mf$29KZ+@8K(z2xm7
zSV(8`S0H^{PZFC9taWc)kPL~kJaZTTY9xqAApPSvF@QuZ2oj|=$rRp~2-uMbcDw_F
z^&Lbu1|w2GRe^q_7qyZs(zm`P&_>W(XP=&GkK*{LC+mIMo{-F938up{&L?WVE)1mo
z{7LwIZ!DvI%DQlR#Y?>2K%_E%RAKx!nkhAQMf-f5oNOt=YQeQzVd#|QWu2Pa+HX`2
zY!AK=1DOFFK8HNu&VB+vy0j7|)2|ex%7VqL-dK0FA@syE?(e7SAa3d2QPX;g9u{X>
zRVSkiTh?~B3zvI5t;q#(tJ_~7*0OO0MO)^HfNAExjjON2>yD;v2anxBY=B}V_v4Ec
zl>F9)s^3XPn5VgsQroHgMg<+@3ay*fMT<&ZVdf!!(Iiw0%J9SY>~Uf#_=Rvw)zeZ0
zU+xpCuy{+pCA;4gDDbt%gMqYqjoYF$K(H1kfK(F+QPioUC|H15_A-iNTXH^?ej)Y8
zse{Yu0fu@S*GO~{FtQvmz^rMk)zl)%C8{-unJZ!>K8(vd=UI`uEJUkFhu!L=OTH$z
zoiBIEKP8|G*~a?UUkk9au@V`0<_}*(|LzJ@fjSyj<TPkFgt*xkYFWzhatrBxjJl60
z&v=);v$0*<Y!hQ5z@3Of&dY6q{iV|=*}tCG=d+KP2!9>1)RS>z57n>ZCP=Yfa=Fm%
z8*=P;V=;~w&i6wTQi{`us(5bUOt8Cr*~VL!^W^G;%i<dk{TV!Goy(VA6PXcwJXOwJ
z6DjjUkpJ~d*<wJmm+i_g6o8_U!5S#>T@BW4X^jEg^@RIR#vf6IA5zVO(EYuDg{HmY
z=~l2Im(l!`-y7#`Q8)kQHZFr;z6^hYz;%_=KE<TkcGdrKsw~^`n-&UOeNygi-<4Er
zc_a9p9Df3>z?TqJAtJw902NASZs&UP&m9N4cNN`4Y-`XXk@XWSC1Ekd3}r;%dupUS
zwSKFHL?GkrSa>@9Y|OJ+NF6cgZge3xKZ5~PB#Zs`sy+gKzlrAiA2-25|0yo;7zo5T
zXFy{Tpi9s7e)Y(U7cYj>xH<HDL+{15()p}42l2#kfzW+NO6Sn?Y#pBqaB-obq35|B
zgJ5-dP2hV4Bxz%ontRIrY$bw>b>3?DF>qxOB7x2AZ^ixNt_9u#HQ5xCYaRb~4f^xI
z;wF&t1N5Q>E^qmbL#2RfTiLw1=f+L{`E?~JG86{g?Ho_{G0=-?xyg~niACo%u(IC@
zoLA$W#>X;qRxYmoy9mLHCJ+*$KKSsd=S2?792?gcEfGfCfOW(uq(g8MIa{ec>;}~L
z6pBE9vTWHOU(4kZT^42woTdIJk`cFuB*1tWaC<H+PCaiU2v8*R2!VJk@^$LGpXt*<
zM<46A{&QlW=f_j+{a4N_*=~IND`;Pc{|uZ;lqfsUN`CBTU>m>)26`S{fSJ6Fb|Q-B
znk;#_uSfxp_?k@MgL5?Bo55clI)BfWxB+C;<~Ak<9lgoe2!Q$Wct5sSOeBA=?qP=k
zdZr2ijag&=x0S1CihyvN36m^e9xlq#&k#&v-ucaO{(r^)N56=|K_p6JdQHS%$jUT$
z#%)t!h5o@L|40MS4d7bniZzLpZ`I#6_fY+RUMM>!I}El0h9w*+I_s}}_~$+^KqN!-
zn9tF$fr1I(lK&_-_$vU4zZ-)+C1M0~dH?5Bv=7i56qW#${7;<p=zX~iRaQj(@U)Ui
zVixaI4=Fe%86r4ziu?ZEr#$uz>Fq0;*TKU`V;;F)GT!k<6e1htYl)Be3fhG(f?IpJ
zsr1RrPzd8Qy^8)pc_0X+C5IMmK?v`W`Mp%fX`q<`4fLC><GFOKl2-Z=w`upB)BP>A
z5l9H+XMa9~#LsBv#Rx!?{~Y*x2Ko3oBsmS&&!GD2Z%B&%!9>p>iAL{1MeImN6WkVq
zoBg6HxjNH{%etQLjV#&NZxs`tfdu}3IyfN|!Vd!>UTk@eJ^yt)GYD4EN;6GC+8%Rb
z|Bi&z)j=3dE&?e1`S({fAmp)9vXAcIF&q;^qV!=?jeU`m?*EZb>LJch00{{m!><or
zybs4nh*c$Wr49IxO?V(})o`V%aO8Mie|iSv^UtS2>{*-1l}i@KzlZ*u6ObZ*M$<&b
z?^%Km66kP*RD#-LR_LV6Nxa48qu8rBoTZk0LEe-LKSEm&+oWWrlyYttos`TBH@F06
zl(fYxV`j?i9Mz_RX;qi*Y_AnJIT(WISiKYW^55xu2B8UAk?mNSi3)x|1IW+*6<Eyh
z&B4M;y74Cio9~-9&_k?50-O!ICD>UO)}=oM^FK(N|HR>6_$HeBx_b`BQ7Z~43FP1$
za9b{!4C^FPA$v11bi=w0@$WPcb-;84_l`h-AMXvhK!Cc;vY_VekgBFl&6i-$OUx3y
zF*%n0MLNR`ty(#WseocNy^5bDh4Df~KVP&9cLnoNGpdJ`-TovsijR1OM^m&`C2b#v
zJ{X8nTL)#Qm(P60N8TgZsz$Z+rHy`v{6Y-nUs(xs&_k$>IgMpPfRDZMZ3&}wX<i9!
zZH`$ZkrwYH?QBj7-n5Q`$~d)Bt6a@%s%R(~R5P$3)T-heS)?P@1z{i^-DQt|5Ly}1
z|3)SrSB3$n%pjjOOJiSUaG@tQQ5vZe-#$}mmCgV1uhfArm#~Q?+x?$jJXWrt(Sc8l
zqQbr!kTRWPY7b&r=)FaD5wDY}ugLfA8ZM-!!y;a+9GM3>E?^h)M>$%q8gnY5@X~bs
z?5;cPINbkUKQ-h;Y4*PwC{3ZM<aNPr9rQ|P)affPcxG#KASoo(N*p7sbi=Watwj8>
z+;H)n6@M}tb+6Ov3NEe1C=H|r*o*OBmBdgvtShJMkbCh{h)Fb;r3|AN{AaNtSCIPz
zTA33xJO@^vZ!bAA$`u~m89TJ&nay78`4~zVFVp&e)z`(>rw{7@N&@+p$bF@JJ{wSc
zZ_`_|-HRz}y3`5k)`KV93{7a>LDkKgR0~kcI~-f<a%redRgrcM^)eKmA&4#Bx4D58
zu|>8y47a*H%f$ST00oeJ7}PTbe#y@PrhvTQy`ov{{2*0%Zqc!o!H8Lt#L#rl@ySlC
zzVy)KP*<b2+w?<=UeWjCLt3TlvZ=l9HuJqC7Yb*G3G4{!|LT3vvl0{77rzXKju6rP
z+}Hig!qh&0o~iexw2M6VQ1P`I_9yELoG}VUmUIQ^mP)-XrT(i)P}!Y|@9ab6-#Moz
z&n;BzXD3lA_%qH&q%jx2dvz=NK6(;4rYS$=l|STvR<jQxgs{V&fFK6=m@16$YvV7p
zec$DszO;47hAA@yFAh_B`cmi8Ouu8yVYQNwM|9L8nNm2(H7-5c3CG7vtKOT~kc{&3
zKX@yY=q6rk_@gYeLi1iW92beO-PUm@F*NmOc50tc$+@ekWBjZAbSu4A|D%s}pdTT1
zh97aj&tE};oN%#fCd7IsJ~74M)=kcxl0h16|5ABalIhBzHAnxwQ3-t_+PD5`mCjdO
zl1<w-l@ss_(_IdSKdaaemss_M`JKLCw(H%tvYJo4DwbFpAEJd`5&VyVw15Qx0ZokA
zSpxJ)-8I$zbWd?x+avIjw*&@lKF`~8pf}?-4MM2Y{ZP%5u5tMfvr15OMuYKoj`Q(G
z!o7px7(u_s`VE=Xg!tUq2#ZEL7w|GJItoU2)tPkX0H8+?G*r@~6ErpUtyoD}Sy^7*
zCeT^weg!C?8*$nh--zOC_2Qz?OMUKmp^61_&E>0w)IGW~8h>mZRn89!z&F5poCdYY
zN!^igr|c3eR;(@qE%<P4xBQ-wG^IT|E#EXI+`V_H*Z4e)bLM#8LGQeRqgSppne)q}
z|96!W=|liJ<Zf_Z^_af!)@@}*ut3)6tm!#_$O1*v(mdanxCW75di#6rLPk96p&Y|-
z+3O5e$ANfB0e>C%_Q^A^BKzTLZpUPKgFU1;DY<!NO4fmdqc$UX6up`*z_R^JoA<CK
zH%t}lKLRC!iF}aOK-~HHIWiKtN=Qc$j*s#h4q|qM@9~SYvO_nT^-V+eyAcdX;?mOb
z9>*O6rXu}LYg-Y>+dqB<@H{>ls6XOaEPL8**F-HX=`g;hTAk_3U-(+^YobhAt%A7n
zN^>*JR44ha^>ym9p<j8t<MfsfT5Lg=^vumWWDuckPYy{i&iQ~o++1slLHn`Eu)Ri~
z_yg0f{b}dLNE`Q7L-k@mbibE=W9{5PY;LH7jkEkv89Vx44{j6ntUjCS%F@yCqSkz-
zU^Gi8U^qc;rQmvA1a|ucWSi(S<(hI{lrkierX_cJlGMcfbaGs!WeXnK+wk&!hJKBx
zMu!p6#{4A5qvDx8mwTt#1IxnC`wJQimByYY>U0dpl3`Kzx1(iI&i?(rD^EAqOBngZ
zY-;@_`R}zP8rr|{9*D%&ejhH~>9vM>+2ncCYQ4e7+B`eCbUe8Hmbp+&ucxSRIZh;K
zmGbYbT|+@Ivy00e2DZ6fP~$HOX!l_)%`wyjL=)*`JcBpAvq&z5Qm)zB#(J#NxW>;q
zb*bV<BiO~8C%=|+<?p|+o2!<u0;%-#r<DgaHS=Dyl%rK=i@doy@z&l$TaJ<<dTK<z
z&>Kwax?7ZJGnemzt07?)<oLo{bGL1(A;%qtvv}t;ColzXeYrqJ+T`3Ah7!rpR-i=1
zOR<3$vnT)JpMdGu9MZsIRK%YZ-`fSKPGJ6Gs$HX94%7eX*@HM~_5!pwe2sDpQqgt_
zb<~Km{#!U+!Vt!!!fa7=9$YeiZo0+gujlIPvwi04;dze+OwpvbTGd}a7_?Z>bf?48
zxkWq5u-7#JyP8;;ZCuBTe1;X|(jl!(EX+>7&88?f#UH^@n+D<>&C&1v+Jue}(D!ws
zCl<?Ka-rmpO*_R{pM}uDw$KI$K6vjXi1uPdqa`lezcIWsI8FCXN7=-lOZ9*&D>WGr
zo$IeuA?1$g!yjtqo$Nukl386$Mz&h^z2DL!y6C$*#49!F^390AVdx7X$x%#CSZvWl
zcO->0#cfNu-FlBBCB2?L^gd6FNX!1-Bg=nJbXVSY`nwW|Q1Z{#6cJwOm;b~mqpv^>
zGqSVe>j_q&YD8!tfy~cnJ{Si#NY9eO(YFRa)l=nSHhO<9DVvU3&Q5=s9u`g3pjyUd
zUX|U)F+8+XWMATcSMV9RioGY^Xgus=>P?$fRei!?IoCV0%7Bf~mGu=&D=kzJy!9`7
zTV=_EicC@ySOgI*adMI&$(3sjB^_m5Pnh*6&gwnuA#Uh@-Ti&?GwkvE(>)~tSHOC&
zyaoc{v#nFd4Kml_<HC>`&8UqzN|o2<w~<WJFJ3FXj5w+1*UVZdo{=FQI+(A>UpXTf
zNE$y4c}5@R2UDD~4nugiwj|9??Z}c|7|(g^JM^g3f>VLOoPd9ttx-(_5B2xPEmA%u
zCPy#*Siw%?2Pjf=@GJu)^Rm*MLwdSUKPWxSj&?$e20OA-U%I!Vce?dXIo>));f`JJ
zI*4T|oAzZs!xBvUr_bdRQ#~#1>J}Y&lKrZA?6;#>x(6|`^|j_QL0=E3D72Vn_k7&`
z>zBdbBmcxtTDZQSatwJMm8}T%Fh0C<ZBkbq(@UxO#MUQ2m%WgSGvJ>cIZb|3b+Yqw
z?>TV=?8~$T4K1WGEr&MzR@luR^#8He&mbY>IxIxIFW9xIOHA6m)^Y4lnls8D#jX~v
zgTr?Xw=Va3xZY_E;~i$=0@=d<#2~=;4oyhRgtTVTv6U;>(HB1wZkI&Jb1mQ0mz|=+
zSnh=%&Q7Bb%LoZOnJ=xjDM+|a?j)L*$@R&pU><O;ayJR0{hudn2jcp9Yl4+Mq2^_K
za~LxCoR+y{{NXzD20g^Cywrud(w02SJtFF3c!L&E3-|bd#ksL6+goV1fqJ7@JiZ}l
zKD{~}Lr&R&Q!VrSG<@gCPlK(L(<fg{p#HG|zwN9E(0!<Gtj-VWZU={oACpssDvR~N
zp5SD5iJ^CK$AwsS$O32hEcn6hYqa|PSN8?;?r_3c5#gW#L`&D$%?e5`Up2hRlBmre
zPPdtim7f;`Ij~Z`=g==BMylpFd;qov|7V7YfX71edhs1*67M@}MSy92bqR0UD(%WV
zO-8$$kX5!&Ru3KaVDWpmmz)Mj187RAbq2@u(N1bHNs{c0SZ|>!(KG_%;5r*}CA;8I
zT7nk$Jv+JkmvFQmZ(24EIx4og*Grb(zG)>z4#9|{^zkW2p-g1Ub5_lWRnrI?%2Auz
zT=jGd2%Gyb$1^cdjX6jnvo2@kySV6JNT@gN5Yj9CceqB7f!pjc$E!#9U()(M$Z(3(
zy}KtkV^Yi(t$(Lb%*!3+gczdPT@%D*se^TE(mNh*;_A$F(C`>$un>dLvT@bQD&%0j
zQL`%VT%RBo<y+cJf7)fPMFP_&MA-U??>;{KA{nqgW9Pll8@S^HKOuJxo;ikiSH!%b
zNr_IfC%gk2IiYvYAOb)|FQ+X(SW<eH>8x$T1mL$Ws5Uw}$}$Df_Rn33h}y--9@Ktj
zMdK>pK@$8sdLkVWV9Q8@*XBz4jlv#5POIjPI&K+D(l^o8nC#C6%vqLKD&D{gU7Tn`
zh83?@*09z!8l}G(E=b8=0+yrExND#}Lfur{h(&GfzDs7?Uk(D6F39JErK>DEPuoV=
zP84|Y2W|}W^-f@3#xf0y8-nJC>9J_)3k62fUYa67n14gt206e@-wh_~`&sCkEp`$V
zqm2g}Vy$7hTJ@dhrj2aAoN%s+=~J*Yf2+XKnA0ashE&1h_j>>OHukwf5?2hXdyauT
z0>i}SaSg)C7#yVgG+&|Z`+3aeV{Bpw5Y8uo57VN|`1V!NUARJA1|R>~F$12XK>|oG
z974qC`?Hbu>07u<qaUr!QL6FH<vLumM9kH3ggl2bkhjyR`yi(<N_T*U8MPX_y<i*9
zY?g~;xv<H!W#_Hgr6DaQ;2_p|zg(XkLHQ<=;`R;YC>*5O)P4y0TGBG)n^&L7|Iflj
z9wmE5qmC8Apv)dNwe^)YoX*mnk{8!dE_uSuy__HwQ8*x8mWZL+d6ypct*!Eh4B8#9
z?z|j-&;;*BTmSc<^d5Fu6^y}Tp}QnCkV|`lQ7g+gH-7s#rgEK!Q%`QhYJ?Goo^$t|
zMk;+#io~k(6G<f;hSPF}(z0YU3YB+_cIJ|c(b=(aWNNFsb(RG71o-;m+N^xj3M*_Y
zy%vbtM_8qJ(}R-r^t4~`9q3_eraA>>3|BX&%&RBqHYP)0j9Xa9={tifxMP|+{me;u
z-6mS>f0C4-JKk({o1t6%HG3WbKxn(-ia81b0Y&yO=r}+}3S!!kuI8$rk{DDJNw!O=
z#4lnV2C+UkxTwhQmdPOid%T%MS9(a!O4sXkTBe-$9}E(CzQjuJsbYPTT6&3~8{;f~
z6+AbYu-DQCVt^YQkd|O;hkAEG-idA7tXBft%u^^OJ?UD*JLM`hg+;^6ujO=VugCq2
z?XTAJz3r%B_zm@lO@71yAOC>{LfuRpS5qp*`-)DYQqB1_E_qY(P4>`ybaislD3gDj
zyIKH}@kLJj&yuOnn~3MFV$%JgQ&5D-WZn&2pV=_6+55NH8>1v>jA4O4r}F5{aYvxV
zO^S}8kkV%xX5i|OV9UCTa?3M3q*LYmNGE9DSOK~kZ6tb^^Koh6?I*Lha75?m%Ej38
z!SMF%D*as_k|rI=H`6zvb<^wLm=_&T8J69relG(0wf}jHA3~r(<@0AkezzyYQ<*Qp
zE9siT7bWF;@!<2KE>StxAiAH;XrgE8o!>^X6m{P_*s6ZO<QS8pp&3eBd`PcirWio)
z#OLGFWV>EUh_8?0thpPSZBChFF-0aoSzZ#wE@$`}0vWb9&8*!75|$4<7QIjsImypZ
z3YuzQEgX=J?f)pp$$R$c-PVvnxjZ6+S_Va4Ejr#)s_JbP=<gu)L6JEN-w0qPApnfa
z9ul41BGUPdn3Mggbl&kH*P~W~VC0fHM$17Kt3D^UE$=3WErDghrOo@hkaKCBDJ;TD
zlN*#zwPX9XpXT;Lvm;)1@4>eg*2XjqcVEx(&A+e~V%XnVsTlPcw+p!#-n!V{5|%TZ
z^!;afvvNZ0hd5!}#IIc00q+~R!G%~_{P9=^v%N<Ha&xF&Ucie=RfsE*A5KiTxjh{x
zX{Gy?oG#<)o*g@vq5g1S^5cTcMI!U$%>p6862K(1zHodoa7C^uOW~LXd!%OIl2tF5
z+~FlZpmwy?{b*A2H@s)10KlRl2|{}S-l!3PWbdngi13|B_~GbV#%LKYp(zjV`R@Wr
z1r~@3A@X8eS@c_ez)C=aPMKNkR9Oy!;vn0pn5N*~M_WR#;kE|#5DcK~yu3P;JahAe
zT}ChPH_#Z5lQL6@Wf40U(v&&bJ(S)}4a#;n5R6*rNoM^qeopz<(18$vnLtc64vGWo
z2_Hn;NIbJk*o0~kUuKgTG_^?XX|JyRvZ(|wAoXI%E)oI~O*V<K!aTFHQ_XX_ynwzy
zC)@W}Ns;TvR=nHAY`jGsQ2)pn*#`gx5IX{b$$)uhg@ia<H^|2nY0;O^JvbCDDpe}M
zTfP!8Jl%%D(KH-fxNf>w?LF*2SAIG+hW_>{IK++|_eHVSX@BRC!iIi<%vnP_h+=yY
zYulv97Mn%SE`F5yeQaEa^iEarOQRuq%?Ad<M7i}}gJJxPiYiLk>{Q`uY+w+Wf$Y`>
zwU>~<-weD`m$a1E^ln1#rr7JTGbT!gN0fs#Gsv;SAx!Q^RyY!^yTPKp^Ofm+A>(tH
zS+Y^9;~J;R9M`k3({}Ct;fe(e1fo$!U_*i1i<AT45$Z1?oQ#x|cj>umDM_t@RL=ut
z#4N)@F~i)TtjwdTS<B?&b9uS)v~8o}D&<XJ6e6tZ&gbG#dRqz|*pz3%Ua%ka<(rkL
zTS#tN$ySrH&~Ypm^rpi8DYC?-312zc(MIY~Hgc4{5coU)g7WA71z*pMc(+Te`!uXL
zg8r!mh;;<de+8!=0u+nn+IJrc|M(1K$IFCtHX7d}BIE*4Tl0C>!pf7JRCP9Ys4Dtm
zR`q7ma*MpC^R4j1Tv4(RH=B%Bi{(>i3IGKWEZ+E@g?Jx`RK8XGR#_~9T~{FB3kd@Q
znHP8nMjKHOD1Ym+DvEewwev(M%ERne5rl^C)9O^PLE#c=jrBVEFG7T@9%<<Jx~BY1
zCdNKIWKTU0D^&<*!C%t;h~{tz=MYHkaWA?@Arfg^LO)x;_0%J-Lg5$|L>+%>fmM<k
zr}yIf?)7B!1^uQN1PC&`gJEh5(n>}abCPRc#j=UF?8cAh8og17`+hS_mgPy(s16N<
zfNkY+#r-Yeo#kG|_74rKD{-GQ!PD&2D^Vn56IakjtK}An;$dxq?im99&r3}%Q!_K#
z?OqS38w2@r8GMer=$^U2JL21uF8TLI<v#$1bVyw1vZ9Rl2oV7h5%DGS!-18kA&#Tj
z{~sV^ePXFdM}!9w5XcZHht}J+Hj%on7x#Xk%d9)yE&RR+TQeGX>8$d)%(NF{!W;Tt
zWkM=}e#{#hT>N3b3IE`Hta`_y!pF$!uV*k+x9d=@-*=HP;<ltoJL2wSrTy)t%e32F
z3#pzCsSNSDdnK|9%F;Oz&QZqb(Zsk{-!S<jlGUuu(QKub2Qr-?dR0iRUw$at-*pa%
zjg<tVDt>SH>1{?z?aYShcTH(o)5aHu00jGt$j5UG;uyIS^q*kFck}hTvk1Ae1C3rb
zlLdaNbN=`0OLK86(Zn(fq4@js<Ve&<bF<Bbrt#z{Aw-~+;Y(0sE#=7Mnmz0zUfx8x
z2Yvk{8naNwqvtylv60@F#FW=;fc5iwxX$DOJEb?|Iw?_7_56m*;}*En2mX@3@4W@0
zEPv%`m3F>0eRa9ssu7UFEF1W>#<0mTA>bpoBdi%d33cJ9*LQQre40j3qZVSN(+BTT
zo9f<l2WNV<HfTdP!_nO~t*=@2ft3fS+NF0p8tz9ZJjxLW@ON=AMObr<JuOX8I8)?R
z?Jc>BzHSDlaFuwprQzv$E@oM_Yi5Y;+7CVMP4-#LKCnPZqYoi{{EL4q8W0bR@_^{e
zZW)-Eur*_qG#1}y9prrKQ=<!I8WUzSaF3yrS(cTb>v$V8B*NvK?jFe;(?`xqC0DN~
zA=4CcD!>~pWP56nS?#%l<51sv3lSDAW0I9V(Q%hqSO1PsJ@9@~+}BC&DLr*HImib0
z7;HbU#X_L~Db^T3R{9s^_{YOPGlvGl=9{DfVp>5BhsyOU<;5_TxtJH7<gdBuSlOrc
z2!=(|LTELrC=B5gN<y#-g1FwO#_sK!l4rMW*Dz>~P7KmmQE>_IAoBSof6w0JQt%Bm
zfG+$&$<TyMhGp_bZ7j^KB8dmTpYJUYdPnfU6#O^Ug8UA&%aD$4q;Xt&_1(Pq0qAUl
zM5e-@%AnHkPkCd#qdc2xmu=x}@6gPhP!hejQTO0QA#GA+TJC)XRT@4xz|IR@x5MYd
zF{Z+7Xvo0Z=d8keN)ZzD{Z-lHuD39VTf{u3SWU$^cnA~X^4w{y^JR#HbT7-5?1W0R
zjp4VwCR7(Py3*TX{t5f2Ei*@BzfZ&_FU0|y{QR_1PaYfs0TR022^wPIFuIU{JSY9i
zkTFSr9QX3N3-MtGrOt}RytwkUE83h9F)#GE;OIU3bshaFa>RfPBi39&Cx&!<>GAd{
z!ZP|0Jzitoi+qjUC^dR5Y2-+A?G_3x)=|rc9)(NF&*!r1GGEdy-m(z87VmgiFH5FO
z9Y|!lF=zFRf1ky?EAA^w&~|+j>67OJBo?`J2*@yhW$lI}I`n9kS|r^3YE0d&x9&z}
zpFZbtI`%Bei2KNL<&t`&*u0fCBoVk|;mEpCu`$JjtmXHXTQFHlzTFHn8(YyrZrC2}
z_=vS-hW1F4C0$x69_#9xI!+w%`gCaAz$Utnt_28GPD{(&h+DnsZX?k&*t0}htps1)
zTx@wyI}j;otB%DU$xKjN_1%-ZgVpzO5os|<#!zyOjw1{27^EWteNWm82?pCUsbCms
znjVcbdGCZ~((sQ=XTWd7j_j(U`f+!PN2XGCUFAk6Sbuos*pQ3wwPKzwhVQYr6iz`7
zZL|kX0NZx3oH?rDIzmKMTf0_ySjWEmK@gvgCf6Cdu4yU*;B=){esCfWNvAKOfbLZg
zZ+UgtAmG%zjqEUhkB-V<#Ri2!Q&yO-VJB?P&Od|h>CttKv^)`a1zE=|zt#>qE-!lG
z<2FGAugtc$0*=BowT_NJ#=b$zzO}BdF8kKSW>&Mj^tM+;(T{*=^gZmo?l|e#lGSIi
zCbe{Ln}FeXS95b(LlNSfD0Wmh=)%SI3p<5jk^a5;7>@FEti6|JXw?0RHgW))@6l+;
zP%ZpZ)jijz#_rbhqPO~r-0z;r_=<HHh=}o=DE}x$p!uELGQoi?!qC)K(a=!P^t|Y=
z-QC@-JC}N|Iw2jd284B~$eHa~lh6HY(4@mg@h011A|;c0g_HJ0W#Uc8)-T>F3~%_J
z%H*U^?ZG3}bR#2I5dXr(Et22;v;N9ZGbyd?1ZNbr+wE@pr;1gIH8h<aU*Ce`D`tqQ
zY~8mU%=(@uU(*iJ5bRP}()7qk0;sedhWPnV?%cV<H4hGz&6qx0Wl?SYe7SAqUgwr&
zvyu9))^xI%`%XW_Dv#4T_UYlhlmEMCP{_xjK(>N+&HvY$dIqTs`aYwi?Hz+X&NkDB
zk3ZftU4+f?Yq@p8EJmLqj^vB%83h$y(ycxFen-8#SIl63D&mgyeqxBMr+ClGJzkq~
zGL7qy^Q9>Xr)&zG#l!=wrjOlzeM_}^Jqxv92L_~8UG`DqR=Vtg#t6YQ@S5;3Xg~)~
zU>sUW(bEHZyWskoK;KtHV3YQC&C0dg%ZO;I=}CcR6EHRKkn0h-LeV|y8#PmQ%v&i_
z^J%T6hvBF>AE+YJXzyq19$L8*N<HKioHzX#(>0tAXm(wFWt*b4@nAmlz8f(k*;6Ro
zer9Y-QX>lC_8H~2+uQDme`n`avJtJ7pg@ApOo>U`MxNzDouxVy?_%z!Lj$1iJhF)-
z<QvUADpP%WG>9ou8Gd(%7L-JAa3nB5EKr9n2a-V?pVn=Up758qGIl5Kc92EI_Zf^)
zl+V(9>;Hzc4~46baN$y=%TIpSBrx0MO+xl}*s=|(-4W%WbM*xKyk)*47;CamjT3|I
z<5}-o;zAc91JJrVnljoj3>$xByK#i$!?+$(@kJ)abY<k_X;25csg+R9q?Mtwv%`@h
z9Z+UJ7bdK_-1B-&P>n4BA}I@4m#@L(cS$Lu_r(Wz1;D-wK=4e|K+;ciH6<eYwSX|`
zlAf`w5==q}FfAkt`5U%Vx{56zxze#(;f2K_SmO)6S2`G3f&IpS!PR=gaX3M4Y7nAD
zvq-z&<v$`aQ5S+wCu2o5{z8B5#}|#toRrB-w8S3bLA}t;&mW<8$3p$2DoFM_-LR^+
zF##7^K5UO<=gc~HQ&RxA2pT<zZ6BsXx_0G)FvQU~AQ(8le=T>(9i4nC|M)T!7#p96
z&(p;@Ra_=KICua)HOP>)?Rebn$5eBZ>u~M1_nAZjPWW?vowOx4TYO^%<zRPV5FV_|
z{L}!d(iF+1WVN9u{$}cXe^8?Zy(GQ`1iGa=KYZP$p!Q5N8buM?9i@-vv5xwlLvdt(
zlQGdu0>rEdiEtreF1-!BQf_*B3Rg~ZDh)mk7>$f*<^cY|JZJ?T@$z0qEv7Iy2oDW_
z3du?H-;f*jW|yqneA8^5Hyd1N^15lm4+t50pJR*xRMqOXbvgeIba)}xJ;;%am-_V4
z-7U=d&rs0)q*1=t*d{lNx*9IsXqP3dN4wVn;l%|=V=9W%ZSM<z2(n=@`(BFwh#}p`
z82*hrl~=P&gld?)WF5@`{0xHml>l$HBz8Mnz9rh_Zl?_T;xZ8B94>&yQ`C2a7S)%q
ziDR&&vc&{+cYk6+A=?OK0(^c^)-)7#Z-vJ9VUr~|j6ivR9=X6=G=}Qv@)s?j0r;Ak
z(2l)~g@iItv(mLco1N;ZZSa~S3B~*ahf$#5?8x#uj2hSsATxsK?`;ddnXfjx_QdPE
zMx4wwdTOFbk<&P$p%!>l-$ox2m?CZ3$wO=K_Q}b{)qLZ=zGPfBlcQmpetyxwD=pFK
z;>GOxQp34EF%Zk5QNDFh`%sR7ZJTr^6CN_vr!TmS$K=Nq%5!Nbuf1~`Iml4O4q3sh
z=iode8l#udtkM@@{+aH9K=EPmx%}H~Xny=L1ADA86`Ac<UHu|!`YGcO%LX*v!r;OF
zfCJ%JWi`@&y+}u(&>w+45~zaS-DzXCVf?(f{?*~sCRXq_j&Ho$gdr2p!jW#-4oL&t
zDgyb3H?OBKcEKvPZ$n8KnOZu+g@fWo(0iPD>le}{a<F=Mf3(v_fC%mQZ8rr=-lvZ+
zs82ZGYbw}(EjU;Zb$^IS5|ZO*tR^xZ%aXsfBtQZ{t0Ma*mN9EdB;t{1bQW}6KMv=2
z#_S$&A<rM<%xK%AW_v$<J{4)4^hF0#DSYmO1e;PUD(z<^y!66p)h2lwvO{KgLjCwf
ziqE&}rV~j8<Liyga0b{d9~$F^n2RHa8t5lj^=A`f9Y2dM3@;arrJRqInd(=!-;Zf1
z2q6A5asUTlgisK!M>2<*0;3Tf?Z44E+u4${RZTV9+~C-l!3L{va$A+LRZWq?W<EBL
zM3&{)kRG1do+Ao&q>NNDBgNnl_?Up$GTOdiN86V9(oy8z74$^t4e7%)efo(j^e};g
zFFi>Uk7vy#5!z$i3`(e<Nd7YbZeXYI_z2442kfans(7Npdp5B&AKWBaId>A{3xi$j
z;Q^&u*K0VnfRJistnj+o8eYBkFN_VkdqiwflJI624Av<{{KKHuwz`v4&=MRcYw(jj
zfMP(^z}i*-i91LE?^P*73?W%^XNLP|z6WEB?;~8aEW3@P9fe`vI#pJkb<C9WyxM7e
z7YZ2rXAtegHo)e~D~AT%5*3nwg54VRfH5cm)`c#6VZcrBm(;K6exqy(8UbPHr>Gj=
zoGF9@9j~uBIo0}%(30mne^x1n>!eB2(9ChjmBq$!7QxEscYVuue@!?2&auYNo4t@L
z=1gt!-Pcphj*?7P)7gfnGj3(gN7C%3A(vyIK<jtPi0B<`2Nr&T9OfPnc9;-AGWjDR
zz`oaUGN^yWbubr1zq)is6-@SeLBiV7z8%}WRC>{0VnVXs-kQa<o-f#LQr1`zTwNp3
zf#S2q^n-v369j363?w37&^r7|?&JdySd6sEF8(hJ4BK#Uz{AtkbZk{mh8G~>{4<JY
zAdd_%g6l0hiVPsQnK188E$5-a;Ql=?&>;ug5qvoT4P7hPK>>ru<7_d{v4@oYuuM^d
zBI!rjk1cXj+)%I4!2f09Q7nJf(kj%yMcV4^#Ih;XaS8>odFTx|OZxYY9RuJT;eY@!
zB1j$GcVmO_E*17*0jWb(s~>%zLqD~}S)GaC%Ca^K|6fJ&@gdTw$|g;B3Go(w{;R;@
zM=TgnNWFxOMKtTD&wXRQJN;2|WCne?6dKpVWd)0zSgA&g>ocO+<(t~XttO!O^tZbK
z|L;Qql15oVc@Cc*+WQK;n%}{$_Y6n0pG6t@+MaC=H8nLQCntwsF{A@kD3gT>#v|zm
z<GBN*o^B5gvUm(rHdC`nN@|78v0YNkKjUF+N!jt)UI%hxy@Z7&jn(A-juWvb6u^^G
zzMbgRBcklj%AY>JINWyB53|s7M1|lo?6QYvWG5E;Um@Vc0AU(#47~v$QV^=IAE!IV
z(5wH4tha!Q;_Ls%6;VO~6#;3G?gnX)l2{OFmXOX}x?4m91f;vWo25aJ?p$DjMY@*m
zSbpm#p6B`g|L4rvvkQA>=FZH$d*83R;~Y&!Hlcj4N@KrB_vV%^04upxv%boATmJpa
z^Ve%Y{u}>2V}K0F394n6;sJ+WOdt8|vC^{dT869{)=!3ww1x8RMNDI2K_kNX)XAq&
z!mL8K@A27R!k2uUYnAO1N?vZnst;GefPNDg(Qe*+w*%8B7NZ$iQ{aaG-ZU!vKM!le
zC?U%qvDg3I4*5~Ne2E_0<IkA<f42Zd6!{WxV_U`j*JZY?iQRq|lCDwd;QR%Q&wLlx
z@dYkT+l{Jx??!GZq2B66urL4M2dM8Cpvo9wnJ>8i>@$<@@YZkPW~yR;Y-WDp9zkg#
zhJ+j17ONV`e478ZAEhtk+}Mi1?4mAtvhdHn_E&j{>JTxJj}-#^GoHJjmv6CWrEQlM
zHN%!R7?>fveENeQXC?~WO<FJD#PbDm3<c2<_wL<R8H7vJ79%EZZeUm^^{1=9PXH-{
zVilT>GXwm*HN8xb%_q@MWZZUbk<aEGJ1yeQLX8Q4(Uw)p@gASfGd)O|Nw_n({B(Er
z7QVjkn2lGWqHS<#UjA`jHva^Jy8oRbQnF*-C+b1BErT-0s|>YGYj0J>t$%TK-}fDB
zVdb|wpmOhD4+8PZI~$C~PEKB|=lO7dF`KBDQh=-U<^!HV`J3Dgk05b#GIG^waflQ`
z+NQTp79}K}W8#3N=y@0GKYo^`!=@UWz|^ZK6ZhNxx?jv;STbEENB3`M@V|d~81`*{
z!FKvfgol%txo&pBM)tiJjd1d&iisWJHT$i5K`%Y8e1x~h=J_$0#tlrGMB-wqri&s&
zKj5HUqrU?NIa`wJ`qfahS^7v6f{4-kz9{$KHucXaSan_MiKmyJ{{%n*`4k{PL4)VJ
z-^9VOhT`^$%`@rYplXZokiVZW?-+k^?Qw>I0ZapU5N8&fxD>+@fIOj$@y>!B^bKox
zoo-|ps2zVOJAR7=OrN`V)EcB^4DeS&qn0e|CkHH|o)fbmT(;DXmG4W&lzfl1U^PwR
zSNBMo+&U}2U&m>^K~c9ogQ8v{_D>O%gk`ClzkWw)6qNhiu5)BtNWC2d#xlLqBc$8)
zt3ACPnx=CsY2I095048Ds|K}#g&aQSTg<YNzhi6fx<y`<_ZG9;)UFt>>#ggN;2e;9
zlu!xo%we39-O_DVh-317__ufTqTd>^m~>aY(ft*udiX9S!>l`g+wOTx?wJ9%XS4E<
zcTIaXnXbs4evSvOH^xIPOnXD;W=-m{&(FsZ*qo<Ujp&Rz5N#3St~EIVNVY^#agdc<
ze7)^!hSSE~faduSO|CE2O>k&{t<`a9=s-RPZ<1%NlIOU@0gWks@{HmitSUmV77tX{
zHJ0HyK)dZg%kXtuEGnNNg!u1~=k~Mzx@fs!p^a8Qej{*(T}LV9%QQ>=!R%>X1@4|T
zL5BAy*G+lY_2>}IV9iR$9+nM++_=XBUd5BZzKW||_b4fUbEC(JYsT=TKqltKZR*^3
z68Rh31X(c0B0lT!i+ek>zoL!oU@3Qki~xhbR9jd)#Y#pCjvRPlP<@r$Zp_7ZrvH|I
zDPO*9`KfAbRmngCPHa;mB*A94NYQ;hIVs~*-+L{5?>6LmFkvkd6YcW@IMMK%CIUZq
zI)+o(hv><v50ZXyi26|Kbb4<vQ0%jk_P201HPb<T<kw~N$=7L(-qi8UeW7`|fAQGP
zI;U8UkUOcl`eCW0G+^rI!{Dq(&ISfQo|9bO#$3#cy$JVSxPJUli2t*p9t`CjEjvqd
zt5-~1CK`~EM-4`P)XO!R&if+sr7UU-{M?=?&URJi1VPnGkvyas)<g;kd`#5B8-T^|
zw#Jia+5J~fcB1yWvFNaQ%Hj?gHzL_e3O|&x8HtrteU_={mX=8Z3n-1zisZ<L^_U&F
zIelaYsM^lh@Q)0Wv%>M#%qIa8rp;9gkBG>Uu0)f5p~4^V_+Yo+ql-CP<tn+$h8<}#
zNM`(wqNIeKoHl!Ch7*&p+k|Px_#1Epvx~G?*0x-Gun|-ryl+2OW>X@3OW2-|D{K!C
zf(7Z_-XbUC4#E@S5pUVG1~lmeO}1wO9x~ybdd}&et=T(Z7_E_r_CFz{KfO|fwzCQ<
zO7sOQRnT6GSvqp41y4_!aK|Svy|<w~1d=r0bd3HAMH}xA8)50%=WkseBmb;$|K?MD
zj$QP;sSKn^pmebn?;83tlkLx(8z1w*h3dWtYNR2qyIVbW>fUoQoo1{KCa533`118W
zs~c<ROTM(3^(GC^&BtJ=Flcp7dMAySjprbwhkL^*BJ@@H#u8T!Vflb?_DCOt*<#p6
z#ak*o6EnG(VxR2~EN5|&hA;aP?Dtn&5b{a_FqR9abAF>eo}+p|HvRE>-cyq;Fs#qx
zswG-A`{PiG%^t!V*KY%|^Fx&X+6n6t8%+p(zTQ}{sh@`Y`!_JTNTD?odLM%lsV7L>
zlyU&GZ>{;G{-906I140Nh5-yxNusa$VDe7QqWaN#dM6*-8m~?k)>IB+*f`)c-ssEk
z=NRvB*94&VyA_5RFT-4H@nTTZ#H-yWo*I%6Z>l7CM9$A3?^oVQ59UVoM2+5ukr--{
zHlYxpOY(;qb=mlv$x16u-MhWqrEA`VeCnK(gQg3WUm8p0ssfXTNDF8%*k{|VUrkfY
z-#3ov49M^H%MA<5wTXDJo~gBQ@JvAa(oJOviyDWt>4NN0zf$$@bRL_X?3nXscTL8F
zbU9VANebj43{_%uak`UfD5wP`m3}cF3-ZboY}9gx7J_Y4hs_4+W+q#-Q*c{4Yo~SQ
z1i>Ua-5BO2nnJ3MKZ2TfodOdV)9*am$dd|jozc7)&>wdz&jQWe(#!@EY;>o;f#dU=
zr`xP}-LDLqO#nD%Uh-dtpP3DskdN*uvCd0&s9=70!kV>>-wGx&V?GV3{^E2XZd!%4
zE>^$3v+I&wJxI=KRHveEpdNV~5Rw<CuJ_8!;0OG}4^h_RFE&k4SFH|)ae7xLI)hyE
zvDHBEyEzrCsgeYgO4ga+11YN9>BYLd@q10+ck`w3Tg=Nk$-YPUz09!cFtMvCOP_e|
zAdk_wNA->CLm3~n2+Q?<v09s(oOD<@eQ}amznCg=`!Y8$?NzS2*oRF}wek^9E`~Xj
zNz;4QN%@%x``Egk%|_YCTuih=qNX!_{MB?|ZnEC@@0s=tqIc`cj5ll#tD}O;rzes9
zZVFEe#zxyFa-cDx+%0n3UuNmLo<g5hZ38v05mQ-ekd)?{^XG))i<*l><hde(j)@2=
z8z!?1Q{}Ylb%>V&^qCRQ0F<T+RJ>*$D@jF#`5}TUs;K$qj{H|LBWt6YT)p6D_Tz|o
zSf=90L~y6@(sUz8uBH{5rOzTZK-9)x?CG{(y&}aDCwgf7yFImR+c#qyLh^!?L)Rl<
zgm$tr4<*N>H#||z%NGaV#Xd+HGgWZnI7Q`nf;A0G+P#-lplEf#w*i`Nu96}_cF1Rm
z$o3<^y53lEX{wG7AyJjtH(Lsz_4*Iny87K;fOY&~#KJQ%S>J8@pvTI+rwGGZ&}hKp
zkVgbA*=?E>pG=Uof-H)mdv>4r!(2p)jPY66d$wQk!fISRwTp)=BzbW3h42TDM(o2r
z#oosG_Z!fU+C|!JE||Jj>+V^ZjmhE=Bm2g(s#=xu>uP5Tscpdfs>Z?R-!z88MVepV
z-R$d1yUj7c+SMm6N>$_0AJ1u@O4Vs>d#RJt;e0?xsUnw`*O<dQO*=!#OxC#bau-Bc
zuZj#8sXZy!nyCh|C7v)NunDP$vzO$s0j25Xjb29sIFdP_ZzWl-Bsse8C8!@@VzM-~
zZCE3=Gah}C(SG-oRzlPHsa~BS>)iCc`K>Y6k~+8FsB+$YW`C`QcBp;4GAvtUc68F$
zmPpCc&U(F^l>#{|OU?`p0l8_BUEhcYsL+4ttG6OvOHDuQhX~n_RGX_@C#ZRSi*QV!
zw!xn6nO(QmcC{?wC$}_oW|=2qyQo&%kUc%5#!gz*gr++Ov3i#<H;p?4!(NchPCu^c
zImO+L;kIFEUADQLem_xaY)V>gCZU)~W@7B~hDjjJcZdaa5~?aXR~YL#p8&8l)KnRh
zr9Q+|y5GskT%>tde~&C{_oJ?0@AK}DWBgNK`V;*TS$<<!TA}0!1;11+zR-4~PoT-j
z8@GO6(^Qod<aBX|P&;DnN%`DF5GGjcI-0>Bl6Ps`oR+sZFQS!1JDT=p5C`nyAkeH4
z*HC4<M5rP3_@rtgr{^OLoT_Y6vDk5tg*{wq+z;~^DY9>VF@w`5l5O;MBaOJrb9+b3
z6tFG4-RikjABCki@NAoIhy)KSpKY9)TsP<m%gC^$xwQDz$TJI7_;t+_U697lrlHtM
zt`wFuwo=F4omAr;_1Kz&rv+(gSZdxn$!hD5@&a1|6EG-ZqF9y1%Y|HqU9+4+2?WbM
z?(pv0;#)7!*Bdr6a`J{L3+7LEw!vO)byB1O`?qFchPvw6&)w0*ZhuoMd_aWPTWs9j
zj*!DF_QeboY<PiV`3C_I>rzbA$U}sgShn@>E9TkiQ=!J3M-VT7e0^R30Z`L?vg~c5
zg0-Xi8ujoxfi02qGgx8j<ufy;oJ=F<MALt@0AQCNKYF@~r}0y0=p4+}C&G7STs|_D
z5R+Ps%!6pUW8bG1NYz-yvEUBnca>`|)ipkiFKV3GT4sq4T|jnU`0fVv!YBg?{b)E%
zWw*Z@DQ%~B@q^Xfq;@N)tOrPL%2;btyh5b{`@fjCZG}#}TpzTXJK%7wXjRPjW#X&;
zZoCo3ZyR$0^X{F9ouj3Sk$oaPewk#X6~)35f`a8j@*}vI0y84)&HTH>+E|CPRvyk+
zidb>QG4VlC%}bXl3;4p~K(-1;Y4I+(IuSKZC2GKM<J<cFDH@K}4-#&$lU4|jSLA3=
zW1KkX0e$6dmFmyB{I?M`AVoi|V?Rn%0fwi=KIzBsbEo^>5bG0~wyq+Tgm@2|QIao2
zHiiJ<=J_-TBR?g=Ss^XWhId+%EKxLo_{YGkGX_&p5N1&aXsUXFAT08f^7L8F51n{!
z>>UHI@4j$?UcDrxIU34a1UjXjq*t!mw!aO35iH^A8<S9+f{R>6-(qwwNk>!kN^rH(
zvfS(jNG_otDCQcMtU44c?hWgk!Xb%-um*aylUsa<xyji}K+hhsNDo76%xe^d1(q(<
zNwL+l#yRIS<+<>n)+ocId#!dM<L5NgcGT?lSsBbTe`V_w*LIS7{OwE=WYrn3{e5s=
z`=bjl1B>7SX^9=JmC@2z*?Qj(HTw(qWx^=)@5iBYx_yo-TP#pk;zq&pJ};QSL&()C
z#SEXGmGwcf9Dif$`Yn~mHb@1QPrlKap>jZC%QcT5=GjiqS!;7~EXi_%A<m}hz+1Td
z`;poN9wP8Jbz&x}LPts1Z<j(fhvb9^K%`gRkP&Bo4QEJOOnuQ+?QuMga{}?Gd_4eX
zbH5l@vuq|!oo17BHJ?=&bDoz5MpJegnMJlpp{LJ~AJd!iSpUN8n&Eq{)`Qp?pbHf&
zRBdrp;anHv{4m5#t=DX_FZb-T#ERZo2WKl!?XX&9+45R5#STAj%>(ZX4>Wqe3RAY4
z6@h`uchm7^=OsPmd?0|i#itNO*H@4n`~4XZ><L8EFvhp>4&R3-`F6xqs-U3YM&u16
zCR+M)Oe=C}uAU}JeTUfb(3jCi^wr8Twe?^FE}eG;5g^7Cfr_k7Tq(b(IpF?=gU<Oy
zxqIAW$+5b(IT!v*kdx}wiu57|mFwN!{d$4N<k)P<eV_8{p1xWeO~4rF4GE!UL}v~t
zC%3z~vCBuERpq%E2!N!T=889Rt`~jo8;zlJl#g7dQPv@Ui5h&%#XS;Uy@12oJ4(5>
zl-A?Gw0u3oI}_PEe$hq8^J-GH!>)2h_X)@SS2enB)%s)AG5ikTC0DcSmsUN*N!uYk
zb!iM9YY`L;%YDU#ST>uK#xK%+ZWsUtQoRo3iDeZHtx_#RtpHfow7-)Rxlg2@i#dF+
zTMm~k9l5V2%rKA2JbUJSkJ{11>sOCL{7Hh!*xwI~gK9eJnlH+Ujcc5AzE5wQf@LI~
z6zoAC^yeV=iq%tzU<LY5E^&x7&adBH6oy5;FXrt^llx9UF~un^hcY(J)%GVlbviqQ
zW)qzGa7E&S?;&A7^eW?kYAf-t@=96zu{pFM4t0W`gz3%JvrDxn;SDWAOGVAJ=qQNW
zr#koMMM?z8OQnm)HP@zV??D5KB_$ib`Uw=WsBMpN`E!h4i+S`Kg<hvuF|z3N4|<En
zb#~{-+0Ji8@V?al8x=0w;Dl=rgG*q|3`_2v3w4iu8PxchkiK3|p#Vnxj~+++BW_k-
zSr;6Y)TAK1w?P5=4S4`V2HZ%pGghZrApn0o&nB0b&wF52W$LGv)8N6*&0BkDM5Cff
zkeKN%wvC}WzqBZ|bb72gymc9C-?6|AAKldg8Hq*(LL=JzqB<FyyMOw^(LLbyibpHK
zlGy>qi_Tn;j(t67mtxMIrt_|JryL?&r{LiS>+XA;;3wKq{8)f$Sp}ULkA%71v$={l
zLN(ScaqoaV*9Rg}R%LEgx$pz`LhLyY-EEZ)PW&3+A@Y+<F9!>|;fDM8?<4%piC&|E
zfTtq_9QLz;eDou1;ssGT;*~Zl3uz(A>A(5awj!s;-#C7nM+VG@x4QNw@BJ3ouyqH=
zDEA4G+q>+WwKDBZmFS-^eM=)7=P}rza8LVPc<h8Wgp9kO1MX=#WU6mN3@6?&F~iOR
zQ+B~KB^lKdcCn{!OIPN_5igXR3!f@l#H!L>EiICKGqx@=;G4?x%3$0u%jayKawH&M
zihBWJ={YnSaqs_Oj`%|ou##rW2*w06nr`Wqz6g6&5!*J%^9(LS&#NyZs5o=+K>_f@
z`zT^+$fE8mL@WQxbB^a!fr97b!luoqmnvF&{i4Tr9=~z&sjES$Zl`c8pW2xQQ9<Qh
z`4TZz$o4hl>AgqKk~EIG3J=p&l8kM=tPyvl7RqSto3Wk>0$^}w$-QQLjiVLwGLtL@
zAe12-lh#~kJW+>Zcz?H2T$#TAUCIzuy;HuizZ>9K52}E}Y;50qsqw~hFmB$zSZtH1
zDsuVe?ZE`Y*=H1ivBfF9ShkmLmKlZ%$pL;F!Cp0&g)G#IeHZ|%*YAI>nAUIq2vk$M
z?jO0oi!|u^8t&}~$_E>B4$|A|bBP(~SbhLl9y1@52`K<$6^q66%FJ(>xrgTTtjfGc
zUUFykGqgbb&lZ}u>1I-EW~SJTB7{<tQCtGfc+C`eLm`YNd48q(Qcq41+k6b})(kb`
z&PqyceM0Eh+*O%dh;k#-U**Nc<vWziKP7F2i4=L{)m%rRN87DGaRVe<S$o&WEnh}z
znxBlDY$DEbKWzlkS*-V2?8u9m9#8HxRUrBu^3qE`ya0{dO!AuyVDEp*{#qgzBjg@J
zRgzX!`^|vthk_$r92TxevC%u5O5R~aD@YipX|*apX>d{Jvz_z%CN6ix>bDxF1hVTu
z|IrtP6=mm4X}3R#5NaLX(!}+ow}G3RlRbM@FV>AuYAgnl@P6sGQ$|>hVqSL@R#&&?
z<(bPwj#H1r$zaV1i<H9kH!!>4QEjRH&DLh9*wu(|q*C7Su-e}G=ElBt4b}B<U0Z!V
zvgb%;NxbmZ8mz7!8Kojb1}5%P^nEc7(u25S6s1&i_pYem=8)-^5QH|cBTs9J$7l8}
z=OoF~IW1fU*cZXNggA{V@|5IRjEkO(*l8gIqPye+Qkv6grG|I2wgTqd<tbeT*sr;h
zy!B?#8Hco`Nq~?B&x;*w*RM%oL~sIDW%uhNXFgE*hRJpxi#|0|lHu-*a9M6Z?0Q%M
zPP6xLbkX3FH0{`<GhXW-9!h-10qXGu@xo4C>HIQqKMTeR2s8%5e=FZ#lT)hr2~e>e
zb<&=l{|zR^vSgc%Ag8%+GJ~Qr`G85>xQqh8i>3(+-CYViar=z&5fpLVP9+h{7<A?r
z^KnE#wzK6QjUetn8o@TKwdJpUJ7IBRg`}mHl-`W&Mbgbbwc>D+YuUU$ejJT-ITviE
zl@|RNw*IKV)ZA%RpX@W(SAIEWC`+x~`?#M$W1Z52_AbiK?SS4SS-<t`#kXubv)(pU
z3Su!c_v<Mo4D;G-5vB|peSN<>>&2=&;dWndD}ES2)J*w$DogQPx$4}mM#q`M@7va2
zzWbf;uS;?IBS>$fqiagGa?n=YnY>)viNi|6t<@kObFwXs+g+mEKM;$uNKJ$TrzpNv
zkw+&8{`xRvK&f5PS5IS|{9WP^!WDXU%G&GFHSMJT**#?WI(v-g16C>X)y<HdfF9PB
zjFp-0iawO$*5tDC#>RRyRS>_hq^Ai5B_}7K2<H5>T|GR{-{URVMy^n-XL^->aok%c
z(n8SrRhHu-Ba^}4Rv|45zq(#-t}zU(O80%HYJ;^@&FrtS%HM|f_)KtkvgfQuw!OYG
zO2wXLmCyb(TN}*hHGO4pvYkPA*E*UVzNmgAW!`dP8kvRv^<*`7o%pWKR%Aq^X?&3e
zMEHCz98_9a1U?R>c<~~fiy_QJi!m>$`w{$gjRgH4+}&Myd96+{648I1bp{FbuyO4>
z6E?)7<YPLHnncN1MLZ^)#$RLoVN!6h63Z-|js>*GaUHJoXox2sk>;zr&vOGJYQOe(
zS!U36RJ(#Gb(&OlxMhhAW}Pm^h79t8cx27bKb5ai4Dv9gHpe!`=vM_!tD9k;`JL}i
z;DkY#ifp)>peJ#1Mf;XS2+Q_wTjz5O^NFFV)%*Rzw)Ij18{dE6{5d)p|8@1fsQJ4R
ztq=OJEW}sjs8i18T^kPlrnP$3XMdsGZkSZfiNdvGMCi8(5`XG$r3@ZA=3<64)VjX|
z*dTxUl#4Wjfu(ilR|Gcd3B7xt%n)N(J8f3Gk829;%x+!}oCD~csn71c2PRo?>4ps_
z+L8JFp&RB6X-6{?Thb+>JCf6yhT8i)hE1JwPfQj|vQq`KDR*|>zW$}n&Y!!haMV5U
zS!b_>MTqe!EX(KUNP+=1*z*Ly-pwByPlYptrX%mgJ;BbFHn-+B`GE(hR)x!>s1@)K
z6BP;~%Dlo#d{nOiXfrC%!AaqfqplIOPV5XfwqiN?!RIvr#U&@n-D5Jd>SZNTWS8}u
z@DTOLEih#XbTSvw^~*76WNK2qyiB(XJnac%_f^LXbN@EsOkh#QP6i@?>D74%J{Do&
z5+x%YcZF!&uzzk*a#eO2P;i*3Jt)Q=Mm?>%`~fZ0F`98Agbst`Qq9#dUE6WCgBt<U
zsH&Tn9L1>tdP#T!Bx+tT!o0pmIHmtYtB*q=9>wrDYSDpn<!|Io=YtHB7o6?Y-Z(mH
z*CPU=`M3xp(0e@MbR=z1V9ZZ|RPWf>QW9mPPcm+^ae*u_5*~vDYzXvP3N`AomHb}b
zGqrAFEzmsiI?+;VH8|&TN}#`?NZ_KOx4<I5NzE?RizO%_8P<4QqkU{W&!3t#+@j%|
zUCk`=4nV7^)8J9;>FV50y!w*&x|kizJyHM;cV`e}e>XYr4T%G8o7x7A3{<0Jyx!NB
zYeS8SdCp+5BI?-VME^jMHHyJs9kO(sMHj2#%#@vf_y~yv6b-G<T8JLd@#Z<7w1AE4
z_P!lsq3)_3e_5(vtk*t*>(>AgfZjc;pLKtHH;Q3$bLi?$R2-U`Lw%-;=bu+VFu>8y
z7OtDq?`%H(6T|LJG!gN+YP~ggZt#zo8AmG>RdcpK`6OJFb<RgQ1-aOo&qzhr^H{63
z@HqOv0pHBa+P;*NB_ipxw5aw*3}Eue$Ky>#d(Cn*GnQGOOwq9_-5o|#8xfW@Xj^?G
zx595ucJ{J7E{3iGxZ=;xB%l7vsi7cTu0I0l+rvyDOy2JjvE+(SRu|0m<$5<68Gw$I
z#ne9Xs>H?xYNb}&LsW)ZD|i`63?dTqU0nhLp&W3?DyR6efDpZzz3f*tN^TcLf#wR(
z589!*=AsnZ$$Tju^}{b~!<<!q04%pWKT!_Lr~k071`H@u<?egx=jgi619q-SwRWv7
zf1aH(WLEq|3a6Gw8E3gE_R0zG9egk^&Dtrf*)F}I9bzg<Hc98SJ>AdA85RgNUYz_`
z!Q`=2hDu`S37;>~Do+>pLJlR>kTf>@*E1R*@dl-b{gF;TgN6nGWHv?c%V<rOy7=yx
zfvrC!;{7k%fAB{<a3lBavA+b*%Ld<?_=WLT<7{cp)8BVKS)+REe>Y2Kw6_`+<1b85
zD>i*XTOb;S$YLCFSN-KHMt!eRD%0A`v3)FfQj-Kxn?r(9D?CT-8~0a1C|`ht+Kc6z
zY%3e*G-m^6H9&;NKh$*x97`PmtEF0!3%56%8HQ&FpI&(Vu|c5ltmB)!PY+Q;0zi4$
z(+5#^y^9L@m6nFQ{G*pxSW|g`|F`NMC3jy94ns{rq#3SO-gxT>7_L1|(|z?YhC|5M
zKfq6SYK5HRykk#td{L-i#FyxWP<<xmNDB@@{h647$xueTkTA42?v&$9?8fb=o)Q~*
zyBeUzM|7*`^q(7tItSF2{^IcN+kPl&jL&l=tzG|O@$}SrD#Q&67M2bStM@sC(5KIb
z3=E@7PYj^w;9O9!tM749d#hpLC2U|X8$@IDUJ(yp=D(NouS<J@dD%vwe|~kl_vV6d
zJ$wmOv`6l!d^VQ7J1;6<dcMU+jN0K=hB)HtDmb?7srZs0m!A5y9QE;f8jQ&}lz75-
z>-_~@&)8q7Kd5VpmihqBrbOI5rE<IgjhJU|*m3_kqZK`=<U_Gfl+p6ZOfoLd@F$wn
zWXY~J=*T8;7)_(9;+6I_b108he4ED2l0w*K=s0w(R%=Zj;Y_o5-XHPe!np=h1`8zy
zd6?>KcHw}hSK00!usB0FZ@;oQ<uDgI;kWUzohR!GUokZWQpx%|Jo^g2UZm#U4DKmD
z@y;*5()OQ5A5x+PRfr5B3~C1C!iMPvFIH1F&J9d8{0_HnFec&ftLy#SDNaRkr~<p<
z<TyzGdLj4%lxVwtG3H9MXh5cSjsX&Y#yZXJ4!FTHv}>n~h<JslpD(o0m}OvLV>?ca
z=4_O}eKyD6J~Kv16IM4-K}!U{{B}u)hw{63bI3g*u9IQBzO**?>t+>?{xsjB{hj6l
z<ndJy4bA@h*M86s(12`)e}@VG`{$WbW$O4%qo?hrba<M8$<>}t@exm#v>9@8zE7qr
zyQ`~Df}K*5^nP_2E|in$)mj<u?ES1=Za^W;$z|-Ru-b9laBYhTb;(|V*4~AA8m-tC
za#$Ml_0x;;L9MWrFb}MFa4u|t2tw4(Z?pTOtlu`%s|l-rzO%cXGQYU~0)BCTCpJ<4
zn*#kd!#{zh2{?^(XS_i=-B6a6FN{&Wy_#gKNw_9%jKLDBeH#EzZW|edd=uwPVZQ3w
z#vbsD8^E9k)XXX98x8rn8N3`M?h!AVVlYg3qZmg&%7^ebV|EJBtET<3UbNC2>v1nx
zMC!Qy?X{B~QM=+m%wewNiRt3PL*w*O1uC|4fAc^Hl9@3pIZb1&Lfcz9OMAcSXoEQS
zz#OcrR~w?6S5}trl|QpDA+s5@G?mL^_lF+#Az1Gtox8$GoELGvNxI!^OQ~9i-}p88
zdfpYO3{7ADr7Hi^x1MXGWAj%nBg7^Z>ywo%bqy{K4}(FPbE2qq$Y4`)4QX9Z+FOl9
zb~QD3kanT$3M|4SE$~%85O-G5f~bh=F|4MHY`o`lnm3lYB2e``#KUhZhfyv8InMk&
zwFvfOHvs@_Cy&9!Z=!ydVryG)c}btb@`?VqnnP%99&V?16||+tggi*5tasgNEp*xJ
zO~%&TZ^?}k?jKr({>2h2q<7m`FK`Bcw&KdOY35{Bt1sF;{-`X}5uR1HiWBe6wUf)G
zWKQJ74C`CnC9@lQ&JG$+nvO0WZ&1tOBFF@?QgYiHe|8_}2Gq{VW6|-rpj&JHtj7_r
z*eYF-0P_#0@#=`jdo6#a;b}i;PBc#voqFeQmz|G!*kblbUwxqrY<qJ}8oOhAze&IW
z6qjDzq^v}o=6u~5CcT=fM23afgZCHQ@hIZuyQLD^fyXEH2jx0QaqlD7*0}}U7eaq%
zg1@#)+JIF8vs|eE=1M~W6hPiZX&zsk61e9IjUt>DO+MGEP*hQy8M!%V;MAv!+}xy?
z(s?)gV>8=OSXb|(YThrbLHPaxJO8O(V{CJd{$3=w34Q*{MPWrn!MA-`?45jhDVjH1
zIl{us<j)VQ3G^3@#t@Q*hQhs{_iRReeX^*kK<<t=!EkJ_7vvG0bMi_8K>~-`ZX@4&
z^Tck7neEZkX__LjjbTKRRmg9J?s(ejEnGH(l$+}Ad<3AVb7C@sbT$q)v{sA|wo3WP
z>(?E?8x}5L|5I?%)zZ;?;)!l+zmf}VNqcA_5;UqO!Na`fY^Hf#FN%Ru-q<J-3$Plo
zU#P;wHDy%C-QQf;*<5JTyWwH78P@f)p;=Yo_svgXnedkx8<}7KIxNRddweE>@LsAn
zRrJh0tN*<)3W*Xj$q0ckn(2pndu(^!N!z0M>X6NiYH;JpEWNcSqfM!dY8tdWAjBjN
zG$#|wDFWa>U_qW^wQ_BX^XS9h|KYQ_PVfjkPAlJ^P1X?+-M5@gIvIwTEzE7yC+SCL
z?4C9hO8zHmi0TKXy@lPu`-Hur)yGeJ+JUYg?4Y$6N_tT<qAp#A5e2HY=S`=-S!<g|
z50IQ(F4c7!Lkafw<A-B16qdiwe*h^DHqT!_tE3-0&uVjVaryc4XSjt7Q01Gj78d$R
zL>var?dr~3;XCjXx65fi3NU=PEK?KYkwCIKbCMx$;B%`lOw*#1R-4{3FqEkg+n2x)
zX$EiSz>nxWtTjs^RN_q;D;@24%xY4T9jk8rE=?y7mH&fC8FEE+|7VeT#Dj@3B1};%
zqxrZ8$(P5U%_vAR7{nGUz8XAD(bF$o<lK0D+{5vSo;O9ncdl6VG|j_puC+lqwu-3I
z#|yRAWtmF=NYFfF*e$}uVG6?>6{oDCe>c;KSnS3#n~l__d~g2dlX$x)bn}Vo2S_xE
z=@xCgKI0Rxfr~bJ^|@7G;WvToQ$j9U8ZNTLJHE`webxQvcgWHR{k(O$!tMHIi91U_
zf&UVLGkv3ZYh-$gT08pm4muGWY<+A8sl*6a|17ckg7YeyrQI`S^aWSPSV#Y7U{HNn
z@;pE5l|uh6)XzO5cd^Ccw?aSZbPo<W#hpot@S@9PQ!#GRXQ$Q17#;GLb39*1cpbTg
zib-LVySXuo=0b@6m92b(bj7EUZ&qX8HDEC9LcfPcI-3^FPxnl81SXk$6}4dNebu9<
zshZW7dCfjl8xh3s;g&SbdQ|$h=ablLGqy=?CXOn$-i$(rMR~esrG!!S?E?Gwy=`-T
zC^uTn3hv(X9~NQX-Nz7eDpy^E_buV);)9>}Eq!26t+xB76|cq#qJb>&T5Q~pBsarm
z<BxYCY;Bc}TmPy9!(}_@_vS7+0I9A&(G+N9p_g^d_ofby)pRR<ds-L^s!i!z*>dpu
zymFeaxiCy6kSA^Q@E3@@rdU=jwzmk)CP&vLd<c?u8!tx(KDZDQ*;q0#0qquTz~Oet
zbwg6Sg$7l>JI`KHfx?ZHMV9jEsiP4|5SRu(IboMQ&fxY~fEw>Mf?u(je0f;r?JWrb
zQ<C>o&0qXA9}T0P$H+i*HjQR;4xdl*I==|$zPMWQl|!qOo+07<qTAjOO4Y;)YuYdD
z7KO>2+{li)StNF~^@P5(jtmKPoDCzHOB!0Q?M;ycXFsa4?X2;5dok<03wcB`e3kMP
zA^aG?aKffDVu!4Z;D49G!U`{vH_-ylfA~+b9_A~w1$0sO)J`&)4-moix?0&bpXBHt
z;~S_Oc*(KdbC%sZ!B{t&QJtUg%)s-I5jHwpwzVI18m3ou_EHb}SQJ%GvHnrG{w$O7
zSiGV<28p9`64+=;75NgB^#pa?@AlwvgEAm%g)OI}UJh`oScmw+7hK#r+fe$0)2nt_
zwM0l=Vtzvo?L_S=6a!${=I$!uvUHx5a~<<kKWoZz^;#BYJ`KiJ#z_zC_2uCT9!Tj?
zDZ(1vX#90{66(>=bn|YX%)tu{<a=)Kor%{hP<!GJ>u-$wS5%A^FL7}xO~}-GH?4(T
zJo_d*sA=D9`eUB|&O&Rxevn3OiD0^N(o91sAazQ_2wT|FI%77-(%KlV7q3}&Tvn^a
zPrywIch*3tIfXF!ByQU_t*rE_S>2Qz+4WCGeFYk3&DuF}oVYPGB{lyR#;lb_qc=Q#
z<daW=XU6x^|9VI59XBEY7s1b!JIJXzDIIic75n}>`@AsQW8V{6Q$X(L6RTMR%j?Q1
z<GIQHv=mV*&(X+5@ySFlojI^Y@QAk8wLPQ;Weukv0v5zvDQ&BoxPb@raVAD%f!vU7
z=L=;5*pqg5J^s<gvW=IJ6fGsCKGE~}=TF9M)iOb&RpgX9_Xnat<%5bM3nQZ2(AKl@
zgoD?wUYcH)n3jW)2N#Ru?74h{Tko|@Z1A0crk3_8J?W2+@4`BK^!M90&+Ae2#hoGu
zRX#kFU)~r_dDvK<Z=mjeyi1etv9_McCqYP5My}mAHSIB|F){BNDa8F9<HBqBBdKVn
z%8?|wl#uQ5FL-M_QjDDUY-Kqw`{uGP`2JPFQX_j=t^4A+7L@HJw>!1xt+d)(h$rPu
zflqOur!d<u?zH*SW+GlG0*fZAn^g)_k@;}40=by3Cv60SPD>TrYEgRHd;!r_H17T!
z2MJxY+yCxFEt9g%qJiv<c@$ny{^9p^zi4u<F=*f8JYMep^}^B-YQ-5fzQQF_;_8}+
z^^Sw;<0tZ<(x0)qw)+!7X0Zp?eO#V-TkI8weM@T?u0L0%r!#`3YSb+pc@m1uZEA8Q
zVuTO5^F%G={KDtO+$2|c6wY%&GZ=<CIGnAW&*ewTNETa-1beF8s#9UugLF$!Ujwd~
zwNQons7615tKOe;x#YmQ3`e_(Z?M(P``V(z6>$$YW_9)Z+l@Puz8+WlOnF%IqOYQ5
z3Kgx3?Z>S|B|3jN^e7NzR+RNeYV!Xi)i4CQm~8P}A{PC&#oWfTjfFTAwxyozh|FKL
z9Iy?mn--OSFmHv7#AXV6cJ#q3NGXB!<?t{e&D!i=!|XbwSG{gd!*`-Fg#3awLDl=M
zX)uke(O-G8+J&yod>&W_@9$^edCF((%;`b97`9-c*^@nP2o))JX{&nw@K&Aa)2
z6WcFu2KAPcf8KAH*{S{LwIj<n*yOx>r-G5j(xQ9(<gv?Bt1_YUOvD&y-&b?0L2sF<
zdG-b1g(6c|{orb;Fba5$@At?~zHI7DY&XVz><)CZ$!`v2t6_+h8-9JG^hPDNjFAB2
z(xF#KuuA`dF*lQ3KJ(Mb$;Z69$0)sLq-c%WDw&wMi0X?Gm3yJ6X0s+wG=8tePdI^t
z&bHGjWJnXOB6+Ab_9^IZ5pmuo!KMQ=_904=XFJ)iJ;T%FmT)=57}Nr{iBZ@}nm5&~
ziPO*=sUA3su)k*r&2EN=UmkQS!pDrXOvPhf%@c`M88Zpy!CN@ubU_SKIDdR(rzzKf
zpvIQv3u|f>vcXdtG|t@C)5^{#1>o1I)^xUqu;3DSeX**g%JIysxDM0>f-v776J@_r
zIpu;|w>L8mqmo927tWW?(qDbt*A@W&E_@geFG)4KJIH!(JQ2i^{*s=T?)C1Nmr=o&
zb}!R+F|HdtB@r!LOj%CO0+SX7Z7HEACqHJZCiLDJ*30Q`Q+7oNM{@|J&x?d@S8nae
zZzNCQ5IAbt#kY&sIwAWu*`DqRheV`nUNZpVM<q7!o6u-cr0~0WL;YI7S6M}069yjg
zGLBGEl`R{zl+5I@Yn`8oNeXgOWpx+<5m1T*fN1p0Qx;K&{p%`qHD15Yc>L4F>yI&o
zC(Q9HB|o7Z&My6|B}p+Y#<s<`qO4)SN9W2hn^zp7M9SOTpUI~5+#D5DN^fa6R#`A>
z8PSH^&FgFCJU0X?bAeNM4+(c+=tF)oK0rmwzoIaaQXMN{=b<AYj)R*#>$2wr^FGEm
z>GuiGtW;O!seR{qcfHIjZY2OWdm#^8o)O2x;<gos-d*>8PmW4Tx734qM3)(DAJJ0a
zen-u1UCj{DcQD)}pVizYJ34{tIz27_5242`g=|gSWkEXdBJClDPe1{teEdq3Trbhw
z29{DUH94-Ssl0coWpO@i;Qb}&rAXe~@-S`!a@pg&x<ip@kV}^9uza@XkIts%)ylS}
z>Mq3=7qUEy!<C}pdTJk0aAMieVmSZE>NOggFTnpkncY|CetVO$0$zAie_CqDfjtl?
z@!<Kloq48OklI=w<^xUbzrtA?-c!B5?O9!U&<OMqO|x_P-@pkvmZmO2bD7D;F@zU~
ztq&~6(J$5~?14vQKU`T($Yu}Ntefn{?Uf8sVo7A_Sb(Aw(>A1w?E*!cdb-G3OUIPR
zc>`OQ1;`D)0!?T|JdpvjZ~3HuU;Tx`^svs}OkUnzSiavclm*$^+M3;<rSG0i$m`rT
zfnA?sdvdqHDOUS|Rt|oq+y|7WH$tSv^J<{cX>&AP%nP{?#9#K#H$w<Z(>Z6d7oBSP
z`=1z1z!UwEYtLB&k9r|Tq!#bnmGiMsfsDfs3W32z_k>l302I70eNUb$;&D8b#B&P&
z=`Tl7d^g9>SpM@w5(P_(1=Qo6<rz9|aoB|cK=$@hwBqd661qWwdivq_jI})Ob42=1
zgUb&`Be>FA=9^AT=8NnPF3|fd+U&)NmtJMv4f(owc|3}De&e=uDv)@fzuI=Lp6k$s
z9zQ!&#H+T}NJKVXoj;Cf9+xYZY}COVtN3ucGgq_8mPOvYL@>>2U0W+3BG{@<$8!B~
zEAL#=0yb4|qXsSSM*6`g(#2-}#X{MB_$QQEzL`Bre^^#wXdA!?{1rWHG#iGWFXQh0
zg~08xlYvifxvn)S<v#l$3Ui}BOk3kgd(T7ptFG;*-NS|H7k8u?59S<=qbdDvt0oa?
z*1Og+IN0F|{-s;Go<IYXJKRvtjJei&x>(iX%U`M8e?ys;>}aTvuoN3QF@6p+Cer?6
zEx|^Vg8qcZiT&@1<Q`@Tt;$Gs{oxi&+-mlFl~u&ZaJ6&28aMbg?Ln!sZhKEw@WRWe
z{qP5l!a`Y1NtpKd$->$4K`dJ3Q}EJuRj<~hDJzFbZkDEVS$*=P2?6U}@YG_iI2$e+
z;WM;0r3b&M+ismRNm0T*_zRT##VxY*ih!cw(>=4<*k;IRqt}l*`Lj#2?)AZz=I@DZ
zCX)&5POi^&*VMXt;_<v(z{$eF(q3As?`@ROsyUjtHUn%K&<_Wj@Fgst1kf}1x5oh@
z$o?Rs=x<RVclfD4{$_ke3haG#D_#7tdxFZT(rXk?B}%$It-AJ_L)l5n7)4ka`Z0n_
zpXk@2AKGNcWA6VS<5PfyiJ6Tpx(4-M5M%0dDk+lsT6|f{|AdsHuB+v1)Uj|@gU3;5
zPMhR|^BOx549kDr^r*iDpu**xm&Hv=QP4NZ!-gE-Jbtg|e~*&Mgu;oz3iFAm323GR
zRtL>ugE#eme0YDj2UQfDSMVQ)+W)-KL#4RQYkd4W4fqe1^gml=_7TtjeEiS9%Kw;-
zn3+!h|4yj~sAgOe|NWr+`)Yyy9k@!lPn6tp)=(2MJhrsZgz#cvJN@l@WOqfIYVJ;v
z$bI++=f1W0PVhG({U5BQ*e8YXmw0OZ#8%9FV_~erZZ@KO>SFnPq1CtbIS~|<y68`~
z`2RUz0II6JwI9F1NU;J}07%nsulp>c`B8Y|5Bm!FAgbqwW@u$TI)bQV-TyS`&zt2{
zbQDOIbq%6~zSs%y7eIxo02NO<_D04R*G6j}p^#uZ3N#FJyktmLw#NpzqqOJNs;I_^
zl<I-he{aNJ7uNgRAHSi;Ym_BKNfE(sIfw;Um7mE?n&tt&=Cy)lrDb^U*ug-|$a?N(
zOW6C@6UTV?1OJ@MG88?6Xv}=WH&#UvP^4iQV^{h&0Ik!>rk1hxnB{3Pt0KGhC$Ogr
zc?p{@sIrKm!Y{950;<kUSnka-%v~g#Id%6IW;vXf*IF%~tiH3VTNUc~HAzNQw+@9#
z(-posZ#Bzv-RgRJz8zMsn^d~Z1O^3b50j#>?Tq%j<ATu#bpbM{Je6|~Ts`~p2EDsA
zw(sNo{5w8#n<q74Brac{VA*M&9T$j--~X6!``s(da1C=!=}T|}t9*QA{{AM5rd3j$
z`}MyuZvpQ=9RTk<3xNDQlnY4f0|@qd5CxnU;&wgZB8O^}FS*Dy(q_ABRkYr!-;(ln
z#|9SGjccv3Ob?8S+cuxG)NcVt^~Mmk9UmSN<q^l_bni}V^7|6<B3f!Eh%jzE&iPIv
zAGTq<L_K){r@W7x+5YRSIupiE7LWSXxI>zRLe#za#;7Fx94kv7Fnp-uTi+LJ4F-e!
zBeTaiYgG=KdtcA+{?fr;POK6^fgCsiIf{lbWvkl!ggs%syNWJ48^7LpNV^5dKvp%l
zeGdj8pBmk7|8Clf%gzmJb*LEzu=Lc#Z;2O948@^pjb3X2QJQfc38Gku{m{O@jll0U
zvDLm25<CHt2@$gE-(_7*F#G7lq18qSKc#Srj^qt_(>+C;!)t=k4w@+Ngr7c4<}ZO&
z_5+7Shn0|H!JcEC7I^iiv<Y=PsgNPXElYy#is4RTT|y<sD>PKo+YVo420r>5t!KZ^
zJ-ad7-^om|6L3jOyfXKb5v)Z)_BfY8a0nC=L6Xs7q4pJEm={BmDRNPlr6y)~d-VwX
zWHq@?xGi6U;NVj6A=3QG!fN;DKfxBy+<n}Tu>QsAt%iMv8jYuo?iOw9E_xSJb5ln4
z?uk0Vl|SvAY#K|a?)*UKWxxTwF&eExZor%7;9li?rXhUowi<BN;`GgaQM(qvS%scW
zc!X>l1wOx&F>~pei)np@|9<}(8o~U?Aj5)gOuO==F#9ygz}Vn(_VXr>p(U}V)5yp1
z8rkS*!^#~OWn0(-OmWuA7jenF2>%vPVoPkPyd6hXk^Bgb=3hAd1XYyC_gqjv^eS%Z
zO6_-N3@@Q73J3+7r2>-0hf7EM(rj0x!tmD?%+og4ZKfPBr6-|zK$d`U`M`~SrIvCE
zx!!8fQTR-L_e8zX-M!|q7OPnH2iA*O_q*=InxFl6$!EDfQ_R|+*0xs=>&2TQo)BCW
zAWuz2Hkk*FFubgPj2K-BeLxH6red!`z~=|BT;^j`v-?Tbo+@W<-KW&0V3qfjN3YL9
zDUOU&6MQ+Gg&2fT<K3lbfEemTuul@p%nvJ)gs+z!?hH+bc5(a?8x2Gou*DaYbF%L_
z#b&NPO(SEQJVc6KPN!W}LGx-_$9m~PmY4`xQP_K8D^VrAhlqH>645hv_K5b!oTCK%
zYl8Vo@-lIaPvDX$LBM4LL7H#{&~Si>&h;q~d4GB=)!fh@*6hu<;B&unzOUgKejT#m
zBtm1)<{FGTPga*^J=T6SzN5AZ>pcTaEfz1OPZ2+_;@5hr8!&+KXZ3oJxtGS}`xp?G
z4b;<~rO_fD6@a=%cj>Cdexv{bqv>Jc>nPPHW{q;(p8T<WTV(R*j+6Kh$_ImMIIN8)
zY778^t0_)LITm3j(SGRNWcMz<IBTTAQKZ0q=>0p|Lbig$n|b^SzcLZA`;vuI@Vthc
za^GUS!aZ&@b=hXojB9VJG}TPyre%Lo_*X2=_7{reC?f=_507G2o|=4gtr`msj@g}#
z8FbJ7^7YJW$0@P70KA+vwL1}fk*{ZV9+TAAUf8KM0oRI!->riXUJv&lh&Na^2%aS~
zJtF35bO`W|e1XE^7k4~mrs44N)<YFL))>9!VlUUJ^GJ`Ties7*S*tE%4El(?#?acL
zls|ZNO?=Etd^&(<?|0n>=7Voc9<O71NKkez%b}XmQ~*i)P$}M^WVh@6M$%+TQAs-K
zsP*(99o5DNyN>`^yOV60uya#v1SaOBZa26r`_T?Yx@MH$dsNjl8wrgx;{gMWYd~)=
zPC!$f=Sl9@)(0pwy(k$240~S#7jvV4$(8+TY0QuM+(C|<I?G66zn{{X`U^RChJYf0
z;&utvJMn&3Ct<)gmeLmK9C1<CkvA{$2;XITBgS%vLyxsLNPzgsH16ix<5zo>y-<dV
zDhCY+YT3!Z<UN<T4+r#yCnDwNy1nHDpnOJh5;cp^aoet%Wb$y^?I#6;8^RfUuq=Ho
zY*EP$Ell^xP~38Hzuk6tu@LV;Cy|Q&+aKB{3-g&8>6gy!Bm*}`pQW7&_1ErA4nWH}
z#pSK{X%9l@i$NJ*(df9T17AH7@a$cQ;Ud77jAop8b9MVO;M-ly!vK08+CtFBPJnK}
z7GI-3_mLVGXWR|mMA(?=1DqWIL*!J^M$c<D9#c-tnW^i~*aJYMO$N{AkRXr6Qgo7O
z^lijRKWc?3U!5DTuMRUhHWe<rc6STL78{Q1C&TyGc5wxef`dDr);F{qK90S1S___O
zWFY~v_ykb52$ma9FX^mT!}49|FGHsE*+*-hgm5ahf#bI!D(A9S-#@gRRts#q(mn;I
z#aM_jiHskfFuB9mv)31RbqH0w9MbiB_ku7Q?$0y)5FVG+CPm&AbvI{i{Ido71>!FV
z^SCZuoXnq^pLA7N>rIsw``u*Zir+7d-wg**R7uKjxDUq{k~N_(5IPaxN$DTI3lyX~
zWLCq5h{%rgM|^Zbj7{e(o~PMhgEPMwckZ^uX;f!KqxTrF_mVoPt#PWX-ppJ|Q{gAe
zuO5!i6lU*ESn3mIRcLN;kJ4qZhi)_K&H32u=|?L~cqLZL4<DgOMovpiGN%(mmZh94
z$b~(n?fHqWVAJczGxIFylgIVgkXUbW0dUJYk6c6R&3wc`+Y++)x$fSkx6?7@!8S0q
zB5=0>fZVKuX~44Rm}CST=Uv(5bkqpF7uxfb-7x|JsGkGanu5cli5!08_a>CFh>JSk
z&9#FXP5}@-N?%Fme&x}WoS94!a1}dbuTzv|aI{ER!tlOdO|3z5a#W6k0jHADkLm?0
zgP_JCm#_ap(D_Y2(a*P2yxd3NYyCXk?lWz-$7OTERX2?HQQkhq*F4Lu><J>x5&ac6
z1_~_07pjw8L|4|&uxs{-a)PEk7i%6p|Dar+;@*19tKEn=usC)jFY{gxjBIbBs&Bpu
zhI=|3#ok)MpU&VwjioY+EQc;C`tgTaz}QD|UN~tHSIAO6<}MDM?;d%PFL&#QqHnM}
zLx$ZsIo&4JY(%8=V-jwrj6jHsys|t$qqV_S?kNVL^`C5U;^uoCX28~rf!XEPiK;2g
zc8OKiUAPrmhAeo`2$wC3TkLv@7jacqXUH`Kb+7IHM_Q8YlJW(5e?r&pq3*gF?g$yN
zz0YJp4B1pEBIH3VHj7ov%in$ChiNdEP|}8#FCAF0hsZLCfClLR5rwmGyt@pgb@Qll
zH!r4&2FO)p<e8A7Cda0ROU{@vPykc4;U;@DY?SEuUWy<<k86}xpE)gP-l%-b{|)9d
zvVcPC@er*vB=Y(oe}RT$a{{_(fvi88421d}ZH#hwldKU%E_4B(59gK<2+040h!_>R
zFMT?5u%UL_(s=6<w!a`SI&#b><Jx*XNEf<gJgEj=i7GMnmbRHV4EWNXxt14<(xVVj
zG;FhnPR{Jw+vLhTHAmR*YhA9mG{Uobwu^3A_@;i#4=UX>-7ql3XL!|L7B#njxO481
zvpuQ`S|R>1>f(ESl=RdFug$L5t?eSRW@X($%TO~-;OIg;CHH1zorL_uv>!UIIPWeY
z_R;zk?)Biv1gXq)6Vb~CR^F)`;k*i-UeZYR)GPvv=53X=!QuE)oVCHSR9%7H0ujxb
z2=S;;v0u0f)^+xHWlXKcVPz?<+><sLAgFhnT<5*L`a0{v<Uqc7wGE}6NA4E^Cv{O6
ztpH!V*AtIH&7V>;mgGHqbn1()%Rhu(1vFnhxc8g8-Lox<p@pG22#Ku{Qq&Hqo)!9`
zm5o))cdel6IVvz#gTDR*kyL`u;2WbZ9%zy%T39ia*DhU3uY|+rgDB)Uhi^qIhpu|G
z2YtbVk%{zK-CCgghR_XH$N*Qe=i&+SDm!o2$6LJ0Y;ht^crWyu6$37IR{L!D=X_Y$
z?;vk|<6DG(D~|;-wS4;Cn4WFeW{KK=w1U>+*niRfA^H>lj`ztYr*OmjwabV1{S|k=
zl{~MYYQq+kEeaLs>FIwWMTFh<QOJs@p^r;<5|8tC@tWV&PN&AE^8R+?W%$Cy0>=Mi
z>n)(-=$2?<AP^)3NYLQ!uEE_QxLa_C;O@cQgS#`hyAzzjZ6LV2`#;J3zTEr%_txsg
z>IK7epL4oTRqfif%O`*Sr)6Q}99`#8@FnWUyi&w(;){Cq_C+V?E9e@Ibh_#rn@KX=
z>CK5XKGBdVthr>X56i;Vfead!Wk?EW9QZcJ&9WSz*BMl~RVh-~d7>5{hm^HF(nE#v
z?OV@-Ozx+H_{MP~nFz9q5R>_Lu4~#yZgc0rTA%9&+wcH+v+9kY&2}utIUSo6*X#I8
za*#b}wmOrNZ@?vC_2;v$jknc&5^Sv^)~6I(s^{>$x}B<KavB5pEE<}EoRhWsf}kJ+
zG@mqBWRHsK5^`zP@$XE?9DL%Z;SR!|az!B(cUBxO^P)Id2R>9HFLjP-HD5Zr_jmBS
z_WH_7d?9l5i@INqN-zmSR&)waVkpt<2;kJAP;Z!GL@TZ=;$t!8TdpojjQg5`NhWRG
ziiC$oN)_Wx&r)Kd*UZy2uHI4K$C+1JUtY1OP2HF<5aMq&CGQH>*~Ef?+M4y;C78YZ
z;^WzyD`}6e2J?F7A!X;e+gTAyvv{RtW96YE9Gh8X(LXUKUk7N+Y}y^C*xKQ*ye^E`
z@#~19YcJ|JzJAbKA-oHd9-|aUR*raRtv$ZI?!0?8<lFr6dXLaAmEuLlizm-^zsbvg
z(Y1cx)kf4LXXPgHbN+e`mE-AWt4wuSZh_rlckpWSW$652R#VYx(c;8TDR~b(93rD%
zADHNVw)1-18BL&Z`bx~p8RO^}y43la?DP1mvh&!x^<czM%j4x`1-<BSUldGt6K5@%
zRND*hOzcQxHFZM?;~D_9A?v<gpC8*BXZxGg=YeV!&*)S}>8_L1fMtudWvY(lDud`G
zWV^|^2wtbP!;t-F<9;H04qnO>k<b#~<1X7LgPk2OCJk;^MbbvIJQy$P2E5SZSh+2S
zK?j>VfW;6=^8!q8QE>nJsipUMs%0#Ik2@a?Y~EG(y%|7{-$kRlDdEy%fVU7RwI6rD
zlOwp7ScRgy4O{Lx9F(i%zw=%R!+h7B)OO}hqL#BinKDnmbBb5-<pty3?zKWpFt9L_
z3Yo9&JMvk@sd@(e3FO`85Ox;U>guUPiW4%YQ*B11@JPvH-#L=z>=kD%#T`Js*5HAp
z27yky^9#RDtxm}B(eiV8i>QlD0qF}{PjSmA=L$oYTRQ%JNfhN!AvP)n7)W>)A>_Qg
ztw(--5+hnlG9BJ;+<G14K7Ba4AX7QxFHRv%oZR_!BYv$IIN`BVx%MIGx<oZoF`FTa
zy>aEPcF;0;rNjzt?Bcr$PJWX?roK5ka=Ys{4?@!;`FNN1vU6y`GNp@Lwe1*_(h?uM
z+J1{rKQ0tF(XU(k$Zo{9>bOSVz-9U$+Q(Z;Cmda~5+fh<8g=<zOo>co=0E~sqJjYk
z&u^IB%!k_1?-}ItPYN8q@2%XLjwz4*QkZgG(%&}g;MY*`63T3Is|$CYkQ}2dNsRV!
zb)<-d3n5iDIz`hw;*fh9cjEqZqayvzG@&E3=#x@Ls9{ZKbIT80{V!Tc{I8nd#`V-+
z)7Hj^b3wpX%5xQ)w3|6I1gCMFB8KwHtG5-?`0OHd(M&fh@6a3`8zTlkc2a!<n>c}!
zO`p7}F{<H-Y*yH2YkoG4g3|jL90E_O+4ugpl@of{(28P`eQ5+9&!HM&X&mOv{pc~;
zH6-Z)_Npbc3oS#Q(zR=ZgHadfMTH3Nt!pZg2s#JK06SJqq{7k)!<A4?eg#PW;=2yI
z_%Wx`9|1R$9cG@{1`j99Co}Epb$(#07{qORDMdUy*PuTTiBkuAUJ{vP#n-!m21fSu
z;ADOzG@DuO<W4#O={I&GwPugW0k4PihqEeEDWA*bJoa;Uc|~v(`PQy6t$Ip#Ut5Mv
zvl->ww3*ytY^_%wPV@r8>8_DG$(|dKOfAg6wgq%SeE5YA+US0699$#MIKLG*S=lHE
zjiBOf>3le#xi~D8j<3wRn3!Hk_Sm|$Dd<GgiD(d066C39S$D~!0K|;@cX%i#Jdqu_
z1O}pMX;~eexL<cUXhoT{|B8qGF^J`%KkxH=TNWXHbK_8cy=Pc6&yi>Ml6U2|`;)Db
zW6;U7%!bCJ!r8;-LQ(N-#pk6y<wElj5q&*hXr`KDZQ*Ki8Z|3sz6^_*nJ_F0GTi2{
zax9~S4E(fwG&ELf%Ht1rYhdd!5Lau<(>ZO8?|0ol?KJYD%;NX`HSu|-CQKTsWXi+p
z<(8kUc#P-jq2dYCd7DAfOX;|l5Rtw`clvla@pYwK&S#S)&;3Q-bff5`dC6mKoiDQ^
z_Uy@{mCHQkC@3@&xz;v-YOT|souA<I!(!o1yuCb=MFRh`hv=)vQ6mGv`ZCd;+wg*I
zBz|Jdj~_q4t;p*1ZRtA>9*%ARuhVIn0nHQ9SGu6kN|J)A^UIze^u^OdvOo^K%1n;E
zgqFkms1=tCn?mKg0O#(0U+6f~wG`bsa&D*a@#S~;7hJBeN1^ZjLY2ZPzCz*nKFfVt
z;1&yIO~wyX1=4j?4<ihnPj{Qq6OJxf&v%9geCvak<LRGe12ZqPe9nXRSAySf{0Pm9
z@9?@L5c3Y^ZVq%AeHf$h*KwS$#9z6laY}xyo!NC92;ez&Kbgc(w5i9n6!nT|uin2o
zwYs=z=X@q#LOkL%Ju2{#{%}F+c9`kGxXpPUL1B8)7qvl@s#olqS5N#}fl#t9+o|R8
zT*)3QtH{@rp?uWE``uiN8O*;&UOmw{ZJ!uU)in|@qVh+7xp`}z#_&`6jJ5rKRfWa~
zr*($6bR%~9?s!6yB>a@4(yPZuQBoH40#56R9=pHplR-z?2k_JItEXJ2nV$Oi;m0cf
z0B8hzHVm?Wwj5uezZ38bCl7N77DsL&M_V<9%iLZ;kcntB@g0;!GEC}qiu3THj;DAY
zAIDcH*u+%WJ{BF@W&!Tbv8E%}Zp|Z<P_X!*n_TKl%W9kU#>aK}VBcqbav80#*p6+%
zq?-#JUk+<B@J!#5e5!`IH=n1A`lT;^_c8mg(v$%1M~R47nL3z<+Ier{vdOJ}uQBVf
z*y2fl?Rmh$xzzJgAV>TX2_8og!xKoU^5;XGvg|2~PxTG?!1X8VK}|(L^XbzuMOf^j
z7r9DY>Nefdkh2U$8eSFhcg~mX#@9aX`u!qng2d^LJMlj5H!cW7UcrVRAkPnv)pA&P
zb3$)p8Uoo>H{fq`r{zHCeb+`^#Cl%*`+*1(&SrYt4QKj<rdrHLkIo5k(w5Pf<^T#9
zM>Eimqo+B#iC7IgYyG{?3dzwD>P(xk45@`ivMpi>d*Qb9HWqxM&3W5hpjfG}n}Fml
z9G^>&7LF0!NvGYesa~yH3{t>)Ty{xKgJA}Oe!<U$uu^q23R-Wo4hqG_z47)fXQ#R<
zc+{cbRq+OOIDe+!NO^WNH08&I&M2g3&6fV#WV&jJh<~{NN(8&2E+|;3&|+~aZn;%;
zgfGjn^cnua8E(4@oMIe-pIR@(4~g;mrZZtXa@9V!Q_Ta53JKqXC+=OXcb-km)3V&M
za=^rc>~0aC^$hP83kV>M3~Ev4+lJ``QYt!rj(3NZ%xvD^xVliiOC*Nan4j+??g8eK
z<8M27cduO>F=X9++iI4zeJOQ(#2xw|ut+;8i>y5HctNVNCEoSJB!OXjZ#~&7Z}I;2
z3i|x!J>Kb>e8mS;Z6N#pup0k2pB@*<$u-zpC;XTD_x3nw8&O?)QE|g*#G$kIGt`;K
z&5hz=FXBeKXiz^wiN?V4InL*wwS$IM)vJkG5&hqy+O(QJycJ(Q2~1oRTBxch+$GTY
zndf{v*lieby!dS3z_JN!Rx!^F-0zoPZN<c>3hRL#9WYWLzYX0Hc3Mtvd|kH_)!ctU
z!F2df1_k^RoTRA`6<6>f4}E=!+AeE&B?-%9Jitbn_F7zM^Hzs1#L9*K`Sqn$80}1E
znzt3x()+c*$74_JX}9r}{phFo<@bbloC$-+JhZ}f(P$j6XIe%H4IS6fMXwhhH1c2P
z6Z1T=f3fPq<>$u@KfYbQ?^9~#B0^VH0#ZxSgDB2Tw(0e`U5eQ*v){Bt+ZQ~GuLqO7
z<I-HwmSM-Fr_`!XJJ#Scj3!0nRKUbf6HJUHkWJ)(TP$klbt!Mr4|d%-O&dcMKV7x^
zhGS_M6bD<=p21Uaq^K3hQdDluA*o*2(X!=y%{l(N7~<<MIFXNQmcR1*MhO|!;Yh`F
zCgoNb!VDFo$}DXkZHp%rD0q(EUfq42&b-lkX=VubxlW!|>R@@kdBSuhWf=~|%VgsD
z6>-9Eo8_eja~v|a0t6h98?U>oE?ep>vdX78ix4!uilcc%==v>XduPKN0X!oHy<9he
zfh-lDE}t!J)~T<eX{b+6!?iV<+qlhl);XOzXMQ?$KRP_Cw0FRLK=z8<7H{A&_(2~Q
zXOo>U^4y5b>0m6$WohYDb>+d#TWG@eF$_!?QK=F57_G-DqbFb8|5msT8|}~deh;x2
z<xC7Sd@hW9<q0QjUREo+vgzf6lruU&pXrM$^iy7*`Aw_aF-`mY9;<kSx$(u6uw1jr
z<&(<o7o>S~o)`fRkf*Kg&@)esJ3};m^u^J>_h~K08iYxyUqPRyv8}}^yBZskvk+6u
z5ItTwOvQUvzXaIECkn0WzC0xNCL#BkxIui)`>#o@KL`0fr6er6(3wmedv12>9(Y#C
zJEI{peqXLJrx?@HNI=i-&aRmIgvo)8S(fyb_SCzKM<nu~W|y?}D&s^E@DB38G5lrn
zI7>im_A{r40yyLC#7FTm^}yOi^$*qBx9ckriWl9(vY|~gm*MMI4gM`$T4<Z+<D8;}
zqu$z`E6qn^mMp77Pv)>g+0Ol!9*WEa`zjc0_1pNiNA64S^E3gbfj8H}UOXRuR^0Cq
zu&u-8k9b!RJqoIfpk;L|-B&yAB=a$sJxEg2s;a)Yn(pfjtMamyop)y|Wa}*r-+o<t
zjp0mapnD(rj}r19_N(72N=OF=2#0Et!UDP{ItrtniUcR5C&DEtnQ_HdTQT4Ey6or`
zfjOeFBySidnaFKFz7u{GN==Xu3o>I-@q8oF*p{bdn7N23J7`5C&Eph>T?F>ccgw#P
zo^Wyk`s1S;pUfXt3L(lTzl0t-&%ye<s{uCIC0-C1_G^x1a3iWp1%N*MR>S^#d_eWZ
zz++-|{GECA^CpkE_}c5UDi?M&{%WdfU39u)Gg7YgGx>36P}hgpk=cH2{PXV)4gVe?
z2@pIjBxFH1zrT+W0=mW$`z3yxZIx}rcD~kTP7agcd4HI~|1=dJB9`f|*@J*Xl!YSU
zfBdE54&2Di{OoQ9)m!@Kb@YXcAqJ%MP;Yk+W?>P=GJ-YI{$<f43PWj!^LZQb0z7lG
zC6I-o)tUeM$G@NYE(!6N+Y23N@^^s$J?Y98-2}IZ|L-sUJzxI_o&$QSy8l;h+6;Jy
z`1jlYdm_*Toh=$Hjd1+?w*UW!CM(~Y|NAP0DD;qK!1XvG?*Go;---0ah@AL$q`$Ay
zB~KDVh%0QZqK*uf((pvV`b1zLfbf6(V+c9qR35m&O1kUpY+l?o5Es+XPk{epcC;P;
zcu+|X83Y}dw&H(iN`xqK&?^&b9C@X)@&=aL3AkF#N88P1&fJ9z#l^H7av{gcSihkn
zE@+8!EGC1#pRpv-1pF`km}}h5N9v9P{gi|#(oh><waXEP*SW?Hr$|?ApfD`jzPjS+
z_5ukQ(isgE(xbkzM|AvW4*hbZvPTqKmj%Num!AzT-|VPc9tkHB&%r=FOA<0M#a``>
z(6=6i(Nx^+<wbwV;1YG0@Imlx7RYvi7NL9S34C{j7{b@CeD`ozxgWpM3Y?@Vgl@uk
z|IAq!!t@zDBd@4kO3N>%d`70fje@H}mFQahC&Rqk7~_z@-?M;njiLw+I1Z?(v$$QN
zmM^kJZ#U_>l-tP#vedcuraiV3)u!~O3ij*f^KuMj;Jfm;3tj(wu@7qY>sk!D!BU-h
zo_J)TnvWd6_HL`={>NwKcAoPv>9wj$Q+jaqpB-f^V-iP)3SR=o_j=#1Nz$%s<aHm&
zJh4b(ZhX2p9zzCQTtz)`;mdJ-6$_|*o_GY=Ed2-4X*ulk0Dz5utsey#z)n43;{BYX
z<u$u!hQxgqLkJ0nC<I0FF&NY<eLjflb`E@XN=-}ad<Hw6pY04#R$BzAm>1nd25T8Q
z?l!LWr>~N496?UEyaD!1DB#k+uah5RUd08l;hs-@mCe+1V2{UeJN%~yf1sH^FbQYr
zD}>{B&~CVH0~K8MO{N7K-8T9{;g6;g;q?a2<A|_-zm*%Ef(pU;wyPsz?<;wnH!jTA
z^Teac89Hn`?t>hg%PKqE%~p@@w9ucw@cO*)oEJeoTNMPz2mmprt=F3CoqMC$?N%J$
zSk9(*TQlfMa;Nm0Oiw4H@!d2j-1ME)plv18GXZ`WWlBpov|TnSU@hoo)>$;#Bb<^M
z1BzB=ubyh<aFs_rb}tgcnVk-j1dWwA`agk}n*r|UD@{H@!3M|2x;@>mBFTPa0tANV
z@iGL4T*W+97hj!ap%f-_`<F+3>h+v8JUm)Qwux!J@BHaz5b^=?!BirePSTdB{oG^i
zan+^s5oB4}uDYUtn)hkbLj7JVMA?klZz>OZYHu$Y)tl#f*_t@orbaZ2(;<j2n3XCv
zx&Z^E?AF`EJA7St5u)cEY+KtfhgW9L)n?arPrl2%wB>omcAMKzjegr_$zb2_AsAIG
z^caa-m+L0mZ7<o5i9OY3J+kN&CqzFJ7)Q5<*aQczz#@k{_X)}O_~iqs|53R1j#MDa
zh-*WMT|yN4p;UAlRB%cKd=ci?EdiIvKinfVfwz=r8Q&MOuvRmXZMVB8ca#@~&3ocp
zS+e|opq}DE5VD{5OTPIR@vi-$K*jLFS3I+ZBNZ~T`j5(cS7CI~DWbg>p%sNK;+Q82
zDR~7Z@0^cmCNLBg?Uot#M^&OVnO}cNDjJXK_BfxKaev$EGcaDFcDaPCNSc@!Pa<PY
za`9*o_86VHKnHhpL!f3Y=%M!PqS3vqr5A0jGE%A&2dlkjCqs+$2fS>&Lba2I*qjRr
zJdq9t)5Zt|2%y!OF+Wls^IvI68R9F0B^~>IKzJ>O?U*s)!XySy+*UR@yL%GyN%<n+
zZBC>PoaJK3HVV+n=QFinVjsJ!&9>Et)oovedBu^-^vXzlkv#tdIGhx$P7x0*9N{v&
zpZiErkVs!<$)0S2fXD2xQ;X-Jsb_Vfm=B11zzDJS6hm^i@JZ6{8wzPJt5_-Xlam<N
z&HrpXbr<I@hgWCxj_U(Gm_{Tb^7ZaXj**zw3`|8qL5K31%H2PdWEGjNbf%uvdvc8&
z*6AiBoaBbOnuy7?h$R;uugk^YmGKr1SrgsRBc9}T-<!pRy4sdYWqIcHxS{1$E68KW
zyT9VKYYVU_oJgVjk?ck9E+?97)oB7o;etw4gD;!X+?=UQ*E)QU6}X*X_YU!RzLo5w
zcepIcU-+s8-q_-P^#nuwt+{XiwKIe#a1l>O0c!iA_Sf`wb$4YMEz=BY#gld}9z>I8
zBm%Fbl0Jn22)!bK7`8P{>UQhBrwzzYVf&9+NDl;lXJ8sdE~U~u(}^V@t9WVPh`(c@
z+`2achvQC9c@~SH_(w;p%=!#niv!L{8(kk-sX|{oeR4d)K9QniFhkk^TJhhKj1ZQ8
zEV3k<f_y&7N4cj(DgHV0SDa_po|pKLy&NXEK7Ze*$~XiAPK?^V-5(q7!ONwaySkIF
z@3u%*PlYayxTi;!jEZc4Aop(J@Gbe(^8oa?zecu?z^|i)Z%U4g%jj8yR$lfAfcDX^
z3x(p>WhL?}V({HFzW0bylTmp1>5+0EhT%{=VgzR)zkU7}JWs86iGzun5#V8H(QJm9
zJHg8*r<a<R17;URwd&un<{U^Ez2iZs`0u4){q{tB)>G@gg#^LvBmUMzcoy&4)zkCz
z>;;?M5GY7N7-Q=}E_6dlqDpiXRPrwcEKV%_#!hxU8^JqDhG8>fA}%LnjpF!fGFZ%H
zWErb2GbfH^rt%r0q-muTU;erX8_@!eb$fSOn55uo3?cG6)!{s-!08aLL<G$;rN0#p
z@D=#lV|^=A)B_Ia#{3JHz<Z(>n)7lr5^~2_8hV1(krDO{?g{Ex((vQlLV?46#vqpA
z)(nVN<mN;t)r=2=!*Uwuc#xjbmWwFzOK4Lj1yjBfMv9d^$A6n*Z^R1d@wRN*$;QBg
zZT<i>WiSvukJ9r~t0!q@U;hubAPT6VogNtDGUr;}5!0hs-<VzJ_3BT`C7)}gt<lHR
zYZ^c^O)?5{{bY@06Z<kUxL)8$YdWe5yh8@W<S#AT4VL0KhnPxrp2LUZG~$jFl+ReV
zVvef})f!1RuW&LiZrM=&cjk#8pP-_8)1YDn2v_0dsSnENaRe)YlewGEy6->SNBjwX
zeqoq%O})Qtgjsk>x2mW;rq&>=-vYd!##FYd=C#=}zK(6KA<bxD<rWfKP<5I9%#y@$
zvaHI)pMg`*Z|MFI7iUeQ5-&!Y#)`E1mv^@jBjEq|T5;L8msxGILAPy3Ni6bvO*o*$
zU+oHa>f=dguO-!9HYheXWqE#1zd}=vp>K!3fIT4om%GiA6PpKV<`uB*#T%D<(8C%9
z@8{X1G47yxwTPV@7E1Zx8osdY%`8tq*=M0QU!G3PToxlJqat#^PeKr<-+bd?2cK6u
z;Y!U6@54dpx$ql&(76KxAvoiK4P_L*gAWRYKik7+y+!o@Uv{HNC`8>JrC%Q-A!9mR
zjOb?Ri1_3hFt3Wn=rhQ*9R1%N0_>0aGJzdeH1I5o$OwedCxKhweW(?N{U86)Pw)$b
z<`rS59|{5VTYq0Hod_#na&*KjeXE~3#?Buy%)g)Zn}inAee6v^4Kz`P1L4yBewclk
zmOjuKwe*0YXzafa_wTXc`H57Bt|@+DFa!3U>=<?vW|Mzqp&-Q?V^3~Twc9Bz`A``p
z@ik%RV0h%CWb%zRCfED|8h|aF9_Rf!M?l#CmZ!;BwfrBYm%lFb8v%g_7#;{EL$D0K
z-wSEitoB&1`X<ogT8Tw{iKl&9rnD}}h~8A(t|F5Ib2BWK*eiuu&r!@<H%gUXG@~ph
zE~t2FYG!(8JD*xJ7rapa`u*UOb4dtIN6%GE2^gqq!Wf0)qnuuijBW%RH%ZDuSyl7O
z63)vh3kTy_O(JoNsDh&E81%!)dK;T?a^Wj<DLPFcs{n9TeoaV8OFdCDYaX+)YDwEk
zN65c{p5Uw5pH38ioqB;+6kiMwe>&4UxIS3Fw}BZc5z{+CNbj5NX(I#a`rT#8*ar=Y
z!V9KtAE!%16!S?&9LcDGm-bjCo23Dc4$(oC<qh9!tSo;<QUALzf1<DuCU8{2^fqE$
zkgx)7D>lQ;_6_E~nXIvgljN$h?72Pd_(4yJ`<P`u6t~{p{VV9{F?k5X>(g-z(&H~F
zM=>XO$IYtH9HRe;rNF2pI35b-fFbr1ID*?s_$+;KcNTwna1f?ku~I6j6i(|`SvbDx
zp7VUpnJ+rhU^WkQo|{@%j$C&zC?oE)KJ6w6lN7h1BiU?p6k{6C2Q01bP=YP~-*^6Z
zKm)!4!7ZesuYk`73EAu7aeKefsa&uUIJd@hSsJqrS=ETRqOT1>^vO-0H?VQrcyTnJ
z`Vl`9v?#@40{|>_fWG~$WdF0AZ^Xfmb654?DB<K9G|!E0V8dwmG3^BXUUgWpVVv&a
zVrqHVscg^&bkes)xXhh{ix0!6F-Z?^t=UqK#`A?SF)(g!!<Y#E=O)4LPMW|5I7{vR
zbZzghK(rKK2&Q@Z-4XqY+}qBhTTZN4uJ@>q(;o72Qw!VAhyfGeB(gCz^83-bxSJXq
z3gsF{Oy~)XGlU!_E(_KPlDUceXl_3L@5}N#HxH+I6?;7xT^@{q*T@0y?<ar;N7+C`
zV+Hi2sX3~rGwbPYt9cXN-6)Ews%C0NwVpsYa^lEUPizBwv4_O-;lC#A_w05-pvt=&
zB2`iTzz6T_LC}ifB8}D;vp3bEWwXn^x{Fea?(Pfs;k;K5J3VB&3?gfpsmaM%3R)$f
zutr<qwKx&~r4;|?4Pm^4i@^0H7B&D6I|RN-o4p8E;E|Fn)P~;zRl6}7AG8c)s_cu4
zAGY$aWGOA14l{Rfh@BeJ;!N3V8go~lXb#u*`66SqW<c_tXVSc&q~zf#A{&$_e9r9V
z_!x|v=YEuRLl)@pKaaCt6A^@#<H`yG928VF>7;y0TACIW{xvOt5nx#P&C#){<noHg
zio<32OPV%oCF6HCu7C>(y#sxtKyY!##{4dLN7?cFLPt$!MO87iqzP#L?CNM~AZYCR
zJN!*{+<f0W;SxB{V*)tZCJnl%AHc6|_D8~H5b#F`4gY6bU_^uK!r(<aA)bSP%7Odl
zb3$xP!u+Ln2DfNYowL5LC($f(thtIQc)O}+WrbQRc4<OuF>!c>?rGBR!=lTVf^@Yu
zQ!ytzg74MoaR-QA6>hq6lP^L_Pmq3)WWO3tS2SC5Hos_X=59fvL&yKs2tZ_qsuD$R
z;C|~S2=<{}U74g5w@aCDEUiX061C(vhpC24{r4AClq649$T(Tq=L0Mqwqp!aYn$0$
zw~AoWQni!Eb+&jl^h=JQ#>aqLC3tO}tds_h(xhjruh3k2i&c4a*wfe}t!wV$u8Y)#
z;)Vhde@g{WcEMQ2;Qf1lS3(prsFmYXn%0tdDuTpp8kQvpSuWdaWs|ZTW#t-jToL3Q
z-L>+LuDeRh#ft#g6Ks$3?chQO)P~`+(b+S`Ycp3z@%<BnKkZ}wfj0bJNFbt5UJN5s
z1inJROCT-#!||l4e9<+hB4Ie1;W3%PthF#EFU6HM)VH>=fwIik1S)BvWx!(Qy&||g
zPb7_?@|}9J-(vdwa?5Up*RqRwbTt(BSM?DkLHoy)6Y@aSMTTI9`Sc$8sS+~c!k;}A
zz{B_2ZNI5DF$2C4#B=Z$;79$o7^TQ%+O40qt?n)Y;1EBpNr`N(r~4Eb>AXmyl|VbV
zQEoGjxqC1h<~ZKggUCK%)4U(<^7{NjmiDsOv_5b9a4^{ep^a=?X~i!kX4_d1XURFC
zPKOI=;K(A!yI9H=e2asM-s|t%6t7|5`-P)Q_0_mlj+LNv`DP<iHQ3b}?1peBS86Ae
zT(Vm0I&5Cxh1o=vrsq@l-#=M{3Q{iIih)}}`WP*#nLan6r6Ao!1OhE{nIxl2#E&oe
z>kuDcot8(`=1!bL@Q?cP&#DI_4?w@rNUUdVgyGl7D$LS5^0%r7pKl^*uqV&<gYHKf
zlZL<diujojqOe0MW`Zl2NMGBZ;#;a(bBW*()u0Sxk4bkL@iiSse@hEtxk8n2rUHA?
zJ(h<Ojtp&YBu3s3$6G((Ceo!Uf8$mQm%FaAj4R52xk^^K)xrGgl$q3_rp1+Bf{`U*
z+F=dei?N|oscqGp#5EZ9hlX2Fxz)x&zkmw;58yD_dc+0Sb4vnbKwzx0DA$S$0;*r8
z`fpE>K<Ep_S*wgk?OQDNWU|r{_V9Gj1-fxV2QK=o%O>8^t&s3%nt>D&r@NIQ7d*uc
z0a3goYK6s1cgW$`D0^g(3T_>Qc$f^jZEeXUIXG~LBv4n#uwwh_Yqm$>QEd7V$WV6D
zgo&&=txLvQWR_9U2*>TN`-;PjJ`Fi{ucP^%YO6~zWXu7*4Y+Y1vCUAjaliN@Waw7+
z@4#=@Pc&G1xA;t#Ok>KYkz{_;JH=hB@oh9;u9}VaqQ11-pQ^ZjLeS7FSokXRoiUIo
zwIT>RG8$zEs7zHnIgNR=L=i;1S@WsbVL2%Az({9Z2AdD1q~NqIs#Khg!qb>yghwr>
zh@Th%Oz}c>{zLKNslJ4Nq3z|?<N$@l!6<FNrSug*gO4E>fHfaq^Sm5W)APE%-_7!Q
z*e_DRB@q5?PX#LBN&h%zrsa8<=<d7-TwrGrkp>FW!@YycC02J$`<KV;H(voOcvf2g
z@uAsoO3@Ev{w??+Dk|^VFX~UOI-K>fW8++g&Bb&sZEM@}T*{(o;_-<SEf+`dnQoE|
zVuC~9wHu=w`8b!a|3+UY7%6N|`O;?lz|K-br<MoM$moN!cq~fvo0hbWnNF{<cEU9v
z`+Qlutx^R)v=~18+}_dt2F%0vA`<9WF$yN9dkAOfAVhy4c)b(X69dbTVv$4doli4r
z>_gBykWx)dX2G7l1xsLIML^SESJmz^kdc#m;>FUrGt#3o4?_B(4pNo#r16<H<ok$`
zDoG*CW04$u%(pugbKoF2qs-zH#;HxN0&y=P;c`_k;JP|>#0Md7fgzbGT{C{i|AyQp
zw=D4<%@)CzPVU2?f>GEcp3tM+t)7gL25Y*XGcZYura6#ujnCHvNgO$tFk3963~+^e
z_zK||*1)S|m?a6;QGNY;!fZltve8ee5lZqZd;t^Bs}So0bPx|@*yY762PP^iY6l-d
zsQ7Knu0PB`kgwV6j9dcUXm=qeu&vNhOFV6^V*e`$96?W+Wn^eaa^&B=KHMrPmFWDG
z?1d$6uv%u^9C#0@T3m4x2#$F?2FtCO_3bv348gE%JTY%TtFzJ_lkpk5u~I@XnIAAO
zw@oD`0TPJob+Xc2YdT?1hYkDNm+6B9dxv)4Pzj;nz$OM$RLjvo$mlT_cA1&zG@FDF
zIyEYF28c%|Q~lOxiqRM~Etcd#3;FEpCM?w+M<9V#M;^7;5JqSA+K0%{0pS3Ml1XIK
zYmlHIxL><O7A8{U$Zs@29@SU8F;!e<9Bf6PY2L#4<;eP40Zp64IReCQCp&<4^<4b7
zhQO|}m}<ANQqWU^^9}V%997#KVh4b0iX=7fzQxGH*llg-x!}44e~ctfQuvL&jAPdg
z{F{@E_Ia`n%v9YY5j7=H9$JjmV5#_Zon<@b!}NvEGhe$q9UNDXFyVEJWhGXI#2}&5
zDVmp(c|ooYS2|hV`#&>~3KpBQHK3y(04FX&V4=`-j*i^$b&havL<f7Hu@0EwN(n8w
z=StLz(cCo^$WAZ{361LX)D%D^M}@`63K0ipEr(aE8Pp5)jO~y2*4)L6UW#FqvOnU3
z;J*)~GscQfsOF&-g^lYW(5XJC!yIRr0LCl!5JMFN1kxa^ESI$9!R42X0$>x1jN&Tu
z4k{p}t3djw%psYwYeB;d7;w*rgt17m;*2~$?aWYag<+uK!e*YDEE*YG-?4B6++C^b
zZt|^>vBb%Rd1N$ih}JyPe9-%z7x>lOQ$U{0g@KmRHf}?w^>~{GhU7j@h$`9X&#jFl
zr1YY?pSvZFkTU<q3S4V^0%LUC7N0%cY_`V>QFRS$T2Z<p3J&8F-z)r6R_I403xN+3
zez-BPBSgW0dgXC=kvn~gAK-hw?q3-7RS*;qBwW;P%8qEO=luZqPP0{@?P?XY*K=$>
z%r=1MA;B~*S$c89E+bcT+`8&e(l`AXn5ipuS4UkWQ%Y)kW+OMe_f6cglvGsbWa1sS
zDyT}h{H#ThB8`T?&F<t+4%-2y(V+yEtlWErG6PP^7E2lq6^?+tbw+i4l*)2h7NUIw
zja=KlmB;oDrpR#OB+$dt5DN~U`d=<U;-?*k(@1WYyXoA2lTCf-etk`GZ+rd;K79Eh
z4ew*=qOqDpQ==RMVy4yi8|ZWiBkks&9Hsh6@q_lksAw%4hP|x?faLPrYndAD*lS7T
zbE=$er&5>=ATz43Yyq4?$*)-t;x=~`fF`0GZ?IMyf(pg2L@iP^JN2G=TRchUF5^%v
zHjx($0RlG!BV=TRM$(FO0I+w57WqJb!?P*nTscvk*mE!J?BC1kH%QX|LbyJSTIqSy
zT_yq<niqU84mkqrcLv|~A3Bimv1y_1<M6N#d?$$}k_r@>VDiPJ=lymvxD40M#f9U!
zAaV6SNJP*<Ts1uk7A^E!TSq1)Ru1FFhpl+IK87T)i0WE0`ruH-itGTfY9j+T+e~op
zwvDapeO{Y!2c1qt;L+iyWzd+TWcLn{e#G@?U5kwpcBEvf6?&Sr21Z@DX_NEiO3+Q~
z<<ZEL^5*3%=Tum-W(^dw{X(id?_3G-qz+4Cx$E#jrtASwSs0c#he?)#WsaeqL7MoP
zXZ97|Vsf>;lM+f>Pw<l2CH{Wfd>wfB{9WP5ZT%5B0-SS-d5d<ne)ixDnlFl0+NP5R
z<yB_xIXFSciN_&$J^gf#^ZiaN>JA~=J@dceJGjgMK0?yLYHq@~278~&3cy7O*lCYH
zR(5o3kU+n-!k(Yp+N@HswMd!}LRTz3M#h9uq)Noj{ya%KXG6`xEJ!hfy9BcFPSFq<
zTLh-4(o(i-JG@lY4I~JoHuGEQ2elAV*WFav&T}i4TDq4BNgM90#r8q-1*(RVNtdFp
z>P%S8REB?!%76V_D!4DgK^|EkQ{Wbu=cS|3^YS#1Y83?q3&$Ab^&c$ee*)kv_`NBx
z$Y`w{g6PA-(9*5?dcwKqUhDgU_<=Kj`cU@N{0FT#Cqou>lYJ^BBwMs6GmdCWxEX)t
zTd3(FR?EOCv-DKsn&m<4NI!IQMWlIab~$1^HuF+Hr!!!yW!S+Y2t9YzjC*plupDV#
znI3-EeG%Bhr4_MbP$K4v`~W8~8~5!4275~1bV~Z6+<`1k%C&w0aYoD0dZzIKA9eg^
zQ#Q7rsIX3I$1FMx&)}+v(p^5@0jh<Y`m`M9&mTgfFz5;i1U~w&Uk!C@ZYD!;o*}*`
z$h;Fp9Qr4~6-X0E?e9oMXJc)u<qg`Pa!b<XX0ep}(3EsEB${T4M{D?e;8wou+?0m*
zXjXYC+$@~xH(T*8U!o|nTn?zlk!U1~y1YDxkhy?w>2|B&j!TGLyuIEQXfuE&=G43j
zQm(2{vc5*gYrBNf1a5a|!YKa0$=F$WbDUJ(f!$aigshvaTvtgPO(yz)OLUjr#N`Sv
zA{b7-ox)!6(}&?T-{x$?cZ-lQA1;CpS8=4<;Gf$je1VS<b=R|UY>V;88Tu+sT}Kqi
zsddZsiG<*SDrkyxAKzr<EnjQjv$LPM#~2>=>aUWKB(CJxBnfIJetT8r&&C1w>%#p}
zqhj?m^$QU%aUB_!i4#A(wRyJ}EqT};%@n-CsXDK^#5S2a%2G6&G8e}OLw`&R<=K#@
z_BpU?xZf#vxVKsZG^pOxqxAifz0Y9{O7mnle;3v+Y_)r<wg~iy1^0v<@ojt~B^)iF
zS8w*$3HIC*><JV2`t4)3PbQ-j^0*dk$$uiGAPz`;#Sx!qM+tE=d#Xl>UrqMvmUt7Y
z#f=wn4mDobBwku>cXFo8^HT2-0O(3(&5X^PGsK@R`tg2g*Mj=A98_+2<;bkn#<#h1
zYlj5er_=|$`co3w88I~pR9nkoRSd{JJ$d$|EcJ~o8;8Vp6{9WWrGbU?cHfl-M2sd8
zt3<jARs2dzS!1_~3@XM>f_}&sowjx4l5lw0Nwe+1h`QXJ@VOn<hyNJ<zxC~4fqJO9
zF?a-Sz-?}&=!{3dQk<jmJ`|M`z|*7ZX5BK)c*R{tvk?Vrv?$z2|0bwyocTJP86MDd
z@blVK*dz~2IRYn!<6y{_R!dKXtJHq)roB#)bNLsk@cM(f|E_Zb=phGa^aTY?XFJIx
z$AOSc6+fkH5wE{PuzzcWiLQ>|(4^GY0>tj}5h|iT;r{EVHaSgJIz<^F0nIB+Jx@W-
zFXNfK-1{xvHO@cbp@h>Rc(?rN+(Xt3Y5ybjN{9~!$5g%jYnhs*B&dNFi*r|G`fz!$
zNK;I){5XbI=T>D#QV*nUIb7*o*^!ws%4S1bM_<3!Qa0zLm`k<jd&K1&Co3tJI1sa@
zGr{`pdRd`Tk2~l>v%ti71cAl)>|+!fzD28Aa2ES=nS#G#uC}UNZJcaJ^PuJ&UJYG*
zdLyrk#<>8|Pv0ii8Tc#7a10qB@79>CCadw~<J0BDK0^Qo?+iby`r20_!k2e}#JuV!
zm|On=ezD4-T@iP!1YM_Ma0(wZBWB-@_5$kbtUq_*Z?1lb@hS~G@sDPW=P3h$c<`ID
zAq!#7<uFL%6)I(?v3KT*(D}6oDd${L?4B(islty_A_q))Jkm18#}V5*nO8tnowbt*
z8jq_(h=(S5x?f6Wt+iTt=a+^M?;~Bf?IU;GsRErAXk=4)0_#6NM;v=UYf*lh<+8Q@
z<po@251aZ?a%w#n?Qpk^Y`N;OkPts>?P20U&nwFaaIOI3(l8DB+DGvivMYP0EoM=C
zE+@F3a+@(C-~10a4#uwq2nC4ho1n;#m$s?$WXg=EZR&_(=QLGm%O-gSFC=Xg$2V+d
z;R_>EIkUcRsgKcZ^|v#m@zz!^bW`$_#Ng6^2Aamw6&>o2e3t7YWa@d;U6ye9=`<1f
zRYxt>SY{UAt1u{0Oz|MAR9M>RQw1K<(DJmXJ5S*8gf)5NjxcWTPNXCylUe4+Q8W`E
zF&`!GDJ{H2YtxRcLC|xk@r$E0h<RX~P?WBSYfKgZPxg2&bU0GE+a!AwOXs5**sZNN
z9BZiVBX5G&M*v3bqa$9kxuTxA#Ik9*`&c^y)t)FJ1W;EoG~VC+NdJYkR^UQRv@$1u
zSVkMeKVD|Al}|}6C+6*s1F}o2gj$y^Gi@lnZQF_;n|~5#7~5+C+&2fJuBBlz=9U>$
zIM~M$^ah_(mdE^iPVELT!2KRtXSfkR9pdtMVS2Q##>SmmTG?`>rS3BcB60V@&Kf7n
z)R4Oh#Tv6y-<n)P>?FmRmctw#gs&O8Y2-p?J)AX}Wp<gpJXScdJ_e(q(S~4*?Ox-9
z*wB#1>NHcBmOYrhg`OlhC~~jC9aoPQDX%fLy8CE6gKErqznN-(GJ(#`77f?U>yxdM
zMrQF<r~f}-+dEYGAJ)w`>P)uB&h6Flb=<9$7zb-=3tvZ-#HgYZN)&r2i~;f|&Uk#v
z0{ZQ;_mK_*tmPneE91b!%k(;a4Mb$I?&@a2g&9LzW^Bo<iE{c(9n$`>8eSbeOXGS~
zbFM(ky20-iX-T%ll|Ief(VUuyL0t7u4fSX-w6(G2W5K-k@Ys)5NwYd^HVzdEc#CNq
zIe`8oxbpKBCi8OLv+|pWN>@~TT(g}Ld+3lua_1Wm>d7Z&HkwG<OEKbRzRpR(O>*?v
zt@&`KfxLt7mFiGo0dZYFEqy*a<PvhCKh!{}w?7CWd;Lvifo)e18YD-GHH&n6sajkn
zZr*+s`;T4HK*o6Y0F&SVkr`4YN;34t7|R`-(<bf-H>oQhj7G9ri`Ky45%TA=t_3a+
z(cbCha9QL2IC83H;Y+(VXSK|iutWs6%w}edbPcN|-7eRBrbO8VP{%FrFgID?uedGQ
zZE7xSlbexoGDMnsgiBuQ%+PIovqq!)<+EoYPs<|+%sZKLNmKx5{YT7{?LhXHqn2wV
zF9=iDF$H-}T)1y^Ezu8`-@3lD{cnoPSKto%XV>xu=(Sva0;};YPJ#8*Ix<!hnRy(#
z+eyB1w}^J#seX0A;Ws4}Coj6Z_w7Q%X23oRO|a`KnubrW2|S(}T8<0EuWQs`rMhG}
zL8JR@9v;Ay#MZhbuPQ@T$>UzH%<$P<nxfi$A|-xE2GAgDo<q9;ViUcNC_h;Z#!M)&
z<d%C-*{t*{D@fcd%eaV~8RKpLWSz3Q(CzXgxDlwy%RU~2@7I^pNcy|88F)AO_jnrx
zu|JCAVE%fEP|+OAZkef-L}R6}^Hv{W&w9f2a5%zpxd2VxD|seQ=+bZHxWTl1pxLB-
zs5^|t{l*Sm^Fpzr>K)%|TS_#7Y<Cw@x>3~5iXG0_#3Wq4n~7AA*=oAoG*>&D8lN!6
zjyHeJ)VtmvsVhrEG#Rjmg(~INHO)i7_7dN=MF8@<l(ILWvVN20Vw*cRRAm{Do+@V5
z$W&92<f_twBYAG~P(uL^f&zpHU?`xQIyf%m3b#5naqh<hodluRBWo3tTMO$sWYNzl
zQGKGD+cGTSeS8DDzmB(Xs(|8nVQ>ZwWj|UifNfWkiuBsTF1*HcRy!8oklw=SVIYH?
zY|aO>n!Gf@V$4GKUSazs-n<{(IeK=+@Ga(3d%%K1^C)jx)3}~WJ7+SQNq}Fo<RV5P
z5jP-q+12C3H4SG<<5gst81L>l=uK1<CvrT4%g_T_dwP3{K~XhWgZ9TD`UG=qAH9o5
z@5uYa0r$+l<rs}pk}>|trDZB<_CT`afZ})W=I(k=OT6>EK_=~qxly}`wC^xjwDb5&
zgD}9TNX`j`bp@q?6J(||75BSWZT{g+P6%aKUgN701s&DQrq=!D?3$iNnVz$z4`tUq
z9M~l_Dpn1038HKCnU{Zqp_x@$Pp7E5jjyXzB}cs*7D<hAi?NDO?2R%SAP?D0+fYu)
zM~kD8IBlE4nTa<)2?{Lq%MhRWeRm4_LU)pj0>v4mY7PGa=6*yn0(U9hv+Ve=X_gww
zql?PjEQsMP#rDT5A2jbbrP{>w8$CI&Xt>>mJ&A+G+SNs=xyZ_MdqLJ7?jB?lv*kz$
zcvTcSd(QaZX1JP?KF@;R`KOo>l8TYKmf%PG_?L+$;$v`=vb6eu_X`?UteB7NKd-<Q
zIaEnamslL39yPo;ip+&rQZVi%BNqts;8xov&PcE5^O{x`8MrYj9BjpFYm&{arQy?y
zkeU*o#sv23rPScYkEa}aJh1zb4NuJN%h_f{uR~F6+J~O5u!<)w7@2q;zg(1Gn(`OI
zi$%0B2ThCcvDWRyj1J^;vLEg;FLnoL8<ZkRw6Ah6ei0R?cpX5Ca%YdUG`r}Yz6!jT
ziG8?(`5H5r<FEJz?U16{*`5~u;&Ti>ZpSEX?#|Jk3=Kv{xp{gnGVh$#Z7|)aJm&se
z<dZ4Xft!y7XPVnj+>}WqyTU0DiOXumfbffJz~RgRTKTvS?%EvBfZ?!ms@mtJp><Oq
zZ^Ovf+Y(#;^uH+?UjeXq-eZjSlu^0fF)3<6b_Yhpvdn>{x7~x0BGgbZ4SqfuLSP=O
zCilXSb9+xuw@+fATf`8@C-TR7Cp;S$d`plt>LjxA`aIP<PVcb&r~~&YXiZz1TqEp)
z?j(=g@HFfO`y_Q$vf(OIHg0_4{eHegyd-<=lFbDNT-o-;qg~ktorqkY*f-60y$oc%
ztQ||K$>K=69SsGn;cP5^uUBJo%ay~;vW2uok9qB|d`91AsGn##^3jIp?#k<kqa)k-
zl4%cKIvbr9!&DrXk>b}=@pvcYM}7o#>m^$+=IauV8%Pq`_3eEisr(9G_nx!EnOHa+
z$!U3Ca=m2MDqSd+&$%W^=);#!6CSJ9lo_l+nBP6!y<e9~ysh4j(7wMbj&MmtlHyn+
zO!QjYFKg~{WFAX{$>jK@OLJ4*{baGH<pDCeB%M--FT_@dJtc+KX=)V$H_Jm}t)(jK
zWlOv~dHU9VWTu}%*CB`T+(Fhqm+HfbG4NWH%fdOIm0uM+D%HWYTKJ@}#(YY}0Ccgr
zR^QinYsxl@?-?H6()#-8ng7=eYnW)t;}uq7e4>x^3^>3>l!m&w$X97iq|vL{&KQgw
z4kOa!16q~*Ny98o8B<oX(M#dymi5;N5K2KSnr~K^mk)}RY_0-1)0%|sXZ?cQ6)Kfv
z6XNu<FVF9L2FxeW@sqV?DSw4V%?L>rsjzd+#k58Nf0`S00yI)L%L1v>9ung1$*ZLh
z=uENz2cjfrcgtwy()z@?75?HaV;MV}NqfXB%s6G(sSHY9dnO#3DOdpQmY4*WTUTcn
zU0n$R^URA>^dAS7LqE#}#0y`jNmSz2-90<lD(R(r=egKumW3EH#U4psP+5P)ifB}#
z{(LKzmvVu&<kW@KgrWpx<lo_Veh4l0O4jF?D*+2mBqLR`hI_HmSo~_<>GyF`x}Z(f
z(_KfzFvnr~O1K87cE^YD&K@)z<Xbw-54O%E8SAp35(sP%)m#u(1S!X{$y!-!X{pNi
zz3|T2XAV@!v&u{7QLXNCra#~)we4U8uu=jB4PJb8y@6+D@4q<s9>E~k87QI|Xfiwb
z4*!t(F8JFqA@qc@c^n7S<8Yne!hx>)##fqgjGj3?Q-EMeGs3acLH)Sm=IR$_9Sba?
zo=!SX@oFE-hk~upyJHX8-7+DiH~f%wZFBn{0<u1)wTvonar@nHHWFoQaA(cJerl)1
za7w(<$c9B3o{KH`q;w;~`2hV<TzlLas2v=Hf#=m$O`za$D#g0<z8N#M!%QNOJ)f~?
zD_3{|b2r~(XNUc4k?f@Loy@Z3$z;`7A-&dtwwNZdi);>wsc4{*H>2b|BBhSs00+6P
z<O4fQZw#`&3&QbiidS986#G^({(>3LfD4<_8zLV8le~dd{N_fJ^(_Ss)J`$CB%#$l
zHCxO#_LSTn>_Z_cQie@aKnbj{h*303p+|Y=s|Gk*invBzDX=jWPWP>Ddj#0)N9rW`
zgIQHCS~j~hIL>uUy&2COc|1x=8QDT|K1OQW8%OZ=YzY54@+jbu>ZZ7!YeTs?4yMsP
z)FSH^U@{4A?EUazyyHW8<((qJ%ekLslVrEK2GmEH^5uMJ(i>xyLlG^Za`C%K88<3?
zkG~-}&iOmi@5mv1f1f+gI&T#yUZkDqHK8QAoak^LwPz(Yr)EyO*|9Myh4X{T$Ct}1
z>uUy5exDTc-Tn9Yk(eoPu<y1>_V#9PCk?FmEH?GiNR(m86zE<o`6t$=UG@IM`RMva
zKDoWIR=AHYpA&pp1Y!BRd;m<Mi1}=P6(X$$e?dABtxP&3p_l`%@0KeF^(V$t>*=p#
z{7>Ys_C+te_%^JFbia5CM)5%N&FR=#{TQ<iFF8PqW-Ref*niWK_iZ-zZ~`qyK`OSN
zO^#V)1g8NU@RQ>C*se*_B;MA^Fr!sukvG+wE&~X#Wlih>graKA1sZ#}5}}FWI~LC3
z)s^G4dQs*%&*~gX)~EXo&1~u62Gnof{<&9I(zA~Ew3X|UDxtwmke|%&bzppgBaLdx
zHQj=#5vAx$q2uase7Jk~Do=Z8w*$Rz%vT;0Rky8#ac${DWEkjBN6bt4AgWB7W#Q&5
z=eL~X)&sHTE-6tukIANBL)S0(aImAP<X!J)TovCu5NWccaconmIiFJP-C!lK10*br
zkW_I*4X-eD^%UAEp0C{Q{&|0cPX*P?Dvf|#4=M|=tYzY*OrF&w!1nCGkYcBt7&*1q
zr~n=SVi|J4H;bYNy>fH0;*S$&gQg2}3#fLX+9D~k)XjQqt5Iw_X|5YMu|6q;&`iAD
z4ig6IVISX{aDK07)Y*&seCL&fT3UFrdwPjUK9fIYN$Q4ivYy9TYR?mnen{h$k>U@f
zFoarwYFE82i)ORX;SlM_VE@jUy(F2SV4o53PgvGfMbTvq%&U;X72PpoUhM}$(s7-Q
z{bEWNRY_spN+{vVN;2wFp24*N7^hs;ps{(;+h<u+R_F>^Fp?-W$ZyuEok`z#hB?aW
zPKKEz;Tv06hRKL62WWISphnXtsXkRrb7fKt=3c24(H2;>o_~tQKXi09*?UgZAASeY
zFdN=dKmep|57hZBHaL?MIG8bQaa5S*&*IECNJ}~W82R!CXArgo(j`Pld5Q_hp-80M
z@-Prix7?Zw{YqCiEbI{1VV|QsMZ>IeWAGstlEh;tm+yHqZ9yTLkEryG6<qOvg#`t!
zK^s-CUna8sV)I)<MLdUvBJCY!j8RlT<^4+l|M^mQ+q^$6XWLC^2F)%#gHm6%39IJp
z#U~hUuSYRE05FpkEe1}cD2LR^>mF&<_EtzT3-h3Pshmd}T(@C}zSc~KAJy>w-R?t!
zLWB6He?ks-N#ERXvZyv831@Wm@P?E~v06GI>-=OWNl><^G%2xXQH*ADKChG}rQ9cD
zspu4PG0n1U#CO=tauQ}M<Fdo~yIL%kvM-v%dWdX!%HH|Q948|t6Y>~*x>TftnmX>V
zjVVe|eo4=^@3J`X%ek%NvM}hr6QpqjFJT`X-TP-`u*%VJtZHhkKOjskI8c%$dO0n5
zq)B$NRak>l9DqBBc#7tR#dO3+cP|8wQxZ`_`pk(O@2Ic))h3Ez{7XQ9y9FVi*(74h
z7k#W-jdcIa2|Kr_goA6`R684ug6T=K#s!W7^L)y9&CTECDM)ZJ=4y-+^b7T;87GZ*
zhd+~Gu8e?Sxs!P~t5IE{+y=bQ`3>PMX~<`lo%f52VSzFk9oYuq$03XfpRqd^8uZJ0
zbOC1g83x6YECk70i)>jF688mt{q8SGQ)=0}aPLB9J};fNj+AOHI_~Qpd5A1r)-Gd6
zI|6l~2EeNfX0(L7gQ~#xxmGJKi*+;@c4RZyQb9A6k@y54W38V67@J>S`KtlS2CORJ
zEA-9Ye{{pE+Y3nvHPON!&Z--HHimad(>^JbvJ7MX|G2sesHnQGEifqEDbn52-69Q=
zBQ4V1-AZ?NNXHNoLpRdhIUpci(hdI@@B4k<zZQ!%cQJ>1@7a4lJI)Sk-EW_t?|0N=
zc&>XYM+vaQ0&Ct17y}g@l)Y@Tcx=7cZp`IwTLe`~ZLHElb3dgzr0ldaAL<o;UHPOd
zbb%%j)IA|uh<~+of#z%I*}O1)aB({e4Z83rh~AzxZQ*4p{xnzsr=L9A$}clvxAeBZ
zSgL+zbWamrFSS7p|DHWx74<L=&*W*Ghb|?zM!%{#s4FM+)Qbb}7JOFnMPReGf+Z(%
ze3_l~0KMJN1)LKP(KT32ILx-}ED}(Js`uwvqH{T)%LRYTv^nru)c)!*5)g-5n=~`v
zGn&e$b7SRQ)hI_^Jzd`c`c#b=PFl&xLlkd}Px(?oO-En_c&Uyub@h>xHW3dOR=Ppl
zG}$9d!Kob_DH8et*+u5I<x590BdI9DXO?+&T1Yr_J(klWK`yc@@!xW$*9{2srD59h
zYRV6Z)UGbIwY91NXjK&U-^{8dr{n~ta7)H-KWZG-YC8xL)6`skY-PI)JD1k{fYU`u
zmSVk)V4__iq%o}BtZREfRr#U$-jk_cbeXnM&s)g2G~uU=u97LFqB|I=KYk%$mDg0h
zdoi+S88%~^YKeY~6&(AbudrWlNM7iYCa;Rgz}Lk2<bZ<SCh2hZR*Mk?mYs^VJm()n
z$?BMEGQSDBy-jXDOUdi`P^>y+tIarHWsYB4OP+Bsuekqjgy<2%Ag53@$b|WR!`X21
z{liLkVFr^spb>-jTRo+y4o?-x_9yq4Ux}vqGP@Gh+%`UcKSQFLTD(4MFTq@jE*=~_
zRHZhMku@19Wpj9bl6|GS^-4k=)a|}J!rpUSd9Z~4Rj9d96<VK?Wn25qTOAIOmY!aR
z_Np@4T;XpctSU@Bw~QQurex5&2xzOOUnf-^c2pmhw1<$0!Ugv#gZ=i+1cv&@;u&PZ
zO%N%uu%xa{n;aH~;TsRlBNa727m}+uGP!K4<>}cPDK?LdD;uLQ^NVLJTcvC6skg?R
z{~%BZ26qb5(g&nY6LAV;$Ve*03A~QqtTQ>fjA}m1C~A0QvXWa-6rP!JK}JtjxLxzF
za=r`;Dyqj%9}J^bP?1&d=_{da1`*&!6t#aW$U-oXk?cu@j;Mi;ponBn6b5~=I=XB<
zo`|xnHUvefRpiE7*6O>k7`#`X8mBcslV`dTZKjs9!v%t`kWUoYIwCI2OHtHev*|&e
zoZgM<d9flb5g-VYu<W?;T)FFx0#VT-SDY2dV*>!KtNMcIM7+VWU`=GQ+&~^c*ZYE@
z@R(&o5fVP~Y4A^_gu+8gOcnNsrEI-(E%-F{4q6JFzXrEkUvS}(Ywi0GygA}~-Xy@^
zi|7wcRB(Ta_Wk*zMaG^2{Tmci2mk;bu0$~5jADg#gAS9riex`dW~@GKX6$pwgU@<H
zLI7q2wFf>m-8h(;DX+<%#BtF|(dB$+B?_%kOzSy_I^2l~cQ^Lbh%P|F52b~<a3;>C
z<WKNyPHxbG*D%mbBF#uLOMqVe`Vjalde-<idNyQRi@egG8#+^)wi_45Eu>SMyHp?-
zC*snAd}vMc!YX=kI5T$ugLz==J>Cs>GOp?%qY|79O{a`Df9Q<oXokFr1=w2lY%otq
z-Bm$ZNOgNE*LzoR7Z}<8yn^IYOG9{nv7+)qwfg%V8C-Xdnn-9&8Et<OBU)ryVjTqv
z=st*4V{II*Wsw5T>G;$fG#<jNZXwCI!L5SkGd4Z$?Km%tCVeysX<%S37Z#9bD!o69
z5(rT5%6|WqT<HHAp}_(vOdE;l@Ei_eGCS34;%ZmJ`cE#vOpX$rMh5G`>1O+wnsHOE
zUcR`g(buR}PZ?JmTU(_e)=ZwUDJdn5(Kswq2{9Wn1-&wqNlIMUT!IY0VvcyiBukq(
zZaIN~o`aDE7lDniT1V&1@AkxAWj`xC)XY}qAvT_kU<Y5va0!G;>IUohWcI5ojnX4w
z2F}17VY@g!e#gE{D&SA|GJnS5JAkW`GOUeOk&5FcM~$Nwj(L@s{F6xme)R}>5ly|z
zsqA)V(_8^g8p@f5{8aNQ3z@A8=tCGf9XY(XfLdeJ{TeV__~#?4OstKQ%`lW#(sSMB
z?zhFD2`-}92vkbz{@;EVzMM{>31;+xG@bVFgtBMsdv=g*Ba6kyR$cO^H;@!Jlv^qY
zaX1+@q<%LxD#Ht~7wgFGSXIjYjCU9f&qX%=1khU&WQeW9r2U#IDdcbB3#K6-r*0IM
zc5PH(Med5)iSX(y;^~{5GCYYe<e*hkRB!s2ee*+COGbR%cuyYK63xlAN~Eiu-YXlH
z-#XEiI8D*a5q*;~kuU7+3w2C|yYn*?&;@`_@%Hm|v+`&&gu;7-lEZb;Z1`aQ)E!sm
z=;aK7QE1%IOI=eM+Fz(-kW~FhBvnJl;L6JLGNlvV*EUe2Y)_n4XDdq~ceMR)=#UWG
zn`l5$&OM86`jXhz@gZAsBIbxOcs+yLvmIjBF~JrLn_uwjSXuRQ%lOkiZ;zs(`LdL*
z_s@oPO`1~2#w8*>n_|w{`teZm8a1i5P5;>yMa^n@rbg2j4IZMKgzZHhiw13)whB2f
zP^pK!jxwv6e<e!8Nqp$~R+2ASPrRSd*77N|^Ruq?`-Wfv=P|roN7hDNOkjjZ7O$dG
zxqDePY$cMyb|$CCakPdhzTs(|wXDTvUW}CMWQgCnkMhH$<4wCyw%!B#%+E}!zsXF&
zgK^kGB=xor*+AWFNn61P0Mzf8MhsB~7pSX<T5HY}WKI&^DBdKbQBg(7O23l1skl~!
z@u5pc#?>LG9kbiqGRw(&l(t!yjke#_(qF}W-1H{42)QgGZrtdnPN~LLeRbgCkWuKo
zWmWn`+D=d+ZY;Nvvi4Z-<6E>DhvGdlc=DT?F}D>}m5Hm9hUH;}+q1WWT<SL~Zfq*`
z^$Hy-^=E#0NfGn=K%kQSkC)}#76Yy?Z7T}Cm>GTAy7d*-2I9Fan%Y*F*s^c?)x=*e
zSqr?Sv1eOyOl@0K9z!_;So1k+>XFw6#0$$)fbqhs&CzERQ1`A}>j>CE<+U;KjX3WV
zq$>+#mk)Q--=1`rmKzA+V|0~o<EMQQw{(<ZMnDmSVY|LO3WVRp6EFv5b$jP7sJO4b
zYzN2*_7X(@E{f8~55>DfWGH4&;EAj${2mnO>0uh8sEA9Fe_RY%aQdY5y)Jos6w;CD
zl^_)mk95)$AgC91!O$Cy5|T?91|_n+l<2)~AIImx8`TZP-W)oNz+GIQZ#F1*bI0%U
zU%$4bcwtj<sez9)HAN`27LRs%U_-B=L8Ws;{D3>R!_sbe$7Y~k+tfH7>G}E2DK02}
z<pO>pdO_%Pk#sbJj>#XXOW@HB@#pZc;1blKNTrj(_44rSY#1?uA7NF@mX{?XbnfYf
zFq>HMdwc<Z&a|4_UP!jDwDoBa`4KerHHz(qrrW4JsiOOE)Ue#s{L__UodCaiEsCm7
z$-~-f6;Au)t=7AA%K<7m<+dL0X`=((&MD(fErEfgBh--|T=P?D+ITAyFyx0VhQ<)C
zq7z%#qoj7S<i4V8khvF_i#7VyXkZ^k*u*5$>@WaZv+Hc?((%~RJ(^Q9EAuG&lH@uD
zQ-x8Vm*fA(9AU<se#aaGye6d3Img>MP(Qb`ELF>p<AfPtlif*lvFal?^SwkPg<ai{
zvwn2!hp%k6US1xU3$jSkwE)>|C4diEw872t5VwKDJxBhH&^*^;-iAo9^5Eu+YVS%%
zT&F&gNoB{l(v~@tsVQP>2Fa$9I5kUKIEKUb&fREL;%j<Yx<5k<#-M3sL4l5&-%WRM
zsc)+pHlbZ1X>o(+n6=vSEpE^TH^2;i%_}!cO$7-Pu`tTau(VK`pS<a)tBY7Bf_t@m
z@-@x}c0go)Sz(K!I{cm2f=_hh;6{I(X~8w}SI984e2&l~Y#C!ksepD_`o*!b#rxG~
z{Pxtw`CZF(xA!5Kl+uAHlWajNi()3mxrF?AO%-Mi>TI^fcH3K73l}%Ct~O1N7rR&3
zQaVYzd!?~m7C0lj6vyO08rM4;bXkB5gxP^jwY^!=DlhECD1?4jz+-^b78@*I?w_2_
z*dOm?C@hmV?ukmY7SEM<6dH}M2yqMh2Ih-T5k?P{Tj2bBBW013o40%f9VpNLma<5J
zEn${v<YY#C)8#QRKSQQ0Zrdg6pQz=n_8}Ux8+1CRx$tdc_;9~fnT1oE{6$VW#LW1U
z*xL%F!DdgvW&u{#{Af<4FFi=|7hoeJjW3jAH_(`6p^-dcw(SUL=(OfOKK-pmU7+3%
zq{nlq`^L8&l=Q*G>#L8PY4l1tx-Sm=kUJ6YzwZrw;^|Pw2x0d(a?E7DxG~M<ZCIMv
zB3yj!-6<4^o)Y=UVO+r^vnnR@wclERW8sOt<V*8>p3=byV5iWYq|}S7d4C~aoq>6=
zJ<nw%phvs@Ya%#b`fNYwCR$MKD{nJs8T5bzp~&F;mB)APyrh>9aGuUjLFUt{46S{@
zzy~ZVB<chveLkQI#Bv1FSqOSC&E(Tr+<}UT(mT8PBJq7}s1-IR7~U^ZU9b*3#!)kz
z*#W;98y)2yMnW@~qMp}J-eM&ZzJ@kpFsP~ia_F%7erC<07$~`&#G<#i+c@pG3!CRX
zUfKtXQ>2|EJ|$!0X<(R1UDUra9~)z-c36?r8@BE#9_?!C@~vC1$!Qnb_5hmXJwu!A
zH3@+{ob-`q3wz(Tl~q!ZAP_^(7(a@B@46yIxS^_^o2qc@Y#brkqRiD(`V=LRE$UXc
zlQRax2=JfB+^P4Ts4TlrPAwtVm7{pSGyarGPj#vm7@GJqroGBXmzLuP&nyC;W=h{Z
z$N)aJ>D#3)1~~8-S}Gk*f||YFqQX7f>4Q-GqN{~e&W`N?o`AX2dp!i6ojm(tnEEK3
z#MxK{cFdKU1c|YBNJCX!Q{FvK<};0I?T6T{m)!I-GtNmV1ra3lp`U(tKnd~dn7Z2A
z#T9$ONjwSI={tgsVrjcSmGj785LPSL{3mbmys3)!w}XucQxtz3%E*MLZpNPKGp)EC
zbdN<U)s-So;`Xk4KIcm0aav}>98M+pl^4&?J$vdx0G$G7qreZEaXpyTjigm#!TS$;
zbnFif+O<Q*Acilk<ADhb9uA)u-VcWte6(W|tGA)ge`hF}L_*u>xsScKo2&HGX7$9=
zHEX`vW(WR%iUY|o1Yn^S5Jj0F47`Qz5ZJs&Ow0~?WGl$4IsRT#oI{DI#=c^PmEE%V
zc@tZhb+Al)Q49jTDi_bxNkhw4K%;`sV-r%zXWTnW$xfpVF5F*bpLEQ4t<cg>9cS`V
zTCeV!)rt50^Ks0q5A8C^EE&ej1?ivxBO^u2e9<v?DMm(t!X$pr1H7#Du;H<Fnq3kb
z-@<qXjcq55A~3X81v`Q*gavPA)A02_B~<=yWZy}N6;v4{Ld)%Adyni#dW)ecJv+3p
ze2k|YWf#m2R4;X1N(MS5y7$zzV;?@nBUV+XP<SyV*Fyx8>mEI*ZS6U?g)@&(*@&88
zE);4qs-@|@>9r#Ru}UmRDw2tfRPu+JJkd?J-y9u7q8TS~NE7Yf_~EQAU+&dk5;P!X
z+C`}y0Jb0+3ZL+lt&G+%yqhdFo%2+oXN8_|flaaOy11GLlGGpPtLRu!>vrhidhCA1
zQ~&+J2zy5DTS6PCD)1J+$E)DHIPAUzXQOUAA&{0%omeoMUUPE?%Xa}7zZ<1VWbs3g
zJz`-NiJ{j2V~TqkUrA~ghky~>OhAsJ%JJ)Wuh<=gv+)~|m?I#^j99QmGa5WM3)>RR
zxN~_59qNYMOLWAXA^=?e@(I2CX1AmDz?x*1IOf|pyTHD+%U3Mv<-Jy%G&_#_g!M9V
zH0CNfpxtbE!eVbvX`#HIQQi~Eej((4PyWe95sZ@CrgUGLvWvZf(n7HVyCfW+V^(z(
zM{ftxIn_t!xi`6$`o3IUbEYj>5w%oav80J}h`mvv|GC4^Pb8hED>*g-J)i~GGsel^
z{^uNgf5ywYA7%vbu<@D#?)s)Uc!l7(xH<0K&IoUhlYKe-F}A_Q0!*wsGh=U0t&*vu
ziR6J2?Rho`-BDcBr#E6j&;m;U8q5EdT@EAgHWQw)u}X0!C#dqjE9x$XR-HhmR6HSM
z@A}@)p!bLMntoM03W+ilMa<uNe;`EtiDtQr{ZukIR4!jq|2n94hop^F%R9!mzBL4h
zh}p+rSpqa@HyigFjm`G{$<SIH@NFXF1=>IC_$OvY3=<93=*KsPSY=J^dnsp~1EEPd
zt5PH(xQ*#Zvt0Elr<v*1aKe^5t{B~J;*|^|8<sq?>hk;&VNT-W6#JM$O?=QP$u1<2
z=K9Yr5B2r+wd2?^C?rMN)iPJeZEyaZ9Iz@UA0^AMA^uM$5Q-`6uP8z7<E5cd)X~VA
zN1;D0d~U|%WRU6NONf~YI<?iU^}<t~X-&y3g$VLON4wmx#d8zMBXpS;j2Upa{3cgi
z-#ZA)x=XxcL6f7s7PCGHx=q)plK6S$6AaNud5MiZ`J(kTwf8DT@vzaM_Dx`^a0TYv
z@SY&XxBnRo|I1k=0{~FD`!DQRjO7XQ@HrG4-J^<S7vakVou*Fd$LsUjK9uu#Bn?4x
zk>3uc;8y!P)Y=Vtk2v+L+eogJy|kvF6%DKYdcyZ^Z2HwvWv0h&*Bt(suk-FbP`r>)
z$<SewrY98*S;>LxwJ?AZ4l&B(e@Fm5aG~nRGpNK?(Y`SoWw4JAvpC(4ykMTa)v;{7
z3Nr^M)G-*r*%sN!dJj0^W-}t0&X{2fz~W?T_d6v@Ki{4onGBCL2tR`7bb|R!(|?Nl
zV5mhq3(9W@9Td$aWOKk>=9_x!rJw5zmVm%?6*pLhwf@(eJQdRMqx~fUYH+qw;|0RT
zO<_a0fqEQT-{Snb@D3sP#6yVP#6)#Fb}~7{|3kfR=>F4XR-Q>pTJ)-8^dd0M=91|<
zn?P1zfE8Iiisp`XA?Q&*c7I-`SEVpa?}Qu}{QN=j`(z-;`A2z<DfM=_yp(Z}<kJJB
zX-u*0ZL-hLyicU^t@;za+V`G)j??nOwB7u~(4k@ctgnDsBEAcO$iA*m*Dv^Q%#)Y7
zr8|ixvf{RGjN{QKF_T3<gkakV)hM@D#J4ST$h$kIPpSl8JN%ye@@uCFTmZkaE+Gbk
z{&+lOt}75d>ihARF#~Moya2i;f<QdE4_c`eUAv?;+<MVRl}e=eHG0bA<4z7lRh!$i
z`U2j2?d|t>+3h#mv2sPLuk0@mhsN6PE>eA;?^S5pYHKH1qQ>MNe&sUA;4@h6eaYio
zFn^`SO_aTBOeHb0aHnFf%{WAZioFUer}Fkx_*?_W;o_vh$Y5@2xzG=WTY?1%^)FIE
zJGw{uu{0+HPuq-g_0B0RyKUegyZ68A1fy)2kdHQ!qbsq&s<s)Ic$z4qx4zXEVzUFi
z*KKrrL<z)emfqTNTf$c`&ou<^*!4Q|AXWi5dI%ocg3Oe5x=!M6Xa&qmzn)N9tLQsS
z1yib$E~4=JOH^=0;5Pm$e;C>7ChrSNeohWRGaPuRi<>q^SDg(bx!V&@n_M&d9vyG@
zZ+<)qCM;h%!%JL&q~QM2=Tq$c5L>V#>7-82u~44Tgi(0mYe+=TLYvuSu5Zf^h@<|4
zZV>(tHaK?=MzD>;cAMMD`a+Fq`}KOblE}mP*_eok<JK@Uc4b^JQcX;i5BL51E+gFN
zk*G{7``3~&VQvY^)Kg!u-zRHS3m3R#T43`2%j5+2fH((kp$u%JzSHme23{3@L<~ti
zMP@OwlHaE)lHW+}PG{fW6*H<OKfeq(4sSaZP~E|edX@iy(b;Md(|fHe5EI(d{HR(9
zi`7M6@alwTTPeCNj0E@1#p{R}Dd(0HbnYDhnvBs-x!G4{KIcid)9XDlZBD@`@f88i
zXZ|o0A6Un9!Aor3Vv@#<&||{^Kl6XH{%-yKT=ktq<nc~$n%Iz#O(%^ic0q<ah}!Q5
zlINWp1_YCB>DMY7HPQ!9z|BD{h@P^IjFs~y%WrEue8C<8kz3s_n=1pkyJ(OMVst>O
zpTv=vE_V!tZ@+3B(rO<wd@Ad&@ttQ1X#WVN2^9%81eX-<*9mpEwh8f=Ex+AZbh*G=
zeSIA3fN^ZrRp&nD)p=x?0{cR<+Cn+v;!!v({k{kFX;3WC<G@#gLW~gbQL@8&wGrxp
zFK;*Ja*8DvxYU2{wSe{pbFA%+h%+PjPkWmtz`Az0i8<aP4#S~(|ANsj--TUlHgiR*
zN8XB>{*Z4gLh$ELoY!XdYEK5;S~V-v%j@RTn#~pRXllKb<e;}ZWn78WW!6`<xGVYG
zIIsEoF_SvNyw1gfknHJO)Po7AB8Cjg>8^PN{g#_dPH&UY$TgB)R@%|e3+2AIxxaLO
z5&K-VMea(v)WK|7YF8`f$}V|1_^w~5mT<q!SM%a)?}a>EU;`af?^Xtx>x*PjGOfP_
z)>@w@cz-oc5%lOghrcqSk+Ro8`PZPta0NWq2tTS5(p9psCWQ+E^YW*5VrQ3at4VS_
z5l-opaC31;IaVc}T*KriWmkvL?`mJRk1*#R6~czu9}|lqY`yvXo~Y%mVil@fNu9<|
zGo^dU2{I3NfxCogy)=$8b-75x(=16uMEDdPa-%>E%<${EguFEN0=ox`B3%oz-a(ah
zT%s4PQiF6(>%>2Hr%|AxqfXiWI|Ba>hF!SK^G)F}Z=r0_F#rVhZkJGDtA(i?F)+mx
zJfmPCf?S?d{Og#L47T^Qw|-@};<Lcjvy;BszjoWWE#SFyIG(OSpK=uATfSW8Gz*Dq
z6gMj&zA-%&Wd-3~AhTj(Vzf>t=n$_t!Jdc9vhIu386P10qpWkJAgr~JLd{RHmO^pm
zCDvMDodjwrgk<D3WCrk$o1Ml;-Es?AV7C{#<%~fT?ve=(cXu@tv$_k#c)`e&&oj)Q
zBy3l~=e3fAfh=4@-{K{Bg`H%pgB?i=>Xbc>hrDs(wduNTnpZ1a5iF+@6q9wOlo(%K
zh_j>k0_{eGf7!-=RoNd?Mc5x^GB*+VMBO!Uu8YBRCtf8Hwa2^s*!$bz@QQ?dGIcgB
z?%~;Wm_Vk{vjZkP_6r#zS9<SPJaKZRZdVgm#fHx)a|Df^@Vc~}p9XR4>TH%4BPXMr
z+Ojnb<@k<?pby)LwN!mH{<ON%><3hb+^h;<kS)V%paR(t5hY)0@naXspR$8Pjiuj}
zx(uRl_kS0lx=?={7>B(eWT9wbg`f6i-!a5~j7E!-eL62;e~;CdH2vrv_ferpO(mZy
zMjDg+6L|SYT1z|litp~gQG}N7>TE$N%kfgrQ`cv|X4L--(hl|+I{!9^<+_<!7a)ZE
z;Z^G*ej*XIqlqT1Gm#R^*6RMo?l54}XWLJ75_7(O4Vh5i$0ED%aOvk^1gkH(;e;yK
zcd}OGFj;-XL9@xA9XVDp^t51glEe4TbA*<O@X}YwV*Cm3UX2>+YO%3cSI#b`WyifS
zT&pb@we=k{rUz$@XboJYF{3tRWzAoe*0@(5xrY5%OUt%I5Pb%V;Rp3}qy54rhyx|~
zPQthpKFXF?l&)&A71gyL?a-b3)4)-YVb+`zAHoeEv8A6bA2F0C;~p!vCvSKnR;6K7
zwab#AGQ}ePwAMD#6kAZaGJfJFv1q$}Osc*)OxNT(JfRf-n2T*@yZrMI=;QIu`K&d&
zY(-epvcBBH=3y5)^8+j*sA>jB7Pl1BSGRwK)^_@4U~MxJ75&P)yt?=b{_Rxh<mXB&
zdcLxYa&hIyReP-n=B?H%;zPyY2ksj8&(8p4&tMR~A^5|wni6^{jOZ+zL(1vj4u>ZZ
zPFzTus%eA@C|U|^vEMcMa;BllhR|5=5zxK>XEV&`e7yT)IyA5)*qVV!a^Ib&i1xV-
zvW^^?naA216>#zP8<S?o@(1!)_i+xHE<P^)TIUs{H#J6)O-tSQs~cvlM2yFTj>lt3
zT=&iCoC>J=E*FNq+avMbw;7kGe9~r0cG7asVP8m!Z?oDStekmYRCXx&e!Xoa+`qav
z*yd4RMfd%1#_ih1-~hyG13Qk9HW`PW9zXsZJ!4o0?kCYE-f0onWKB$MCMj#om@6sM
zf~I|CXC*RbWW{|??%rk|Gi8&zYS*_=wb?HzUnjH%^<L=L(t5awC|gRKEg<c?d**q*
zbY*$c$gtkp$+4so`$cTgT11`pm9qhTmWSPGT2lQcuEjbk$-4>liE3DP9R!+^I@k30
z`WZb9<MoPjdg%<Q*_8L%>hyPp?zm@-jkzvSK=mVzmy@o;d%?coV*x5ndG!3b-ch}?
zJz|4NCftC_Y1i+dPbG#9k<JsD7ITb3YVR=)_f-U&TP#j4UwSmGSI?Uj~__D(G3
z+kP(RYt6dyeC3!l%sf}y#B(*%(x^W)mTk#Ge6>i$Ll*N5&~~AFLc1Evryv#6?v-~i
zkdax=S^mI#GV2>!qcwe@+T%JuS&fwKA-22n8B{NzU{r;Vc<_)e;$?a#o;O+++<q3q
zFuh_Bzv(|i)|jTRRWAO?*Rm*pl(ANz_c%rE{;SS&9YjKMC<{S_fd)pEk5u?}B8QT;
zSutZ@ftbU~OGZ{UEHya(HD<_U#8J#;VS$fjMU8gD2gt`t4ns&f3xg%y(y_4idvs}&
zrT20rnPBaGE!WMcRpUys<`#L7qs^Sp&{(r^aQ=E6g-B{E1TBA#xKB7?2ErhG6VAb4
z?c;2FaUrA|n6Bs9A_?EdzBX*ZYm7W)-Mr8JRt%iX;r{8`%J`0rYpBBkCt*bY;=O$+
zxiB1oR9>;|XXcqZKGYuGQc+9ii{?_b7{~izA3sNdM#LYizP0mWqQL1WxrYO#_FJXc
z68#L0C1{qxoo3C}4_m(Z#g&<ok-ee#QdXa*O!e7Mu3KT<eAmp?mk+13AEO;vb@)4|
ze@m8V2dRI$kdSZV&xHaxAgfM`$L5~1<>f^E3ZGTi1Ke{uR(kP8S(;bd6PJ>~O-k?<
z-K7^Ja$W&ur{nntZ<syx`Am>K`{RI&cN_~v*x_!LW_pb+l{G&jbfb*oYPN+U1#6FJ
z2ed0ufkg}lFD*xu0y`l|>PI7ILuWm@$Gb7#Coj&3t<GwO_UDKMBHafMsTGni<lflu
z8L8A$%gMD$yY{Vohl$A3o3HOoy7kTuGI}LPqDsDL$n27N9OZ})J3;pPP_g-GR(qc|
zsswd&^{x|maMEps*}nDAvHkXJrL?AK+l1fpSO#~=xZ*nc8#c+XeQus^_1Fi--49=4
z@qa_46KrmVh>-8i4=G>aNuS#*-*>{>(PO$!vE!7sjg%Lc?Mey?xVVu4Q5`R6F})_L
zfk{dWWnK2bTSIyQS}uG-@ew(Vg+~h|>+*}OxWrbuvN@RSrB5LEj8)aQzOCGY1Qnwn
zfT5liW3E~n_J>&?X?c_G`6PQrB=KnDny5V$v>nWRdW-D?-{fH1mNNy^36kzwfr=9l
zVJ%vlMu^MJANwZEW{WyChP*O^KfWSPce}BWi8qI5=lhs;7VVx*-GKO;!GEDW8O9Z$
zLbrM1QmSZm<Vg1=n}zssab-b40V;_g0q)@_?JIh=oZK0-w^v$qmS|E^@zsh;F7Md2
z-J-6(s@se9V)iS{7HrR>96{@(<_iVoS?&yb#C+wkjs5USjPGkSZnA_XN~C_ywz%Dm
z?D6`GI$)=8YQkg1BhD}3_pZPd$?&mz?zVnDq~&DhrFC(Lrf&`Yh2j3l_?6c|uf3)N
zvXu3SxE%6}&ARaq1w>pW#K{&J7TsUub=-r<cinP<lWN(yKT`RT29uh4s=1H84S`1T
zj@nqXE^HtNac;Z8=0yS2Q#@32*hXQ^)~Jo{tAe}pb@V`&4tBJz=u{nEe?MASkMC8p
zS5X5%noG4>OoYSljLrfitPx_d$Dy?5e>Mpjz~bXF5=<y7UbM==Amly&)j%Q$g^<_o
zi~P39+j{hmIZe0^&J)z>b36%r%r2<vtHn}N^`9-!v~{XNFf~ZyoU831dl4hEz@oX^
zjA2qXxV!n7wlChs&&GK3Yvh)$Fnr3QGw73;049mWk(EBxAE26kWc>_Goawu$-@LZ8
zYFVeh1f4j)Vu8&hXL2_(-VP!&RRJ+A>77rvi7f0tO@(?_hJi2TR{o+F3&86-*BZ-X
zx_GTSI6S~axfZ$UtKQos{Br(e<!X21fh}R@8tJ6DTb<$MA}zz#cVw+-R7z!6Qx^Jf
z2jZ&ZsAm)0gwfr^oE5X@R{A(dtb#@L5pkdc?pF1xZ=qV-XPp+5$>{#Wa&&IJGD!$j
z=e3Qy0N4-3lG?I;+=RW8_$L=&+<VAuE^c)0j@wIYHGeVob`V?Nt*4<{k+t2>p!=*(
zHv3v;-%~{MwNlSMceGTN@1Wo|;J28OeJ3Le&)poex8`$<9sm3s^`m4hywbc<-f=jZ
zC%*X~b0sdo4=`@C+<32jQ%#uo(EqevKP$3&9T6;)YARH-9lYPp*WGsasa5oP1v%To
z&io{tgGBSOyH98yPnfcp@b`_@gZ+4<II;Nm3{xhTmX=H@Jv=;YmK&grW(t&qZ}OX)
z?+&07zsItlTqX*Hfzo0KqGSO8o8`yiDnXh6XBAw4Ahh={8dQv2PSz!FqC2jSMXMpM
zPdsS#zUyovp-@Hp4dKT!pNCo6b#WK?W@Jw9`eBJUTh`U*<~sRkfRMP$=R|-b;B^Uu
zBBIyNHQH_A#+{E|0o3RI$u$)j$5=!rEJyNkV@_-|%gGbg5UJyj22h?mYk>cc82?gP
zWQ2&&32>B5Iq*#(+eUAWE60X+E~P^5ezWi&o%ZV^g<OZXveOT0CnL{SO657Z8|JlN
zasO=i-!Dl&hh>Q}qjHi*<!9h841zQKWmNq&4}X0$o6XFP9k06d7nST=4EcZ#xRn##
zdXgp!tHFPMRZlSpZS?)mJ>P`-?S{V<mof)<amhKc={q&N@OL988`OL(Ni<D5W!R>C
z8}%~#Dl#D<!L5o^;4kz3S}hN|m1{;-J3v*a_-=xY>lFc<KN*5fqsPN!0jSV{TMmts
z?bMOaxa4z?ERy)*w=Tp#3JQAb5PaAm%9n`5H5-9pQ42YpuMynH$Odn^dO#FetU8Ye
zw-+Q=ueyVxrHP&tV#n8(UQV|vtNeXM1L_WmwXf?1rvy}FHds_nC{f|&SPYAAIuQXM
zGewt3l?Kj9NHx%ixhXbh?}E_eNIA_g7#3g9+59o6S}-sktfbA{6Q35>d#nxuL`4vF
zj$2!4TR%(}D|ZK@ld>Bjh#cMyaQx^|o*<W!4a7h|_8+QVS;!7JYArA_y4{|foQ&|b
zfQ~0k6#uh}6mp{994#zL@8@XxqSJy_TiiLS52s!ND8xFhGtfx{-M)oj?(XhFTSb#3
zW|1Ic-F6IV0!LjDKc$8XOigl7c2bZmLg~~^XFv(yJY4Sq3BL}(Z!$g4P|mX<=C(??
zzZ0<gF8Z(_G|}VrdIRr|0nHE7{ltW&^=L`N?_{Q=6pdi+et38o={!}14%><GT6pjD
z1W!LCCs$Is?^6I;A@)EbK5nMYhJx)zaz~Gxo_ei<cyyb><3ggZ*XG>wsm2Z0FIBI8
zGX)B4@5YB4C56J8unde%CaqiO)TG>8F@XO!N-`FVo7y3CKIku2{%%%^rZiILD!$p`
zo%vVzWuN>QVI0RqlZlS77VX|N8*O0y;W4OhAHi3Vn=2LdirzsyKTv)fkB?bu`2zkk
zv{#^?;OjW;Q}g6(yVGLnfLgDuO+b4H&&R5msIaI&<&NhF*W^%yvGwpcj|#Yii(2~4
zwHWmP6m#+>ah$-Duz;wg&Tn>4fH#sv;#%KoyuNS7$ZcqUZ-324q4*4!*!~A6g8z3!
zHyxbF05lRR>fcO6mW2=!fI)5$f^;Ie5g1j}**!~!*k1(UMLDs`c%k`Xfl>;(ly9~c
z;4h4Arw?6v#37ILo7$qUFfo|PbH!1(^)~t)<O4-X5&B=~$r4t2(<#P=V2Vmgp>Uy!
zeK`i`;7L;E+(pI3{dr<rz&P#+_s4l;|8KM%TNj}sz2~o25F41n-ir6#`F#}mS2X-Y
zb+Dn#aPiZ?O9kUA3-s@_Juy=<{}FC~0vf-tsI|^cPJ|u6u;hfanK0%*viu7HLil0S
zoTh`H!I6Cr_N$_MiFo(|yZ*nVmJ9<E_6>eGg2_vNsJ<WwhvJ0yrWD_Q-JlISrLm#r
zI1szusNZV>V5GE+jwlpP1EUGuh^Vb+<J|vw*k^zY#DETb=G%ps@zX&KpHa8QPolnv
zUM@oArcNit6ZMe<Y!uAQ%v@Z~-#efsBNgrHswA*$<l{?AjJW7T-15T3ltJi7N|e(`
zFbDuy(I+K|L(w6a-&mGIfdPxlTo~e(M0)l4d_zj1+jgaekm0WQ1?<3E(KdzL_!m&&
z^1A+cetxce>mQ(47cG`?IX6BQbh6PGtwhZGVe&fGei45-NVE$>T7lRvND-Q9MG3|F
z2#lS|Fz)*?ikwZUr|}#KPY;hbvO}PYdbJ2ATn;jfJHQWQ>R_+XZ--0_S~%y>e*aJ`
z6|sA7_L$B}+dHwc@)*b}|8&j@^oN5%c$3_*qb%_}|9tRlVJA&&{}(N=LPguvYEQ!T
zsm^1_C4S(y-#cJ2dqX=`DNlrc3uzN2ivxVaXJ%NuRveg<E$m^B6^Oex2aN#^PBj&_
zx1j&vFV-Q-f6-^qY9MfI+{ABp8~#z$&O2hnUw@*_;9oMqg%I%BPcSLYVuTQK3XYS=
zlCQ$!e^pz8{V%jb*}<D~oE6vlekYkv{8W3@O;Y^Int%=-Cf?7h9EF^DePMje!Q5+Z
zSTIAB>^W2frg=Y(bIX7F_W{VMpv#$E`fX{tiJ2L+ye8aoBzU>i5;Fh?vTs_6ZZ#F-
zu;E*yC}FpX)V%F3D5E0!)c)lp3>^Q*QZsXTAJzd|Z$37?&lXw37=C^{u+o^eM$*xm
zu%|hZnoF-yU<c&=j!oF3g}sBJX#C2JZ-4NHrNn4!eYEy`*k?BBb&tfC+`zz-N{*+u
zw8MR$qJFXY4hPI5>kbG1AF=#pCgTGV&{N_5`{n<Lg}*4mmt44<HzfZ)*gv|`kHRl(
z%kiJ9e_t8{A>kPV|2vf*e}fhOrB}b?-k|)~<=>C6)P7-T2{<=@srFyen6t2p6!l-%
z7$KOjt*xzW9}!JNzl;Iq`KrfuDBfY<62tU!{^9!HU&cU$&CNzdw9W`YBYgcqR7pvR
zYOxR%At1!h%VslX>}yL<v1}<K8&uFk{8fnm;T9-ot052!YI4@R;;~&x7$TPl%GVL$
z_<0&zJTH-lvBT;=arLF(Zs@h(rt-wvnr!)hJs2h}+3gJ!=;zKQ0%}{L+fEBF`+6{W
z0-lp&Ai?SzTZ_}Z4$47Ob0YYn^4HIV(g12TA%@H;s@2kM58c{d?9D(WaRQFg<^`%a
znG+!QC_JmNfy1d)CL`_LL?}nn`Cp?Ey5UsJFqlLR1Z_KvzzUrPyEW*@H4Dw%oqS30
zK5T!#lTH(Vr?N<D0o|V8#>=mL{YK|sc#P*;d708|pLv%w`f#Luw3`nlKrAg;oCJ+S
zIKEcP1k?gdp6g!{4TnBj7ZgTm?=hytev=&m#(#8i9lr9_{7JTL(}<P8<(a9zh4-dd
zfD;0pDm>s;p~G%DG^Wkxp~)@Z*u}LSln?uBVFlpbM8%tipg+vffHhz5v{eeX2uVlh
z6wKQ6uyt_n-bdzAKr~m%vY0#2f!bATR38+*5=$Vxn4`1BM6ejrDzL>nRK#vgsJ~Io
zAB*>~YuVRBX3C_(cSTp6$a&ajD|6(AA|IGIM~q&paSF9l45&Yp#U{;ttrOpbzKH`C
z^i8%Krv3x~5n?*3nrJm@Ef%{-j~T}<^{DPnwt0r9YvDlIZk^VH^;as({(pIy3=Q#1
z?uOlXu2FT>c)zevm@Cu<O#{S@G*6yG)(mY}XCW<pRNv;7mo^=)Qzi=(qx4S*KyUCd
zJ<BSlnLtnb=0>WW9Z_5}EMRqK&y(fFdO^^t_Y7!t#)#!=qsQm*WTdoM+qb^3wRt_z
z+sDBH`o6!>%fEr9LlI~%<Q2=Yl7UCvBsLoCRW0OTJ%t4z`~JD3!GV+q`)F2smC@TS
z*i37~?W$l#pRcTSV@lF$Z2xja4|w|}uCH!Gbiee4DT(*Pa81o2ece^z^VFeo=KG5J
z#^u?A`1Z%hRM4u>*SxiI(=grrs;!Mb_6yWEZJ_aE<*x{474Id&`enXU+kP|6_NHlO
zeOqjh2Hl){Olg{;bN?BAWh{PZW7HSIZBAPoR|y+HlXvaoMa+eIn&epgPndb7axbQh
ztQ0&qBWZx&ZMj+~WTKO99p4abR<}LIlto>tMO(K1qWQ0vUjNhqbp}={WMbPHx0~0y
zzV{P^hDD)l^V&%I<P8~EwbI?*u=?H%Wk20-^B$oWLYJM$e^`rNUD9ULj*b&*yl;*!
zt!P<2a;K?VwGlqOT``R;tt|^n#4^&JLUV2S_FQ1)v5jg~Xoi`kK6$Zb+e^ayF8O83
z^kE>zz~5a5kokov`iY@<;v3=BuZ6VPt$mW-i57X}Jy4kZ1(5_W?)|v$d*AdYwy(9C
zhS3Um#Br(f`P{kmXQ?a3*V8H~@68?V2vXSh)HXZ7sZT8a$Y@?>Acy&fs^6f1saSS`
zB4O(UbgInBJ&_ECd95z~MIR7hYt~*GlHKLH`JSpaCG+80FLm+I0B$?Rxuj;}>vW<>
zPor-3jH!ZheD~@x?-mC>&p#$fh#^!ycu879QdvZ%P+iZ#mUHj%iP;V~0srICd?SOR
z9SWLkM4A!8?*1)i9?eHQKwPw_%E(UC?9Vz9)UK*!_jAAb-97H;q}^KNzl%i)py1#A
zj0+RQX&U*`XZOtkg*Q~aVSVaomMZJ)<+mJSUebhzbe3h9m&#+n3`aQ^+Zk~b0a{}C
z&C==qQ4P>A?3dg;Az1qvVZF+Rmn;#1qH&o0qcd=xt>;#U+D$RSv*(cYR8WFx!?D!K
z1A|L_4f~bmQdOP>`kp!8g0Q3gKT6r(?U(hoJ<;v*8<y-aQXU%G4jq5-`HHaD@S(oW
z&q0{Oqmg}5`yU98RxTwBEQ5kyWgvGe^W+PpjqP;TNp$}x3egq}sHHe0Z9w~513;@r
z)s^cVyXEgLfgJcYZV#f}8+<-UH;uY#x5Yoq9>#dev3pt*_%yB#<k&MkA2w=QjmZAE
z*HDppLl%>Ukog7uHz&o61H!UeowgQ4J$M_7Kk6$7s*W-xS6ld&^<_E~hYxI`CJ#bX
z)Wz**ZkzSB9*)9jEw&HJe>QH;v{`_k-^C_wE|4%%VE)540CK1}ian3Bp~CSmyzO$o
zh=g%#6Q9|SD)i=3uqGa5cCiYtCCu4FyAUPI2hb#ov}@ikm4gQNqdY6GuaO;}5T$Be
zh5tb{Dhwzmz@pOV?vncVTBkGAft?#PCUrR~bL|2>n=IR6bX|`anU*RQGGjq~*SPfB
zRpkO-2bUPOtQnSPx6(8jGU`im2Y}Rki&E+{rkq8^T@cUJ`Bir3x1p-JmXNlJ^gY$y
zaG_FNeD5QW+S9R&Zy%80&?55=ws7t#sD#`e(MaU6Ma$m%!v!#j+x=+sKXBZQd6<e(
z@_P$0vH{STKub|I@NsgIW25GUK~)+d#Gq5Qz+U>x-40RT($ScmruahD;U>e4d0FN}
zZObzO>Lyp^yNJW|Go4bT;s&~oF5Sf@Xmpm{>W0?R+8Bh)B^M?qnQ4vNK5mh`vjrv@
z15GOBXAMSc%p}mGF5f7JC$+cs>*dJV+a_lQyMRrAbAUxl+nM2Z9tq3RG3F(GJ$-HM
z9|<7CrNU*fu#Tr<>Im|c$FO!?d>Na&)+MJoP-JnoLebsOM?%4VmGngecZV}OR)6Vy
z{h8UxC!n6zJAU<9Eddc7ehuKQWa?1nZy1TkMDzZuAgPF;w5g8deYg-84eiwgT{cLM
z)G=E|C(l*-ejk|$*N8A*d6kl%5EyS+zeSR|+E&C%M-G`#YcD?NZn?bx<MpJgs?9D7
zR(u_Aj!DWsv#i&U$BRs3W7ZcayPeZjZIo;FZOtJfTE!W=oczKR%fRn5v6xG2OJ7#(
z0Gp%>X{nyn6y=4~>lsBdj&p)7Z8S9Rn$R7GxXmvZ%f<Ok2L@NDXJONJ4ion5U2Lpz
zH?9<sTlluqtbm$5BSnRC!`iyH>nWFQLUTXgGl$B^eA!v>767k9Tie;J`VHtm_2x;n
zD_=w$t53-wMk@YAeyljCxkw(Is0gnpn!h;9?|C_Sau+!TyNNn$v1MJ(sJ+OTR%f<*
z<gSfe@9v7QWEtaH;HfkcW8nyl*hPKk7B150V5tkz_-eBYr{$VA+$QW|;(8}#Qtp1E
z1<|q<=eNCSI#}NG9ttlcx*-CVRiri5&TWkx346R@<asr1;v9vYYIAt{U9^3!(Kgn@
z>%#@f(6CF1i_I6T#fboZ?9SzJJ>^>yZENKZN)#F4SH}!_4r8~CY5Po{A7^BCp-!>*
zqWjyxNC}OeS<^TiM94qHq=XWaJ&NDE4x=9=q^`|4y5tUWh1lW_KxD%$&U*YyOh-YG
zn%a+g-yJ_EYQG0Tk0JXQ-Y}0eWrW$vQ91e4&HC86HWw1#e6_ZCEal^{YOH2=051#)
zpffXgHg4&mSA5T)4qr+Rk*zKFVsW?f_Ds?x;%QZ%;RkhYp!+Js3n$GA$PW;=%T1YZ
zWu$r6IsoMs?$vE=7zFF)Q?x6_tg>gm(!NK%Uz<=Iu?tPMsyI|vXxzK(#l_Viq6)SN
zf-rzMbh6BNg%ZaDy+J}nL(SkX8%gICW#8-v2>)X?KLdP%qB7b`LU^oL(W|xDgBvcF
zP5PDZxUBn=Hp%h1Q*U<alJJfgFamE<j-BgDyH455S>7Fku+Ch*%D$SNQ5x4tiy0Ug
znaYIo+g1HSjgxvo2YRi$&yMe{1#Ysuowp`Dv#uFQtcH|LCM1BB#F94$3oIm^s)|QY
z13@!7_T_qbuc0F1G@C6fSyoOxVy)P)b+E2+<22Tewt5#~0}j#3V&x@XH>xOil4WC=
zzK0zdh&>|5mieRip-!lON51-EN?<9((blZ59UmUf3=1{~Y7L;lOYViZ!YaJjKRqMj
z-)GdDE*wd<Z_uIvhEGjOaxU`Ir;A&ctbo_>r{sp8eK>+7BY_c7qa!UDm$lE{XN_r&
zSF~v=xJMh`RFM!27nn2$Yl`b_6UcFd4|*si?lvB%PMw)@5s38|S=Z1h&0scP&B)|w
zfoztTNzCoDi28oqpE^%Dr-!HSi!d;Oq$dI^oMoV=r0F5xxjfHnbRhgzBoA`nPX6Kh
zdJ|1O7U7@0qbRTufgvn2e|@XlHy@T=YXyMj(5A{=-EAQ6*&OflZejH|<Opw4p>j6}
zCwf*TD!OiqyWB2(_1+Q^dw<!-X}xzJBs7S<ziYOn{SXT{a`-lWnIHpy&Y-Oomspe-
zx^h%;4z1zeQcF)%ch8EkYPWTzS*q+|W|C{z29hp-<?g=L+NruftS^??AVxwZvuYtW
z%8+#Mxfb6g`<7>4UF)DXq-p4Vf2OLM6_UwaoBjgBv`aP`57$zWmKRQ3Ynk43lC2Bs
zkJ;+W@?Pr8z84-p&hWT2AzABr;(sEM&j7jNqBvWJs}8mIdQ}@bqlMGv=JCYzEUv^n
zw@x&P=-Cwo#+L0QQ$?pR_;F}2u@>pblZ$u!)?(QTp*D@71@CK`mjM1N>*8$io?1|y
z!ueTBMp?_?i2Hhvg1uE&KCJ54;c`8Yh@)OwV7W;j#J=3VQieNMfuk%JB1e+mo-<Cd
zy)2vJSb8CF`^sE1qWs0x(aymr=5|fRvoo(@C2orN66)ottI?%IAlRXqrm78yU!lF|
zRjR?=$+B#sy|ldI(?T|<i!#(U-ZwO4G6R+KFFuwA6*qnzDYHI0L)oi#?ZbwZ^Xhe+
z{fEu$h4M36i;#>%8w@QYj#6-xT9M}YY!PBsNpz}<k>$~~M*a7-J6E2>L>y-xX!W34
z7EJ=PwQs~TL$v;kKe5|kOoFsL$VWsYior{>#_s;5ZOf9pSIOjpcY`mq7V|M?w?U=*
zr=SjpbDe4(J0yQMsJR+dG|@Ht8e_-COPRu)88^5ba$NdyAXQL|F&)%bjb(gSEz3Wv
zH6vWF@^G9aG-91cl-TJ*H)rGH(;Q1{d3%4aOH?;<$@Q?LF-|fz95Zeay@{SE*C?$3
zQlww{VLwLe?O67}%pUHkR8?mkZ)MH=w7QpWIAS)<(=a+yo5X|W%!COnN1Dy#=~Iv-
zTI(z2Jn6Qz9bG*|u6moWtxYt0WIItkG*@1!9(BIH;;D{vvV4+OhL|m4db>=a8oOv3
zx72LgHgvna*|@MsWXjK}uWc$IBK2CsgNWFbW2S7;gF;E@a(XneaX%0@m^eMa7+9J9
zPQ9*#`KcjIbG#+4aaqG8<F}>~Wx^uh?inbJ_1$Q|;c-r(s?CcG$!KjCm1F%HdT9yK
zT51WYm8E>3u=~DQ2K8a`!~O)Q@6>^?xNjwC$3ILaLSv2AC14wc5+0;|T6X<K_hh4u
ztfPKspP;=QNyG5nM7}z0!bGk!S6q*%a%+0)6LHtNnU;q7y?%hi9euQuvZklEx6N8%
z_dec!+N?4Qt)-6M!u_Fzu%mWhT066+!oI8edt2up%jx=G>zr*(?F(p#KAMiX&OBM;
z8s8PDg<d8NW~#S%Ie3RzH&<$+&Vm;-u?kvTZt89&EnU@#C${iXr-mcQwo7ZzY{oof
zlWHGh0q`|b<JjJ<PIJ<{wbcc)#5LZ=^{erjwTra%C4uAIV_q&7tu>%kyU=Evu@Qkk
zdI1^^Lm@aan*Lf6HKZ*^ynW(xIhvUUAdn+gFspJS9iXZw#m)Rd`mE^~A+|uu9~w=R
z#TXP;cQMb@C4Nl{!4ztTu#?N>@0M2U7O>V0ys=lBadPEs1xv46C6CyqaF`s2DX3p)
z-83~;cPG&)8OH(oG|<+nSGK$Z1FL6d%l1ka5-%Gx*QF~#b9;NP3l??qk#+EGvzcqn
zJ`+MfOY4EL)uXapB4EGXLe0x>OUp(eQQ$!hIQNKoqdp0*^m=79n2EW93tvlM_0%bm
zi$-awV64og9VEj0vq33wBPYlcT7+Vfesbi%T12NLcorz`0J%TaD4BEf$>}4gM+=r?
zWBKc&Rmq6XqNm3<N%>Io64hiY-%t`Z4`KAe)VY0>u!HO>*}zE#n4kB380<xaS_~N|
zI3{ni{H$F+-bEaVqU<c!o0V%o4qon-8l6p^o{(HcaVHqnS;<ff8>RC5Tt7A*7De2Q
z)j{e?(=F8-tR{kOqn1`$e2&9efr;f6)r>NCPG;)ShV7Q2J-UVE)q$Kt#bs&g>X)Y!
z3X(6ejgNTwmt06=y^D0<Y&PjaSy)`bFPG_?8uZHZ+(b>?(G$!Ivg+>~1lzyP{4juT
z7&R>&;+`NOs$0jZZ^@zsx!pC*>afPI3C)UA{H6RYz~;V(+gF>yXMlXF@x?wU!w9L?
z{v%aW-)56#5qDF6tW>q}lW6l#?GYiI1)!(;$WgH8ZFZKk!^?2G){W*mhu(75qp7#J
z{2zl9JXG$?(oJRtW+_;ho79dR=2{Yp5cAqf3n05@k~WA1mZr6?BDPx&{5PkSU<21i
z#n0KNxil!<#9|`@Xe?MsP!(qooD_8tDJrG9w8AqcpE?yhX^k7$T<g4_nNU=o*0Ol3
znKjED9;YKx&(S?!SL!e(OW9nNs*y>9>~W!6f{Jgspu)_U=WsSS*w-XbJ#0MSfvCA;
z)sQfgn1)lXG|9~{Rhzpoa{~ey4Kb!y)D`W=*B;`pd9bE($T14cH}H*Rzi4hQRGHxD
z{v0kwyrid*?la?dzNU@B4ekHv;O&!R;$c~B<S}Ic?h^bOIGYs47x*Xs1hq;wufXT2
zrMeUT4k=TOtSxyEk9OHu3Gvv*SFTUWj+&axghpK8H%xa+90svbCA+02s`Esb(qGsY
zhW4Om3h=9!c=z&6TlVD*ZapI-wK^CB1>xW$LTP`|4}j-@q(~hMX*F0#q=Gfq(jMbh
zN#hh0h?OroyBiIsb3dj|2(?idx9p6G-mYkndn#-~B_>yGBv;ZH)wmCY#53cRCes_L
zAPx#2t?RYE2s~qO7}5flUbI#(X(X9s7qql#W^a^NH%gl!KEK=w*4lrZJb}8KGN`$N
zrVGRq49&CjjJccEPeIl`_|E$`5_c+`d4hh0##+t8vcS%!0({0xE|<YKuJ#9{td*{p
z!Oe>_{^NVd$n|v!x)uO#*H%`!>f|yjx~V-Xvzst(%V&I78NVW$zl^$EnGgMMwD*0@
zHKH{5-cW~8dK5z-9u=J+cVF*u@*K|!3!q}L2p=0-Y!rp7ginO_x@N+l&iTnIqJrf|
zm#-(?<ycy;_3m3Qw`mSfmEi;6U~?#;PPo(mHTKqFQFc+^sFc8CfFRvSNDD}JgOqf4
zcY`!2NJ+yWT}pR1Sain>Ff`K4&|T-ocpl#GJ?A<fe{nHf`(Cs6+N*x+w{FHlIze>e
z(mC%p*4IdC)nweq&Q_*{Vk>`p0WNf(6c>@nqf&P!s(i2?d1F0BqZnesXRKjRz~Rxe
zdh}@8da8e_+2FAMJ(pmAvG%;}LCUl_-PS>?@2L{|E6(!mSw=7KdC+F*XUdNU2OJ}t
zomiH8W1FP=bKV#|Rhyy%Gu^$`+TBAENMzF&=39$A3^oz_pz^*!4i5*=q?L9?bF=;r
z92R<PK3-nv@wM^J`>%RT&apx^YM>V<;1kX%YMl(HN%a8louFN-p~QSD9hWWY&C{6`
zS)U&dPxK#o{3*gOQoSG_P-@<YA}t*<1rf2~XyrNfrJ0_H<;g{##xZIA%yEmW-eAC`
zU^-}}7O!)<r2nB5IWc7;@9}fjljgl|gPQyz=$!q!2Q)!Q2s=GVTMg1lJ<zJ8nR6@N
z(%dkrO$_Dz{)l;OR?26p!MSA>=G1?VJH;5=ulsmU)kT>6DNs1h&XOi^k`rE4g{~=)
z9ZWoOvf7jlolaw)s(Xgp3aQBNpZ{bQ+q_jAT>=UouroW*5Xc(>*QqC|%`g(F_l(e8
zIJCB8>mOUYKTfb4*!G&6y*3B(MCGut=@SsH-?(E@$@iF(eQsgT?ugV>4(d-(q?uza
z{xux?>|TpTPMda+<`S$cL3XAN>f-1AW&BdDDQ-xMt=<Qw4s|z3M=kDstndW7jgmg!
zTs!!#d5)Xe+e1xbV0bPess!|~suhH)o6~8tYLh-TplLIx#-mrb*z95OBf=zRSV6bp
zcb#{V%5fvcPWi+RW7>3)`U)?!(RAAS7&c#}7j`g3pG{qBQ!4f*`jo*YdpJ!ei6BCp
zOT#RJPy4XQ&CRs}CgZe)d;}-sBSknZ=p|gcoTkGdxG;$e*_3{z4^#zzl6ZT;GXvx*
z;ruR|V|j%oY<oJXY-b-p4ope0e4e6#-`l`ko^^A(&SkiwiduVMB@1+>h)QTQ$qbW@
zC=ia89yK%=CE7o<OfsphKY()`L&9X$b9{)l3MBLe!Rpn<Dppr}h^bO~TvDn{SffjE
z>1wmOt8h&BI-ZzyWBNmFwJoPsLwAv>pw06}7qhRPpoo)6O<d-T-7`;zg2)m%bf%28
z<3aZ3`Ag|BLEB;t63BjapS**QlK$s`uC`)T{!KW-P*pfnYfR6Xbydd**9q7vC8T0g
z>ql*BdULnhB-*D>pX5?rT6F{@jO4x#Dt$r^B;SaXF)6&lTN9Z`s0rzEl_Cq4jhw4b
zK4-2BaSL93_To~j;WiogmUiH?g2X=HZtP|t>smHUN;xg@StpvIi)7AZs;@|>tJH<$
z^N(U%uMv{M>p@eyMsY!x=#tfP;u8u?G{8-hCv?ZZM2<?^7(Qgm`(9c!5F<0v{6GiV
zyUMH~IPnk`Iq|tP2WJhu$*hhPE{MbPTyQDY{op$(RHsRup&PRQ-Cw7=;k#P!d4W)_
zX;qzwXavWQJ3Tc=!6_80q)%&@o<bn6l3h_<S10=M#Jm&u=~@WNd3wpzqob6{N3{YL
zoii7-mM25$TOLt0)BD;ws&&N*Y5ma~E~`iMR3+0OQJJS*M?q&tF7<exA8mj<OP9gl
znh3_Ok~UD6YSd!UT?V168cQ5k=Y-^xrMo!cm?I(`ngA%X#ZZQT2YlPX{{pJ-*A|k@
zGa=SNMUGMNGsa;NKx5xykO{sLvTt0$?IGVV=TumF7a(msM!>K%^&>2Wa|-;xg_gFF
zlwUIbY;^9iN{n|OrfpMDw|f}2pI0+YL{lVyHJz;C^Uo=rw|MoF4)5(Ls=nknh;a$y
zWv-rKxvoFnDAz6JsCGgG+77aE-dRY7Aya43TrpQ34G8+{6bqQW60A>87VY{n?K)gr
zg!2O)Ow2x^w`11$%Ccv{Tv#7QTD^SyPZgVPO}&0a$*AA_Nwe3C<Ev61cfoyY#R)Jk
zt!G;=R(nmV46_!HgbTb}j0@{~f%U?#4TJFIb0#L({z{)i`xPe^?}vs$Lc|O-d;>9n
z%3trWyYS`P`$res)k8l=RrE)VKLp5rw|e527Y;HTor|KtCiZ9vE_MHjc90xV09Ulx
z7w_ON>#e@-c+4uS9+%oz9Vz-&>6*%EsomPby+JS<t;E<_q$Btq{WJ<%(yuziFdpL4
zH5OjYUU6%FzLdgGHNURDK<pA0VLGJ1Hmn$yT>gHnE5;?QwnXPCNXm0OEktGjQqIpK
z<gbSm)4fZhP4^tYbKpAP=1FRt&uKC%ZQvQ1-EOknC^@3nqv>}M&hiJ0>eaH8)vwz5
zFnXw}g1kHluWJECcDX@2^Em-0fR_D4$A?=k$AVhI!&hb0e@amxGllDLxTI^FZwsbF
zwJ&hcqpUOIX>@q0``HY`bj<Is*D)j3AJ0M?pWx_nqGPw$d{<@HJD2p+a*PT;em|&k
z-v1*#BQ9Rx5l;Gc=Dl+a09nYoLH=L>Te^-b`hMaD!1~{w%-*CF0x-!s62fvJ$cI`&
zqHme0F(DV@=rt8Dvb~SB%?SjgHNc$rkl0+d!D-H^YsP6h_G!I^fCh11^d(W3=YqvP
z{?{}k<_oLT)WEPL9o+a6mnruJ^B8s!K#j0n4L5EZe*)2fgU%%q0<YkplZYI@L_rur
z{*J;F!kSBj4og<*&uc=)I<U+q@R1!~2%@#kZuzk>eRK60(P}nVCR@Jc(7P+a_DKhL
zpzLCJ*ukRUg$Xex8VO(w)-{!}Z<nW{A#aC7QV&~GzjSGgKaCfLm;?pv*~)%kRW$zo
zqJLLTIFAg~xtp{b-sj6g=ESYNbDBY;BiG2Moj-1ccMV-kt*tXVI*-!AXZTV-y=qlm
z#?5&iN4;c~oKk|pCEmszzY_bUyWEl)?zNEzyr{WuW%2FNX~{RQA3VtC!uN1yQ7xbT
z1}`y?Kag!nTVhyBzZFhETL_NAI$cO5Nx^=doSGzHiY--4$4xChMKV9pc@?28Z%1st
z04Eyqdd?sNHe8^Te;`zwND`B$PII%Ng^@DjBB8HrcEIGZ;_wqk1+b!~D(fC_*P^BE
zt*BFLXR%{{kPte?>T=AZ4pWa}C)t-;>P*<n<r%7;!KD-Tth&=SV7c`6A|12{sx^xI
zGeNh}Z6G=NZn1#0q3LZm?63nOne8xKcV%}brI#lroVws>ol^FBxl)R3&8{##x)dp^
z?Sdl=d~u;%zwUqL62?G-6V~KDvguox&W1TWc^QC#0Ic~h;I$MUz#U#1((f{tnWuNz
z3U--D+2+PWEs3`3XJgUs?OdQZXyViGu9HuL!8r|fj;GOkSXle@%mcg!1|9(|Qvoc~
z807YU%G1BrC2v9gFpzv%x9I(hA$gi9q=cZ3lrjS;xno;j-CVp`itxd(LNUVC;Z^vf
z_XJgSS|A^ykc+Pl&-7*>`?L|j(%s(M5fjpb6qr<iy2NyJKmJ!(U@tyWFUT9+H+J;z
zy|3)=zuzG1<xq>VKi;9EHA(@1j;gMPY2E*4D`XGapH7dBL*Rm0Z6wQpy~}6K^lCI-
z&islk?mPeitpR|#n(r;L$ssopY1q;e8^p4;<v6CEbYK`+r&&{AenaN1vy0o2=slGW
zW|*FAXM2zO&)B~^8W4GpJ|$X|3hiZ?rk=Kc%MTM3`J?wi1se%aI~>-$8X}G7p1Dx-
zdcnF!K>4EId+Dg#VaKGhuS;jMesvHU5_8PwW%{2`Bh(#D>+Ac9QD8ox{Fan4Xl=>N
z(Br)23~oS_w;7?4Q>tgnTd~nD266FxV>0Nc`kwy9uLor~H!AO3V&dYU$l502QXLPQ
z;1Zu$IO!QUGL4$SJIPYpLPuf|+yn-<rvFHU8^nidiHZHjl5h|WV3Dt?WoFmQmE)Q&
zT-*GScO_1u$vz{|ysogXLq#WXbYhcC?x@7j(ayCMbsXqQ|DxsD2Qfp1NIBn81MzqF
z+SV;!=gWH_o-I4iVw><#!yg_$VBhn4>Yz6W6vf7l$v~eUo#tFckr3SO9u+>qWez%L
zmMiTD`a*=l;ghJVYqQ867yM*avm#wn#p)t7UvwLZc<@L~xp79gV&3)Y1Xq#U`09w>
zv-(4h&uobhG!vSwWQMe7$vlt>0Cy8hV;6{i0+ZoTEn3atBlCO2S>*v&G4F>$)^&M@
zivzq!EY1A(&(>q@t{f%c)62>-ddcbSguz~GWk6Z5nB^sEw^&KZy{>NUu&K1MQCEXy
z{H!m6P1}gYJIj`g`yN%g<vNNMSJe+J{#GvlpKyuUfkz@_l2`#+oyaIEtvx9-b+Owq
zzX)DwE%j2dFUE4N%XRa&$_C$t)EQar{xu#F5{Y6bq~$uzP4YVPE#jqBAnES+Pir8K
zEoHWDdQ+ow1CMcbQo0JA4=Bms@%fDu<rLP!r)O1lM<KP}1COR(mJHD%9CdrZyzcSX
z;0maNR)aL!VKyJAK&c^JxL|E0BB?bj(sD$Yvrwnfz;Kz4w-1D+iQz~w%SZ|+FT16v
z>#L<Lk1%8X#mNEnGtK~x?;}}G16M`!-VePT+6#ggB@$GwZQp5&VFJ|tTgB99$#wqk
zs*AO3W?8VBvTUuIdsU$NLb7bNEl$!X0`Qr7>6XGMLTm5TB#<oikvGX+eI&oOyW5pl
zE6&7gF)cDw^0hgiu#~)&A^)s$ENucZ>GBIpL&G28y$_$l&f{kHz23F&ZSJEzYxbEP
zb%);Dc=H*BK|fa{)5J7d)WSwWa@cUv&-ydd0%+dp;n#%#v_lKx$f`QUtkPO|t3&s<
zEUgS79IyWBrcK|$14hKEc)7n229WqZc?Q&L@i%|kbbtJZPye`K^dE5Pj)SemNT+u{
z=KO70QO$J6J&A5=YGR;vAobuOi-E$3t)4Bzev=eyb5m!D2q<H9qUcpZ@i0D-7z{vE
zN*LLm1*Ot{!7xyFst!F9_%S%QPrij-01*KbWNcHXYJi!0`dIUqELH>-&JcnitfRa_
zarsQ(=dL^wX2qZ4z|qq%PE@w2;}#09ogr>>oULQoNxNp&TXc2Pw}zIx)dI&MDIce_
z`PBn6`sNE&$NU!EKPmj}{SlA2$KI4HA=opm=3Gw}uKHcg{{7fPO<z5mzF2mW6nXY7
z;Ye7OTpdB8w>862ai=>6QA<%pPH&4<#-!~Q0Pw}NaH9|O#|ogAKw@iqY;0yD8T98C
z>@*yu=*mq#5i~27`WEl$L%%)NS}JQ9165eS0BXsIGGkg|fAZ&v9WGioXa?fU)^hU;
zk&Nv(b6Ntp=kFff+47Y4;j1c`oQS<)8YNa(D%NglSKx3Sr|JpE$xtfCA6Tq5#f&E~
zNH%|c^Q$eh9~XW$0(-4Nn)0EO2Gr5_LEXrwbxc(yS+<1`Sk{Ek(G`=~0hFuSdiwmy
z!oS@(1y4RUPNu2gaS$0-82xJEc$=J{q{UHOWa|&Dv>#wqX`1)7<t=y#9cNW|Y%NsV
z3sC*`wJ~^(b{9&?NMYlQ7uxkOTm#E(V&4rnsuC_r^B&6G_bJBl`sCSP(;~;L)+`t-
zN-DYBu78zH%O3RkM(4UDj<olF^!9gG+q#~W&&gR4OVzvj_$(#8*zFJFPvgbk-kn<)
zMhV#TkAzXk9Fm!Z(D}Ank79D8^X%g)dTfu+iXz5Cwk^eBwbOe=ucpZQ0G8%61AF_r
z42#T>m^Kr2q03_5d5RvRI9%!S$n@6L3BIqNwCK@bg*zP@C@Nh)3Ab|iCdxFt3M$B_
zA)nv4drmCDIF8rty?YsL#?{mUKjhZR%-`2>@e<fM;a+7-)?rQ!YHciE9k|l=yViBt
zXn@c({_Xm>?c*5(%zxM6u3?%?*8kwQ4H&P=C~1jJ`JGICeH9KR8Gi*Unbn3gF?d`u
zdpB+au;pN*-UXA;&ebGEkCF>{E1df{TU(N$vfLm|$!e;$Pf7OqplZWL?Gh!ZLxUzw
zYgNIduG!w?scIijJgxMMTkX|Hm&&(QXT_w|%?-D#wi<_b={mb^IN?sJfncCc-IH>;
z?{(SVX6b_Qr~3nF-9tg}8F6Bj!`~3k61LMcgQ84tydQcn6{ZbK`&8Gx@BVXw5Nfb-
zBGJO}vRpS~i>W%()g&p`5AJGa_w5(cthdm(G#D21e?v1H|771aqRcYd+BTAb*}=W3
z<x6B+KCt60-C6mQ6-QYRYz#Sr@%?gcf9@$iRtFyJ$%8OfDdFQ@YyG8XRh7?(>oLa6
z);_ILiWPYFP(ux1g{8G~x>Z#kVMn|RE;^320SDf_>*Wwhf^QQ9+w@L~urq}|`m$M-
zZRD;vq5Oh|M(MXbS{PF{(_R}E9|$$1pYra3d-E(of$-lvFoms&tLWU362*6=pfg^K
zAKbj0z{IjRnz4UOoxi`#^SSu+UH(^=a~pMFx*?wC<O(A97z;-P(b0T0z?X#9ktFov
zW-JTXs^L#Ra!ZU7w#SY3@{n5l!ii(mZ{}a}nu~Pse7o<|qICZVUwVx92yJe$CaCF-
zHxU)RXW%ORLODzkagYxmW!2zFi-PMBqWlpZeQ(MwT?>p$knePEeNfO`9--c<!_`a$
zZ=yEp`Nw0*!w*CU$I68sj8xjGlx+u?5eJbSJPf<j`YU#;p~Hq$6?A_YSMm#kl1(;M
zB&t{Es}(<RA~Ze5Lrl{db?cd12mgOaSNH!71t}%TKiAiD8Ap!1=Eqv!*l2WG?))s9
zBzX2S9u9{CZ}tTym0Uw5l9no2CLaQX1;^!MDb&ZrALHWU-st5@0il2{=IQi<cK_yO
z71pDqX#b6jv=OBmEx)neXpjx0Os~OF#IG!k$7OY<(Pg<ak|AxT?8?dM>7!gLw8Qic
z4R4?YoyV{v;Q6(b{~wU-0Xv>Y-e-&BRu*4+f?2Td#}6#isKkFU-hV?-pi>?|ZPOhc
z@7&`*0spSZJscR)<N7=Q`{!2$<j@gV(7Ol!Y638-hMK%MH}c!MsHEik<HnxpBtL?7
zfv?U$U*~-wPc(6S{Q~{pizl2Fjl9jPjWR4RD?4^STFsyM*7A!=1rW?;o5;xDaRVdI
z3^y&dWDfhY&487RZezX4?9V2^e&t{___eNmT^VB5Yf1s$R|bM&*Cri40G=UifQEmr
zRoB9%aITDEb}R8I;J^RUnT!Jh3e*13Pb=5!SmQ&YS;Cg?$FT~dVBh}l8*;;ctQsM+
zvjSGz&taoh7-vADw~gJfa<K0i<;xyFEUM~?^_W)=D2FY*>lnz$XxDu$XraQB>K@&8
zG-!kkCwn1W+AAWRs>ux`jB()|i9`ZIwt^fcUqii}CB3&!6B#sWQX#cAcr9k+B#3(o
z3!M{+L!?lFH>Z<BvplqO7g$oqxjD<06)ym4m9!5)7zLzt-W#2Wz$@2pa~?1Wk(zUs
z+JyOMrlio*12l>!Pi}c)Y?zuS24t5Dc(kAL+1bYoE<<KmBFXFzwXcm5zP%az_>g$r
zgbjOL@O(My=AayMIE8zLaby1-#vYd}Np%6RTBz)%Z~N;PB06eqxTde)^lChoC;Cn5
zlo<zJ_k)_7<6FJNj7%Owh5Xnx{brxN{Yqi+)Sn^4vCl9y2>jXgRUBqks*Lo)QWt9U
z^?v068Oe)_IhTWQ&q0|6KnK3be>}gv<^Y|dWw@>SF&Rg;#W+MVl9-NudRO9Na7gb~
zozUeBmuE^SkW@H1t~Pp)NH_BO;n#cq9xNBlm9pj!xu{->@&-x?c{b6InIsMNK7}tc
zDQ0->gqSzUtqq<v-y{pV=Otr(jRzoNF-@JXsy`Go(Ag7QffnyjZ$vzy1O8y>`p3B8
zjgwFL`2f+wwmXIb3H9+$RO1;89uhzc&!rX$d6IYhZ{VnT^X94t8x~yPOGuB_O>y<v
zw=MW|uD`qzAtI~D!q1%|jFvRB#Nl}O_4PcXkB~{P(2pQr8588T^-l!WyZ~|JJ3j(h
zo28g&G$Due>Fy$Bw$@XUgLh$pn9{t6s6s9HrGgsQv&dRc?RY=)4eDJCrb4dG`vd&j
zOcoj}r^K5v(KrGX72B9p8d&_Dzn5t04ksIE@oj?&*_xqpaux9()AORj6)F@7X8}!H
zN(^8O2MI-ko2A!9-rcXHoPPEkh|vR|W1T`~yK*L*yvN#ka<AaaI@f!~LwI?!hlF3D
zt@w19p#q2y=1lvWm78aQ4SI6%S;kVSq%JZ)+1QXjj*t)x)*u2_t^)(X0pRu3kU7o(
zil6g}{J_OZ$!=a_8RD%>*yQy%qiKl&`-B|x09o+zs=Zlu{bN0E!u`k6^0FjYI*Q+R
zB|@A73V&VB%-@_RIJ@F4Ln4-<Vy~^FMg%D;t&uo+)u(3Py(=Hw{0`N69^ZeXvRP_2
zUO<=crt%1P4KtJlQOp10#_Z>H?zSa$d{Zf??}L5vpq;$T_(7!ylc)sk=E5k;tqxsL
zF2Dim;K_sM!N3lLV&d~TS8PuZ$L6(gQl3wl33@uM#@z6&Rf}Zmk@6cRK#XT@Q?Z!H
zEDN85B=BtZ4|f2f^I8(XO84HasN5S%M6OIAm_=ucaRq1`p*D!{TXK?<Ko&kwa}$5x
zvGD+cyZz?IhUIeZgWd7jhitx^y(s@1uk%TxSk^#wa!ejXtWcff^te!Cwr^9GwHH9=
z^8kZKy5*HfzvN;#ae8-MP<RxhgZ3@2V<9Xc0-zwm(6Ov;?O+;O=mTDxE50hJnpj>h
z%gyS8DnAX5*5}?qDuT@9F~Z3AAIkNmoh8`Xxj}!Tey}fu<)&s{?%Z6}qisk>vD5R$
z)mYu`dw>NZ_?DA3^g;R-1YM9@@F%-G1@6}~7+)~Z(<Wq4*Wx_*6$cqBURc}9e9pJr
ze=o4pD6_gW0yI!2xZ%Z4+?oh0od45C{#D684J_kY$xXuD4=4my_%E&|?Rdi|0br-o
zka1@PBxgoc*x#2jwwbJ^Zvfm90Rc`Wl0oxABE(A31s9LOv(#W0Q{Iz}lls0<ORZ-l
z3?S-25`d9_Z`y<gTpdmX7bSpLTZj7377ztbQUWy0tDSig9ZsW1$dn3(%GW0=EK374
z-tYR?6^<3}T><5;9EA=?@n_k&f|&S!JQ%Qnh!jAxSrC?|EwZw*vLWE)g3^GCA}Jb@
zMvwD{Ckd?o;)$W8$HxvD?b=&@3sX7L{oP&ak<O%RfJJpQ2lXC(AQ?%>0dTs2I{ib!
zm(p!!om|XK??-t5tBwWm0aZ9YY-N7zn9QTX!ovLv{U|byN9~>md22qOI*|gEGF$CG
zo+V{tbiQj1Gtb?#BjkUx3(T4w9UB7}m;&?yB<Sgu-{k!Nx?Abj0Mvv`G8yCEw<{D8
z1R}c<3wfM)?<3`>6}7iU?`uftBZvc){O9amx*r?|$J!)<ToF6*UbTr^Sy{ag$EAo2
zS{?h(sV@N)abA5%dTaMy7C#<w(LQmNvuu`Lt(wC~BCbg}N^3E!@b})=ro}6DQ2PfG
zzT;1b0|Svzdp%yM*7)R$OW%^>+SGH3le<ZXf7foLUn0`lz`ZMq+`Ntgljzw^iX9^R
z$Y#?oT<y-p@v3=KIGL?4tq(wmNv_NkF_S=vJWQZ1dn29oLRIyayS5V22JpW&W+?Ml
z(BzTEv78X|VNB+xrjv@E(Z~p%G}Hs#9<PFGo?AQ_-5-d<elj7wF8%PH-EDL%ZHh=s
zwB>HAIVLDHPT7(R9~mVHskyxB>%;rWjlkb{(g@MkHeeXH#vD~JY>;*ZHd35y(eGsF
zbcCh-bfvxx;Ag_Q#@96ri!6Ocln5R^>37B_wen09V~*U>R^854`W4woqao6|mwT2<
zqzVevq+3mYY?h=ttUAsFkasI{<CP)n$I+!FP$jL87hct=`MfpnUlSo4hXDKR-bv}s
z6iarpk8R*%wy$Gw%TF`$D0o{hXO4?(5Q=q0O+HVz{Nj7H)7<<ky4%V<@TJgr0?X2a
zG1~L2#)>mNl7=NsP-;;@wF5q7W&<fHQ><K11b*@tU?Sc}P<@#h6oA%U?sL}2q7?}V
z=#8NxMeEjGdh>#v)h?kt(Y61Ld_U<Z!>?sS9e`uXcvB`Z)@yJ?7tM}m>wmtsR9@y~
zyrzqBy}5v?dwDPtH7{^tb>bj3!$y6x<Ns9G$T=ji11NtQ6>z|dO_%2z$i9ZUbvP!4
zg|ABALtO2mK{&^1*L%^Xva0EL>mTkRvdRm$RGI83G&)#>#3y25Y{L^Oqrt9IDv`E*
z65hS^E0IN^*BT8iKN_=Q=jjxCeQk=gVMp9^itaCAB@H`Dxul*<Ee<B~V=`S=d6CCD
ziEi{quP5YmawWYa;-8hT*uh@jyN3^!5f@SQb5|A(=6TrcU}BOad;h|0?6H~Rb=8W*
z@l%!^o)txnH5=_M7ad*bzH*1kexfWH$@Nit?KPxcLbT5}c8se`Z#N<KC7O69a8Tza
z0VsFE%`k}tq_8@@B6T`w6o@3T@s+ktG$_1M!LtS|28MQeO3kLsgk%<1&%{~kt6>Of
zY67|m|Iw1S9tr0*O`)t!9IW9-sd#E@o6tC;ea7bC(#WJ$8|JbmpS0B^S#7+fC$t7B
zOT)f7a3H+SzIa#PUovIj#qYK&eGrF@_WdbPkd}Ld+Ig!CfaSWi1K))LNruAk>J1mG
zj*)hbGyLkWCPGw`+%H)|`n0iW>)^bD2H>nUrK!i`2@z6|qwYj6!pAMl1c&VqBHSWk
zCL8ycggg-$HS<Mu8gx}kpxCiv?X6uZD3PA#7`#A!ygcOtY?a<&s#VO|dmTZ_DOX^q
zoTs|I%)NPfk0Dhp!=*9r_)vGty7)r=TfM@ylf}@0O(zUWGsk{#Ef5bsw}8y(ijkY6
zRWS3Kzeg1NYq|jGv4Vxaj75E%;E_<|Wb~2ejF>(G_q=)*Q|nt*!?Ek8u=>f$es+k?
zziNuk$+!oUvQLuJ0gZOIqE1#gxzu%q&3_&mwx3syG3leP70|nCvsGq_uf)?ltjlxr
z9j5WzXLq?$wQf)6vGKchS(=)gJv5au{Cc6<rPzF30h_j1+g7t&^2u<8Jlm*OXm{R7
zxFpy>y8jsTls)=LT>SK`nw0ny%)h7Ca?-gA-Klf$;@@fXT)C7&?U#+FMN%Fc?HW9M
zQ=Iuj0GzBER<Qnw(4O}%bq6>ZOh3f4zB)dK3svR)n7ZX!&@pxuU&71zkrC!{lmqjr
z3ES5+@+o<^6sT(AX7D$_`!eUJOO-eXnv0HJ>vYMKN>7>dMR^qVOnJ-_R0Lz^&)^iO
zStz#oUbf;fXYN#SnR_P%4|n<Byp=Vc-gR0^ahG|?ED=a%dmh!LV{Yt&r#RGd@uXf0
z(QCd;kJooM-+;Y&V~>hSU`)rh!VmXkQdgNeCnw&zVS^Fb7MZQR3K#dULWxHRdpGZA
zt*&UQ7JI1N+Gww`)*n?br@i$JJU&S3fcZaGb!u=fXUMOuDePxXWIW^2kbmr<wtU>Y
z+ARKr&*>+Q2fCg40y1KyBEUtzFbM(%5eNkKCo2ZGDO>+Spa63~K8{S7G_O|#rllRE
z7+h>`I{7@7E)*YX%&gQPWpP>SErg2EIw3G39XmBm8lid6<*}#JqEL`DHJwX4X96p{
zI9pgWnZ=Gw0KSa6-TtTCoOp|p4Z8oFt*|Q^ewkiU-Rw*q?c?VDG5cvt>GgdLgK{^y
zdJwr$Rov8Pwwt8)<&#Z@PzM5+f12jg;wYY@UQCu;2y($4qHnG}W5>)_ss=rK4g3w<
z!#P!aG9M~5yGf2<_^$0aaXk)??z#2cUCa+jIo&(C{5SU0$-}M~Nu{yF(k+CIJ)-A+
zwf_FTrNZQ<$lA^0Z_O_#ps!K#%oX9Yxp|IBNd??s=k>ShV;n`X<mJcrrp@2_p4GIO
zRJ*}X!oGp{12b2DKJUtN$<>!7O0GQ-+HIkSh&9Gb%_og-j8zHD9DJmrbl=nfhd(3*
zWmuWUyf&&bs9{XT<$+bAh21iC3)b-o(E9(Fid(A@YJ`~Vu}Zh44}A^EdC(UR_~QY+
z+?vY#+~U@)PfV@FKUM*QQP<K9GM~pQs>+cWGGBEn>!UH#@o%f|Yj9EqOXEeAmK<z{
zf^(xbpz0jc`+OQ2hZD{+lS*0*KKaz|#v7}b+9J#NR0)#PGYPph)|}0_M_uwwn?7AM
zCeTftx%ed{*Nf*_5NRnAUq>T3^f#Zb<@&t>2$&<!Vt^4TM6F<}zuc~0euILXyxq$C
z`#VqAbnpYb!3+WK)&69cwLu^%Dcda<EN}er7Ovw>;$~mP1B`Lh8&ZSCFr^q2^^e=Y
zrnR^acyMN?k1BqSI5e`n0X2e)r^y&=!Fjc+q2}(zMdUZHWs7Zs={3eW6G9AJdi1Y1
zo1>UYT2fA4f7_syO_L?Wz0Zp@W9elizk{3FKHJE4qmjhoO5%Hhn(2N6?oRQ8x@rl+
z+BJ>WhAmt%@L!NqQ&U&+5qh+K!z1w!d;jI{tpfI5i9AN-+V$Jm7>v-#3wV!X71ZUP
z?3;W%#u6(byQC@iZ?bhQ@B$|PrV<}}_c>zOA<fs`S$X{PGfZ0Ln;o*nriygYQX}!O
za^~uv9(@UoS!crXN21kuaUz?#V8;$M(Y{Bud`bQH=BtGQWM%=6j(sH9dhWzWS`WIS
zc}SE@SKVwDRsRHoad%?G&xQsGSyTZiqjkqBQ{1^VpU}kt!0C0;Lkov%wkMN32soiD
za&<4vuV+WD+{K$;bNj<zpO>yyBPFHwxQ4Ots!I_WxRyjcbpId9o`OuHF4!)Y*2iQ6
z9(VPu7b7Juu=;uBEZ#gG737Za)vN@VH}QMC3#CZbR)<Qb0Tp5HqhHYopsY&?!mDp#
ziV=#ZfPb&wHK`*_FYy#DMPl)vaO?F-JI1g6w-E>_Q2g2O(ID?K%LuRJzkx9tU0~XN
z5+QDQj9=+tTe|&&Pb4aBBnbVY)4Yiy<eccAI`gzN|0k^b9Wf9X*Un)X4QEbXsT=iP
zn(@RS$*8XYi|N*Az91$d$7BYEPdvcW+6)$E#D3G<S!eK7AaUCbuMa8M?$;?}J`dCk
zW5z9L{rOJ#9x^b%{obbnQzJ<a)rzs_|LyP}uT2eja^1&7l6N4^Urrt1!2^8Fx9O61
z%ALP_COI=uAX#>vd_nlH>;Dr4<cI=h7H9h<P)_*smp@L{Ne1{J>D&8}J4*OlQXIfA
z6d30Jj$!Xw2U*~Q24c<+K*n~*Qv>Nw0j6~1tK!>dh$jiJYJepn_q2aa)B5vJ;WU8i
z-L=tS#EE<^LGH4uaz(FbbXOiK1_X0g+O6&}hr55h6&Ni82(#XpC)|+(`1LLAaKq(D
zLZt^D^Ze8kA1yNKPt)N|@o)mbBjGYTI4RPiAYM@ggu{UW+IKfG%K=$|6Kuywd|@fC
zF&`dMtBP;CHQi{`MZwx+)-`nUzO0YLEgA&pR;{;aPh}JOl?k}3Qhxe&E+RA7t2ENX
z13$d8$K;#H3fuYz+9ge|z8UJQzEyjW|KRX#ut8=j0p+PYkqd-)bU&%~##SWCCJ~##
zUtOh4+}kXhR<YK`y^I3$1#rwA1>Hcpdgo{k-wF4ClDrh#p)Ei3_Kpij{|o9*)Qv=7
zfo3R_pCBb+G9mt%Uh>)a02wMOP^1I-?i#}6+`oc$1^%;@@;24vjaJiNR|e_w^#nGB
z{s&mA44g_3c=}Q_tH58Ud%M|>0?R<*7pgvt^+HLt1GLn0YS{uhi@roe&5STEyBfUm
z5|y5iB6D6f%@Na{?VFm<qJ+w<HQzbaqMtO&qJWj}^KYu_blckKmTf|OTR^-!&~^CH
zgp2k?L*<B6dy}?x+Q!67&rsq#4p@QpE!lpblFY48=f9}?Ez(xLwmmI7AlvQW3`KoJ
zJ=6@g>o`0FfnN$0_=&=?XgQyD?k3tsWALcHD@WEs9b8*<sws#OTZ{3kg~8>Ei|m%o
zXLdc)x}ks09S^Py)MjYw3gejgGN8)!E+gMM!?(oTFm$MGxNWhI>^B^?Y}BqcN**`%
zChJdp<C{_QFv;y-0rw_(ruNwdb34KLYe}2SMu@Su^nfmrmw;fy-Ouj@0?jeM3=6`!
zDBhpTAc*GUT(x1jg%Ndg9X$WiN4BOWI_=s%y(BhZD9*}A{$j(!%*kOVkZ@&jT*XT<
zH9lMs9GX&V&UfYY?w>-lWF_6hPxNKis@n{;B_{z+R^FvYYr1N|x=~=rrnZ^@%=)-W
zy?(V37rnu(xc$l!XcZ-{3wx$lc=9&?5o7t5++(%Yg-;r?b(4Gr<;w1rvrdKOE}%G+
z2T2^0$u$qwf|H~8>#9Msd_T9dQ4viN*)O)Lc(tK>3?3YCb#`>jT_7NXWj^7#bG4Te
zIu^1CfVdZT*+IA2&~igj@Z8Pde6-d-Y5!1gyy9~Q?gQTHtbI?M#$K6Jf+_JYDN{9J
z)H;-Gxo;NRyZ0B;(sWxrBs)rF#vZ0SxN_UpCAOb?xikiv8Lw1@sM_rKFVyWM7zLF>
zI`PJNEj$${7Q0@t0Yle<s&$(Zw$8KUmG{{r=bB~?9g}%%+C=xJ?P=V+v&58#fog!r
zv)ns&2q2B)JK4<PwV&@ivrjh0_c`-{c(<RE?ObKvIIWCIEv}1p=?H0*jeAz7dFWLQ
zJh$K4t!d+W=`L69r%UGr&1SH^2!;vB)#318dD)GKOX*A{<MI*rAfyq%&DNscFu1qK
zZor!1uB+Qj^`8{eoRgxPXWj}sA}hb^Q?S2#`M%=>ju7~#-<$}=O^kHSdyDXo8eh=n
z`(357-@Mp6z)(y%m)=~6ML3e=Vb4@yNK5jcShWC=YkGaKv|H4WSG4kO*5;V4TK$s1
zfiOYXB-g0IcBP6-1KoE9=3>cp>kj*P1(K=o&($?1Ax*z(%FWf=eBPb@vyvt{tg+Er
zztt<oXyTHzIt~rtwyp3Q&4~;VU0&GuL-JMX=}OT!Mecy?R!`5o+I+)#A;*uav?aJ4
z6^biq<;O4|Te+uTY(Y+SLin1G?M$-tKV(FKz(^zSgVljz4FEmx;aM5|Y?Q=uau~I}
z*{arjzVV&zLFbweyUFNwnUF2tSyy^o<JxL1cF4`L>lzV{fj;o|l0shEqQ+jeD!#p_
zUfo(s2>&>-!@*l@Aj&5Iyji?a!p1Y?7OWV2lhxMBvFoYga6wI%6`!Q!v1Rlh0>(V(
zywSSbfN0xPX>(_j+>4IQ3nt6&#i@5Iox0fM#_aWnn-mK9Ose5{nbntJ=A@Yi!y_L4
zyFfL9`U=S79@k$&dvb*a&-UB%=8Kk`J%zOC#YJt#`tYpX_&&Ut9eaR#-FMst>-%&R
z_x_s-4G&impSNWzgs76Q(l+&NjHg9L!=6s__denUei)Fav8nM12fSyPn{XjDG^!Hm
zNWQm~lkVw=68Mw+#nLfB&xn%OpBa~q)Gyw2mu}Sapqpzk+J#y}O2Aur1`|C`m+~{q
zX7}R+z8=dCV2Lu0rS#y=$ZB{fY*acLA(KYPLtO<W8EYV%<LQSHeYoOGq4?&s(QPcX
z3Yj2Sy*-<7hY)}}UlYV4A%;nj@3cV_$*wVy{5)O6W$VJ&>8}RY^BNu~E@S<FAE{yb
zi0{TIbuvW2`DZs@XCc9C;X6tE8B@0!xlLTWnlL9HI2U_+g{k9osd;!8wh4uOU&d8r
zwZ)1xw%+&ZT<H;wzFNqpyDF}ApSpnk{S?8eksAL#7d?#2Z5bDoT77XyWT`cF3Efa?
zGD%|aDI>^EMNf@&vPc`Fr)dA{aSH61e2qj8n8|u-7%uzKK9@y=yG)T-odSGSPKv?m
zQY8Btk#2uT{;hlDUAU3z0m7L)bZpE8HeLGwk2y`i`?x=ueQhwEh|6Ag(ZE-K439cp
z$fID{<L2N}%Y?1&E2UDG>DzjOHYf(&RPUR-HjH?iwv83Vn?7f#pyIheIS-jNH!LTI
zqtwbIJmopuHKG07{)LVtFfJ>RqzINsgu`62p;laL_HG@ucwBJw7*--?t+Tw}EbC#3
zplPGIJ}!0e)$G`Nd0$G_4tsTg(b;$>oqE7>_H)BZ+d&~V?wU@e4+o6_==@LU4;qjY
zg?=1K4|=DN^lt<wjP<0!ePw3R;0Ee{bE%{vib3jSGj~&K2IP`RiNm6RLn{jI<lo&u
zIRm3FhHQdWfkyTV=BdZ%%D!27UW=}>{j6FdttE>AsX-t_3B|hhrjnULQE!38H_>L$
z!)uVOXAg?B$hFu+IkW5WVsQyIwY7<LGRbf@-r<>Z)hbVI-mDe-i%b+?HtRfqZeOf^
z`vs)^4hsXh$rlFjt{{!yuDwqn19UThGXSb0IWl9{o*e;SM2?Dp;^_<rP&_q*3F#O{
z`c5d-vwB394a@mEygrI++(CY**vzPYg4f5SKr$kF-@tWVh;)$&$PHS23Q7J8R>{MW
znKSrIU#)-OvlwdeS2Wl6tiaOFliUF^;kQXFE^)(gaqN6!OD<8yQt=fVVTnU5*r&hq
zhn+S#W5~PW<<R57P(Bf0IudH+hyNA^rqPHR9-e-!)H<2v+{<V`KbIZ5KE633lIZtn
z-^_qcPOgNKGX<8@XAI-XuKD}g=k4UoJVFZWM}cO7rjCGM$KLi(_o|1`H=x&`w*28q
zTBIr(@-5W-Wsj2Uzc^U<A=WNuF7yL4*dSFP7@-<sg+6blxOviC9;yrK{Wenb`TPxc
zEee%Os;vV?8E!|y&PNyxdy|QqCD2f?*=>k-XQ^$>=U!D14)c$BE3yIbl;g2eo?IRU
zte6s`#lwBG>XNMqw|AynB_X|6Whqg4uo{x`+pdJIZlgQ$1jvwYw^>$GE0ANd0$mo*
z-t%_9-<3c`|FHdk#{0l@>(?VP{HLyWj^lR)@huQRKgPnJ`d3%~mMg-5Y$N{7?#3PX
z;`c|$w@C%ni>kXZ_w8E3ShrKF_7=w6?t02!@%`<Pi9hYe|Di5GPC*5XE>KwdlK*S)
z+?|+&{Qs>}_g4fhBX<uc{rfvb$p4a88c1Bo17rW!7mWJ|JUk3Fv+RG#8*>|AAU^v;
zn*F^Uw?a_e4kyFH5Y)Sy-rRn7pl=#rMIcN5o_}*ETah*f9!^E-({#79_?NsQKvML7
zc_38Gfxw*7?=WHk0OQM%A?HYm9Qz;R3Pq)|>gyo|a#V)_2uSArAG05kls#%?a%Fmr
zjZFC}AVlQJ4-!WcfM%~0`*(Z|AOkrmTBzSG*Wjh)Fw~(v9RzYeKiqgzK}Zyz9)M<k
zHuD%jxDVGy>Sx`dYl0^{Jiy#xa#7y<LIgnRi1_K9fqIzh4-megFe#e?czBLvnSSf}
zltt#}{QQ&8%)P?oRmeZb7*ezZo1D8XG)eEzws@*2Q`(@EqKE_Pf=R=6r#$eT8$m@h
z;>Po5cPyn4Fba2MY|KRAAj;{2m8A#$BblM+J*rG2F=<)jIFqmecy5k~i*Cg~|6O--
zG(<{Hai)k11z>a<T5No_SV%a7ViB;>Ut&vuZV)tc&+})QMoJ_5^kgQis1zivtb5aJ
z^wSK4_IGly?@xd+ZtufE*n7*1!BM%D-&ErR_E<p}6~z0VlA@`m#+%{-H4$|0Ufloh
z$+t5J$=hvLFs~S;=B=}yA{z$=tXqhQ0wNl5&)5NN-s}`8|NT3;K6+?4?pP$`4GhVh
zRscba|5>)l@jD=y`!zTAjxe{shVmd<f8j;_&qn+k`37R1-XJU?yzG1sRbz1v_>qxN
L5HAxoeEa_ZfajY(

literal 0
HcmV?d00001


From b38fc1136ef4f846a54117d8d9f1deb3aebe302e Mon Sep 17 00:00:00 2001
From: yh_cc <poemsmileyh@gmail.com>
Date: Sun, 1 May 2022 00:32:11 +0800
Subject: [PATCH 9/9] =?UTF-8?q?1.=E4=BF=AE=E6=94=B9ProgressBart=E5=9C=A8Tr?=
 =?UTF-8?q?ainer=E4=B8=AD=E7=9A=84=E4=B8=80=E4=B8=AAbug;2=E4=BF=AE?=
 =?UTF-8?q?=E5=A4=8Dpytest=E7=9A=84bug?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/core/callbacks/callback_manager.py    | 54 ++++++++++++++-----
 fastNLP/core/callbacks/progress_callback.py   |  2 -
 fastNLP/core/controllers/trainer.py           | 16 ++----
 .../test_checkpoint_callback_torch.py         | 11 ++--
 .../test_load_best_model_callback_torch.py    |  4 +-
 .../callbacks/test_more_evaluate_callback.py  |  3 +-
 tests/helpers/utils.py                        |  9 ++--
 7 files changed, 61 insertions(+), 38 deletions(-)

diff --git a/fastNLP/core/callbacks/callback_manager.py b/fastNLP/core/callbacks/callback_manager.py
index 90d2e1b1..f63c6088 100644
--- a/fastNLP/core/callbacks/callback_manager.py
+++ b/fastNLP/core/callbacks/callback_manager.py
@@ -9,6 +9,8 @@ __all__ = [
 from .callback_events import Events
 from .callback import Callback
 from fastNLP.core.log import logger
+from .progress_callback import ProgressCallback, choose_progress_callback
+from fastNLP.envs import rank_zero_call
 
 
 def _transfer(func):
@@ -26,6 +28,43 @@ def _transfer(func):
     return wrapper
 
 
+def prepare_callbacks(callbacks, progress_bar):
+    """
+
+    :param callbacks:
+    :param progress_bar:
+    :return:
+    """
+    _callbacks = []
+    if callbacks is not None:
+        if isinstance(callbacks, Callback):
+            callbacks = [callbacks]
+        if not isinstance(callbacks, Sequence):
+            raise ValueError("Parameter `callbacks` should be type 'List' or 'Tuple'.")
+        callbacks = list(callbacks)
+        for _callback in callbacks:
+            if not isinstance(_callback, Callback):
+                raise TypeError(f"callbacks must be of Callback type, instead of `{type(_callback)}`")
+        _callbacks += callbacks
+
+    has_no_progress = False
+    for _callback in _callbacks:
+        if isinstance(_callback, ProgressCallback):
+            has_no_progress = True
+    if not has_no_progress:
+        callback = choose_progress_callback(progress_bar)
+        if callback is not None:
+            _callbacks.append(callback)
+    elif progress_bar is not None and progress_bar != 'auto':
+        logger.warning(f"Since you have passed in ProgressBar callback, progress_bar will be ignored.")
+
+    if has_no_progress and progress_bar is None:
+        rank_zero_call(logger.warning)("No progress bar is provided, there will have no information output "
+                                       "during training.")
+
+    return _callbacks
+
+
 class CallbackManager:
     r"""
     用来管理训练过程中的所有的 callback 实例；
@@ -45,24 +84,13 @@ class CallbackManager:
         """
         self._need_reproducible_sampler = False
 
-        _callbacks = []
-        if callbacks is not None:
-            if isinstance(callbacks, Callback):
-                callbacks = [callbacks]
-            if not isinstance(callbacks, Sequence):
-                raise ValueError("Parameter `callbacks` should be type 'List' or 'Tuple'.")
-            callbacks = list(callbacks)
-            for _callback in callbacks:
-                if not isinstance(_callback, Callback):
-                    raise TypeError(f"callbacks must be of Callback type, instead of `{type(_callback)}`")
-            _callbacks += callbacks
         self.callback_fns = defaultdict(list)
         # 因为理论上用户最多只能通过 'trainer.on_train_begin' 或者 'trainer.callback_manager.on_train_begin' 来调用，即其是没办法
         #  直接调用具体的某一个 callback 函数，而不调用其余的同名的 callback 函数的，因此我们只需要记录具体 Event 的时机即可；
         self.callback_counter = defaultdict(lambda: 0)
-        if len(_callbacks):
+        if len(callbacks):
             # 这一对象是为了保存原始的类 callback 对象来帮助用户进行 debug，理论上在正常的使用中你并不会需要它；
-            self.class_callbacks = _callbacks
+            self.class_callbacks = callbacks
         else:
             self.class_callbacks: Optional[List[Callback]] = []
 
diff --git a/fastNLP/core/callbacks/progress_callback.py b/fastNLP/core/callbacks/progress_callback.py
index bacdea48..335345e0 100644
--- a/fastNLP/core/callbacks/progress_callback.py
+++ b/fastNLP/core/callbacks/progress_callback.py
@@ -11,8 +11,6 @@ __all__ = [
 from .has_monitor_callback import HasMonitorCallback
 from fastNLP.core.utils import f_rich_progress
 from fastNLP.core.log import logger
-from fastNLP.core.utils.utils import is_notebook
-
 
 
 class ProgressCallback(HasMonitorCallback):
diff --git a/fastNLP/core/controllers/trainer.py b/fastNLP/core/controllers/trainer.py
index 307901b1..5223c9d8 100644
--- a/fastNLP/core/controllers/trainer.py
+++ b/fastNLP/core/controllers/trainer.py
@@ -19,8 +19,8 @@ from .evaluator import Evaluator
 from fastNLP.core.controllers.utils.utils import TrainerEventTrigger, _TruncatedDataLoader
 from fastNLP.core.callbacks import Callback, CallbackManager, Events, EventsList
 from fastNLP.core.callbacks.callback import _CallbackWrapper
+from fastNLP.core.callbacks.callback_manager import prepare_callbacks
 from fastNLP.core.callbacks.callback_events import _SingleEventState
-from fastNLP.core.callbacks.progress_callback import choose_progress_callback
 from fastNLP.core.drivers import Driver
 from fastNLP.core.drivers.utils import choose_driver
 from fastNLP.core.utils import get_fn_arg_names, match_and_substitute_params, nullcontext
@@ -133,7 +133,7 @@ class Trainer(TrainerEventTrigger):
              ["all", "ignore", "only_error"]；当该参数的值不是以上值时，该值应当表示一个文件夹的名字，我们会将其他 rank 的输出流重定向到
              log 文件中，然后将 log 文件保存在通过该参数值设定的文件夹中；默认为 "only_error"；
             progress_bar: 以哪种方式显示 progress ，目前支持[None, 'raw', 'rich', 'auto'] 或者 RichCallback, RawTextCallback对象，
-                默认为 auto , auto 表示如果检测到当前 terminal 为交互型 则使用 RichCallback，否则使用 RawTextCallback对象。如果
+                默认为 auto , auto 表示如果检测到当前 terminal 为交互型则使用 RichCallback，否则使用 RawTextCallback对象。如果
                 需要定制 progress bar 的参数，例如打印频率等，可以传入 RichCallback, RawTextCallback 对象。
             train_input_mapping: 与 input_mapping 一致，但是只用于 train 中。与 input_mapping 互斥。
             train_output_mapping: 与 output_mapping 一致，但是只用于 train 中。与 output_mapping 互斥。
@@ -212,17 +212,7 @@ class Trainer(TrainerEventTrigger):
         self.driver.set_optimizers(optimizers=optimizers)
 
         # 根据 progress_bar 参数选择 ProgressBarCallback
-        progress_bar_callback = choose_progress_callback(kwargs.get('progress_bar', 'auto'))
-        if progress_bar_callback is not None:
-            if callbacks is None:
-                callbacks = []
-            elif not isinstance(callbacks, Sequence):
-                callbacks = [callbacks]
-
-            callbacks = list(callbacks) + [progress_bar_callback]
-        else:
-            rank_zero_call(logger.warning)("No progress bar is provided, there will have no information output "
-                                           "during training.")
+        callbacks = prepare_callbacks(callbacks, kwargs.get('progress_bar', 'auto'))
         # 初始化 callback manager；
         self.callback_manager = CallbackManager(callbacks)
         # 添加所有的函数式 callbacks；
diff --git a/tests/core/callbacks/test_checkpoint_callback_torch.py b/tests/core/callbacks/test_checkpoint_callback_torch.py
index ca2a3292..0ae9e801 100644
--- a/tests/core/callbacks/test_checkpoint_callback_torch.py
+++ b/tests/core/callbacks/test_checkpoint_callback_torch.py
@@ -73,7 +73,7 @@ def model_and_optimizers(request):
 @pytest.mark.parametrize("driver,device", [("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1)])  # ("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1)
 @pytest.mark.parametrize("version", [0, 1])
 @pytest.mark.parametrize("only_state_dict", [True, False])
-@magic_argv_env_context
+@magic_argv_env_context(timeout=100)
 def test_model_checkpoint_callback_1(
     model_and_optimizers: TrainerParameters,
     driver,
@@ -193,7 +193,7 @@ def test_model_checkpoint_callback_1(
             trainer.load_model(folder, only_state_dict=only_state_dict)
 
             trainer.run()
-
+            trainer.driver.barrier()
     finally:
         rank_zero_rm(path)
 
@@ -203,7 +203,7 @@ def test_model_checkpoint_callback_1(
 
 @pytest.mark.parametrize("driver,device", [("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1)])  # ("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1)
 @pytest.mark.parametrize("only_state_dict", [True])
-@magic_argv_env_context
+@magic_argv_env_context(timeout=100)
 def test_model_checkpoint_callback_2(
         model_and_optimizers: TrainerParameters,
         driver,
@@ -283,6 +283,7 @@ def test_model_checkpoint_callback_2(
 
             trainer.load_model(folder, only_state_dict=only_state_dict)
             trainer.run()
+            trainer.driver.barrier()
 
     finally:
         rank_zero_rm(path)
@@ -295,7 +296,7 @@ def test_model_checkpoint_callback_2(
 @pytest.mark.parametrize("driver,device", [("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 0)])  # ("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1)
 @pytest.mark.parametrize("version", [0, 1])
 @pytest.mark.parametrize("only_state_dict", [True, False])
-@magic_argv_env_context
+@magic_argv_env_context(timeout=100)
 def test_trainer_checkpoint_callback_1(
     model_and_optimizers: TrainerParameters,
     driver,
@@ -413,6 +414,7 @@ def test_trainer_checkpoint_callback_1(
             trainer.load(folder, only_state_dict=only_state_dict)
 
             trainer.run()
+            trainer.driver.barrier()
 
     finally:
         rank_zero_rm(path)
@@ -661,6 +663,7 @@ def test_trainer_checkpoint_callback_2(
             trainer.load(folder, model_load_fn=model_load_fn)
 
             trainer.run()
+            trainer.driver.barrier()
 
     finally:
         rank_zero_rm(path)
diff --git a/tests/core/callbacks/test_load_best_model_callback_torch.py b/tests/core/callbacks/test_load_best_model_callback_torch.py
index 0bc63bd5..f5b67f95 100644
--- a/tests/core/callbacks/test_load_best_model_callback_torch.py
+++ b/tests/core/callbacks/test_load_best_model_callback_torch.py
@@ -16,7 +16,6 @@ from fastNLP.core.controllers.trainer import Trainer
 from fastNLP.core.metrics.accuracy import Accuracy
 from fastNLP.core.callbacks.load_best_model_callback import LoadBestModelCallback
 from fastNLP.core import Evaluator
-from fastNLP.core.utils.utils import safe_rm
 from fastNLP.core.drivers.torch_driver import TorchSingleDriver
 from tests.helpers.models.torch_model import TorchNormalModel_Classification_1
 from tests.helpers.datasets.torch_data import TorchArgMaxDataset
@@ -112,7 +111,8 @@ def test_load_best_model_callback(
     results = evaluator.run()
     assert np.allclose(callbacks[0].monitor_value, results['acc#acc#dl1'])
     if save_folder:
-        safe_rm(save_folder)
+        import shutil
+        shutil.rmtree(save_folder, ignore_errors=True)
     if dist.is_initialized():
         dist.destroy_process_group()
 
diff --git a/tests/core/callbacks/test_more_evaluate_callback.py b/tests/core/callbacks/test_more_evaluate_callback.py
index 16ee3e17..115f519a 100644
--- a/tests/core/callbacks/test_more_evaluate_callback.py
+++ b/tests/core/callbacks/test_more_evaluate_callback.py
@@ -171,7 +171,7 @@ def test_model_more_evaluate_callback_1(
             trainer.load_model(folder, only_state_dict=only_state_dict)
 
             trainer.run()
-
+            trainer.driver.barrier()
     finally:
         rank_zero_rm(path)
 
@@ -255,6 +255,7 @@ def test_trainer_checkpoint_callback_1(
             trainer.load(folder, only_state_dict=only_state_dict)
 
             trainer.run()
+            trainer.driver.barrier()
 
     finally:
         rank_zero_rm(path)
diff --git a/tests/helpers/utils.py b/tests/helpers/utils.py
index c0b51a8b..7e02ca0d 100644
--- a/tests/helpers/utils.py
+++ b/tests/helpers/utils.py
@@ -33,6 +33,8 @@ def recover_logger(fn):
 def magic_argv_env_context(fn=None, timeout=600):
     """
     用来在测试时包裹每一个单独的测试函数，使得 ddp 测试正确；
+    会丢掉 pytest 中的 arg 参数。
+
     :param timeout: 表示一个测试如果经过多久还没有通过的话就主动将其 kill 掉，默认为 10 分钟，单位为秒；
     :return:
     """
@@ -46,9 +48,10 @@ def magic_argv_env_context(fn=None, timeout=600):
         env = deepcopy(os.environ.copy())
 
         used_args = []
-        for each_arg in sys.argv[1:]:
-            if "test" not in each_arg:
-                used_args.append(each_arg)
+        # for each_arg in sys.argv[1:]:
+        #     # warning，否则 可能导致 pytest -s . 中的点混入其中，导致多卡启动的 collect tests items 不为 1
+        #     if each_arg.startswith('-'):
+        #         used_args.append(each_arg)
 
         pytest_current_test = os.environ.get('PYTEST_CURRENT_TEST')