| @@ -126,7 +126,8 @@ class Callback: | |||
| :param trainer: `fastNLP.Trainer` | |||
| :param batch: batch 的数据,已经经过 input_mapping (如果有) 以及 移动到指定设备 。 | |||
| :param list[int] indices: 当前的 batch 是 dataset 中的哪些数据 | |||
| :param list[int] indices: 当前的 batch 是 dataset 中的哪些数据。仅在 DataLoader 支持得到当前 batch index 的时候有值, | |||
| 其它时候为 None 。 | |||
| """ | |||
| pass | |||
| @@ -9,6 +9,8 @@ __all__ = [ | |||
| from .callback_events import Events | |||
| from .callback import Callback | |||
| from fastNLP.core.log import logger | |||
| from .progress_callback import ProgressCallback, choose_progress_callback | |||
| from fastNLP.envs import rank_zero_call | |||
| def _transfer(func): | |||
| @@ -26,6 +28,43 @@ def _transfer(func): | |||
| return wrapper | |||
| def prepare_callbacks(callbacks, progress_bar): | |||
| """ | |||
| :param callbacks: | |||
| :param progress_bar: | |||
| :return: | |||
| """ | |||
| _callbacks = [] | |||
| if callbacks is not None: | |||
| if isinstance(callbacks, Callback): | |||
| callbacks = [callbacks] | |||
| if not isinstance(callbacks, Sequence): | |||
| raise ValueError("Parameter `callbacks` should be type 'List' or 'Tuple'.") | |||
| callbacks = list(callbacks) | |||
| for _callback in callbacks: | |||
| if not isinstance(_callback, Callback): | |||
| raise TypeError(f"callbacks must be of Callback type, instead of `{type(_callback)}`") | |||
| _callbacks += callbacks | |||
| has_no_progress = False | |||
| for _callback in _callbacks: | |||
| if isinstance(_callback, ProgressCallback): | |||
| has_no_progress = True | |||
| if not has_no_progress: | |||
| callback = choose_progress_callback(progress_bar) | |||
| if callback is not None: | |||
| _callbacks.append(callback) | |||
| elif progress_bar is not None and progress_bar != 'auto': | |||
| logger.warning(f"Since you have passed in ProgressBar callback, progress_bar will be ignored.") | |||
| if has_no_progress and progress_bar is None: | |||
| rank_zero_call(logger.warning)("No progress bar is provided, there will have no information output " | |||
| "during training.") | |||
| return _callbacks | |||
| class CallbackManager: | |||
| r""" | |||
| 用来管理训练过程中的所有的 callback 实例; | |||
| @@ -45,24 +84,13 @@ class CallbackManager: | |||
| """ | |||
| self._need_reproducible_sampler = False | |||
| _callbacks = [] | |||
| if callbacks is not None: | |||
| if isinstance(callbacks, Callback): | |||
| callbacks = [callbacks] | |||
| if not isinstance(callbacks, Sequence): | |||
| raise ValueError("Parameter `callbacks` should be type 'List' or 'Tuple'.") | |||
| callbacks = list(callbacks) | |||
| for _callback in callbacks: | |||
| if not isinstance(_callback, Callback): | |||
| raise TypeError(f"callbacks must be of Callback type, instead of `{type(_callback)}`") | |||
| _callbacks += callbacks | |||
| self.callback_fns = defaultdict(list) | |||
| # 因为理论上用户最多只能通过 'trainer.on_train_begin' 或者 'trainer.callback_manager.on_train_begin' 来调用,即其是没办法 | |||
| # 直接调用具体的某一个 callback 函数,而不调用其余的同名的 callback 函数的,因此我们只需要记录具体 Event 的时机即可; | |||
| self.callback_counter = defaultdict(lambda: 0) | |||
| if len(_callbacks): | |||
| if len(callbacks): | |||
| # 这一对象是为了保存原始的类 callback 对象来帮助用户进行 debug,理论上在正常的使用中你并不会需要它; | |||
| self.class_callbacks = _callbacks | |||
| self.class_callbacks = callbacks | |||
| else: | |||
| self.class_callbacks: Optional[List[Callback]] = [] | |||
| @@ -94,20 +94,21 @@ class LoadBestModelCallback(HasMonitorCallback): | |||
| else: | |||
| self.buffer.seek(0) | |||
| trainer.load_model(folder=self.buffer, only_state_dict=self.only_state_dict) | |||
| self._delete_after_after(trainer) | |||
| def _delete_after_after(self, trainer): | |||
| trainer.driver.barrier() | |||
| if self.delete_after_after: | |||
| if self.real_save_folder: | |||
| logger.info(f"Deleting {self.real_save_folder}...") | |||
| shutil.rmtree(self.real_save_folder, ignore_errors=True) | |||
| try: | |||
| # 如果是 emtpy 的,就会被删除掉 | |||
| os.rmdir(self.save_folder) | |||
| except: | |||
| pass | |||
| elif hasattr(self, 'buffer'): | |||
| self.buffer.close() | |||
| del self.buffer | |||
| trainer.driver.barrier() | |||
| self._delete_folder() | |||
| trainer.driver.barrier() | |||
| def _delete_folder(self): | |||
| if self.real_save_folder: | |||
| logger.info(f"Deleting {self.real_save_folder}...") | |||
| shutil.rmtree(self.real_save_folder, ignore_errors=True) | |||
| try: | |||
| # 如果是 emtpy 的,就会被删除掉 | |||
| os.rmdir(self.save_folder) | |||
| logger.debug(f"Since {self.save_folder} is an empty folder, it has been removed.") | |||
| except: | |||
| pass | |||
| elif hasattr(self, 'buffer'): | |||
| self.buffer.close() | |||
| del self.buffer | |||
| @@ -11,8 +11,6 @@ __all__ = [ | |||
| from .has_monitor_callback import HasMonitorCallback | |||
| from fastNLP.core.utils import f_rich_progress | |||
| from fastNLP.core.log import logger | |||
| from fastNLP.core.utils.utils import is_notebook | |||
| class ProgressCallback(HasMonitorCallback): | |||
| @@ -6,7 +6,7 @@ from .padders.get_padder import get_padder | |||
| import re | |||
| from .utils import unpack_batch_mapping, unpack_batch_nested_mapping, pack_batch_nested_mapping, unpack_batch_sequence, \ | |||
| pack_batch_sequence, NESTED_DICT_SEPARATOR | |||
| pack_batch_sequence | |||
| sequence_idx_str = re.compile(r'^_\d+$') # 形如_0, _1 | |||
| SUPPORTED_BACKENDS = ['torch', 'jittor', 'paddle', 'numpy', 'raw', None] | |||
| @@ -16,10 +16,11 @@ class Collator: | |||
| def __init__(self, backend='torch'): | |||
| """ | |||
| 用于 pad 数据的对象。会自动将所有能够 pad (由 fastNLP 根据数据判定能否 pad )的数据都进行 pad 操作,默认 pad 的值为 0。 | |||
| 可使用 set_pad() 函数调整。如果有些 field 不想输出,可以使用 set_ignore() 函数进行设置。 | |||
| 可使用 set_pad() 函数调整。如果有些 field 不想输出,可以使用 set_ignore() 函数进行设置。Collator 在第一次进行 pad 的 | |||
| 时候自动根据设置以及数据情况,为每个 field 获取一个 padder ,在之后的每次调用中,都将使用对应的 Padder 给对应的 field 。 | |||
| :param backend: 对于可以 pad 的 field,使用哪种 tensor,支持 ['torch','jittor','paddle','numpy','raw',None], | |||
| 若为 None ,则不进行 padding 。 | |||
| :param backend: 对于可以 pad 的 field,使用哪种 tensor,支持 ['torch','jittor','paddle','numpy','raw',None]。 | |||
| 若为 None ,则不进行 padding 。该参数对本身就不能进行 pad 的数据没用影响,不能 pad 的数据返回一定是 list 。 | |||
| """ | |||
| self.unpack_batch_func = None | |||
| self.pack_batch_func = None | |||
| @@ -54,22 +55,25 @@ class Collator: | |||
| else: | |||
| self.batch_data_type = 's' | |||
| logger.debug(f"Since batch[0] has type:{type(batch[0])}, so the batch_data_type " | |||
| f"is {self.batch_data_type}") | |||
| f"is `{self.batch_data_type}`.") | |||
| if self.batch_data_type == 's': | |||
| self.unpack_batch_func = lambda x:{'_single': x} # 不需要做任何调整 | |||
| self.pack_batch_func = lambda x:x['_single'] | |||
| self.unpack_batch_func = lambda batch, ignore_fields: {'_single': batch} # 不需要做任何调整 | |||
| self.pack_batch_func = lambda x: x['_single'] | |||
| elif self.batch_data_type == 'l': | |||
| self.unpack_batch_func = unpack_batch_sequence | |||
| self.pack_batch_func = pack_batch_sequence | |||
| elif self.batch_data_type == 'd': | |||
| if any([isinstance(v, Mapping) for v in batch[0].values()]): # 可能存在 nested 的dict。{'a': {'b': xx}}->{'a@@b': value} | |||
| if any([isinstance(v, Mapping) for v in batch[0].values()]): # 可能存在 nested 的dict。{'a': {'b': xx}}->{('a', 'b'): value} | |||
| self.unpack_batch_func = unpack_batch_nested_mapping | |||
| self.pack_batch_func = pack_batch_nested_mapping | |||
| else: | |||
| self.unpack_batch_func = unpack_batch_mapping | |||
| self.pack_batch_func = lambda x:x | |||
| unpack_batch:Dict = self.unpack_batch_func(batch) # 将各自 field 组成 batch 形式。 | |||
| if self.unpack_batch_func is unpack_batch_nested_mapping: # 比较特殊,需要防止继续往下延伸 | |||
| unpack_batch: Dict = self.unpack_batch_func(batch, self.ignore_fields, set(self.input_fields.keys())) | |||
| else: | |||
| unpack_batch:Dict = self.unpack_batch_func(batch, self.ignore_fields) # 将各自 field 组成 batch 形式。 | |||
| pad_batch = {} | |||
| if len(self.padders)==0: # 第一次运行,准备 padder | |||
| @@ -96,13 +100,13 @@ class Collator: | |||
| return self.pack_batch_func(pad_batch) # 根据情况恢复成与输入一致的类型 | |||
| def set_pad(self, field_name:str, pad_val:Union[int, float, None]=0, dtype=None, backend=None, | |||
| def set_pad(self, field_name:Union[str, tuple], pad_val:Union[int, float, None]=0, dtype=None, backend=None, | |||
| pad_fn:Callable=None) -> "Collator": | |||
| """ | |||
| 如果需要对某个 field 的内容进行特殊的调整,请使用这个函数。 | |||
| :param field_name: 需要调整的 field 的名称。如果 Dataset 的 __getitem__ 方法返回的是 dict 类型的,则可以直接使用对应的 | |||
| field 的 key 来表示,如果是 nested 的 dict,可以使用 @@ 来连接不同层次的 key,例如 {'a': {'b': 1}} 中的使用 a@@b; | |||
| field 的 key 来表示,如果是 nested 的 dict,可以使用元组表示多层次的 key,例如 {'a': {'b': 1}} 中的使用 ('a', 'b'); | |||
| 如果 __getitem__ 返回的是 Sequence 类型的,则可以使用 '_0', '_1' 表示序列中第 0 或 1 个元素。如果该 field 在数据中没 | |||
| 有找到,则报错;如果 __getitem__ 返回的是就是整体内容,请使用 "_single" 。 | |||
| :param pad_val: 这个 field 的默认 pad 值。如果设置为 None,则表示该 field 不需要 pad , fastNLP 默认只会对可以 pad 的 | |||
| @@ -126,11 +130,11 @@ class Collator: | |||
| f"index, but other field is set as dict mode." | |||
| elif self.batch_data_type == 'l': | |||
| assert sequence_idx_str.match(field_name) is not None, f"Other field is set as list mode. But the new " \ | |||
| f"field name is {field_name}" | |||
| f"field name is {field_name}." | |||
| if field_name == '_single': | |||
| self.batch_data_type = 's' | |||
| elif sequence_idx_str.match(field_name): | |||
| elif isinstance(field_name, str) and sequence_idx_str.match(field_name): | |||
| self.batch_data_type = 'l' | |||
| else: | |||
| self.batch_data_type = 'd' | |||
| @@ -165,8 +169,8 @@ class Collator: | |||
| collator.set_ignore('field1', 'field2') | |||
| :param field_names: 需要忽略的 field 的名称。如果 Dataset 的 __getitem__ 方法返回的是 dict 类型的,则可以直接使用对应的 | |||
| field 的 key 来表示,如果是 nested 的 dict,可以使用 @@ 来连接不同层次的 key,例如 {'a': {'b': 1}} 中的使用 a@@b; | |||
| 如果 __getitem__ 返回的是 Sequence 类型的,则可以使用 '_0', '_1' 表示序列中第 0 或 1 个元素。 | |||
| field 的 key 来表示,如果是 nested 的 dict,可以使用元组来表示,例如 {'a': {'b': 1}} 中的使用 ('a', 'b'); 如果 | |||
| __getitem__ 返回的是 Sequence 类型的,则可以使用 '_0', '_1' 表示序列中第 0 或 1 个元素。 | |||
| :return: 返回 Collator 自身 | |||
| """ | |||
| for field_name in field_names: | |||
| @@ -149,6 +149,7 @@ def is_number(dtype): | |||
| if dtype in (float, int, complex, bool) and not is_numpy_generic_class(dtype) \ | |||
| and not is_numpy_number_dtype(dtype): | |||
| return True | |||
| return False | |||
| except: | |||
| return False | |||
| @@ -161,6 +162,7 @@ if __name__ == '__main__': | |||
| # print(type(b[0])) | |||
| # print(b) | |||
| # import torch | |||
| print(is_number(type('a'))) | |||
| print(is_number_or_numpy_number(type(3))) # True | |||
| print(is_number_or_numpy_number(type(3.1))) # True | |||
| print(is_number_or_numpy_number(type('3'))) # False | |||
| @@ -2,54 +2,58 @@ from collections import defaultdict | |||
| from functools import reduce | |||
| from typing import Sequence, Mapping, Dict | |||
| NESTED_DICT_SEPARATOR = '@@' | |||
| def unpack_batch_mapping(batch:Sequence[Mapping])->Dict: | |||
| def unpack_batch_mapping(batch:Sequence[Mapping], ignore_fields:set)->Dict: | |||
| """ | |||
| 将 Sequence[Mapping] 转为 Dict 。例如 [{'a': [1, 2], 'b': 1}, {'a': [3], 'b': 2}] -> {'a': [[1, 2], [3]], 'b': [1, 2]} | |||
| :param batch: | |||
| :param ignore_fields: | |||
| :return: | |||
| """ | |||
| dict_batch = defaultdict(list) | |||
| for sample in batch: | |||
| for key, value in sample.items(): | |||
| if key in ignore_fields: | |||
| continue | |||
| dict_batch[key].append(value) | |||
| return dict_batch | |||
| def unpack_batch_nested_mapping(batch:Sequence[Mapping], _parent='')->Dict: | |||
| def unpack_batch_nested_mapping(batch:Sequence[Mapping], ignore_fields:set, stop_deep_fields:set)->Dict: | |||
| """ | |||
| 将 nested 的 dict 中的内容展开到一个 flat dict 中 | |||
| :param batch: | |||
| :param _parent: 内部使用 | |||
| :param ignore_fields: 需要忽略的 field 。 | |||
| :param stop_deep_fields: 不需要继续往下衍射的 | |||
| :return: | |||
| """ | |||
| dict_batch = defaultdict(list) | |||
| if _parent != '': | |||
| _parent += NESTED_DICT_SEPARATOR | |||
| for sample in batch: | |||
| for key, value in sample.items(): | |||
| if isinstance(value, Mapping): | |||
| _dict_batch = _unpack_batch_nested_mapping(value, _parent=_parent + key) | |||
| if key in ignore_fields: | |||
| continue | |||
| if isinstance(value, Mapping) and key not in stop_deep_fields: | |||
| _dict_batch = _unpack_batch_nested_mapping(value, ignore_fields, stop_deep_fields, _parent=(key,)) | |||
| for key, value in _dict_batch.items(): | |||
| dict_batch[key].append(value) | |||
| else: | |||
| dict_batch[_parent + key].append(value) | |||
| dict_batch[key].append(value) | |||
| return dict_batch | |||
| def _unpack_batch_nested_mapping(value, _parent)->Dict: | |||
| def _unpack_batch_nested_mapping(value, ignore_fields, stop_deep_fields, _parent)->Dict: | |||
| _dict = {} | |||
| _parent += NESTED_DICT_SEPARATOR | |||
| for k, v in value.items(): | |||
| if isinstance(v, Mapping): | |||
| __dict = _unpack_batch_nested_mapping(v, _parent=_parent + k) | |||
| _k = _parent + (k,) | |||
| if _k in ignore_fields: | |||
| continue | |||
| if isinstance(v, Mapping) and _k not in stop_deep_fields: | |||
| __dict = _unpack_batch_nested_mapping(v, ignore_fields, stop_deep_fields, _parent=_k) | |||
| _dict.update(__dict) | |||
| else: | |||
| _dict[_parent + k] = v | |||
| _dict[_k] = v | |||
| return _dict | |||
| @@ -63,10 +67,11 @@ def pack_batch_nested_mapping(batch:Mapping) -> Dict: | |||
| dicts = [] | |||
| for key, value in batch.items(): | |||
| keys = key.split(NESTED_DICT_SEPARATOR) | |||
| d = {keys[-1]: value} | |||
| for key in keys[:-1:][::-1]: | |||
| d = {key: d} | |||
| if not isinstance(key, tuple): | |||
| key = [key] | |||
| d = {key[-1]: value} | |||
| for k in key[:-1:][::-1]: | |||
| d = {k: d} | |||
| dicts.append(d) | |||
| return reduce(_merge_dict, dicts) | |||
| @@ -85,17 +90,21 @@ def _merge_dict(a, b, path=None): | |||
| return a | |||
| def unpack_batch_sequence(batch:Sequence[Sequence])->Dict: | |||
| def unpack_batch_sequence(batch:Sequence[Sequence], ignore_fields)->Dict: | |||
| """ | |||
| 将 Sequence[Sequence] 转为 Mapping 。例如 [[[1, 2], 2], [[3], 2]] -> {'_0': [[1, 2], [3]], '_1': [1, 2]} | |||
| :param batch: | |||
| :param ignore_fields: 需要忽略的field | |||
| :return: | |||
| """ | |||
| dict_batch = defaultdict(list) | |||
| for sample in batch: | |||
| for i, content in enumerate(sample): | |||
| dict_batch[f'_{i}'].append(content) | |||
| field_name = f'_{i}' | |||
| if field_name in ignore_fields: | |||
| continue | |||
| dict_batch[field_name].append(content) | |||
| return dict_batch | |||
| @@ -19,8 +19,8 @@ from .evaluator import Evaluator | |||
| from fastNLP.core.controllers.utils.utils import TrainerEventTrigger, _TruncatedDataLoader | |||
| from fastNLP.core.callbacks import Callback, CallbackManager, Events, EventsList | |||
| from fastNLP.core.callbacks.callback import _CallbackWrapper | |||
| from fastNLP.core.callbacks.callback_manager import prepare_callbacks | |||
| from fastNLP.core.callbacks.callback_events import _SingleEventState | |||
| from fastNLP.core.callbacks.progress_callback import choose_progress_callback | |||
| from fastNLP.core.drivers import Driver | |||
| from fastNLP.core.drivers.utils import choose_driver | |||
| from fastNLP.core.utils import get_fn_arg_names, match_and_substitute_params, nullcontext | |||
| @@ -133,7 +133,7 @@ class Trainer(TrainerEventTrigger): | |||
| ["all", "ignore", "only_error"];当该参数的值不是以上值时,该值应当表示一个文件夹的名字,我们会将其他 rank 的输出流重定向到 | |||
| log 文件中,然后将 log 文件保存在通过该参数值设定的文件夹中;默认为 "only_error"; | |||
| progress_bar: 以哪种方式显示 progress ,目前支持[None, 'raw', 'rich', 'auto'] 或者 RichCallback, RawTextCallback对象, | |||
| 默认为 auto , auto 表示如果检测到当前 terminal 为交互型 则使用 RichCallback,否则使用 RawTextCallback对象。如果 | |||
| 默认为 auto , auto 表示如果检测到当前 terminal 为交互型则使用 RichCallback,否则使用 RawTextCallback对象。如果 | |||
| 需要定制 progress bar 的参数,例如打印频率等,可以传入 RichCallback, RawTextCallback 对象。 | |||
| train_input_mapping: 与 input_mapping 一致,但是只用于 train 中。与 input_mapping 互斥。 | |||
| train_output_mapping: 与 output_mapping 一致,但是只用于 train 中。与 output_mapping 互斥。 | |||
| @@ -212,17 +212,7 @@ class Trainer(TrainerEventTrigger): | |||
| self.driver.set_optimizers(optimizers=optimizers) | |||
| # 根据 progress_bar 参数选择 ProgressBarCallback | |||
| progress_bar_callback = choose_progress_callback(kwargs.get('progress_bar', 'auto')) | |||
| if progress_bar_callback is not None: | |||
| if callbacks is None: | |||
| callbacks = [] | |||
| elif not isinstance(callbacks, Sequence): | |||
| callbacks = [callbacks] | |||
| callbacks = list(callbacks) + [progress_bar_callback] | |||
| else: | |||
| rank_zero_call(logger.warning)("No progress bar is provided, there will have no information output " | |||
| "during training.") | |||
| callbacks = prepare_callbacks(callbacks, kwargs.get('progress_bar', 'auto')) | |||
| # 初始化 callback manager; | |||
| self.callback_manager = CallbackManager(callbacks) | |||
| # 添加所有的函数式 callbacks; | |||
| @@ -1,7 +0,0 @@ | |||
| __all__ = [ | |||
| 'FDataLoader' | |||
| ] | |||
| class FDataLoader: | |||
| pass | |||
| @@ -17,7 +17,7 @@ if _NEED_IMPORT_TORCH: | |||
| from torch.utils.data import DataLoader, Sampler | |||
| from torch.utils.data._utils.collate import default_collate | |||
| else: | |||
| from ..fdataloader import FDataLoader as DataLoader | |||
| from fastNLP.core.utils.dummy_class import DummyClass as DataLoader | |||
| class _FDataSet: | |||
| @@ -14,7 +14,7 @@ if _NEED_IMPORT_PADDLE: | |||
| import paddle | |||
| def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[int]]], | |||
| model: paddle.nn.Layer, **kwargs) -> PaddleDriver: | |||
| model: "paddle.nn.Layer", **kwargs) -> PaddleDriver: | |||
| r""" | |||
| 用来根据参数 `driver` 和 `device` 来确定并且初始化一个具体的 `Driver` 实例然后返回回去; | |||
| 1、如果检测到当前进程为用户通过 `python -m paddle.distributed.launch xxx.py` 方式拉起的,则将 | |||
| @@ -11,8 +11,8 @@ from fastNLP.core.log import logger | |||
| from fastNLP.envs import FASTNLP_BACKEND_LAUNCH | |||
| def initialize_torch_driver(driver: str, device: Optional[Union[str, torch.device, int, List[int]]], | |||
| model: torch.nn.Module, **kwargs) -> TorchDriver: | |||
| def initialize_torch_driver(driver: str, device: Optional[Union[str, "torch.device", int, List[int]]], | |||
| model: "torch.nn.Module", **kwargs) -> TorchDriver: | |||
| r""" | |||
| 用来根据参数 `driver` 和 `device` 来确定并且初始化一个具体的 `Driver` 实例然后返回回去; | |||
| 注意如果输入的 `device` 如果和 `driver` 对应不上就直接报错; | |||
| @@ -28,7 +28,7 @@ class Accuracy(Metric): | |||
| def get_metric(self) -> dict: | |||
| r""" | |||
| get_metric 函数将根据 evaluate 函数累计的评价指标统计量来计算最终的评价结果. | |||
| get_metric 函数将根据 update 函数累计的评价指标统计量来计算最终的评价结果. | |||
| :return dict evaluate_result: {"acc": float} | |||
| """ | |||
| @@ -37,7 +37,7 @@ class Accuracy(Metric): | |||
| def update(self, pred, target, seq_len=None): | |||
| r""" | |||
| evaluate函数将针对一个批次的预测结果做评价指标的累计 | |||
| update 函数将针对一个批次的预测结果做评价指标的累计 | |||
| :param torch.Tensor pred: 预测的tensor, tensor的形状可以是torch.Size([B,]), torch.Size([B, n_classes]), | |||
| torch.Size([B, max_len]), 或者torch.Size([B, max_len, n_classes]) | |||
| @@ -56,7 +56,7 @@ class ClassifyFPreRecMetric(Metric): | |||
| def get_metric(self) -> dict: | |||
| r""" | |||
| get_metric函数将根据evaluate函数累计的评价指标统计量来计算最终的评价结果. | |||
| get_metric函数将根据update函数累计的评价指标统计量来计算最终的评价结果. | |||
| :return dict evaluate_result: {"acc": float} | |||
| """ | |||
| @@ -117,7 +117,7 @@ class ClassifyFPreRecMetric(Metric): | |||
| def update(self, pred, target, seq_len=None): | |||
| r""" | |||
| evaluate函数将针对一个批次的预测结果做评价指标的累计 | |||
| update 函数将针对一个批次的预测结果做评价指标的累计 | |||
| :param torch.Tensor pred: 预测的tensor, tensor的形状可以是torch.Size([B,]), torch.Size([B, n_classes]), | |||
| torch.Size([B, max_len]), 或者torch.Size([B, max_len, n_classes]) | |||
| @@ -11,9 +11,8 @@ _IS_ALLENNLP_AVAILABLE = _module_available('allennlp') | |||
| if _IS_ALLENNLP_AVAILABLE: | |||
| from allennlp.training.metrics import Metric as allennlp_Metric | |||
| if _NEED_IMPORT_TORCH and _IS_TORCHMETRICS_AVAILABLE: | |||
| if _IS_TORCHMETRICS_AVAILABLE: | |||
| from torchmetrics import Metric as torchmetrics_Metric | |||
| if _IS_TORCHMETRICS_AVAILABLE: | |||
| from torchmetrics import Metric as torchmetrics_Metric | |||
| if _NEED_IMPORT_PADDLE: | |||
| from paddle.metric import Metric as paddle_Metric | |||
| @@ -1,4 +1,5 @@ | |||
| import functools | |||
| class DummyClass: | |||
| pass | |||
| def __call__(self, *args, **kwargs): | |||
| return | |||
| @@ -0,0 +1 @@ | |||
| """基于 transformers-4.11.3 版本迁移""" | |||
| @@ -0,0 +1,9 @@ | |||
| """ | |||
| 为了防止因 https://github.com/huggingface/transformers 版本变化导致代码不兼容,当前 folder 以及子 folder | |||
| 都复制自 https://github.com/huggingface/transformers 的4.11.3版本。 | |||
| In order to avoid the code change of https://github.com/huggingface/transformers to cause version | |||
| mismatch, we copy code from https://github.com/huggingface/transformers(version:4.11.3) in this | |||
| folder and its subfolder. | |||
| """ | |||
| __version__ = "4.11.3" | |||
| from .models import * | |||
| @@ -0,0 +1,125 @@ | |||
| # Copyright 2020 The HuggingFace Team. All rights reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| import math | |||
| from packaging import version | |||
| from fastNLP.envs.imports import _NEED_IMPORT_TORCH | |||
| from fastNLP.core.log import logger | |||
| if _NEED_IMPORT_TORCH: | |||
| import torch | |||
| from torch import nn, tanh, sigmoid | |||
| from torch.nn.functional import relu | |||
| else: | |||
| from fastNLP.core.utils.dummy_class import ( | |||
| DummyClass as relu, | |||
| DummyClass as tanh, | |||
| DummyClass as sigmoid, | |||
| ) | |||
| def _gelu_python(x): | |||
| """ | |||
| Original Implementation of the GELU activation function in Google BERT repo when initially created. For | |||
| information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 + | |||
| torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional | |||
| Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 | |||
| """ | |||
| return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) | |||
| def gelu_new(x): | |||
| """ | |||
| Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see | |||
| the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 | |||
| """ | |||
| return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))) | |||
| if _NEED_IMPORT_TORCH: | |||
| if version.parse(torch.__version__) < version.parse("1.4"): | |||
| gelu = _gelu_python | |||
| else: | |||
| gelu = nn.functional.gelu | |||
| else: | |||
| from fastNLP.core.utils.dummy_class import DummyClass as gelu | |||
| def gelu_fast(x): | |||
| return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) | |||
| def quick_gelu(x): | |||
| return x * torch.sigmoid(1.702 * x) | |||
| def _silu_python(x): | |||
| """ | |||
| See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear | |||
| Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function | |||
| Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated | |||
| Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with | |||
| later. | |||
| """ | |||
| return x * torch.sigmoid(x) | |||
| if _NEED_IMPORT_TORCH: | |||
| if version.parse(torch.__version__) < version.parse("1.7"): | |||
| silu = _silu_python | |||
| else: | |||
| silu = nn.functional.silu | |||
| else: | |||
| from fastNLP.core.utils.dummy_class import DummyClass as silu | |||
| def _mish_python(x): | |||
| """ | |||
| See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also | |||
| visit the official repository for the paper: https://github.com/digantamisra98/Mish | |||
| """ | |||
| return x * torch.tanh(nn.functional.softplus(x)) | |||
| if _NEED_IMPORT_TORCH: | |||
| if version.parse(torch.__version__) < version.parse("1.9"): | |||
| mish = _mish_python | |||
| else: | |||
| mish = nn.functional.mish | |||
| else: | |||
| from fastNLP.core.utils.dummy_class import DummyClass as mish | |||
| def linear_act(x): | |||
| return x | |||
| ACT2FN = { | |||
| "relu": relu, | |||
| "silu": silu, | |||
| "swish": silu, | |||
| "gelu": gelu, | |||
| "tanh": tanh, | |||
| "gelu_new": gelu_new, | |||
| "gelu_fast": gelu_fast, | |||
| "quick_gelu": quick_gelu, | |||
| "mish": mish, | |||
| "linear": linear_act, | |||
| "sigmoid": sigmoid, | |||
| } | |||
| def get_activation(activation_string): | |||
| if activation_string in ACT2FN: | |||
| return ACT2FN[activation_string] | |||
| else: | |||
| raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}") | |||
| @@ -0,0 +1,777 @@ | |||
| # coding=utf-8 | |||
| # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. | |||
| # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """ Configuration base class and utilities.""" | |||
| import copy | |||
| import json | |||
| import os | |||
| from typing import Any, Dict, Tuple, Union | |||
| from . import __version__ | |||
| from .file_utils import ( | |||
| CONFIG_NAME, | |||
| cached_path, | |||
| hf_bucket_url, | |||
| is_offline_mode, | |||
| is_remote_url, | |||
| ) | |||
| from fastNLP.envs.imports import _NEED_IMPORT_TORCH | |||
| from fastNLP.core.log import logger | |||
| class PretrainedConfig: | |||
| r""" | |||
| Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as | |||
| methods for loading/downloading/saving configurations. | |||
| Note: | |||
| A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to | |||
| initialize a model does **not** load the model weights. It only affects the model's configuration. | |||
| Class attributes (overridden by derived classes) | |||
| - **model_type** (:obj:`str`) -- An identifier for the model type, serialized into the JSON file, and used to | |||
| recreate the correct object in :class:`~transformers.AutoConfig`. | |||
| - **is_composition** (:obj:`bool`) -- Whether the config class is composed of multiple sub-configs. In this | |||
| case the config has to be initialized from two or more configs of type | |||
| :class:`~transformers.PretrainedConfig` like: :class:`~transformers.EncoderDecoderConfig` or | |||
| :class:`~RagConfig`. | |||
| - **keys_to_ignore_at_inference** (:obj:`List[str]`) -- A list of keys to ignore by default when looking at | |||
| dictionary outputs of the model during inference. | |||
| - **attribute_map** (:obj:`Dict[str, str]`) -- A dict that maps model specific attribute names to the | |||
| standardized naming of attributes. | |||
| Common attributes (present in all subclasses) | |||
| - **vocab_size** (:obj:`int`) -- The number of tokens in the vocabulary, which is also the first dimension of | |||
| the embeddings matrix (this attribute may be missing for models that don't have a text modality like ViT). | |||
| - **hidden_size** (:obj:`int`) -- The hidden size of the model. | |||
| - **num_attention_heads** (:obj:`int`) -- The number of attention heads used in the multi-head attention layers | |||
| of the model. | |||
| - **num_hidden_layers** (:obj:`int`) -- The number of blocks in the model. | |||
| Args: | |||
| name_or_path (:obj:`str`, `optional`, defaults to :obj:`""`): | |||
| Store the string that was passed to :func:`~transformers.PreTrainedModel.from_pretrained` or | |||
| :func:`~transformers.TFPreTrainedModel.from_pretrained` as ``pretrained_model_name_or_path`` if the | |||
| configuration was created with such a method. | |||
| output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether or not the model should return all hidden-states. | |||
| output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether or not the model should returns all attentions. | |||
| return_dict (:obj:`bool`, `optional`, defaults to :obj:`True`): | |||
| Whether or not the model should return a :class:`~transformers.file_utils.ModelOutput` instead of a plain | |||
| tuple. | |||
| is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether the model is used as an encoder/decoder or not. | |||
| is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether the model is used as decoder or not (in which case it's used as an encoder). | |||
| add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether cross-attention layers should be added to the model. Note, this option is only relevant for models | |||
| that can be used as decoder models within the `:class:~transformers.EncoderDecoderModel` class, which | |||
| consists of all models in ``AUTO_MODELS_FOR_CAUSAL_LM``. | |||
| tie_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder | |||
| and decoder model to have the exact same parameter names. | |||
| prune_heads (:obj:`Dict[int, List[int]]`, `optional`, defaults to :obj:`{}`): | |||
| Pruned heads of the model. The keys are the selected layer indices and the associated values, the list of | |||
| heads to prune in said layer. | |||
| For instance ``{1: [0, 2], 2: [2, 3]}`` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2. | |||
| chunk_size_feed_forward (:obj:`int`, `optional`, defaults to :obj:`0`): | |||
| The chunk size of all feed forward layers in the residual attention blocks. A chunk size of :obj:`0` means | |||
| that the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes | |||
| :obj:`n` < sequence_length embeddings at a time. For more information on feed forward chunking, see `How | |||
| does Feed Forward Chunking work? <../glossary.html#feed-forward-chunking>`__ . | |||
| Parameters for sequence generation | |||
| - **max_length** (:obj:`int`, `optional`, defaults to 20) -- Maximum length that will be used by default in the | |||
| :obj:`generate` method of the model. | |||
| - **min_length** (:obj:`int`, `optional`, defaults to 10) -- Minimum length that will be used by default in the | |||
| :obj:`generate` method of the model. | |||
| - **do_sample** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default in the | |||
| :obj:`generate` method of the model. Whether or not to use sampling ; use greedy decoding otherwise. | |||
| - **early_stopping** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Flag that will be used by default | |||
| in the :obj:`generate` method of the model. Whether to stop the beam search when at least ``num_beams`` | |||
| sentences are finished per batch or not. | |||
| - **num_beams** (:obj:`int`, `optional`, defaults to 1) -- Number of beams for beam search that will be used by | |||
| default in the :obj:`generate` method of the model. 1 means no beam search. | |||
| - **num_beam_groups** (:obj:`int`, `optional`, defaults to 1) -- Number of groups to divide :obj:`num_beams` | |||
| into in order to ensure diversity among different groups of beams that will be used by default in the | |||
| :obj:`generate` method of the model. 1 means no group beam search. | |||
| - **diversity_penalty** (:obj:`float`, `optional`, defaults to 0.0) -- Value to control diversity for group | |||
| beam search. that will be used by default in the :obj:`generate` method of the model. 0 means no diversity | |||
| penalty. The higher the penalty, the more diverse are the outputs. | |||
| - **temperature** (:obj:`float`, `optional`, defaults to 1) -- The value used to module the next token | |||
| probabilities that will be used by default in the :obj:`generate` method of the model. Must be strictly | |||
| positive. | |||
| - **top_k** (:obj:`int`, `optional`, defaults to 50) -- Number of highest probability vocabulary tokens to keep | |||
| for top-k-filtering that will be used by default in the :obj:`generate` method of the model. | |||
| - **top_p** (:obj:`float`, `optional`, defaults to 1) -- Value that will be used by default in the | |||
| :obj:`generate` method of the model for ``top_p``. If set to float < 1, only the most probable tokens with | |||
| probabilities that add up to ``top_p`` or higher are kept for generation. | |||
| - **repetition_penalty** (:obj:`float`, `optional`, defaults to 1) -- Parameter for repetition penalty that | |||
| will be used by default in the :obj:`generate` method of the model. 1.0 means no penalty. | |||
| - **length_penalty** (:obj:`float`, `optional`, defaults to 1) -- Exponential penalty to the length that will | |||
| be used by default in the :obj:`generate` method of the model. | |||
| - **no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by default in the | |||
| :obj:`generate` method of the model for ``no_repeat_ngram_size``. If set to int > 0, all ngrams of that size | |||
| can only occur once. | |||
| - **encoder_no_repeat_ngram_size** (:obj:`int`, `optional`, defaults to 0) -- Value that will be used by | |||
| default in the :obj:`generate` method of the model for ``encoder_no_repeat_ngram_size``. If set to int > 0, | |||
| all ngrams of that size that occur in the ``encoder_input_ids`` cannot occur in the ``decoder_input_ids``. | |||
| - **bad_words_ids** (:obj:`List[int]`, `optional`) -- List of token ids that are not allowed to be generated | |||
| that will be used by default in the :obj:`generate` method of the model. In order to get the tokens of the | |||
| words that should not appear in the generated text, use :obj:`tokenizer.encode(bad_word, | |||
| add_prefix_space=True)`. | |||
| - **num_return_sequences** (:obj:`int`, `optional`, defaults to 1) -- Number of independently computed returned | |||
| sequences for each element in the batch that will be used by default in the :obj:`generate` method of the | |||
| model. | |||
| - **output_scores** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether the model should return the | |||
| logits when used for generation | |||
| - **return_dict_in_generate** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether the model should | |||
| return a :class:`~transformers.file_utils.ModelOutput` instead of a :obj:`torch.LongTensor` | |||
| - **forced_bos_token_id** (:obj:`int`, `optional`) -- The id of the token to force as the first generated token | |||
| after the :obj:`decoder_start_token_id`. Useful for multilingual models like :doc:`mBART | |||
| <../model_doc/mbart>` where the first generated token needs to be the target language token. | |||
| - **forced_eos_token_id** (:obj:`int`, `optional`) -- The id of the token to force as the last generated token | |||
| when :obj:`max_length` is reached. | |||
| - **remove_invalid_values** (:obj:`bool`, `optional`) -- Whether to remove possible `nan` and `inf` outputs of | |||
| the model to prevent the generation method to crash. Note that using ``remove_invalid_values`` can slow down | |||
| generation. | |||
| Parameters for fine-tuning tasks | |||
| - **architectures** (:obj:`List[str]`, `optional`) -- Model architectures that can be used with the model | |||
| pretrained weights. | |||
| - **finetuning_task** (:obj:`str`, `optional`) -- Name of the task used to fine-tune the model. This can be | |||
| used when converting from an original (TensorFlow or PyTorch) checkpoint. | |||
| - **id2label** (:obj:`Dict[int, str]`, `optional`) -- A map from index (for instance prediction index, or | |||
| target index) to label. | |||
| - **label2id** (:obj:`Dict[str, int]`, `optional`) -- A map from label to index for the model. | |||
| - **num_labels** (:obj:`int`, `optional`) -- Number of labels to use in the last layer added to the model, | |||
| typically for a classification task. | |||
| - **task_specific_params** (:obj:`Dict[str, Any]`, `optional`) -- Additional keyword arguments to store for the | |||
| current task. | |||
| - **problem_type** (:obj:`str`, `optional`) -- Problem type for :obj:`XxxForSequenceClassification` models. Can | |||
| be one of (:obj:`"regression"`, :obj:`"single_label_classification"`, :obj:`"multi_label_classification"`). | |||
| Please note that this parameter is only available in the following models: `AlbertForSequenceClassification`, | |||
| `BertForSequenceClassification`, `BigBirdForSequenceClassification`, `ConvBertForSequenceClassification`, | |||
| `DistilBertForSequenceClassification`, `ElectraForSequenceClassification`, `FunnelForSequenceClassification`, | |||
| `LongformerForSequenceClassification`, `MobileBertForSequenceClassification`, | |||
| `ReformerForSequenceClassification`, `RobertaForSequenceClassification`, | |||
| `SqueezeBertForSequenceClassification`, `XLMForSequenceClassification` and `XLNetForSequenceClassification`. | |||
| Parameters linked to the tokenizer | |||
| - **tokenizer_class** (:obj:`str`, `optional`) -- The name of the associated tokenizer class to use (if none is | |||
| set, will use the tokenizer associated to the model by default). | |||
| - **prefix** (:obj:`str`, `optional`) -- A specific prompt that should be added at the beginning of each text | |||
| before calling the model. | |||
| - **bos_token_id** (:obj:`int`, `optional`)) -- The id of the `beginning-of-stream` token. | |||
| - **pad_token_id** (:obj:`int`, `optional`)) -- The id of the `padding` token. | |||
| - **eos_token_id** (:obj:`int`, `optional`)) -- The id of the `end-of-stream` token. | |||
| - **decoder_start_token_id** (:obj:`int`, `optional`)) -- If an encoder-decoder model starts decoding with a | |||
| different token than `bos`, the id of that token. | |||
| - **sep_token_id** (:obj:`int`, `optional`)) -- The id of the `separation` token. | |||
| PyTorch specific parameters | |||
| - **torchscript** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should be | |||
| used with Torchscript. | |||
| - **tie_word_embeddings** (:obj:`bool`, `optional`, defaults to :obj:`True`) -- Whether the model's input and | |||
| output word embeddings should be tied. Note that this is only relevant if the model has a output word | |||
| embedding layer. | |||
| - **torch_dtype** (:obj:`str`, `optional`) -- The :obj:`dtype` of the weights. This attribute can be used to | |||
| initialize the model to a non-default ``dtype`` (which is normally ``float32``) and thus allow for optimal | |||
| storage allocation. For example, if the saved model is ``float16``, ideally we want to load it back using the | |||
| minimal amount of memory needed to load ``float16`` weights. Since the config object is stored in plain text, | |||
| this attribute contains just the floating type string without the ``torch.`` prefix. For example, for | |||
| ``torch.float16`` ``torch_dtype`` is the ``"float16"`` string. | |||
| This attribute is currently not being used during model loading time, but this may change in the future | |||
| versions. But we can already start preparing for the future by saving the dtype with save_pretrained. | |||
| TensorFlow specific parameters | |||
| - **use_bfloat16** (:obj:`bool`, `optional`, defaults to :obj:`False`) -- Whether or not the model should use | |||
| BFloat16 scalars (only used by some TensorFlow models). | |||
| """ | |||
| model_type: str = "" | |||
| is_composition: bool = False | |||
| attribute_map: Dict[str, str] = {} | |||
| def __setattr__(self, key, value): | |||
| if key in super().__getattribute__("attribute_map"): | |||
| key = super().__getattribute__("attribute_map")[key] | |||
| super().__setattr__(key, value) | |||
| def __getattribute__(self, key): | |||
| if key != "attribute_map" and key in super().__getattribute__("attribute_map"): | |||
| key = super().__getattribute__("attribute_map")[key] | |||
| return super().__getattribute__(key) | |||
| def __init__(self, **kwargs): | |||
| # Attributes with defaults | |||
| self.return_dict = kwargs.pop("return_dict", True) | |||
| self.output_hidden_states = kwargs.pop("output_hidden_states", False) | |||
| self.output_attentions = kwargs.pop("output_attentions", False) | |||
| self.torchscript = kwargs.pop("torchscript", False) # Only used by PyTorch models | |||
| self.torch_dtype = kwargs.pop("torch_dtype", None) # Only used by PyTorch models | |||
| self.use_bfloat16 = kwargs.pop("use_bfloat16", False) | |||
| self.pruned_heads = kwargs.pop("pruned_heads", {}) | |||
| self.tie_word_embeddings = kwargs.pop( | |||
| "tie_word_embeddings", True | |||
| ) # Whether input and output word embeddings should be tied for all MLM, LM and Seq2Seq models. | |||
| # Is decoder is used in encoder-decoder models to differentiate encoder from decoder | |||
| self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False) | |||
| self.is_decoder = kwargs.pop("is_decoder", False) | |||
| self.add_cross_attention = kwargs.pop("add_cross_attention", False) | |||
| self.tie_encoder_decoder = kwargs.pop("tie_encoder_decoder", False) | |||
| # Parameters for sequence generation | |||
| self.max_length = kwargs.pop("max_length", 20) | |||
| self.min_length = kwargs.pop("min_length", 0) | |||
| self.do_sample = kwargs.pop("do_sample", False) | |||
| self.early_stopping = kwargs.pop("early_stopping", False) | |||
| self.num_beams = kwargs.pop("num_beams", 1) | |||
| self.num_beam_groups = kwargs.pop("num_beam_groups", 1) | |||
| self.diversity_penalty = kwargs.pop("diversity_penalty", 0.0) | |||
| self.temperature = kwargs.pop("temperature", 1.0) | |||
| self.top_k = kwargs.pop("top_k", 50) | |||
| self.top_p = kwargs.pop("top_p", 1.0) | |||
| self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0) | |||
| self.length_penalty = kwargs.pop("length_penalty", 1.0) | |||
| self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0) | |||
| self.encoder_no_repeat_ngram_size = kwargs.pop("encoder_no_repeat_ngram_size", 0) | |||
| self.bad_words_ids = kwargs.pop("bad_words_ids", None) | |||
| self.num_return_sequences = kwargs.pop("num_return_sequences", 1) | |||
| self.chunk_size_feed_forward = kwargs.pop("chunk_size_feed_forward", 0) | |||
| self.output_scores = kwargs.pop("output_scores", False) | |||
| self.return_dict_in_generate = kwargs.pop("return_dict_in_generate", False) | |||
| self.forced_bos_token_id = kwargs.pop("forced_bos_token_id", None) | |||
| self.forced_eos_token_id = kwargs.pop("forced_eos_token_id", None) | |||
| self.remove_invalid_values = kwargs.pop("remove_invalid_values", False) | |||
| # Fine-tuning task arguments | |||
| self.architectures = kwargs.pop("architectures", None) | |||
| self.finetuning_task = kwargs.pop("finetuning_task", None) | |||
| self.id2label = kwargs.pop("id2label", None) | |||
| self.label2id = kwargs.pop("label2id", None) | |||
| if self.id2label is not None: | |||
| kwargs.pop("num_labels", None) | |||
| self.id2label = dict((int(key), value) for key, value in self.id2label.items()) | |||
| # Keys are always strings in JSON so convert ids to int here. | |||
| else: | |||
| self.num_labels = kwargs.pop("num_labels", 2) | |||
| if self.torch_dtype is not None and isinstance(self.torch_dtype, str): | |||
| # we will start using self.torch_dtype in v5, but to be consistent with | |||
| # from_pretrained's torch_dtype arg convert it to an actual torch.dtype object | |||
| if _NEED_IMPORT_TORCH: | |||
| import torch | |||
| self.torch_dtype = getattr(torch, self.torch_dtype) | |||
| # Tokenizer arguments TODO: eventually tokenizer and models should share the same config | |||
| self.tokenizer_class = kwargs.pop("tokenizer_class", None) | |||
| self.prefix = kwargs.pop("prefix", None) | |||
| self.bos_token_id = kwargs.pop("bos_token_id", None) | |||
| self.pad_token_id = kwargs.pop("pad_token_id", None) | |||
| self.eos_token_id = kwargs.pop("eos_token_id", None) | |||
| self.sep_token_id = kwargs.pop("sep_token_id", None) | |||
| self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None) | |||
| # task specific arguments | |||
| self.task_specific_params = kwargs.pop("task_specific_params", None) | |||
| # regression / multi-label classification | |||
| self.problem_type = kwargs.pop("problem_type", None) | |||
| allowed_problem_types = ("regression", "single_label_classification", "multi_label_classification") | |||
| if self.problem_type is not None and self.problem_type not in allowed_problem_types: | |||
| raise ValueError( | |||
| f"The config parameter `problem_type` was not understood: received {self.problem_type}" | |||
| "but only 'regression', 'single_label_classification' and 'multi_label_classification' are valid." | |||
| ) | |||
| # TPU arguments | |||
| if kwargs.pop("xla_device", None) is not None: | |||
| logger.warning( | |||
| "The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can " | |||
| "safely remove it from your `config.json` file." | |||
| ) | |||
| # Name or path to the pretrained checkpoint | |||
| self._name_or_path = str(kwargs.pop("name_or_path", "")) | |||
| # Drop the transformers version info | |||
| self.transformers_version = kwargs.pop("transformers_version", None) | |||
| # Deal with gradient checkpointing | |||
| if kwargs.get("gradient_checkpointing", False): | |||
| logger.warn( | |||
| "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 " | |||
| "Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the " | |||
| "`Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`." | |||
| ) | |||
| # Additional attributes without default values | |||
| for key, value in kwargs.items(): | |||
| try: | |||
| setattr(self, key, value) | |||
| except AttributeError as err: | |||
| logger.error(f"Can't set {key} with value {value} for {self}") | |||
| raise err | |||
| @property | |||
| def name_or_path(self) -> str: | |||
| return self._name_or_path | |||
| @name_or_path.setter | |||
| def name_or_path(self, value): | |||
| self._name_or_path = str(value) # Make sure that name_or_path is a string (for JSON encoding) | |||
| @property | |||
| def use_return_dict(self) -> bool: | |||
| """ | |||
| :obj:`bool`: Whether or not return :class:`~transformers.file_utils.ModelOutput` instead of tuples. | |||
| """ | |||
| # If torchscript is set, force `return_dict=False` to avoid jit errors | |||
| return self.return_dict and not self.torchscript | |||
| @property | |||
| def num_labels(self) -> int: | |||
| """ | |||
| :obj:`int`: The number of labels for classification models. | |||
| """ | |||
| return len(self.id2label) | |||
| @num_labels.setter | |||
| def num_labels(self, num_labels: int): | |||
| if not hasattr(self, "id2label") or self.id2label is None or len(self.id2label) != num_labels: | |||
| self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)} | |||
| self.label2id = dict(zip(self.id2label.values(), self.id2label.keys())) | |||
| def save_pretrained(self, save_directory: Union[str, os.PathLike], **kwargs): | |||
| """ | |||
| Save a configuration object to the directory ``save_directory``, so that it can be re-loaded using the | |||
| :func:`~transformers.PretrainedConfig.from_pretrained` class method. | |||
| Args: | |||
| save_directory (:obj:`str` or :obj:`os.PathLike`): | |||
| Directory where the configuration JSON file will be saved (will be created if it does not exist). | |||
| push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether or not to push your model to the Hugging Face model hub after saving it. | |||
| .. warning:: | |||
| Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with | |||
| :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are | |||
| pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory | |||
| instead. | |||
| kwargs: | |||
| Additional key word arguments passed along to the | |||
| :meth:`~transformers.file_utils.PushToHubMixin.push_to_hub` method. | |||
| """ | |||
| if os.path.isfile(save_directory): | |||
| raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file") | |||
| os.makedirs(save_directory, exist_ok=True) | |||
| # If we save using the predefined names, we can load using `from_pretrained` | |||
| output_config_file = os.path.join(save_directory, CONFIG_NAME) | |||
| self.to_json_file(output_config_file, use_diff=True) | |||
| logger.info(f"Configuration saved in {output_config_file}") | |||
| @classmethod | |||
| def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": | |||
| r""" | |||
| Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pretrained model | |||
| configuration. | |||
| Args: | |||
| pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): | |||
| This can be either: | |||
| - a string, the `model id` of a pretrained model configuration hosted inside a model repo on | |||
| huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or | |||
| namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. | |||
| - a path to a `directory` containing a configuration file saved using the | |||
| :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g., ``./my_model_directory/``. | |||
| - a path or url to a saved configuration JSON `file`, e.g., | |||
| ``./my_model_directory/configuration.json``. | |||
| cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`): | |||
| Path to a directory in which a downloaded pretrained model configuration should be cached if the | |||
| standard cache should not be used. | |||
| force_download (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether or not to force to (re-)download the configuration files and override the cached versions if | |||
| they exist. | |||
| resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether or not to delete incompletely received file. Attempts to resume the download if such a file | |||
| exists. | |||
| proxies (:obj:`Dict[str, str]`, `optional`): | |||
| A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128', | |||
| 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. | |||
| use_auth_token (:obj:`str` or `bool`, `optional`): | |||
| The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token | |||
| generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`). | |||
| revision(:obj:`str`, `optional`, defaults to :obj:`"main"`): | |||
| The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a | |||
| git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any | |||
| identifier allowed by git. | |||
| return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| If :obj:`False`, then this function returns just the final configuration object. | |||
| If :obj:`True`, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs` | |||
| is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., | |||
| the part of ``kwargs`` which has not been used to update ``config`` and is otherwise ignored. | |||
| kwargs (:obj:`Dict[str, Any]`, `optional`): | |||
| The values in kwargs of any keys which are configuration attributes will be used to override the loaded | |||
| values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled | |||
| by the ``return_unused_kwargs`` keyword parameter. | |||
| .. note:: | |||
| Passing :obj:`use_auth_token=True` is required when you want to use a private model. | |||
| Returns: | |||
| :class:`PretrainedConfig`: The configuration object instantiated from this pretrained model. | |||
| Examples:: | |||
| # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a | |||
| # derived class: BertConfig | |||
| config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from huggingface.co and cache. | |||
| config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` | |||
| config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json') | |||
| config = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False) | |||
| assert config.output_attentions == True | |||
| config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True, | |||
| foo=False, return_unused_kwargs=True) | |||
| assert config.output_attentions == True | |||
| assert unused_kwargs == {'foo': False} | |||
| """ | |||
| config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) | |||
| if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: | |||
| logger.warn( | |||
| f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " | |||
| f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." | |||
| ) | |||
| return cls.from_dict(config_dict, **kwargs) | |||
| @classmethod | |||
| def get_config_dict( | |||
| cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs | |||
| ) -> Tuple[Dict[str, Any], Dict[str, Any]]: | |||
| """ | |||
| From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a | |||
| :class:`~transformers.PretrainedConfig` using ``from_dict``. | |||
| Parameters: | |||
| pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): | |||
| The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. | |||
| Returns: | |||
| :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object. | |||
| """ | |||
| cache_dir = kwargs.pop("cache_dir", None) | |||
| force_download = kwargs.pop("force_download", False) | |||
| resume_download = kwargs.pop("resume_download", False) | |||
| proxies = kwargs.pop("proxies", None) | |||
| use_auth_token = kwargs.pop("use_auth_token", None) | |||
| local_files_only = kwargs.pop("local_files_only", False) | |||
| revision = kwargs.pop("revision", None) | |||
| from_pipeline = kwargs.pop("_from_pipeline", None) | |||
| from_auto_class = kwargs.pop("_from_auto", False) | |||
| user_agent = {"file_type": "config", "from_auto_class": from_auto_class} | |||
| if from_pipeline is not None: | |||
| user_agent["using_pipeline"] = from_pipeline | |||
| if is_offline_mode() and not local_files_only: | |||
| logger.info("Offline mode: forcing local_files_only=True") | |||
| local_files_only = True | |||
| pretrained_model_name_or_path = str(pretrained_model_name_or_path) | |||
| if os.path.isdir(pretrained_model_name_or_path): | |||
| config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) | |||
| elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): | |||
| config_file = pretrained_model_name_or_path | |||
| else: | |||
| config_file = hf_bucket_url( | |||
| pretrained_model_name_or_path, filename=CONFIG_NAME, revision=revision, mirror=None | |||
| ) | |||
| try: | |||
| # Load from URL or cache if already cached | |||
| resolved_config_file = cached_path( | |||
| config_file, | |||
| cache_dir=cache_dir, | |||
| force_download=force_download, | |||
| proxies=proxies, | |||
| resume_download=resume_download, | |||
| local_files_only=local_files_only, | |||
| use_auth_token=use_auth_token, | |||
| user_agent=user_agent, | |||
| ) | |||
| # Load config dict | |||
| config_dict = cls._dict_from_json_file(resolved_config_file) | |||
| except EnvironmentError as err: | |||
| logger.error(err) | |||
| msg = ( | |||
| f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n" | |||
| f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n" | |||
| f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {CONFIG_NAME} file\n\n" | |||
| ) | |||
| if revision is not None: | |||
| msg += f"- or '{revision}' is a valid git identifier (branch name, a tag name, or a commit id) that exists for this model name as listed on its model page on 'https://huggingface.co/models'\n\n" | |||
| raise EnvironmentError(msg) | |||
| except (json.JSONDecodeError, UnicodeDecodeError): | |||
| msg = ( | |||
| f"Couldn't reach server at '{config_file}' to download configuration file or " | |||
| "configuration file is not a valid JSON file. " | |||
| f"Please check network or file content here: {resolved_config_file}." | |||
| ) | |||
| raise EnvironmentError(msg) | |||
| if resolved_config_file == config_file: | |||
| logger.info(f"loading configuration file {config_file}") | |||
| else: | |||
| logger.info(f"loading configuration file {config_file} from cache at {resolved_config_file}") | |||
| return config_dict, kwargs | |||
| @classmethod | |||
| def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig": | |||
| """ | |||
| Instantiates a :class:`~transformers.PretrainedConfig` from a Python dictionary of parameters. | |||
| Args: | |||
| config_dict (:obj:`Dict[str, Any]`): | |||
| Dictionary that will be used to instantiate the configuration object. Such a dictionary can be | |||
| retrieved from a pretrained checkpoint by leveraging the | |||
| :func:`~transformers.PretrainedConfig.get_config_dict` method. | |||
| kwargs (:obj:`Dict[str, Any]`): | |||
| Additional parameters from which to initialize the configuration object. | |||
| Returns: | |||
| :class:`PretrainedConfig`: The configuration object instantiated from those parameters. | |||
| """ | |||
| return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) | |||
| config = cls(**config_dict) | |||
| if hasattr(config, "pruned_heads"): | |||
| config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items()) | |||
| # Update config with kwargs if needed | |||
| to_remove = [] | |||
| for key, value in kwargs.items(): | |||
| if hasattr(config, key): | |||
| setattr(config, key, value) | |||
| if key != "torch_dtype": | |||
| to_remove.append(key) | |||
| for key in to_remove: | |||
| kwargs.pop(key, None) | |||
| logger.info(f"Model config {config}") | |||
| if return_unused_kwargs: | |||
| return config, kwargs | |||
| else: | |||
| return config | |||
| @classmethod | |||
| def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "PretrainedConfig": | |||
| """ | |||
| Instantiates a :class:`~transformers.PretrainedConfig` from the path to a JSON file of parameters. | |||
| Args: | |||
| json_file (:obj:`str` or :obj:`os.PathLike`): | |||
| Path to the JSON file containing the parameters. | |||
| Returns: | |||
| :class:`PretrainedConfig`: The configuration object instantiated from that JSON file. | |||
| """ | |||
| config_dict = cls._dict_from_json_file(json_file) | |||
| return cls(**config_dict) | |||
| @classmethod | |||
| def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]): | |||
| with open(json_file, "r", encoding="utf-8") as reader: | |||
| text = reader.read() | |||
| return json.loads(text) | |||
| def __eq__(self, other): | |||
| return self.__dict__ == other.__dict__ | |||
| def __repr__(self): | |||
| return f"{self.__class__.__name__} {self.to_json_string()}" | |||
| def to_diff_dict(self) -> Dict[str, Any]: | |||
| """ | |||
| Removes all attributes from config which correspond to the default config attributes for better readability and | |||
| serializes to a Python dictionary. | |||
| Returns: | |||
| :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance, | |||
| """ | |||
| config_dict = self.to_dict() | |||
| # get the default config dict | |||
| default_config_dict = PretrainedConfig().to_dict() | |||
| # get class specific config dict | |||
| class_config_dict = self.__class__().to_dict() if not self.is_composition else {} | |||
| serializable_config_dict = {} | |||
| # only serialize values that differ from the default config | |||
| for key, value in config_dict.items(): | |||
| if ( | |||
| key not in default_config_dict | |||
| or key == "transformers_version" | |||
| or value != default_config_dict[key] | |||
| or (key in class_config_dict and value != class_config_dict[key]) | |||
| ): | |||
| serializable_config_dict[key] = value | |||
| self.dict_torch_dtype_to_str(serializable_config_dict) | |||
| return serializable_config_dict | |||
| def to_dict(self) -> Dict[str, Any]: | |||
| """ | |||
| Serializes this instance to a Python dictionary. | |||
| Returns: | |||
| :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. | |||
| """ | |||
| output = copy.deepcopy(self.__dict__) | |||
| if hasattr(self.__class__, "model_type"): | |||
| output["model_type"] = self.__class__.model_type | |||
| # Transformers version when serializing the model | |||
| output["transformers_version"] = __version__ | |||
| self.dict_torch_dtype_to_str(output) | |||
| return output | |||
| def to_json_string(self, use_diff: bool = True) -> str: | |||
| """ | |||
| Serializes this instance to a JSON string. | |||
| Args: | |||
| use_diff (:obj:`bool`, `optional`, defaults to :obj:`True`): | |||
| If set to ``True``, only the difference between the config instance and the default | |||
| ``PretrainedConfig()`` is serialized to JSON string. | |||
| Returns: | |||
| :obj:`str`: String containing all the attributes that make up this configuration instance in JSON format. | |||
| """ | |||
| if use_diff is True: | |||
| config_dict = self.to_diff_dict() | |||
| else: | |||
| config_dict = self.to_dict() | |||
| return json.dumps(config_dict, indent=2, sort_keys=True) + "\n" | |||
| def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True): | |||
| """ | |||
| Save this instance to a JSON file. | |||
| Args: | |||
| json_file_path (:obj:`str` or :obj:`os.PathLike`): | |||
| Path to the JSON file in which this configuration instance's parameters will be saved. | |||
| use_diff (:obj:`bool`, `optional`, defaults to :obj:`True`): | |||
| If set to ``True``, only the difference between the config instance and the default | |||
| ``PretrainedConfig()`` is serialized to JSON file. | |||
| """ | |||
| with open(json_file_path, "w", encoding="utf-8") as writer: | |||
| writer.write(self.to_json_string(use_diff=use_diff)) | |||
| def update(self, config_dict: Dict[str, Any]): | |||
| """ | |||
| Updates attributes of this class with attributes from ``config_dict``. | |||
| Args: | |||
| config_dict (:obj:`Dict[str, Any]`): Dictionary of attributes that should be updated for this class. | |||
| """ | |||
| for key, value in config_dict.items(): | |||
| setattr(self, key, value) | |||
| def update_from_string(self, update_str: str): | |||
| """ | |||
| Updates attributes of this class with attributes from ``update_str``. | |||
| The expected format is ints, floats and strings as is, and for booleans use ``true`` or ``false``. For example: | |||
| "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index" | |||
| The keys to change have to already exist in the config object. | |||
| Args: | |||
| update_str (:obj:`str`): String with attributes that should be updated for this class. | |||
| """ | |||
| d = dict(x.split("=") for x in update_str.split(",")) | |||
| for k, v in d.items(): | |||
| if not hasattr(self, k): | |||
| raise ValueError(f"key {k} isn't in the original config dict") | |||
| old_v = getattr(self, k) | |||
| if isinstance(old_v, bool): | |||
| if v.lower() in ["true", "1", "y", "yes"]: | |||
| v = True | |||
| elif v.lower() in ["false", "0", "n", "no"]: | |||
| v = False | |||
| else: | |||
| raise ValueError(f"can't derive true or false from {v} (key {k})") | |||
| elif isinstance(old_v, int): | |||
| v = int(v) | |||
| elif isinstance(old_v, float): | |||
| v = float(v) | |||
| elif not isinstance(old_v, str): | |||
| raise ValueError( | |||
| f"You can only update int, float, bool or string values in the config, got {v} for key {k}" | |||
| ) | |||
| setattr(self, k, v) | |||
| def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None: | |||
| """ | |||
| Checks whether the passed dictionary has a `torch_dtype` key and if it's not None, converts torch.dtype to a | |||
| string of just the type. For example, :obj:`torch.float32` get converted into `"float32"` string, which can | |||
| then be stored in the json format. | |||
| """ | |||
| if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str): | |||
| d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1] | |||
| @@ -0,0 +1,388 @@ | |||
| # Copyright 2020 The HuggingFace Team. All rights reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """ | |||
| Integration with Deepspeed | |||
| """ | |||
| import importlib.util | |||
| import io | |||
| import json | |||
| import weakref | |||
| from copy import deepcopy | |||
| from functools import partialmethod | |||
| from .utils.versions import require_version | |||
| from fastNLP.envs.imports import _NEED_IMPORT_TORCH | |||
| from fastNLP.core.log import logger | |||
| if _NEED_IMPORT_TORCH: | |||
| import torch | |||
| def is_deepspeed_available(): | |||
| return importlib.util.find_spec("deepspeed") is not None | |||
| class HfDeepSpeedConfig: | |||
| """ | |||
| This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage. | |||
| A ``weakref`` of this object is stored in the module's globals to be able to access the config from areas where | |||
| things like the Trainer object is not available (e.g. ``from_pretrained`` and ``_get_resized_embeddings``). | |||
| Therefore it's important that this object remains alive while the program is still running. | |||
| :class:`~transformers.Trainer` uses the ``HfTrainerDeepSpeedConfig`` subclass instead. That subclass has logic to | |||
| sync the configuration with values of :class:`~transformers.TrainingArguments` by replacing special placeholder | |||
| values: ``"auto"``. Without this special logic the DeepSpeed configuration is not modified in any way. | |||
| Args: | |||
| config_file_or_dict (:obj:`Union[str, Dict]`): path to DeepSpeed config file or dict. | |||
| """ | |||
| def __init__(self, config_file_or_dict): | |||
| # set global weakref object | |||
| set_hf_deepspeed_config(self) | |||
| require_version("deepspeed>=0.5.3") | |||
| if isinstance(config_file_or_dict, dict): | |||
| # Don't modify user's data should they want to reuse it (e.g. in tests), because once we | |||
| # modified it, it will not be accepted here again, since `auto` values would have been overridden | |||
| config = deepcopy(config_file_or_dict) | |||
| elif isinstance(config_file_or_dict, str): | |||
| with io.open(config_file_or_dict, "r", encoding="utf-8") as f: | |||
| config = json.load(f) | |||
| else: | |||
| raise ValueError("expecting either a path to a DeepSpeed config file or a pre-populated dict") | |||
| self.config = config | |||
| # zero stage - this is done as early as possible, before model is created, to allow | |||
| # ``is_deepspeed_zero3_enabled`` query and getting to the early deepspeed config object | |||
| # during ``zero.Init()`` which needs whether fp16 is enabled, dtype, etc. | |||
| self._stage = self.get_value("zero_optimization.stage", -1) | |||
| # offload | |||
| self._offload = False | |||
| if self.is_zero2() or self.is_zero3(): | |||
| offload_devices_valid = set(["cpu", "nvme"]) | |||
| offload_devices = set( | |||
| [ | |||
| self.get_value("zero_optimization.offload_optimizer.device"), | |||
| self.get_value("zero_optimization.offload_param.device"), | |||
| ] | |||
| ) | |||
| if len(offload_devices & offload_devices_valid) > 0: | |||
| self._offload = True | |||
| def find_config_node(self, ds_key_long): | |||
| config = self.config | |||
| # find the config node of interest if it exists | |||
| nodes = ds_key_long.split(".") | |||
| ds_key = nodes.pop() | |||
| for node in nodes: | |||
| config = config.get(node) | |||
| if config is None: | |||
| return None, ds_key | |||
| return config, ds_key | |||
| def get_value(self, ds_key_long, default=None): | |||
| """ | |||
| Returns the set value or ``default`` if no value is set | |||
| """ | |||
| config, ds_key = self.find_config_node(ds_key_long) | |||
| if config is None: | |||
| return default | |||
| return config.get(ds_key, default) | |||
| def is_true(self, ds_key_long): | |||
| """ | |||
| Returns :obj:`True`/:obj:`False` only if the value is set, always :obj:`False` otherwise. So use this method to | |||
| ask the very specific question of whether the value is set to :obj:`True` (and it's not set to :obj:`False` or | |||
| isn't set). | |||
| """ | |||
| value = self.get_value(ds_key_long) | |||
| return False if value is None else bool(value) | |||
| def is_false(self, ds_key_long): | |||
| """ | |||
| Returns :obj:`True`/:obj:`False` only if the value is set, always :obj:`False` otherwise. So use this method to | |||
| ask the very specific question of whether the value is set to :obj:`False` (and it's not set to :obj:`True` or | |||
| isn't set). | |||
| """ | |||
| value = self.get_value(ds_key_long) | |||
| return False if value is None else not bool(value) | |||
| def is_zero2(self): | |||
| return self._stage == 2 | |||
| def is_zero3(self): | |||
| return self._stage == 3 | |||
| def is_offload(self): | |||
| return self._offload | |||
| class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig): | |||
| """ | |||
| The ``HfTrainerDeepSpeedConfig`` object is meant to be created during ``TrainingArguments`` object creation and has | |||
| the same lifespan as the latter. | |||
| """ | |||
| def __init__(self, config_file_or_dict): | |||
| super().__init__(config_file_or_dict) | |||
| self._dtype = torch.float16 | |||
| self.mismatches = [] | |||
| def dtype(self): | |||
| return self._dtype | |||
| def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True): | |||
| """ | |||
| A utility method that massages the config file and can optionally verify that the values match. | |||
| 1. Replace "auto" values with ``TrainingArguments`` value. | |||
| 2. If it wasn't "auto" and ``must_match`` is true, then check that DS config matches Trainer | |||
| config values and if mismatched add the entry to ``self.mismatched`` - will assert during | |||
| ``trainer_config_finalize`` for one or more mismatches. | |||
| """ | |||
| config, ds_key = self.find_config_node(ds_key_long) | |||
| if config is None: | |||
| return | |||
| if config.get(ds_key) == "auto": | |||
| config[ds_key] = hf_val | |||
| return | |||
| if not must_match: | |||
| return | |||
| ds_val = config.get(ds_key) | |||
| if ds_val is not None and ds_val != hf_val: | |||
| self.mismatches.append(f"- ds {ds_key_long}={ds_val} vs hf {hf_key}={hf_val}") | |||
| fill_only = partialmethod(fill_match, must_match=False) | |||
| def trainer_config_process(self, args): | |||
| """ | |||
| Adjust the config with ``TrainingArguments`` values. This stage is run during ``TrainingArguments`` object | |||
| creation. | |||
| """ | |||
| # DeepSpeed does: | |||
| # train_batch_size = world_size * train_micro_batch_size_per_gpu * gradient_accumulation_steps | |||
| train_batch_size = args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps | |||
| self.fill_match( | |||
| "train_micro_batch_size_per_gpu", args.per_device_train_batch_size, "per_device_train_batch_size" | |||
| ) | |||
| self.fill_match("gradient_accumulation_steps", args.gradient_accumulation_steps, "gradient_accumulation_steps") | |||
| self.fill_match("train_batch_size", train_batch_size, "train_batch_size (calculated)") | |||
| self.fill_match("gradient_clipping", args.max_grad_norm, "max_grad_norm") | |||
| self.fill_match("optimizer.params.lr", args.learning_rate, "learning_rate") | |||
| self.fill_match("optimizer.params.betas", [args.adam_beta1, args.adam_beta2], "adam_beta1+adam_beta2") | |||
| self.fill_match("optimizer.params.eps", args.adam_epsilon, "adam_epsilon") | |||
| self.fill_match("optimizer.params.weight_decay", args.weight_decay, "weight_decay") | |||
| self.fill_only("scheduler.params.warmup_min_lr", 0) # not a trainer arg | |||
| self.fill_match("scheduler.params.warmup_max_lr", args.learning_rate, "learning_rate") | |||
| # total_num_steps - will get set in trainer_config_finalize | |||
| # fp16 | |||
| if args.fp16: | |||
| fp16_backend = "apex" if args.fp16_backend == "apex" else "amp" | |||
| else: | |||
| fp16_backend = None | |||
| # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set | |||
| # any here unless the user did the work | |||
| self.fill_match("fp16.enabled", fp16_backend == "amp", "fp16+fp16_backend(amp)") | |||
| # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any | |||
| # ZeRO features | |||
| self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)") | |||
| self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level") | |||
| # only if we have an explicit fp16.enabled = False then it's fp32, if it's True or this | |||
| # whole config section is missing then the fallback is fp16 | |||
| if self.is_false("fp16.enabled"): | |||
| self._dtype = torch.float32 | |||
| # later there will be other dtypes besides just fp16 and fp32 | |||
| # also not quite sure what dtype should be under apex, defaulting to fp16 for now | |||
| def trainer_config_finalize(self, args, model, num_training_steps): | |||
| """ | |||
| This stage is run after we have the model and know num_training_steps. | |||
| Now we we can complete the configuration process. | |||
| """ | |||
| # zero | |||
| if self.is_zero3(): | |||
| # automatically assign the optimal config values based on model config | |||
| hidden_size = model.config.hidden_size | |||
| self.fill_only("zero_optimization.reduce_bucket_size", hidden_size * hidden_size) | |||
| self.fill_only("zero_optimization.stage3_prefetch_bucket_size", 0.9 * hidden_size * hidden_size) | |||
| self.fill_only("zero_optimization.stage3_param_persistence_threshold", 10 * hidden_size) | |||
| # scheduler | |||
| self.fill_match("scheduler.params.total_num_steps", num_training_steps, "num_training_steps (calculated)") | |||
| self.fill_match("scheduler.params.warmup_num_steps", args.get_warmup_steps(num_training_steps), "warmup_steps") | |||
| if len(self.mismatches) > 0: | |||
| mismatches = "\n".join(self.mismatches) | |||
| raise ValueError( | |||
| f"Please correct the following DeepSpeed config values that mismatch TrainingArguments values:\n{mismatches}\n" | |||
| "The easiest method is to set these DeepSpeed config values to 'auto'." | |||
| ) | |||
| # keep the config object global to be able to access it anywhere during TrainingArguments life-cycle | |||
| _hf_deepspeed_config_weak_ref = None | |||
| def set_hf_deepspeed_config(hf_deepspeed_config_obj): | |||
| # this is a special weakref global object to allow us to get to Deepspeed config from APIs | |||
| # that don't have an easy way to get to the Deepspeed config outside of the Trainer domain. | |||
| global _hf_deepspeed_config_weak_ref | |||
| # will go away automatically when HfDeepSpeedConfig is destroyed (when TrainingArguments is destroyed) | |||
| _hf_deepspeed_config_weak_ref = weakref.ref(hf_deepspeed_config_obj) | |||
| def is_deepspeed_zero3_enabled(): | |||
| if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None: | |||
| return _hf_deepspeed_config_weak_ref().is_zero3() | |||
| else: | |||
| return False | |||
| def deepspeed_config(): | |||
| if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None: | |||
| return _hf_deepspeed_config_weak_ref().config | |||
| else: | |||
| return None | |||
| def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): | |||
| """ | |||
| Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args. | |||
| If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made. | |||
| Args: | |||
| trainer: Trainer object | |||
| num_training_steps: per single gpu | |||
| resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load | |||
| Returns: model, optimizer, lr_scheduler | |||
| """ | |||
| import deepspeed | |||
| from deepspeed.utils import logger as ds_logger | |||
| model = trainer.model | |||
| args = trainer.args | |||
| hf_deepspeed_config = args.hf_deepspeed_config | |||
| hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps) | |||
| # resume config update - some bits like `model` and `num_training_steps` only become available during train | |||
| config = hf_deepspeed_config.config | |||
| # Optimizer + Scheduler | |||
| # Currently supported combos: | |||
| # 1. DS scheduler + DS optimizer: Yes | |||
| # 2. HF scheduler + HF optimizer: Yes | |||
| # 3. DS scheduler + HF optimizer: Yes | |||
| # 4. HF scheduler + DS optimizer: Yes | |||
| # | |||
| # Unless Offload is enabled in which case it's: | |||
| # 1. DS scheduler + DS optimizer: Yes | |||
| # 2. HF scheduler + HF optimizer: Mostly* | |||
| # 3. DS scheduler + HF optimizer: Mostly* | |||
| # 4. HF scheduler + DS optimizer: Yes | |||
| # | |||
| # Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB) | |||
| optimizer = None | |||
| if "optimizer" in config: | |||
| if args.adafactor: | |||
| raise ValueError( | |||
| "--adafactor was passed, but also found `optimizer` configured in the DeepSpeed config. " | |||
| "Only one optimizer can be configured." | |||
| ) | |||
| else: | |||
| if hf_deepspeed_config.is_offload(): | |||
| logger.info( | |||
| "Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the custom optimizer has both CPU and GPU implementation (except LAMB)" | |||
| ) | |||
| # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch. | |||
| # But trainer uses AdamW by default. | |||
| optimizer = trainer.create_optimizer() | |||
| # To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer` | |||
| config["zero_allow_untested_optimizer"] = True | |||
| def _lr_scheduler_callable(optimizer): | |||
| return trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer) | |||
| lr_scheduler = None | |||
| if "scheduler" not in config: | |||
| if optimizer is None: | |||
| # Optimizer is not available, so use callable to defer lr_scheduler creation to DS init | |||
| lr_scheduler = _lr_scheduler_callable | |||
| else: | |||
| lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer) | |||
| # keep for quick debug: | |||
| # from pprint import pprint; pprint(config) | |||
| # set the Deepspeed log level consistent with the trainer | |||
| ds_logger.setLevel(args.get_process_log_level()) | |||
| model_parameters = filter(lambda p: p.requires_grad, model.parameters()) | |||
| model, optimizer, _, lr_scheduler = deepspeed.initialize( | |||
| model=model, | |||
| model_parameters=model_parameters, | |||
| config_params=config, | |||
| optimizer=optimizer, | |||
| lr_scheduler=lr_scheduler, | |||
| ) | |||
| if resume_from_checkpoint is not None: | |||
| # it's possible that the user is trying to resume from model_path, which doesn't necessarily | |||
| # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's | |||
| # a resume from a checkpoint and not just a local pretrained weight. So we check here if the | |||
| # path contains what looks like a deepspeed checkpoint | |||
| import glob | |||
| deepspeed_checkpoint_dirs = sorted(glob.glob(f"{resume_from_checkpoint}/global_step*")) | |||
| if len(deepspeed_checkpoint_dirs) > 0: | |||
| logger.info(f"Attempting to resume from {resume_from_checkpoint}") | |||
| # this magically updates self.optimizer and self.lr_scheduler | |||
| load_path, _ = model.load_checkpoint( | |||
| resume_from_checkpoint, load_optimizer_states=True, load_lr_scheduler_states=True | |||
| ) | |||
| if load_path is None: | |||
| raise ValueError(f"[deepspeed] failed to resume from checkpoint {resume_from_checkpoint}") | |||
| else: | |||
| logger.info(f"{resume_from_checkpoint} doesn't have deepspeed checkpoints, doing nothing") | |||
| return model, optimizer, lr_scheduler | |||
| @@ -0,0 +1,934 @@ | |||
| import copy | |||
| import fnmatch | |||
| import importlib.util | |||
| import io | |||
| import json | |||
| import os | |||
| import re | |||
| import shutil | |||
| import sys | |||
| import tarfile | |||
| import tempfile | |||
| import operator | |||
| from collections import OrderedDict, UserDict | |||
| from contextlib import contextmanager | |||
| from dataclasses import fields | |||
| from enum import Enum | |||
| from functools import partial | |||
| from hashlib import sha256 | |||
| from pathlib import Path | |||
| from typing import Any, BinaryIO, Dict, Optional, Tuple, Union | |||
| from urllib.parse import urlparse | |||
| from uuid import uuid4 | |||
| from zipfile import ZipFile, is_zipfile | |||
| import numpy as np | |||
| # from tqdm.auto import tqdm | |||
| import requests | |||
| from . import __version__ | |||
| from .utils.versions import importlib_metadata | |||
| from fastNLP.envs.imports import _NEED_IMPORT_TORCH, _TORCH_GREATER_EQUAL_1_8 | |||
| from fastNLP.envs.utils import _compare_version | |||
| from fastNLP.core.log import logger | |||
| if _NEED_IMPORT_TORCH: | |||
| import torch | |||
| _torch_version = importlib_metadata.version("torch") | |||
| hf_cache_home = os.path.expanduser( | |||
| os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface")) | |||
| ) | |||
| default_cache_path = os.path.join(hf_cache_home, "transformers") | |||
| PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path) | |||
| PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE) | |||
| TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE) | |||
| SESSION_ID = uuid4().hex | |||
| ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"} | |||
| DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", False) in ENV_VARS_TRUE_VALUES | |||
| WEIGHTS_NAME = "pytorch_model.bin" | |||
| DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] | |||
| _staging_mode = os.environ.get("HUGGINGFACE_CO_STAGING", "NO").upper() in ENV_VARS_TRUE_VALUES | |||
| _default_endpoint = "https://moon-staging.huggingface.co" if _staging_mode else "https://huggingface.co" | |||
| HUGGINGFACE_CO_RESOLVE_ENDPOINT = os.environ.get("HUGGINGFACE_CO_RESOLVE_ENDPOINT", _default_endpoint) | |||
| HUGGINGFACE_CO_PREFIX = HUGGINGFACE_CO_RESOLVE_ENDPOINT + "/{model_id}/resolve/{revision}/{filename}" | |||
| CONFIG_NAME = "config.json" | |||
| _is_offline_mode = True if os.environ.get("TRANSFORMERS_OFFLINE", "0").upper() in ENV_VARS_TRUE_VALUES else False | |||
| @contextmanager | |||
| def filelock(path): | |||
| try: | |||
| import fcntl | |||
| open_mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC | |||
| fd = os.open(path, open_mode) | |||
| fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB) | |||
| except: | |||
| pass | |||
| yield | |||
| try: | |||
| fcntl.flock(fd, fcntl.LOCK_UN) | |||
| os.close(fd) | |||
| except: | |||
| pass | |||
| def is_offline_mode(): | |||
| return _is_offline_mode | |||
| def is_training_run_on_sagemaker(): | |||
| return "SAGEMAKER_JOB_NAME" in os.environ | |||
| def add_start_docstrings(*docstr): | |||
| def docstring_decorator(fn): | |||
| fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") | |||
| return fn | |||
| return docstring_decorator | |||
| def add_start_docstrings_to_model_forward(*docstr): | |||
| def docstring_decorator(fn): | |||
| class_name = f":class:`~transformers.{fn.__qualname__.split('.')[0]}`" | |||
| intro = f" The {class_name} forward method, overrides the :func:`__call__` special method." | |||
| note = r""" | |||
| .. note:: | |||
| Although the recipe for forward pass needs to be defined within this function, one should call the | |||
| :class:`Module` instance afterwards instead of this since the former takes care of running the pre and post | |||
| processing steps while the latter silently ignores them. | |||
| """ | |||
| fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "") | |||
| return fn | |||
| return docstring_decorator | |||
| def add_end_docstrings(*docstr): | |||
| def docstring_decorator(fn): | |||
| fn.__doc__ = fn.__doc__ + "".join(docstr) | |||
| return fn | |||
| return docstring_decorator | |||
| PT_RETURN_INTRODUCTION = r""" | |||
| Returns: | |||
| :class:`~{full_output_type}` or :obj:`tuple(torch.FloatTensor)`: A :class:`~{full_output_type}` or a tuple of | |||
| :obj:`torch.FloatTensor` (if ``return_dict=False`` is passed or when ``config.return_dict=False``) comprising | |||
| various elements depending on the configuration (:class:`~transformers.{config_class}`) and inputs. | |||
| """ | |||
| def _get_indent(t): | |||
| """Returns the indentation in the first line of t""" | |||
| search = re.search(r"^(\s*)\S", t) | |||
| return "" if search is None else search.groups()[0] | |||
| def _convert_output_args_doc(output_args_doc): | |||
| """Convert output_args_doc to display properly.""" | |||
| # Split output_arg_doc in blocks argument/description | |||
| indent = _get_indent(output_args_doc) | |||
| blocks = [] | |||
| current_block = "" | |||
| for line in output_args_doc.split("\n"): | |||
| # If the indent is the same as the beginning, the line is the name of new arg. | |||
| if _get_indent(line) == indent: | |||
| if len(current_block) > 0: | |||
| blocks.append(current_block[:-1]) | |||
| current_block = f"{line}\n" | |||
| else: | |||
| # Otherwise it's part of the description of the current arg. | |||
| # We need to remove 2 spaces to the indentation. | |||
| current_block += f"{line[2:]}\n" | |||
| blocks.append(current_block[:-1]) | |||
| # Format each block for proper rendering | |||
| for i in range(len(blocks)): | |||
| blocks[i] = re.sub(r"^(\s+)(\S+)(\s+)", r"\1- **\2**\3", blocks[i]) | |||
| blocks[i] = re.sub(r":\s*\n\s*(\S)", r" -- \1", blocks[i]) | |||
| return "\n".join(blocks) | |||
| def _prepare_output_docstrings(output_type, config_class): | |||
| """ | |||
| Prepares the return part of the docstring using `output_type`. | |||
| """ | |||
| docstrings = output_type.__doc__ | |||
| # Remove the head of the docstring to keep the list of args only | |||
| lines = docstrings.split("\n") | |||
| i = 0 | |||
| while i < len(lines) and re.search(r"^\s*(Args|Parameters):\s*$", lines[i]) is None: | |||
| i += 1 | |||
| if i < len(lines): | |||
| docstrings = "\n".join(lines[(i + 1) :]) | |||
| docstrings = _convert_output_args_doc(docstrings) | |||
| # Add the return introduction | |||
| full_output_type = f"{output_type.__module__}.{output_type.__name__}" | |||
| intro = PT_RETURN_INTRODUCTION | |||
| intro = intro.format(full_output_type=full_output_type, config_class=config_class) | |||
| return intro + docstrings | |||
| PT_TOKEN_CLASSIFICATION_SAMPLE = r""" | |||
| Example:: | |||
| >>> from transformers import {tokenizer_class}, {model_class} | |||
| >>> import torch | |||
| >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') | |||
| >>> model = {model_class}.from_pretrained('{checkpoint}') | |||
| >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") | |||
| >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1 | |||
| >>> outputs = model(**inputs, labels=labels) | |||
| >>> loss = outputs.loss | |||
| >>> logits = outputs.logits | |||
| """ | |||
| PT_QUESTION_ANSWERING_SAMPLE = r""" | |||
| Example:: | |||
| >>> from transformers import {tokenizer_class}, {model_class} | |||
| >>> import torch | |||
| >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') | |||
| >>> model = {model_class}.from_pretrained('{checkpoint}') | |||
| >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" | |||
| >>> inputs = tokenizer(question, text, return_tensors='pt') | |||
| >>> start_positions = torch.tensor([1]) | |||
| >>> end_positions = torch.tensor([3]) | |||
| >>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions) | |||
| >>> loss = outputs.loss | |||
| >>> start_scores = outputs.start_logits | |||
| >>> end_scores = outputs.end_logits | |||
| """ | |||
| PT_SEQUENCE_CLASSIFICATION_SAMPLE = r""" | |||
| Example:: | |||
| >>> from transformers import {tokenizer_class}, {model_class} | |||
| >>> import torch | |||
| >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') | |||
| >>> model = {model_class}.from_pretrained('{checkpoint}') | |||
| >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") | |||
| >>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 | |||
| >>> outputs = model(**inputs, labels=labels) | |||
| >>> loss = outputs.loss | |||
| >>> logits = outputs.logits | |||
| """ | |||
| PT_MASKED_LM_SAMPLE = r""" | |||
| Example:: | |||
| >>> from transformers import {tokenizer_class}, {model_class} | |||
| >>> import torch | |||
| >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') | |||
| >>> model = {model_class}.from_pretrained('{checkpoint}') | |||
| >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt") | |||
| >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"] | |||
| >>> outputs = model(**inputs, labels=labels) | |||
| >>> loss = outputs.loss | |||
| >>> logits = outputs.logits | |||
| """ | |||
| PT_BASE_MODEL_SAMPLE = r""" | |||
| Example:: | |||
| >>> from transformers import {tokenizer_class}, {model_class} | |||
| >>> import torch | |||
| >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') | |||
| >>> model = {model_class}.from_pretrained('{checkpoint}') | |||
| >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") | |||
| >>> outputs = model(**inputs) | |||
| >>> last_hidden_states = outputs.last_hidden_state | |||
| """ | |||
| PT_MULTIPLE_CHOICE_SAMPLE = r""" | |||
| Example:: | |||
| >>> from transformers import {tokenizer_class}, {model_class} | |||
| >>> import torch | |||
| >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') | |||
| >>> model = {model_class}.from_pretrained('{checkpoint}') | |||
| >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." | |||
| >>> choice0 = "It is eaten with a fork and a knife." | |||
| >>> choice1 = "It is eaten while held in the hand." | |||
| >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1 | |||
| >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='pt', padding=True) | |||
| >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1 | |||
| >>> # the linear classifier still needs to be trained | |||
| >>> loss = outputs.loss | |||
| >>> logits = outputs.logits | |||
| """ | |||
| PT_CAUSAL_LM_SAMPLE = r""" | |||
| Example:: | |||
| >>> import torch | |||
| >>> from transformers import {tokenizer_class}, {model_class} | |||
| >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}') | |||
| >>> model = {model_class}.from_pretrained('{checkpoint}') | |||
| >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") | |||
| >>> outputs = model(**inputs, labels=inputs["input_ids"]) | |||
| >>> loss = outputs.loss | |||
| >>> logits = outputs.logits | |||
| """ | |||
| PT_SAMPLE_DOCSTRINGS = { | |||
| "SequenceClassification": PT_SEQUENCE_CLASSIFICATION_SAMPLE, | |||
| "QuestionAnswering": PT_QUESTION_ANSWERING_SAMPLE, | |||
| "TokenClassification": PT_TOKEN_CLASSIFICATION_SAMPLE, | |||
| "MultipleChoice": PT_MULTIPLE_CHOICE_SAMPLE, | |||
| "MaskedLM": PT_MASKED_LM_SAMPLE, | |||
| "LMHead": PT_CAUSAL_LM_SAMPLE, | |||
| "BaseModel": PT_BASE_MODEL_SAMPLE, | |||
| } | |||
| def add_code_sample_docstrings( | |||
| *docstr, tokenizer_class=None, checkpoint=None, output_type=None, config_class=None, mask=None, model_cls=None | |||
| ): | |||
| def docstring_decorator(fn): | |||
| # model_class defaults to function's class if not specified otherwise | |||
| model_class = fn.__qualname__.split(".")[0] if model_cls is None else model_cls | |||
| sample_docstrings = PT_SAMPLE_DOCSTRINGS | |||
| doc_kwargs = dict(model_class=model_class, tokenizer_class=tokenizer_class, checkpoint=checkpoint) | |||
| if "SequenceClassification" in model_class: | |||
| code_sample = sample_docstrings["SequenceClassification"] | |||
| elif "QuestionAnswering" in model_class: | |||
| code_sample = sample_docstrings["QuestionAnswering"] | |||
| elif "TokenClassification" in model_class: | |||
| code_sample = sample_docstrings["TokenClassification"] | |||
| elif "MultipleChoice" in model_class: | |||
| code_sample = sample_docstrings["MultipleChoice"] | |||
| elif "MaskedLM" in model_class or model_class in ["FlaubertWithLMHeadModel", "XLMWithLMHeadModel"]: | |||
| doc_kwargs["mask"] = "[MASK]" if mask is None else mask | |||
| code_sample = sample_docstrings["MaskedLM"] | |||
| elif "LMHead" in model_class or "CausalLM" in model_class: | |||
| code_sample = sample_docstrings["LMHead"] | |||
| elif "Model" in model_class or "Encoder" in model_class: | |||
| code_sample = sample_docstrings["BaseModel"] | |||
| else: | |||
| raise ValueError(f"Docstring can't be built for model {model_class}") | |||
| output_doc = _prepare_output_docstrings(output_type, config_class) if output_type is not None else "" | |||
| built_doc = code_sample.format(**doc_kwargs) | |||
| fn.__doc__ = (fn.__doc__ or "") + "".join(docstr) + output_doc + built_doc | |||
| return fn | |||
| return docstring_decorator | |||
| def replace_return_docstrings(output_type=None, config_class=None): | |||
| def docstring_decorator(fn): | |||
| docstrings = fn.__doc__ | |||
| lines = docstrings.split("\n") | |||
| i = 0 | |||
| while i < len(lines) and re.search(r"^\s*Returns?:\s*$", lines[i]) is None: | |||
| i += 1 | |||
| if i < len(lines): | |||
| lines[i] = _prepare_output_docstrings(output_type, config_class) | |||
| docstrings = "\n".join(lines) | |||
| else: | |||
| raise ValueError( | |||
| f"The function {fn} should have an empty 'Return:' or 'Returns:' in its docstring as placeholder, current docstring is:\n{docstrings}" | |||
| ) | |||
| fn.__doc__ = docstrings | |||
| return fn | |||
| return docstring_decorator | |||
| def is_remote_url(url_or_filename): | |||
| parsed = urlparse(url_or_filename) | |||
| return parsed.scheme in ("http", "https") | |||
| def hf_bucket_url( | |||
| model_id: str, filename: str, subfolder: Optional[str] = None, revision: Optional[str] = None, mirror=None | |||
| ) -> str: | |||
| """ | |||
| Resolve a model identifier, a file name, and an optional revision id, to a huggingface.co-hosted url, redirecting | |||
| to Cloudfront (a Content Delivery Network, or CDN) for large files. | |||
| Cloudfront is replicated over the globe so downloads are way faster for the end user (and it also lowers our | |||
| bandwidth costs). | |||
| Cloudfront aggressively caches files by default (default TTL is 24 hours), however this is not an issue here | |||
| because we migrated to a git-based versioning system on huggingface.co, so we now store the files on S3/Cloudfront | |||
| in a content-addressable way (i.e., the file name is its hash). Using content-addressable filenames means cache | |||
| can't ever be stale. | |||
| In terms of client-side caching from this library, we base our caching on the objects' ETag. An object' ETag is: | |||
| its sha1 if stored in git, or its sha256 if stored in git-lfs. Files cached locally from transformers before v3.5.0 | |||
| are not shared with those new files, because the cached file's name contains a hash of the url (which changed). | |||
| """ | |||
| if subfolder is not None: | |||
| filename = f"{subfolder}/{filename}" | |||
| if mirror: | |||
| if mirror in ["tuna", "bfsu"]: | |||
| raise ValueError("The Tuna and BFSU mirrors are no longer available. Try removing the mirror argument.") | |||
| legacy_format = "/" not in model_id | |||
| if legacy_format: | |||
| return f"{mirror}/{model_id}-{filename}" | |||
| else: | |||
| return f"{mirror}/{model_id}/{filename}" | |||
| if revision is None: | |||
| revision = "main" | |||
| return HUGGINGFACE_CO_PREFIX.format(model_id=model_id, revision=revision, filename=filename) | |||
| def url_to_filename(url: str, etag: Optional[str] = None) -> str: | |||
| """ | |||
| Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's, | |||
| delimited by a period. If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name so that TF 2.0 can | |||
| identify it as a HDF5 file (see | |||
| https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380) | |||
| """ | |||
| url_bytes = url.encode("utf-8") | |||
| filename = sha256(url_bytes).hexdigest() | |||
| if etag: | |||
| etag_bytes = etag.encode("utf-8") | |||
| filename += "." + sha256(etag_bytes).hexdigest() | |||
| if url.endswith(".h5"): | |||
| filename += ".h5" | |||
| return filename | |||
| def cached_path( | |||
| url_or_filename, | |||
| cache_dir=None, | |||
| force_download=False, | |||
| proxies=None, | |||
| resume_download=False, | |||
| user_agent: Union[Dict, str, None] = None, | |||
| extract_compressed_file=False, | |||
| force_extract=False, | |||
| use_auth_token: Union[bool, str, None] = None, | |||
| local_files_only=False, | |||
| ) -> Optional[str]: | |||
| """ | |||
| Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file | |||
| and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and | |||
| then return the path | |||
| Args: | |||
| cache_dir: specify a cache directory to save the file to (overwrite the default cache dir). | |||
| force_download: if True, re-download the file even if it's already cached in the cache dir. | |||
| resume_download: if True, resume the download if incompletely received file is found. | |||
| user_agent: Optional string or dict that will be appended to the user-agent on remote requests. | |||
| use_auth_token: Optional string or boolean to use as Bearer token for remote files. If True, | |||
| will get token from ~/.huggingface. | |||
| extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed | |||
| file in a folder along the archive. | |||
| force_extract: if True when extract_compressed_file is True and the archive was already extracted, | |||
| re-extract the archive and override the folder where it was extracted. | |||
| Return: | |||
| Local path (string) of file or if networking is off, last version of file cached on disk. | |||
| Raises: | |||
| In case of non-recoverable file (non-existent or inaccessible url + no cache on disk). | |||
| """ | |||
| if cache_dir is None: | |||
| cache_dir = TRANSFORMERS_CACHE | |||
| if isinstance(url_or_filename, Path): | |||
| url_or_filename = str(url_or_filename) | |||
| if isinstance(cache_dir, Path): | |||
| cache_dir = str(cache_dir) | |||
| if is_offline_mode() and not local_files_only: | |||
| logger.info("Offline mode: forcing local_files_only=True") | |||
| local_files_only = True | |||
| if is_remote_url(url_or_filename): | |||
| # URL, so get it from the cache (downloading if necessary) | |||
| output_path = get_from_cache( | |||
| url_or_filename, | |||
| cache_dir=cache_dir, | |||
| force_download=force_download, | |||
| proxies=proxies, | |||
| resume_download=resume_download, | |||
| user_agent=user_agent, | |||
| use_auth_token=use_auth_token, | |||
| local_files_only=local_files_only, | |||
| ) | |||
| elif os.path.exists(url_or_filename): | |||
| # File, and it exists. | |||
| output_path = url_or_filename | |||
| elif urlparse(url_or_filename).scheme == "": | |||
| # File, but it doesn't exist. | |||
| raise EnvironmentError(f"file {url_or_filename} not found") | |||
| else: | |||
| # Something unknown | |||
| raise ValueError(f"unable to parse {url_or_filename} as a URL or as a local path") | |||
| if extract_compressed_file: | |||
| if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path): | |||
| return output_path | |||
| # Path where we extract compressed archives | |||
| # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/" | |||
| output_dir, output_file = os.path.split(output_path) | |||
| output_extract_dir_name = output_file.replace(".", "-") + "-extracted" | |||
| output_path_extracted = os.path.join(output_dir, output_extract_dir_name) | |||
| if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract: | |||
| return output_path_extracted | |||
| # Prevent parallel extractions | |||
| lock_path = output_path + ".lock" | |||
| with filelock(lock_path): | |||
| shutil.rmtree(output_path_extracted, ignore_errors=True) | |||
| os.makedirs(output_path_extracted) | |||
| if is_zipfile(output_path): | |||
| with ZipFile(output_path, "r") as zip_file: | |||
| zip_file.extractall(output_path_extracted) | |||
| zip_file.close() | |||
| elif tarfile.is_tarfile(output_path): | |||
| tar_file = tarfile.open(output_path) | |||
| tar_file.extractall(output_path_extracted) | |||
| tar_file.close() | |||
| else: | |||
| raise EnvironmentError(f"Archive format of {output_path} could not be identified") | |||
| return output_path_extracted | |||
| return output_path | |||
| def define_sagemaker_information(): | |||
| try: | |||
| instance_data = requests.get(os.environ["ECS_CONTAINER_METADATA_URI"]).json() | |||
| dlc_container_used = instance_data["Image"] | |||
| dlc_tag = instance_data["Image"].split(":")[1] | |||
| except Exception: | |||
| dlc_container_used = None | |||
| dlc_tag = None | |||
| sagemaker_params = json.loads(os.getenv("SM_FRAMEWORK_PARAMS", "{}")) | |||
| runs_distributed_training = True if "sagemaker_distributed_dataparallel_enabled" in sagemaker_params else False | |||
| account_id = os.getenv("TRAINING_JOB_ARN").split(":")[4] if "TRAINING_JOB_ARN" in os.environ else None | |||
| sagemaker_object = { | |||
| "sm_framework": os.getenv("SM_FRAMEWORK_MODULE", None), | |||
| "sm_region": os.getenv("AWS_REGION", None), | |||
| "sm_number_gpu": os.getenv("SM_NUM_GPUS", 0), | |||
| "sm_number_cpu": os.getenv("SM_NUM_CPUS", 0), | |||
| "sm_distributed_training": runs_distributed_training, | |||
| "sm_deep_learning_container": dlc_container_used, | |||
| "sm_deep_learning_container_tag": dlc_tag, | |||
| "sm_account_id": account_id, | |||
| } | |||
| return sagemaker_object | |||
| def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str: | |||
| """ | |||
| Formats a user-agent string with basic info about a request. | |||
| """ | |||
| ua = f"transformers/{__version__}; python/{sys.version.split()[0]}; session_id/{SESSION_ID}" | |||
| if _NEED_IMPORT_TORCH: | |||
| ua += f"; torch/{_torch_version}" | |||
| if DISABLE_TELEMETRY: | |||
| return ua + "; telemetry/off" | |||
| if is_training_run_on_sagemaker(): | |||
| ua += "; " + "; ".join(f"{k}/{v}" for k, v in define_sagemaker_information().items()) | |||
| # CI will set this value to True | |||
| if os.environ.get("TRANSFORMERS_IS_CI", "").upper() in ENV_VARS_TRUE_VALUES: | |||
| ua += "; is_ci/true" | |||
| if isinstance(user_agent, dict): | |||
| ua += "; " + "; ".join(f"{k}/{v}" for k, v in user_agent.items()) | |||
| elif isinstance(user_agent, str): | |||
| ua += "; " + user_agent | |||
| return ua | |||
| def http_get(url: str, temp_file: BinaryIO, proxies=None, resume_size=0, headers: Optional[Dict[str, str]] = None): | |||
| """ | |||
| Download remote file. Do not gobble up errors. | |||
| """ | |||
| headers = copy.deepcopy(headers) | |||
| if resume_size > 0: | |||
| headers["Range"] = f"bytes={resume_size}-" | |||
| r = requests.get(url, stream=True, proxies=proxies, headers=headers) | |||
| r.raise_for_status() | |||
| content_length = r.headers.get("Content-Length") | |||
| total = resume_size + int(content_length) if content_length is not None else None | |||
| # progress = tqdm( | |||
| # unit="B", | |||
| # unit_scale=True, | |||
| # unit_divisor=1024, | |||
| # total=total, | |||
| # initial=resume_size, | |||
| # desc="Downloading", | |||
| # disable=bool(logging.get_verbosity() == logging.NOTSET), | |||
| # ) | |||
| for chunk in r.iter_content(chunk_size=1024): | |||
| if chunk: # filter out keep-alive new chunks | |||
| # progress.update(len(chunk)) | |||
| temp_file.write(chunk) | |||
| # progress.close() | |||
| def get_from_cache( | |||
| url: str, | |||
| cache_dir=None, | |||
| force_download=False, | |||
| proxies=None, | |||
| etag_timeout=10, | |||
| resume_download=False, | |||
| user_agent: Union[Dict, str, None] = None, | |||
| use_auth_token: Union[bool, str, None] = None, | |||
| local_files_only=False, | |||
| ) -> Optional[str]: | |||
| """ | |||
| Given a URL, look for the corresponding file in the local cache. If it's not there, download it. Then return the | |||
| path to the cached file. | |||
| Return: | |||
| Local path (string) of file or if networking is off, last version of file cached on disk. | |||
| Raises: | |||
| In case of non-recoverable file (non-existent or inaccessible url + no cache on disk). | |||
| """ | |||
| if cache_dir is None: | |||
| cache_dir = TRANSFORMERS_CACHE | |||
| if isinstance(cache_dir, Path): | |||
| cache_dir = str(cache_dir) | |||
| os.makedirs(cache_dir, exist_ok=True) | |||
| headers = {"user-agent": http_user_agent(user_agent)} | |||
| if isinstance(use_auth_token, str): | |||
| headers["authorization"] = f"Bearer {use_auth_token}" | |||
| elif use_auth_token: | |||
| raise RuntimeError("`use_auth_token=True` is not supported in FastNLP now") | |||
| # token = HfFolder.get_token() | |||
| # if token is None: | |||
| # raise EnvironmentError("You specified use_auth_token=True, but a huggingface token was not found.") | |||
| # headers["authorization"] = f"Bearer {token}" | |||
| url_to_download = url | |||
| etag = None | |||
| if not local_files_only: | |||
| try: | |||
| r = requests.head(url, headers=headers, allow_redirects=False, proxies=proxies, timeout=etag_timeout) | |||
| r.raise_for_status() | |||
| etag = r.headers.get("X-Linked-Etag") or r.headers.get("ETag") | |||
| # We favor a custom header indicating the etag of the linked resource, and | |||
| # we fallback to the regular etag header. | |||
| # If we don't have any of those, raise an error. | |||
| if etag is None: | |||
| raise OSError( | |||
| "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility." | |||
| ) | |||
| # In case of a redirect, | |||
| # save an extra redirect on the request.get call, | |||
| # and ensure we download the exact atomic version even if it changed | |||
| # between the HEAD and the GET (unlikely, but hey). | |||
| if 300 <= r.status_code <= 399: | |||
| url_to_download = r.headers["Location"] | |||
| except (requests.exceptions.SSLError, requests.exceptions.ProxyError): | |||
| # Actually raise for those subclasses of ConnectionError | |||
| raise | |||
| except (requests.exceptions.ConnectionError, requests.exceptions.Timeout): | |||
| # Otherwise, our Internet connection is down. | |||
| # etag is None | |||
| pass | |||
| filename = url_to_filename(url, etag) | |||
| # get cache path to put the file | |||
| cache_path = os.path.join(cache_dir, filename) | |||
| # etag is None == we don't have a connection or we passed local_files_only. | |||
| # try to get the last downloaded one | |||
| if etag is None: | |||
| if os.path.exists(cache_path): | |||
| return cache_path | |||
| else: | |||
| matching_files = [ | |||
| file | |||
| for file in fnmatch.filter(os.listdir(cache_dir), filename.split(".")[0] + ".*") | |||
| if not file.endswith(".json") and not file.endswith(".lock") | |||
| ] | |||
| if len(matching_files) > 0: | |||
| return os.path.join(cache_dir, matching_files[-1]) | |||
| else: | |||
| # If files cannot be found and local_files_only=True, | |||
| # the models might've been found if local_files_only=False | |||
| # Notify the user about that | |||
| if local_files_only: | |||
| raise FileNotFoundError( | |||
| "Cannot find the requested files in the cached path and outgoing traffic has been" | |||
| " disabled. To enable model look-ups and downloads online, set 'local_files_only'" | |||
| " to False." | |||
| ) | |||
| else: | |||
| raise ValueError( | |||
| "Connection error, and we cannot find the requested files in the cached path." | |||
| " Please try again or make sure your Internet connection is on." | |||
| ) | |||
| # From now on, etag is not None. | |||
| if os.path.exists(cache_path) and not force_download: | |||
| return cache_path | |||
| # Prevent parallel downloads of the same file with a lock. | |||
| lock_path = cache_path + ".lock" | |||
| with filelock(lock_path): | |||
| # If the download just completed while the lock was activated. | |||
| if os.path.exists(cache_path) and not force_download: | |||
| # Even if returning early like here, the lock will be released. | |||
| return cache_path | |||
| if resume_download: | |||
| incomplete_path = cache_path + ".incomplete" | |||
| @contextmanager | |||
| def _resumable_file_manager() -> "io.BufferedWriter": | |||
| with open(incomplete_path, "ab") as f: | |||
| yield f | |||
| temp_file_manager = _resumable_file_manager | |||
| if os.path.exists(incomplete_path): | |||
| resume_size = os.stat(incomplete_path).st_size | |||
| else: | |||
| resume_size = 0 | |||
| else: | |||
| temp_file_manager = partial(tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False) | |||
| resume_size = 0 | |||
| # Download to temporary file, then copy to cache dir once finished. | |||
| # Otherwise you get corrupt cache entries if the download gets interrupted. | |||
| with temp_file_manager() as temp_file: | |||
| logger.info(f"{url} not found in cache or force_download set to True, downloading to {temp_file.name}") | |||
| http_get(url_to_download, temp_file, proxies=proxies, resume_size=resume_size, headers=headers) | |||
| logger.info(f"storing {url} in cache at {cache_path}") | |||
| os.replace(temp_file.name, cache_path) | |||
| # NamedTemporaryFile creates a file with hardwired 0600 perms (ignoring umask), so fixing it. | |||
| umask = os.umask(0o666) | |||
| os.umask(umask) | |||
| os.chmod(cache_path, 0o666 & ~umask) | |||
| logger.info(f"creating metadata file for {cache_path}") | |||
| meta = {"url": url, "etag": etag} | |||
| meta_path = cache_path + ".json" | |||
| with open(meta_path, "w") as meta_file: | |||
| json.dump(meta, meta_file) | |||
| return cache_path | |||
| def is_torch_fx_available(): | |||
| return _TORCH_GREATER_EQUAL_1_8 and _compare_version("torch", operator.lt, "1.9.0") | |||
| def is_torch_fx_proxy(x): | |||
| if is_torch_fx_available(): | |||
| import torch.fx | |||
| return isinstance(x, torch.fx.Proxy) | |||
| return False | |||
| def is_sentencepiece_available(): | |||
| return importlib.util.find_spec("sentencepiece") is not None | |||
| def is_tokenizers_available(): | |||
| return importlib.util.find_spec("tokenizers") is not None | |||
| def is_tensor(x): | |||
| """ | |||
| Tests if ``x`` is a :obj:`torch.Tensor`, :obj:`tf.Tensor`, obj:`jaxlib.xla_extension.DeviceArray` or | |||
| :obj:`np.ndarray`. | |||
| """ | |||
| if is_torch_fx_proxy(x): | |||
| return True | |||
| if isinstance(x, torch.Tensor): | |||
| return True | |||
| return isinstance(x, np.ndarray) | |||
| def to_py_obj(obj): | |||
| """ | |||
| Convert a TensorFlow tensor, PyTorch tensor, Numpy array or python list to a python list. | |||
| """ | |||
| if isinstance(obj, (dict, UserDict)): | |||
| return {k: to_py_obj(v) for k, v in obj.items()} | |||
| elif isinstance(obj, (list, tuple)): | |||
| return [to_py_obj(o) for o in obj] | |||
| elif _NEED_IMPORT_TORCH and _is_torch(obj): | |||
| return obj.detach().cpu().tolist() | |||
| elif isinstance(obj, np.ndarray): | |||
| return obj.tolist() | |||
| else: | |||
| return obj | |||
| def _is_numpy(x): | |||
| return isinstance(x, np.ndarray) | |||
| def _is_torch(x): | |||
| import torch | |||
| return isinstance(x, torch.Tensor) | |||
| def _is_torch_device(x): | |||
| import torch | |||
| return isinstance(x, torch.device) | |||
| class ModelOutput(OrderedDict): | |||
| """ | |||
| Base class for all model outputs as dataclass. Has a ``__getitem__`` that allows indexing by integer or slice (like | |||
| a tuple) or strings (like a dictionary) that will ignore the ``None`` attributes. Otherwise behaves like a regular | |||
| python dictionary. | |||
| .. warning:: | |||
| You can't unpack a :obj:`ModelOutput` directly. Use the :meth:`~transformers.file_utils.ModelOutput.to_tuple` | |||
| method to convert it to a tuple before. | |||
| """ | |||
| def __post_init__(self): | |||
| class_fields = fields(self) | |||
| # Safety and consistency checks | |||
| assert len(class_fields), f"{self.__class__.__name__} has no fields." | |||
| assert all( | |||
| field.default is None for field in class_fields[1:] | |||
| ), f"{self.__class__.__name__} should not have more than one required field." | |||
| first_field = getattr(self, class_fields[0].name) | |||
| other_fields_are_none = all(getattr(self, field.name) is None for field in class_fields[1:]) | |||
| if other_fields_are_none and not is_tensor(first_field): | |||
| if isinstance(first_field, dict): | |||
| iterator = first_field.items() | |||
| first_field_iterator = True | |||
| else: | |||
| try: | |||
| iterator = iter(first_field) | |||
| first_field_iterator = True | |||
| except TypeError: | |||
| first_field_iterator = False | |||
| # if we provided an iterator as first field and the iterator is a (key, value) iterator | |||
| # set the associated fields | |||
| if first_field_iterator: | |||
| for element in iterator: | |||
| if ( | |||
| not isinstance(element, (list, tuple)) | |||
| or not len(element) == 2 | |||
| or not isinstance(element[0], str) | |||
| ): | |||
| break | |||
| setattr(self, element[0], element[1]) | |||
| if element[1] is not None: | |||
| self[element[0]] = element[1] | |||
| elif first_field is not None: | |||
| self[class_fields[0].name] = first_field | |||
| else: | |||
| for field in class_fields: | |||
| v = getattr(self, field.name) | |||
| if v is not None: | |||
| self[field.name] = v | |||
| def __delitem__(self, *args, **kwargs): | |||
| raise Exception(f"You cannot use ``__delitem__`` on a {self.__class__.__name__} instance.") | |||
| def setdefault(self, *args, **kwargs): | |||
| raise Exception(f"You cannot use ``setdefault`` on a {self.__class__.__name__} instance.") | |||
| def pop(self, *args, **kwargs): | |||
| raise Exception(f"You cannot use ``pop`` on a {self.__class__.__name__} instance.") | |||
| def update(self, *args, **kwargs): | |||
| raise Exception(f"You cannot use ``update`` on a {self.__class__.__name__} instance.") | |||
| def __getitem__(self, k): | |||
| if isinstance(k, str): | |||
| inner_dict = {k: v for (k, v) in self.items()} | |||
| return inner_dict[k] | |||
| else: | |||
| return self.to_tuple()[k] | |||
| def __setattr__(self, name, value): | |||
| if name in self.keys() and value is not None: | |||
| # Don't call self.__setitem__ to avoid recursion errors | |||
| super().__setitem__(name, value) | |||
| super().__setattr__(name, value) | |||
| def __setitem__(self, key, value): | |||
| # Will raise a KeyException if needed | |||
| super().__setitem__(key, value) | |||
| # Don't call self.__setattr__ to avoid recursion errors | |||
| super().__setattr__(key, value) | |||
| def to_tuple(self) -> Tuple[Any]: | |||
| """ | |||
| Convert self to a tuple containing all the attributes/keys that are not ``None``. | |||
| """ | |||
| return tuple(self[k] for k in self.keys()) | |||
| class ExplicitEnum(Enum): | |||
| """ | |||
| Enum with more explicit error message for missing values. | |||
| """ | |||
| @classmethod | |||
| def _missing_(cls, value): | |||
| raise ValueError( | |||
| f"{value} is not a valid {cls.__name__}, please select one of {list(cls._value2member_map_.keys())}" | |||
| ) | |||
| class PaddingStrategy(ExplicitEnum): | |||
| """ | |||
| Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion | |||
| in an IDE. | |||
| """ | |||
| LONGEST = "longest" | |||
| MAX_LENGTH = "max_length" | |||
| DO_NOT_PAD = "do_not_pad" | |||
| class TensorType(ExplicitEnum): | |||
| """ | |||
| Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for | |||
| tab-completion in an IDE. | |||
| """ | |||
| PYTORCH = "pt" | |||
| NUMPY = "np" | |||
| @@ -0,0 +1,393 @@ | |||
| # coding=utf-8 | |||
| # Copyright 2020 The HuggingFace Inc. team | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| from abc import ABC, abstractmethod | |||
| from collections import UserDict | |||
| from typing import Optional, Tuple | |||
| from .file_utils import add_start_docstrings | |||
| from fastNLP.envs.imports import _NEED_IMPORT_TORCH | |||
| from fastNLP.core.log import logger | |||
| if _NEED_IMPORT_TORCH: | |||
| import torch | |||
| PROCESS_INPUTS_DOCSTRING = r""" | |||
| Args: | |||
| input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`): | |||
| Indices of input sequence tokens in the vocabulary. | |||
| Indices can be obtained using any class inheriting from :class:`~transformers.PreTrainedTokenizer`. See | |||
| :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for | |||
| details. | |||
| `What are input IDs? <../glossary.html#input-ids>`__ | |||
| next_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2 * num_beams)`): | |||
| Current scores of the top :obj:`2 * num_beams` non-finished beam hypotheses. | |||
| next_tokens (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`): | |||
| :obj:`input_ids` of the tokens corresponding to the top :obj:`2 * num_beams` non-finished beam hypotheses. | |||
| next_indices (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`): | |||
| Beam indices indicating to which beam hypothesis the :obj:`next_tokens` correspond. | |||
| pad_token_id (:obj:`int`, `optional`): | |||
| The id of the `padding` token. | |||
| eos_token_id (:obj:`int`, `optional`): | |||
| The id of the `end-of-sequence` token. | |||
| Return: | |||
| :obj:`UserDict`: A dictionary composed of the fields as defined above: | |||
| - **next_beam_scores** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Updated | |||
| scores of all non-finished beams. | |||
| - **next_beam_tokens** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Next tokens | |||
| to be added to the non-finished beam_hypotheses. | |||
| - **next_beam_indices** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Beam indices | |||
| indicating to which beam the next tokens shall be added. | |||
| """ | |||
| FINALIZE_INPUTS_DOCSTRING = r""" | |||
| Args: | |||
| input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`): | |||
| Indices of input sequence tokens in the vocabulary. | |||
| Indices can be obtained using any class inheriting from :class:`~transformers.PreTrainedTokenizer`. See | |||
| :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for | |||
| details. | |||
| `What are input IDs? <../glossary.html#input-ids>`__ | |||
| final_beam_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`): | |||
| The final scores of all non-finished beams. | |||
| final_beam_tokens (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`): | |||
| The last tokens to be added to the non-finished beam_hypotheses. | |||
| final_beam_indices (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`): | |||
| The beam indices indicating to which beam the :obj:`final_beam_tokens` shall be added. | |||
| pad_token_id (:obj:`int`, `optional`): | |||
| The id of the `padding` token. | |||
| eos_token_id (:obj:`int`, `optional`): | |||
| The id of the `end-of-sequence` token. | |||
| Return: | |||
| :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated | |||
| sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all | |||
| batches finished early due to the :obj:`eos_token_id`. | |||
| """ | |||
| class BeamScorer(ABC): | |||
| """ | |||
| Abstract base class for all beam scorers that are used for :meth:`~transformers.PreTrainedModel.beam_search` and | |||
| :meth:`~transformers.PreTrainedModel.beam_sample`. | |||
| """ | |||
| @abstractmethod | |||
| @add_start_docstrings(PROCESS_INPUTS_DOCSTRING) | |||
| def process( | |||
| self, | |||
| input_ids: "torch.LongTensor", | |||
| next_scores: "torch.FloatTensor", | |||
| next_tokens: "torch.LongTensor", | |||
| next_indices: "torch.LongTensor", | |||
| **kwargs | |||
| ) -> Tuple["torch.Tensor"]: | |||
| raise NotImplementedError("This is an abstract method.") | |||
| @abstractmethod | |||
| @add_start_docstrings(FINALIZE_INPUTS_DOCSTRING) | |||
| def finalize( | |||
| self, | |||
| input_ids: "torch.LongTensor", | |||
| next_scores: "torch.FloatTensor", | |||
| next_tokens: "torch.LongTensor", | |||
| next_indices: "torch.LongTensor", | |||
| max_length: int, | |||
| **kwargs | |||
| ) -> "torch.LongTensor": | |||
| raise NotImplementedError("This is an abstract method.") | |||
| class BeamSearchScorer(BeamScorer): | |||
| r""" | |||
| :class:`transformers.BeamScorer` implementing standard beam search decoding. | |||
| Adapted in part from `Facebook's XLM beam search code | |||
| <https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529>`__. | |||
| Reference for the diverse beam search algorithm and implementation `Ashwin Kalyan's DBS implementation | |||
| <https://github.com/ashwinkalyan/dbs/blob/master/dbs/beam_utils.lua>`__ | |||
| Args: | |||
| batch_size (:obj:`int`): | |||
| Batch Size of :obj:`input_ids` for which standard beam search decoding is run in parallel. | |||
| max_length (:obj:`int`): | |||
| The maximum length of the sequence to be generated. | |||
| num_beams (:obj:`int`): | |||
| Number of beams for beam search. | |||
| device (:obj:`torch.device`): | |||
| Defines the device type (*e.g.*, :obj:`"cpu"` or :obj:`"cuda"`) on which this instance of | |||
| :obj:`BeamSearchScorer` will be allocated. | |||
| length_penalty (:obj:`float`, `optional`, defaults to 1.0): | |||
| Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the | |||
| model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer | |||
| sequences. | |||
| do_early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. | |||
| num_beam_hyps_to_keep (:obj:`int`, `optional`, defaults to 1): | |||
| The number of beam hypotheses that shall be returned upon calling | |||
| :meth:`~transformer.BeamSearchScorer.finalize`. | |||
| num_beam_groups (:obj:`int`): | |||
| Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of | |||
| beams. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details. | |||
| """ | |||
| def __init__( | |||
| self, | |||
| batch_size: int, | |||
| num_beams: int, | |||
| device: "torch.device", | |||
| length_penalty: Optional[float] = 1.0, | |||
| do_early_stopping: Optional[bool] = False, | |||
| num_beam_hyps_to_keep: Optional[int] = 1, | |||
| num_beam_groups: Optional[int] = 1, | |||
| **kwargs, | |||
| ): | |||
| self.num_beams = num_beams | |||
| self.device = device | |||
| self.length_penalty = length_penalty | |||
| self.do_early_stopping = do_early_stopping | |||
| self.num_beam_hyps_to_keep = num_beam_hyps_to_keep | |||
| self.num_beam_groups = num_beam_groups | |||
| self.group_size = self.num_beams // self.num_beam_groups | |||
| self._is_init = False | |||
| self._beam_hyps = [ | |||
| BeamHypotheses( | |||
| num_beams=self.num_beams, | |||
| length_penalty=self.length_penalty, | |||
| early_stopping=self.do_early_stopping, | |||
| ) | |||
| for _ in range(batch_size) | |||
| ] | |||
| self._done = torch.tensor([False for _ in range(batch_size)], dtype=torch.bool, device=self.device) | |||
| if not isinstance(num_beams, int) or num_beams <= 1: | |||
| raise ValueError( | |||
| f"`num_beams` has to be an integer strictly greater than 1, but is {num_beams}. For `num_beams` == 1, one should make use of `greedy_search` instead." | |||
| ) | |||
| if not isinstance(num_beam_groups, int) or (num_beam_groups > num_beams) or (num_beams % num_beam_groups != 0): | |||
| raise ValueError( | |||
| f"`num_beam_groups` has to be an integer smaller or equal than `num_beams` and `num_beams` " | |||
| f"has to be divisible by `num_beam_groups`, but is {num_beam_groups} with `num_beams` being {num_beams}." | |||
| ) | |||
| if "max_length" in kwargs: | |||
| logger.warn( | |||
| "Passing `max_length` to BeamSearchScorer is deprecated and has no effect." | |||
| "`max_length` should be passed directly to `beam_search(...)`, `beam_sample(...)`" | |||
| ",or `group_beam_search(...)`." | |||
| ) | |||
| @property | |||
| def is_done(self) -> bool: | |||
| return self._done.all() | |||
| def process( | |||
| self, | |||
| input_ids: "torch.LongTensor", | |||
| next_scores: "torch.FloatTensor", | |||
| next_tokens: "torch.LongTensor", | |||
| next_indices: "torch.LongTensor", | |||
| pad_token_id: Optional[int] = None, | |||
| eos_token_id: Optional[int] = None, | |||
| ) -> Tuple["torch.Tensor"]: | |||
| cur_len = input_ids.shape[-1] | |||
| batch_size = len(self._beam_hyps) | |||
| assert batch_size == (input_ids.shape[0] // self.group_size) | |||
| device = input_ids.device | |||
| next_beam_scores = torch.zeros((batch_size, self.group_size), dtype=next_scores.dtype, device=device) | |||
| next_beam_tokens = torch.zeros((batch_size, self.group_size), dtype=next_tokens.dtype, device=device) | |||
| next_beam_indices = torch.zeros((batch_size, self.group_size), dtype=next_indices.dtype, device=device) | |||
| for batch_idx, beam_hyp in enumerate(self._beam_hyps): | |||
| if self._done[batch_idx]: | |||
| assert ( | |||
| len(beam_hyp) >= self.num_beams | |||
| ), f"Batch can only be done if at least {self.num_beams} beams have been generated" | |||
| assert ( | |||
| eos_token_id is not None and pad_token_id is not None | |||
| ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" | |||
| # pad the batch | |||
| next_beam_scores[batch_idx, :] = 0 | |||
| next_beam_tokens[batch_idx, :] = pad_token_id | |||
| next_beam_indices[batch_idx, :] = 0 | |||
| continue | |||
| # next tokens for this sentence | |||
| beam_idx = 0 | |||
| for beam_token_rank, (next_token, next_score, next_index) in enumerate( | |||
| zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx]) | |||
| ): | |||
| batch_beam_idx = batch_idx * self.group_size + next_index | |||
| # add to generated hypotheses if end of sentence | |||
| if (eos_token_id is not None) and (next_token.item() == eos_token_id): | |||
| # if beam_token does not belong to top num_beams tokens, it should not be added | |||
| is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size | |||
| if is_beam_token_worse_than_top_num_beams: | |||
| continue | |||
| beam_hyp.add( | |||
| input_ids[batch_beam_idx].clone(), | |||
| next_score.item(), | |||
| ) | |||
| else: | |||
| # add next predicted token since it is not eos_token | |||
| next_beam_scores[batch_idx, beam_idx] = next_score | |||
| next_beam_tokens[batch_idx, beam_idx] = next_token | |||
| next_beam_indices[batch_idx, beam_idx] = batch_beam_idx | |||
| beam_idx += 1 | |||
| # once the beam for next step is full, don't add more tokens to it. | |||
| if beam_idx == self.group_size: | |||
| break | |||
| if beam_idx < self.group_size: | |||
| raise ValueError( | |||
| f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal to `eos_token_id: {eos_token_id}`. Make sure {next_tokens[batch_idx]} are corrected." | |||
| ) | |||
| # Check if we are done so that we can save a pad step if all(done) | |||
| self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done( | |||
| next_scores[batch_idx].max().item(), cur_len | |||
| ) | |||
| return UserDict( | |||
| { | |||
| "next_beam_scores": next_beam_scores.view(-1), | |||
| "next_beam_tokens": next_beam_tokens.view(-1), | |||
| "next_beam_indices": next_beam_indices.view(-1), | |||
| } | |||
| ) | |||
| def finalize( | |||
| self, | |||
| input_ids: "torch.LongTensor", | |||
| final_beam_scores: "torch.FloatTensor", | |||
| final_beam_tokens: "torch.LongTensor", | |||
| final_beam_indices: "torch.LongTensor", | |||
| max_length: int, | |||
| pad_token_id: Optional[int] = None, | |||
| eos_token_id: Optional[int] = None, | |||
| ) -> Tuple["torch.LongTensor"]: | |||
| batch_size = len(self._beam_hyps) | |||
| # finalize all open beam hypotheses and add to generated hypotheses | |||
| for batch_idx, beam_hyp in enumerate(self._beam_hyps): | |||
| if self._done[batch_idx]: | |||
| continue | |||
| # all open beam hypotheses are added to the beam hypothesis | |||
| # beam hypothesis class automatically keeps the best beams | |||
| for beam_id in range(self.num_beams): | |||
| batch_beam_idx = batch_idx * self.num_beams + beam_id | |||
| final_score = final_beam_scores[batch_beam_idx].item() | |||
| final_tokens = input_ids[batch_beam_idx] | |||
| beam_hyp.add(final_tokens, final_score) | |||
| # select the best hypotheses | |||
| sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep) | |||
| best = [] | |||
| best_scores = torch.zeros(batch_size * self.num_beam_hyps_to_keep, device=self.device, dtype=torch.float32) | |||
| # retrieve best hypotheses | |||
| for i, beam_hyp in enumerate(self._beam_hyps): | |||
| sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0]) | |||
| for j in range(self.num_beam_hyps_to_keep): | |||
| best_hyp_tuple = sorted_hyps.pop() | |||
| best_score = best_hyp_tuple[0] | |||
| best_hyp = best_hyp_tuple[1] | |||
| sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp) | |||
| # append to lists | |||
| best.append(best_hyp) | |||
| best_scores[i * self.num_beam_hyps_to_keep + j] = best_score | |||
| # prepare for adding eos | |||
| sent_max_len = min(sent_lengths.max().item() + 1, max_length) | |||
| decoded: "torch.LongTensor" = input_ids.new(batch_size * self.num_beam_hyps_to_keep, sent_max_len) | |||
| # shorter batches are padded if needed | |||
| if sent_lengths.min().item() != sent_lengths.max().item(): | |||
| assert pad_token_id is not None, "`pad_token_id` has to be defined" | |||
| decoded.fill_(pad_token_id) | |||
| # fill with hypotheses and eos_token_id if the latter fits in | |||
| for i, hypo in enumerate(best): | |||
| decoded[i, : sent_lengths[i]] = hypo | |||
| if sent_lengths[i] < max_length: | |||
| decoded[i, sent_lengths[i]] = eos_token_id | |||
| return UserDict( | |||
| { | |||
| "sequences": decoded, | |||
| "sequence_scores": best_scores, | |||
| } | |||
| ) | |||
| class BeamHypotheses: | |||
| def __init__(self, num_beams: int, length_penalty: float, early_stopping: bool): | |||
| """ | |||
| Initialize n-best list of hypotheses. | |||
| """ | |||
| self.length_penalty = length_penalty | |||
| self.early_stopping = early_stopping | |||
| self.num_beams = num_beams | |||
| self.beams = [] | |||
| self.worst_score = 1e9 | |||
| def __len__(self): | |||
| """ | |||
| Number of hypotheses in the list. | |||
| """ | |||
| return len(self.beams) | |||
| def add(self, hyp: "torch.LongTensor", sum_logprobs: float): | |||
| """ | |||
| Add a new hypothesis to the list. | |||
| """ | |||
| score = sum_logprobs / (hyp.shape[-1] ** self.length_penalty) | |||
| if len(self) < self.num_beams or score > self.worst_score: | |||
| self.beams.append((score, hyp)) | |||
| if len(self) > self.num_beams: | |||
| sorted_next_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)]) | |||
| del self.beams[sorted_next_scores[0][1]] | |||
| self.worst_score = sorted_next_scores[1][0] | |||
| else: | |||
| self.worst_score = min(score, self.worst_score) | |||
| def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool: | |||
| """ | |||
| If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst | |||
| one in the heap, then we are done with this sentence. | |||
| """ | |||
| if len(self) < self.num_beams: | |||
| return False | |||
| elif self.early_stopping: | |||
| return True | |||
| else: | |||
| cur_score = best_sum_logprobs / cur_len ** self.length_penalty | |||
| ret = self.worst_score >= cur_score | |||
| return ret | |||
| @@ -0,0 +1,618 @@ | |||
| # coding=utf-8 | |||
| # Copyright 2020 The HuggingFace Inc. team | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| import inspect | |||
| import math | |||
| from abc import ABC | |||
| from typing import Callable, Iterable, List, Optional | |||
| import numpy as np | |||
| from .file_utils import add_start_docstrings | |||
| from fastNLP.envs.imports import _NEED_IMPORT_TORCH | |||
| from fastNLP.core.log import logger | |||
| if _NEED_IMPORT_TORCH: | |||
| import torch | |||
| LOGITS_PROCESSOR_INPUTS_DOCSTRING = r""" | |||
| Args: | |||
| input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): | |||
| Indices of input sequence tokens in the vocabulary. | |||
| Indices can be obtained using :class:`~transformers.BertTokenizer`. See | |||
| :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for | |||
| details. | |||
| `What are input IDs? <../glossary.html#input-ids>`__ | |||
| scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`): | |||
| Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam | |||
| search or log softmax for each vocabulary token when using beam search | |||
| kwargs: | |||
| Additional logits processor specific kwargs. | |||
| Return: | |||
| :obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`: The processed prediction scores. | |||
| """ | |||
| class LogitsProcessor(ABC): | |||
| """Abstract base class for all logit processors that can be applied during generation.""" | |||
| @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": | |||
| """Torch method for processing logits.""" | |||
| raise NotImplementedError( | |||
| f"{self.__class__} is an abstract class. Only classes inheriting this class can be called." | |||
| ) | |||
| class LogitsWarper(ABC): | |||
| """Abstract base class for all logit warpers that can be applied during generation with multinomial sampling.""" | |||
| @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": | |||
| """Torch method for warping logits.""" | |||
| raise NotImplementedError( | |||
| f"{self.__class__} is an abstract class. Only classes inheriting this class can be called." | |||
| ) | |||
| class LogitsProcessorList(list): | |||
| """ | |||
| This class can be used to create a list of :class:`~transformers.LogitsProcessor` or | |||
| :class:`~transformers.LogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits from | |||
| list and adds a specific `__call__` method to apply each :class:`~transformers.LogitsProcessor` or | |||
| :class:`~transformers.LogitsWarper` to the inputs. | |||
| """ | |||
| @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> "torch.FloatTensor": | |||
| for processor in self: | |||
| function_args = inspect.signature(processor.__call__).parameters | |||
| if len(function_args) > 2: | |||
| assert all( | |||
| arg in kwargs for arg in list(function_args.keys())[2:] | |||
| ), f"Make sure that all the required parameters: {list(function_args.keys())} for {processor.__class__} are passed to the logits processor." | |||
| scores = processor(input_ids, scores, **kwargs) | |||
| else: | |||
| scores = processor(input_ids, scores) | |||
| return scores | |||
| class MinLengthLogitsProcessor(LogitsProcessor): | |||
| r""" | |||
| :class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0. | |||
| Args: | |||
| min_length (:obj:`int`): | |||
| The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`. | |||
| eos_token_id (:obj:`int`): | |||
| The id of the `end-of-sequence` token. | |||
| """ | |||
| def __init__(self, min_length: int, eos_token_id: int): | |||
| if not isinstance(min_length, int) or min_length < 0: | |||
| raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}") | |||
| if not isinstance(eos_token_id, int) or eos_token_id < 0: | |||
| raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}") | |||
| self.min_length = min_length | |||
| self.eos_token_id = eos_token_id | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": | |||
| cur_len = input_ids.shape[-1] | |||
| if cur_len < self.min_length: | |||
| scores[:, self.eos_token_id] = -float("inf") | |||
| return scores | |||
| class TemperatureLogitsWarper(LogitsWarper): | |||
| r""" | |||
| :class:`transformers.LogitsWarper` for temperature (exponential scaling output probability distribution). | |||
| Args: | |||
| temperature (:obj:`float`): | |||
| The value used to module the logits distribution. | |||
| """ | |||
| def __init__(self, temperature: float): | |||
| if not isinstance(temperature, float) or not (temperature > 0): | |||
| raise ValueError(f"`temperature` has to be a strictly positive float, but is {temperature}") | |||
| self.temperature = temperature | |||
| def __call__(self, input_ids: "torch.Tensor", scores: "torch.Tensor") -> "torch.FloatTensor": | |||
| scores = scores / self.temperature | |||
| return scores | |||
| class RepetitionPenaltyLogitsProcessor(LogitsProcessor): | |||
| r""" | |||
| :class:`transformers.LogitsProcessor` enforcing an exponential penalty on repeated sequences. | |||
| Args: | |||
| repetition_penalty (:obj:`float`): | |||
| The parameter for repetition penalty. 1.0 means no penalty. See `this paper | |||
| <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details. | |||
| """ | |||
| def __init__(self, penalty: float): | |||
| if not isinstance(penalty, float) or not (penalty > 0): | |||
| raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}") | |||
| self.penalty = penalty | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": | |||
| score = torch.gather(scores, 1, input_ids) | |||
| # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability | |||
| score = torch.where(score < 0, score * self.penalty, score / self.penalty) | |||
| scores.scatter_(1, input_ids, score) | |||
| return scores | |||
| class TopPLogitsWarper(LogitsWarper): | |||
| """ | |||
| :class:`transformers.LogitsWarper` that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= | |||
| prob_cut_off. | |||
| Args: | |||
| top_p (:obj:`float`): | |||
| If set to < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or higher are | |||
| kept for generation. | |||
| filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`): | |||
| All filtered values will be set to this float value. | |||
| min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1): | |||
| Minimum number of tokens that cannot be filtered. | |||
| """ | |||
| def __init__(self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): | |||
| top_p = float(top_p) | |||
| if top_p < 0 or top_p > 1.0: | |||
| raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}") | |||
| self.top_p = top_p | |||
| self.filter_value = filter_value | |||
| self.min_tokens_to_keep = min_tokens_to_keep | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": | |||
| sorted_logits, sorted_indices = torch.sort(scores, descending=True) | |||
| cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1) | |||
| # Remove tokens with cumulative top_p above the threshold (token with 0 are kept) | |||
| sorted_indices_to_remove = cumulative_probs > self.top_p | |||
| if self.min_tokens_to_keep > 1: | |||
| # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) | |||
| sorted_indices_to_remove[..., : self.min_tokens_to_keep - 1] = 0 | |||
| # Shift the indices to the right to keep also the first token above the threshold | |||
| sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() | |||
| sorted_indices_to_remove[..., 0] = 0 | |||
| # scatter sorted tensors to original indexing | |||
| indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) | |||
| scores = scores.masked_fill(indices_to_remove, self.filter_value) | |||
| return scores | |||
| class TopKLogitsWarper(LogitsWarper): | |||
| r""" | |||
| :class:`transformers.LogitsWarper` that performs top-k, i.e. restricting to the k highest probability elements. | |||
| Args: | |||
| top_k (:obj:`int`): | |||
| The number of highest probability vocabulary tokens to keep for top-k-filtering. | |||
| filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`): | |||
| All filtered values will be set to this float value. | |||
| min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1): | |||
| Minimum number of tokens that cannot be filtered. | |||
| """ | |||
| def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): | |||
| if not isinstance(top_k, int) or top_k <= 0: | |||
| raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}") | |||
| self.top_k = top_k | |||
| self.filter_value = filter_value | |||
| self.min_tokens_to_keep = min_tokens_to_keep | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": | |||
| top_k = min(max(self.top_k, self.min_tokens_to_keep), scores.size(-1)) # Safety check | |||
| # Remove all tokens with a probability less than the last token of the top-k | |||
| indices_to_remove = scores < torch.topk(scores, top_k)[0][..., -1, None] | |||
| scores = scores.masked_fill(indices_to_remove, self.filter_value) | |||
| return scores | |||
| def _get_ngrams(ngram_size: int, prev_input_ids: "torch.Tensor", num_hypos: int): | |||
| generated_ngrams = [{} for _ in range(num_hypos)] | |||
| for idx in range(num_hypos): | |||
| gen_tokens = prev_input_ids[idx].tolist() | |||
| generated_ngram = generated_ngrams[idx] | |||
| for ngram in zip(*[gen_tokens[i:] for i in range(ngram_size)]): | |||
| prev_ngram_tuple = tuple(ngram[:-1]) | |||
| generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] | |||
| return generated_ngrams | |||
| def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur_len): | |||
| # Before decoding the next token, prevent decoding of ngrams that have already appeared | |||
| start_idx = cur_len + 1 - ngram_size | |||
| ngram_idx = tuple(prev_input_ids[start_idx:cur_len].tolist()) | |||
| return banned_ngrams.get(ngram_idx, []) | |||
| def _calc_banned_ngram_tokens( | |||
| ngram_size: int, prev_input_ids: "torch.Tensor", num_hypos: int, cur_len: int | |||
| ) -> List[Iterable[int]]: | |||
| """Copied from fairseq for no_repeat_ngram in beam_search""" | |||
| if cur_len + 1 < ngram_size: | |||
| # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet | |||
| return [[] for _ in range(num_hypos)] | |||
| generated_ngrams = _get_ngrams(ngram_size, prev_input_ids, num_hypos) | |||
| banned_tokens = [ | |||
| _get_generated_ngrams(generated_ngrams[hypo_idx], prev_input_ids[hypo_idx], ngram_size, cur_len) | |||
| for hypo_idx in range(num_hypos) | |||
| ] | |||
| return banned_tokens | |||
| class NoRepeatNGramLogitsProcessor(LogitsProcessor): | |||
| r""" | |||
| :class:`transformers.LogitsProcessor` that enforces no repetition of n-grams. See `Fairseq | |||
| <https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345>`__. | |||
| Args: | |||
| ngram_size (:obj:`int`): | |||
| All ngrams of size :obj:`ngram_size` can only occur once. | |||
| """ | |||
| def __init__(self, ngram_size: int): | |||
| if not isinstance(ngram_size, int) or ngram_size <= 0: | |||
| raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}") | |||
| self.ngram_size = ngram_size | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": | |||
| num_batch_hypotheses = scores.shape[0] | |||
| cur_len = input_ids.shape[-1] | |||
| banned_batch_tokens = _calc_banned_ngram_tokens(self.ngram_size, input_ids, num_batch_hypotheses, cur_len) | |||
| for i, banned_tokens in enumerate(banned_batch_tokens): | |||
| scores[i, banned_tokens] = -float("inf") | |||
| return scores | |||
| class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor): | |||
| r""" | |||
| :class:`transformers.LogitsProcessor` that enforces no repetition of encoder input ids n-grams for the decoder ids. | |||
| See `ParlAI <https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/torch_generator_agent.py#L1350>`__. | |||
| Args: | |||
| encoder_ngram_size (:obj:`int`): | |||
| All ngrams of size :obj:`ngram_size` can only occur within the encoder input ids. | |||
| encoder_input_ids (:obj:`int`): | |||
| The encoder_input_ids that should not be repeated within the decoder ids. | |||
| """ | |||
| def __init__(self, encoder_ngram_size: int, encoder_input_ids: "torch.LongTensor"): | |||
| if not isinstance(encoder_ngram_size, int) or encoder_ngram_size <= 0: | |||
| raise ValueError( | |||
| f"`encoder_ngram_size` has to be a strictly positive integer, but is {encoder_ngram_size}" | |||
| ) | |||
| self.ngram_size = encoder_ngram_size | |||
| if len(encoder_input_ids.shape) == 1: | |||
| encoder_input_ids = encoder_input_ids.unsqueeze(0) | |||
| self.batch_size = encoder_input_ids.shape[0] | |||
| self.generated_ngrams = _get_ngrams(encoder_ngram_size, encoder_input_ids, self.batch_size) | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": | |||
| # B x num_beams | |||
| num_hypos = scores.shape[0] | |||
| num_beams = num_hypos // self.batch_size | |||
| cur_len = input_ids.shape[-1] | |||
| banned_batch_tokens = [ | |||
| _get_generated_ngrams( | |||
| self.generated_ngrams[hypo_idx // num_beams], input_ids[hypo_idx], self.ngram_size, cur_len | |||
| ) | |||
| for hypo_idx in range(num_hypos) | |||
| ] | |||
| for i, banned_tokens in enumerate(banned_batch_tokens): | |||
| scores[i, banned_tokens] = -float("inf") | |||
| return scores | |||
| class NoBadWordsLogitsProcessor(LogitsProcessor): | |||
| """ | |||
| :class:`transformers.LogitsProcessor` that enforces that specified sequences will never be sampled. | |||
| Args: | |||
| bad_words_ids (:obj:`List[List[int]]`): | |||
| List of list of token ids that are not allowed to be generated. In order to get the tokens of the words | |||
| that should not appear in the generated text, use :obj:`tokenizer(bad_word, | |||
| add_prefix_space=True).input_ids`. | |||
| eos_token_id (:obj:`int`): | |||
| The id of the `end-of-sequence` token. | |||
| """ | |||
| def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int): | |||
| if not isinstance(bad_words_ids, List) or len(bad_words_ids) == 0: | |||
| raise ValueError(f"`bad_words_ids` has to be a non-emtpy list, but is {bad_words_ids}.") | |||
| if any(not isinstance(bad_word_ids, list) for bad_word_ids in bad_words_ids): | |||
| raise ValueError(f"`bad_words_ids` has to be a list of lists, but is {bad_words_ids}.") | |||
| if any( | |||
| any((not isinstance(token_id, (int, np.integer)) or token_id < 0) for token_id in bad_word_ids) | |||
| for bad_word_ids in bad_words_ids | |||
| ): | |||
| raise ValueError( | |||
| f"Each list in `bad_words_ids` has to be a list of positive integers, but is {bad_words_ids}." | |||
| ) | |||
| bad_words_ids = list(filter(lambda bad_token_seq: bad_token_seq != [eos_token_id], bad_words_ids)) | |||
| self.bad_words_id_length_1 = [] | |||
| self.bad_words_id_length_greater_than_1 = [] | |||
| for word in bad_words_ids: | |||
| if len(word) == 1: | |||
| self.bad_words_id_length_1.append(word[0]) | |||
| else: | |||
| self.bad_words_id_length_greater_than_1.append(word) | |||
| self.static_bad_words_mask: Optional[torch.LongTensor] = None | |||
| for banned_token_seq in self.bad_words_id_length_greater_than_1: | |||
| assert len(banned_token_seq) > 0, f"Banned words token sequences {bad_words_ids} cannot have an empty list" | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": | |||
| if self.static_bad_words_mask is None and len(self.bad_words_id_length_1) > 0: | |||
| self.static_bad_words_mask = self._calc_static_bad_word_mask(scores) | |||
| dynamic_banned_tokens = self._calc_banned_bad_words_ids(input_ids.tolist()) | |||
| scores = self._set_scores_to_inf_for_banned_tokens(scores, dynamic_banned_tokens) | |||
| return scores | |||
| def _calc_static_bad_word_mask(self, scores: "torch.FloatTensor") -> "torch.BoolTensor": | |||
| static_bad_words_mask = torch.zeros(scores.shape[1]) | |||
| static_bad_words_mask[self.bad_words_id_length_1] = 1 | |||
| return static_bad_words_mask.unsqueeze(0).to(scores.device).bool() | |||
| def _tokens_match(self, prev_tokens: List[int], tokens: List[int]) -> bool: | |||
| if len(tokens) == 0: | |||
| # if bad word tokens is just one token always ban it | |||
| return True | |||
| elif len(tokens) > len(prev_tokens): | |||
| # if bad word tokens are longer then prev input_ids they can't be equal | |||
| return False | |||
| else: | |||
| return prev_tokens[-len(tokens) :] == tokens | |||
| def _calc_banned_bad_words_ids(self, prev_input_ids: List[List[int]]) -> Iterable[int]: | |||
| banned_tokens = [] | |||
| for prev_input_ids_slice in prev_input_ids: | |||
| banned_tokens_slice = [] | |||
| for banned_token_seq in self.bad_words_id_length_greater_than_1: | |||
| if self._tokens_match(prev_input_ids_slice, banned_token_seq[:-1]): | |||
| banned_tokens_slice.append(banned_token_seq[-1]) | |||
| banned_tokens.append(banned_tokens_slice) | |||
| return banned_tokens | |||
| def _set_scores_to_inf_for_banned_tokens( | |||
| self, scores: "torch.Tensor", banned_tokens: List[List[int]] | |||
| ) -> "torch.Tensor": | |||
| """ | |||
| Modifies the scores in place by setting the banned token positions to `-inf`. Banned token is expected to be a | |||
| list of list of banned tokens to ban in the format [[batch index, vocabulary position],... | |||
| Args: | |||
| scores: logits distribution of shape (batch size, vocabulary size) | |||
| banned_tokens: list of list of tokens to ban of length (batch_size) | |||
| """ | |||
| banned_mask_list = [] | |||
| for idx, batch_banned_tokens in enumerate(banned_tokens): | |||
| for token in batch_banned_tokens: | |||
| # Eliminates invalid bad word IDs that are over the vocabulary size. | |||
| if token <= scores.shape[1]: | |||
| banned_mask_list.append([idx, token]) | |||
| else: | |||
| logger.error( | |||
| f"An invalid bad word ID is defined: {token}. This ID is not contained in the" | |||
| f"vocabulary, and is therefore ignored." | |||
| ) | |||
| if not banned_mask_list and self.static_bad_words_mask is None: | |||
| return scores | |||
| else: | |||
| if banned_mask_list: | |||
| banned_mask = torch.LongTensor(banned_mask_list) | |||
| indices = torch.ones(len(banned_mask)) | |||
| # A sparse tensor is generated from a list of coordinates: [[0, 1], [0, 2], [2, 0]]. A conversion to dense tensor generates: | |||
| # [ 0 1 1 ] | |||
| # [ 0 0 0 ] | |||
| # [ 1 0 0 ] | |||
| banned_mask = ( | |||
| torch.sparse.LongTensor(banned_mask.t(), indices, scores.size()) | |||
| .to(scores.device) | |||
| .to_dense() | |||
| .bool() | |||
| ) | |||
| if self.static_bad_words_mask is not None: | |||
| banned_mask = torch.bitwise_or(banned_mask, self.static_bad_words_mask) | |||
| else: | |||
| banned_mask = self.static_bad_words_mask | |||
| scores = scores.masked_fill(banned_mask, -float("inf")) | |||
| return scores | |||
| class PrefixConstrainedLogitsProcessor(LogitsProcessor): | |||
| r""" | |||
| :class:`transformers.LogitsProcessor` that enforces constrained generation and is useful for prefix-conditioned | |||
| constrained generation. See `Autoregressive Entity Retrieval <https://arxiv.org/abs/2010.00904>`__ for more | |||
| information. | |||
| Args: | |||
| prefix_allowed_tokens_fn: (:obj:`Callable[[int, torch.Tensor], List[int]]`): | |||
| This function constraints the beam search to allowed tokens only at each step. This function takes 2 | |||
| arguments :obj:`inputs_ids` and the batch ID :obj:`batch_id`. It has to return a list with the allowed | |||
| tokens for the next generation step conditioned on the previously generated tokens :obj:`inputs_ids` and | |||
| the batch ID :obj:`batch_id`. | |||
| """ | |||
| def __init__(self, prefix_allowed_tokens_fn: Callable[[int, "torch.Tensor"], List[int]], num_beams: int): | |||
| self._prefix_allowed_tokens_fn = prefix_allowed_tokens_fn | |||
| self._num_beams = num_beams | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": | |||
| mask = torch.full_like(scores, -math.inf) | |||
| for batch_id, beam_sent in enumerate(input_ids.view(-1, self._num_beams, input_ids.shape[-1])): | |||
| for beam_id, sent in enumerate(beam_sent): | |||
| mask[batch_id * self._num_beams + beam_id, self._prefix_allowed_tokens_fn(batch_id, sent)] = 0 | |||
| return scores + mask | |||
| class HammingDiversityLogitsProcessor(LogitsProcessor): | |||
| r""" | |||
| :class:`transformers.LogitsProcessor` that enforces diverse beam search. Note that this logits processor is only | |||
| effective for :meth:`transformers.PreTrainedModel.group_beam_search`. See `Diverse Beam Search: Decoding Diverse | |||
| Solutions from Neural Sequence Models <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details. | |||
| Args: | |||
| diversity_penalty (:obj:`float`): | |||
| This value is subtracted from a beam's score if it generates a token same as any beam from other group at a | |||
| particular time. Note that :obj:`diversity_penalty` is only effective if ``group beam search`` is enabled. | |||
| num_beams (:obj:`int`): | |||
| Number of beams used for group beam search. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for | |||
| more details. | |||
| num_beam_groups (:obj:`int`): | |||
| Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of | |||
| beams. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details. | |||
| """ | |||
| def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int): | |||
| if not isinstance(diversity_penalty, float) or (not diversity_penalty > 0.0): | |||
| raise ValueError("`diversity_penalty` should be a float strictly larger than 0.") | |||
| self._diversity_penalty = diversity_penalty | |||
| if not isinstance(num_beams, int) or num_beams < 2: | |||
| raise ValueError("`num_beams` should be an integer strictly larger than 1.") | |||
| self._num_beams = num_beams | |||
| if not isinstance(num_beam_groups, int) or num_beam_groups < 2: | |||
| raise ValueError("`num_beam_groups` should be an integer strictly larger than 1.") | |||
| if num_beam_groups > num_beams: | |||
| raise ValueError("`beam_groups` has to be smaller or equal to `num_beams`.") | |||
| self._num_sub_beams = num_beams // num_beam_groups | |||
| def __call__( | |||
| self, | |||
| input_ids: "torch.LongTensor", | |||
| scores: "torch.FloatTensor", | |||
| current_tokens: "torch.LongTensor", | |||
| beam_group_idx: int, | |||
| ) -> "torch.FloatTensor": | |||
| # hamming diversity: penalise using same token in current group which was used in previous groups at | |||
| # the same time step | |||
| batch_size = current_tokens.shape[0] // self._num_beams | |||
| group_start_idx = beam_group_idx * self._num_sub_beams | |||
| group_end_idx = min(group_start_idx + self._num_sub_beams, self._num_beams) | |||
| group_size = group_end_idx - group_start_idx | |||
| vocab_size = scores.shape[-1] | |||
| if group_start_idx == 0: | |||
| return scores | |||
| for batch_idx in range(batch_size): | |||
| # predicted tokens of last time step of previous groups | |||
| previous_group_tokens = current_tokens[ | |||
| batch_idx * self._num_beams : batch_idx * self._num_beams + group_start_idx | |||
| ] | |||
| token_frequency = torch.bincount(previous_group_tokens, minlength=vocab_size).to(scores.device) | |||
| scores[batch_idx * group_size : (batch_idx + 1) * group_size] -= self._diversity_penalty * token_frequency | |||
| return scores | |||
| class ForcedBOSTokenLogitsProcessor(LogitsProcessor): | |||
| r""" | |||
| :class:`~transformers.LogitsProcessor` that enforces the specified token as the first generated token. | |||
| Args: | |||
| bos_token_id (:obj:`int`): | |||
| The id of the token to force as the first generated token. | |||
| """ | |||
| def __init__(self, bos_token_id: int): | |||
| self.bos_token_id = bos_token_id | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": | |||
| cur_len = input_ids.shape[-1] | |||
| if cur_len == 1: | |||
| num_tokens = scores.shape[1] | |||
| scores[:, [i for i in range(num_tokens) if i != self.bos_token_id]] = -float("inf") | |||
| scores[:, self.bos_token_id] = 0 | |||
| return scores | |||
| class ForcedEOSTokenLogitsProcessor(LogitsProcessor): | |||
| r""" | |||
| :class:`~transformers.LogitsProcessor` that enforces the specified token as the last generated token when | |||
| :obj:`max_length` is reached. | |||
| Args: | |||
| max_length (:obj:`int`): | |||
| The maximum length of the sequence to be generated. | |||
| eos_token_id (:obj:`int`): | |||
| The id of the token to force as the last generated token when :obj:`max_length` is reached. | |||
| """ | |||
| def __init__(self, max_length: int, eos_token_id: int): | |||
| self.max_length = max_length | |||
| self.eos_token_id = eos_token_id | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": | |||
| cur_len = input_ids.shape[-1] | |||
| if cur_len == self.max_length - 1: | |||
| num_tokens = scores.shape[1] | |||
| scores[:, [i for i in range(num_tokens) if i != self.eos_token_id]] = -float("inf") | |||
| scores[:, self.eos_token_id] = 0 | |||
| return scores | |||
| class InfNanRemoveLogitsProcessor(LogitsProcessor): | |||
| r""" | |||
| :class:`~transformers.LogitsProcessor` that removes all :obj:`nan` and :obj:`inf` values to avoid the generation | |||
| method to fail. Note that using the logits processor should only be used if necessary since it can slow down the | |||
| generation method. :obj:`max_length` is reached. | |||
| """ | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor") -> "torch.FloatTensor": | |||
| # set all nan values to 0.0 | |||
| scores[scores != scores] = 0.0 | |||
| # set all inf values to max possible value | |||
| scores[scores == float("inf")] = torch.finfo(scores.dtype).max | |||
| return scores | |||
| @@ -0,0 +1,128 @@ | |||
| import time | |||
| from abc import ABC | |||
| from copy import deepcopy | |||
| from typing import Optional | |||
| from .file_utils import add_start_docstrings | |||
| from fastNLP.envs.imports import _NEED_IMPORT_TORCH | |||
| from fastNLP.core.log import logger | |||
| if _NEED_IMPORT_TORCH: | |||
| import torch | |||
| STOPPING_CRITERIA_INPUTS_DOCSTRING = r""" | |||
| Args: | |||
| input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): | |||
| Indices of input sequence tokens in the vocabulary. | |||
| Indices can be obtained using :class:`~transformers.BertTokenizer`. See | |||
| :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for | |||
| details. | |||
| `What are input IDs? <../glossary.html#input-ids>`__ | |||
| scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`): | |||
| Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax | |||
| or scores for each vocabulary token after SoftMax. | |||
| kwargs: | |||
| Additional stopping criteria specific kwargs. | |||
| Return: | |||
| :obj:`bool`. :obj:`False` indicates we should continue, :obj:`True` indicates we should stop. | |||
| """ | |||
| class StoppingCriteria(ABC): | |||
| """Abstract base class for all stopping criteria that can be applied during generation.""" | |||
| @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING) | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> bool: | |||
| raise NotImplementedError("StoppingCriteria needs to be subclassed") | |||
| class MaxLengthCriteria(StoppingCriteria): | |||
| """ | |||
| This class can be used to stop generation whenever the full generated number of tokens exceeds :obj:`max_length`. | |||
| Keep in mind for decoder-only type of transformers, this will include the initial prompted tokens. | |||
| Args: | |||
| max_length (:obj:`int`): | |||
| The maximum length that the output sequence can have in number of tokens. | |||
| """ | |||
| def __init__(self, max_length: int): | |||
| self.max_length = max_length | |||
| @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING) | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> bool: | |||
| return input_ids.shape[-1] >= self.max_length | |||
| class MaxNewTokensCriteria(StoppingCriteria): | |||
| """ | |||
| This class can be used to stop generation whenever the generated number of tokens exceeds :obj:`max_new_tokens`. | |||
| Keep in mind for decoder-only type of transformers, this will **not** include the initial prompted tokens. This is | |||
| very close to :obj:`MaxLengthCriteria` but ignores the number of initial tokens. | |||
| Args: | |||
| start_length (:obj:`int`): | |||
| The number of initial tokens. | |||
| max_new_tokens (:obj:`int`): | |||
| The maximum number of tokens to generate. | |||
| """ | |||
| def __init__(self, start_length: int, max_new_tokens: int): | |||
| self.start_length = start_length | |||
| self.max_new_tokens = max_new_tokens | |||
| self.max_length = start_length + max_new_tokens | |||
| @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING) | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> bool: | |||
| return input_ids.shape[-1] >= self.max_length | |||
| class MaxTimeCriteria(StoppingCriteria): | |||
| """ | |||
| This class can be used to stop generation whenever the full generation exceeds some amount of time. By default, the | |||
| time will start being counted when you initialize this function. You can override this by passing an | |||
| :obj:`initial_time`. | |||
| Args: | |||
| max_time (:obj:`float`): | |||
| The maximum allowed time in seconds for the generation. | |||
| initial_time (:obj:`float`, `optional`, defaults to :obj:`time.time()`): | |||
| The start of the generation allowed time. | |||
| """ | |||
| def __init__(self, max_time: float, initial_timestamp: Optional[float] = None): | |||
| self.max_time = max_time | |||
| self.initial_timestamp = time.time() if initial_timestamp is None else initial_timestamp | |||
| @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING) | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> bool: | |||
| return time.time() - self.initial_timestamp > self.max_time | |||
| class StoppingCriteriaList(list): | |||
| @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING) | |||
| def __call__(self, input_ids: "torch.LongTensor", scores: "torch.FloatTensor", **kwargs) -> bool: | |||
| return any(criteria(input_ids, scores) for criteria in self) | |||
| @property | |||
| def max_length(self) -> Optional[int]: | |||
| for stopping_criterium in self: | |||
| if isinstance(stopping_criterium, MaxLengthCriteria): | |||
| return stopping_criterium.max_length | |||
| elif isinstance(stopping_criterium, MaxNewTokensCriteria): | |||
| return stopping_criterium.max_length | |||
| return None | |||
| def validate_stopping_criteria(stopping_criteria: StoppingCriteriaList, max_length: int) -> StoppingCriteriaList: | |||
| stopping_max_length = stopping_criteria.max_length | |||
| new_stopping_criteria = deepcopy(stopping_criteria) | |||
| if stopping_max_length is not None and stopping_max_length != max_length: | |||
| logger.warn("You set different `max_length` for stopping criteria and `max_length` parameter", UserWarning) | |||
| elif stopping_max_length is None: | |||
| new_stopping_criteria.append(MaxLengthCriteria(max_length=max_length)) | |||
| return new_stopping_criteria | |||
| @@ -0,0 +1,816 @@ | |||
| # Copyright 2020 The HuggingFace Team. All rights reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| from dataclasses import dataclass | |||
| from typing import Optional, Tuple | |||
| from .file_utils import ModelOutput | |||
| from fastNLP.envs.imports import _NEED_IMPORT_TORCH | |||
| if _NEED_IMPORT_TORCH: | |||
| import torch | |||
| @dataclass | |||
| class BaseModelOutput(ModelOutput): | |||
| """ | |||
| Base class for model's outputs, with potential hidden states and attentions. | |||
| Args: | |||
| last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): | |||
| Sequence of hidden-states at the output of the last layer of the model. | |||
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |||
| heads. | |||
| """ | |||
| last_hidden_state: "torch.FloatTensor" = None | |||
| hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class BaseModelOutputWithPooling(ModelOutput): | |||
| """ | |||
| Base class for model's outputs that also contains a pooling of the last hidden states. | |||
| Args: | |||
| last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): | |||
| Sequence of hidden-states at the output of the last layer of the model. | |||
| pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`): | |||
| Last layer hidden-state of the first token of the sequence (classification token) after further processing | |||
| through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns | |||
| the classification token after processing through a linear layer and a tanh activation function. The linear | |||
| layer weights are trained from the next sentence prediction (classification) objective during pretraining. | |||
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |||
| heads. | |||
| """ | |||
| last_hidden_state: "torch.FloatTensor" = None | |||
| pooler_output: "torch.FloatTensor" = None | |||
| hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class BaseModelOutputWithPast(ModelOutput): | |||
| """ | |||
| Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). | |||
| Args: | |||
| last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): | |||
| Sequence of hidden-states at the output of the last layer of the model. | |||
| If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, | |||
| 1, hidden_size)` is output. | |||
| past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): | |||
| Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors | |||
| of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if | |||
| ``config.is_encoder_decoder=True`` 2 additional tensors of shape :obj:`(batch_size, num_heads, | |||
| encoder_sequence_length, embed_size_per_head)`. | |||
| Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if | |||
| ``config.is_encoder_decoder=True`` in the cross-attention blocks) that can be used (see | |||
| :obj:`past_key_values` input) to speed up sequential decoding. | |||
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |||
| heads. | |||
| """ | |||
| last_hidden_state: "torch.FloatTensor" = None | |||
| past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None | |||
| hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class BaseModelOutputWithCrossAttentions(ModelOutput): | |||
| """ | |||
| Base class for model's outputs, with potential hidden states and attentions. | |||
| Args: | |||
| last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): | |||
| Sequence of hidden-states at the output of the last layer of the model. | |||
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |||
| heads. | |||
| cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the | |||
| weighted average in the cross-attention heads. | |||
| """ | |||
| last_hidden_state: "torch.FloatTensor" = None | |||
| hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput): | |||
| """ | |||
| Base class for model's outputs that also contains a pooling of the last hidden states. | |||
| Args: | |||
| last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): | |||
| Sequence of hidden-states at the output of the last layer of the model. | |||
| pooler_output (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, hidden_size)`): | |||
| Last layer hidden-state of the first token of the sequence (classification token) after further processing | |||
| through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns | |||
| the classification token after processing through a linear layer and a tanh activation function. The linear | |||
| layer weights are trained from the next sentence prediction (classification) objective during pretraining. | |||
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |||
| heads. | |||
| cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the | |||
| weighted average in the cross-attention heads. | |||
| past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): | |||
| Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors | |||
| of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if | |||
| ``config.is_encoder_decoder=True`` 2 additional tensors of shape :obj:`(batch_size, num_heads, | |||
| encoder_sequence_length, embed_size_per_head)`. | |||
| Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if | |||
| ``config.is_encoder_decoder=True`` in the cross-attention blocks) that can be used (see | |||
| :obj:`past_key_values` input) to speed up sequential decoding. | |||
| """ | |||
| last_hidden_state: "torch.FloatTensor" = None | |||
| pooler_output: "torch.FloatTensor" = None | |||
| hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None | |||
| attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class BaseModelOutputWithPastAndCrossAttentions(ModelOutput): | |||
| """ | |||
| Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). | |||
| Args: | |||
| last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): | |||
| Sequence of hidden-states at the output of the last layer of the model. | |||
| If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, | |||
| 1, hidden_size)` is output. | |||
| past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): | |||
| Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors | |||
| of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if | |||
| ``config.is_encoder_decoder=True`` 2 additional tensors of shape :obj:`(batch_size, num_heads, | |||
| encoder_sequence_length, embed_size_per_head)`. | |||
| Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if | |||
| ``config.is_encoder_decoder=True`` in the cross-attention blocks) that can be used (see | |||
| :obj:`past_key_values` input) to speed up sequential decoding. | |||
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |||
| heads. | |||
| cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` and ``config.add_cross_attention=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the | |||
| weighted average in the cross-attention heads. | |||
| """ | |||
| last_hidden_state: "torch.FloatTensor" = None | |||
| past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None | |||
| hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class Seq2SeqModelOutput(ModelOutput): | |||
| """ | |||
| Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential | |||
| decoding. | |||
| Args: | |||
| last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): | |||
| Sequence of hidden-states at the output of the last layer of the decoder of the model. | |||
| If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, | |||
| 1, hidden_size)` is output. | |||
| past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): | |||
| Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors | |||
| of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of | |||
| shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. | |||
| Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention | |||
| blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding. | |||
| decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. | |||
| decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the | |||
| self-attention heads. | |||
| cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the | |||
| weighted average in the cross-attention heads. | |||
| encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): | |||
| Sequence of hidden-states at the output of the last layer of the encoder of the model. | |||
| encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. | |||
| encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the | |||
| self-attention heads. | |||
| """ | |||
| last_hidden_state: "torch.FloatTensor" = None | |||
| past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None | |||
| decoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| decoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| encoder_last_hidden_state: Optional["torch.FloatTensor"] = None | |||
| encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class CausalLMOutput(ModelOutput): | |||
| """ | |||
| Base class for causal language model (or autoregressive) outputs. | |||
| Args: | |||
| loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): | |||
| Language modeling loss (for next-token prediction). | |||
| logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): | |||
| Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). | |||
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |||
| heads. | |||
| """ | |||
| loss: Optional["torch.FloatTensor"] = None | |||
| logits: "torch.FloatTensor" = None | |||
| hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class CausalLMOutputWithPast(ModelOutput): | |||
| """ | |||
| Base class for causal language model (or autoregressive) outputs. | |||
| Args: | |||
| loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): | |||
| Language modeling loss (for next-token prediction). | |||
| logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): | |||
| Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). | |||
| past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): | |||
| Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors | |||
| of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) | |||
| Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see | |||
| :obj:`past_key_values` input) to speed up sequential decoding. | |||
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |||
| heads. | |||
| """ | |||
| loss: Optional["torch.FloatTensor"] = None | |||
| logits: "torch.FloatTensor" = None | |||
| past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None | |||
| hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class CausalLMOutputWithCrossAttentions(ModelOutput): | |||
| """ | |||
| Base class for causal language model (or autoregressive) outputs. | |||
| Args: | |||
| loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): | |||
| Language modeling loss (for next-token prediction). | |||
| logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): | |||
| Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). | |||
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |||
| heads. | |||
| cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Cross attentions weights after the attention softmax, used to compute the weighted average in the | |||
| cross-attention heads. | |||
| past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): | |||
| Tuple of :obj:`torch.FloatTensor` tuples of length :obj:`config.n_layers`, with each tuple containing the | |||
| cached key, value states of the self-attention and the cross-attention layers if model is used in | |||
| encoder-decoder setting. Only relevant if ``config.is_decoder = True``. | |||
| Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see | |||
| :obj:`past_key_values` input) to speed up sequential decoding. | |||
| """ | |||
| loss: Optional["torch.FloatTensor"] = None | |||
| logits: "torch.FloatTensor" = None | |||
| past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None | |||
| hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class SequenceClassifierOutputWithPast(ModelOutput): | |||
| """ | |||
| Base class for outputs of sentence classification models. | |||
| Args: | |||
| loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): | |||
| Classification (or regression if config.num_labels==1) loss. | |||
| logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): | |||
| Classification (or regression if config.num_labels==1) scores (before SoftMax). | |||
| past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): | |||
| Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors | |||
| of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) | |||
| Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see | |||
| :obj:`past_key_values` input) to speed up sequential decoding. | |||
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |||
| heads. | |||
| """ | |||
| loss: Optional["torch.FloatTensor"] = None | |||
| logits: "torch.FloatTensor" = None | |||
| past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None | |||
| hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class MaskedLMOutput(ModelOutput): | |||
| """ | |||
| Base class for masked language models outputs. | |||
| Args: | |||
| loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): | |||
| Masked language modeling (MLM) loss. | |||
| logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): | |||
| Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). | |||
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |||
| heads. | |||
| """ | |||
| loss: Optional["torch.FloatTensor"] = None | |||
| logits: "torch.FloatTensor" = None | |||
| hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class Seq2SeqLMOutput(ModelOutput): | |||
| """ | |||
| Base class for sequence-to-sequence language models outputs. | |||
| Args: | |||
| loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): | |||
| Language modeling loss. | |||
| logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): | |||
| Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). | |||
| past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): | |||
| Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors | |||
| of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of | |||
| shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. | |||
| Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention | |||
| blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding. | |||
| decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. | |||
| decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the | |||
| self-attention heads. | |||
| cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the | |||
| weighted average in the cross-attention heads. | |||
| encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): | |||
| Sequence of hidden-states at the output of the last layer of the encoder of the model. | |||
| encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. | |||
| encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the | |||
| self-attention heads. | |||
| """ | |||
| loss: Optional["torch.FloatTensor"] = None | |||
| logits: "torch.FloatTensor" = None | |||
| past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None | |||
| decoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| decoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| encoder_last_hidden_state: Optional["torch.FloatTensor"] = None | |||
| encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class NextSentencePredictorOutput(ModelOutput): | |||
| """ | |||
| Base class for outputs of models predicting if two sentences are consecutive or not. | |||
| Args: | |||
| loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided): | |||
| Next sequence prediction (classification) loss. | |||
| logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): | |||
| Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation | |||
| before SoftMax). | |||
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |||
| heads. | |||
| """ | |||
| loss: Optional["torch.FloatTensor"] = None | |||
| logits: "torch.FloatTensor" = None | |||
| hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class SequenceClassifierOutput(ModelOutput): | |||
| """ | |||
| Base class for outputs of sentence classification models. | |||
| Args: | |||
| loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): | |||
| Classification (or regression if config.num_labels==1) loss. | |||
| logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): | |||
| Classification (or regression if config.num_labels==1) scores (before SoftMax). | |||
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |||
| heads. | |||
| """ | |||
| loss: Optional["torch.FloatTensor"] = None | |||
| logits: "torch.FloatTensor" = None | |||
| hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class Seq2SeqSequenceClassifierOutput(ModelOutput): | |||
| """ | |||
| Base class for outputs of sequence-to-sequence sentence classification models. | |||
| Args: | |||
| loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): | |||
| Classification (or regression if config.num_labels==1) loss. | |||
| logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): | |||
| Classification (or regression if config.num_labels==1) scores (before SoftMax). | |||
| past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): | |||
| Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors | |||
| of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of | |||
| shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. | |||
| Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention | |||
| blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding. | |||
| decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. | |||
| decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the | |||
| self-attention heads. | |||
| cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the | |||
| weighted average in the cross-attention heads. | |||
| encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): | |||
| Sequence of hidden-states at the output of the last layer of the encoder of the model. | |||
| encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. | |||
| encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the | |||
| self-attention heads. | |||
| """ | |||
| loss: Optional["torch.FloatTensor"] = None | |||
| logits: "torch.FloatTensor" = None | |||
| past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None | |||
| decoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| decoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| encoder_last_hidden_state: Optional["torch.FloatTensor"] = None | |||
| encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class MultipleChoiceModelOutput(ModelOutput): | |||
| """ | |||
| Base class for outputs of multiple choice models. | |||
| Args: | |||
| loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided): | |||
| Classification loss. | |||
| logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): | |||
| `num_choices` is the second dimension of the input tensors. (see `input_ids` above). | |||
| Classification scores (before SoftMax). | |||
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |||
| heads. | |||
| """ | |||
| loss: Optional["torch.FloatTensor"] = None | |||
| logits: "torch.FloatTensor" = None | |||
| hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class TokenClassifierOutput(ModelOutput): | |||
| """ | |||
| Base class for outputs of token classification models. | |||
| Args: | |||
| loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : | |||
| Classification loss. | |||
| logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): | |||
| Classification scores (before SoftMax). | |||
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |||
| heads. | |||
| """ | |||
| loss: Optional["torch.FloatTensor"] = None | |||
| logits: "torch.FloatTensor" = None | |||
| hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class QuestionAnsweringModelOutput(ModelOutput): | |||
| """ | |||
| Base class for outputs of question answering models. | |||
| Args: | |||
| loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): | |||
| Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. | |||
| start_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`): | |||
| Span-start scores (before SoftMax). | |||
| end_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`): | |||
| Span-end scores (before SoftMax). | |||
| hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the model at the output of each layer plus the initial embedding outputs. | |||
| attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights after the attention softmax, used to compute the weighted average in the self-attention | |||
| heads. | |||
| """ | |||
| loss: Optional["torch.FloatTensor"] = None | |||
| start_logits: "torch.FloatTensor" = None | |||
| end_logits: "torch.FloatTensor" = None | |||
| hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @dataclass | |||
| class Seq2SeqQuestionAnsweringModelOutput(ModelOutput): | |||
| """ | |||
| Base class for outputs of sequence-to-sequence question answering models. | |||
| Args: | |||
| loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): | |||
| Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. | |||
| start_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`): | |||
| Span-start scores (before SoftMax). | |||
| end_logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`): | |||
| Span-end scores (before SoftMax). | |||
| past_key_values (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): | |||
| Tuple of :obj:`tuple(torch.FloatTensor)` of length :obj:`config.n_layers`, with each tuple having 2 tensors | |||
| of shape :obj:`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of | |||
| shape :obj:`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. | |||
| Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention | |||
| blocks) that can be used (see :obj:`past_key_values` input) to speed up sequential decoding. | |||
| decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. | |||
| decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the | |||
| self-attention heads. | |||
| cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the | |||
| weighted average in the cross-attention heads. | |||
| encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): | |||
| Sequence of hidden-states at the output of the last layer of the encoder of the model. | |||
| encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) | |||
| of shape :obj:`(batch_size, sequence_length, hidden_size)`. | |||
| Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. | |||
| encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): | |||
| Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, | |||
| sequence_length, sequence_length)`. | |||
| Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the | |||
| self-attention heads. | |||
| """ | |||
| loss: Optional["torch.FloatTensor"] = None | |||
| start_logits: "torch.FloatTensor" = None | |||
| end_logits: "torch.FloatTensor" = None | |||
| past_key_values: Optional[Tuple[Tuple["torch.FloatTensor"]]] = None | |||
| decoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| decoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| cross_attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| encoder_last_hidden_state: Optional["torch.FloatTensor"] = None | |||
| encoder_hidden_states: Optional[Tuple["torch.FloatTensor"]] = None | |||
| encoder_attentions: Optional[Tuple["torch.FloatTensor"]] = None | |||
| @@ -0,0 +1,5 @@ | |||
| from .bart import * | |||
| from .bert import * | |||
| from .cpt import * | |||
| from .gpt2 import * | |||
| from .roberta import * | |||
| @@ -0,0 +1,541 @@ | |||
| # coding=utf-8 | |||
| # Copyright 2018 The HuggingFace Inc. team. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """ Auto Config class. """ | |||
| import importlib | |||
| import re | |||
| from collections import OrderedDict | |||
| from typing import List, Union | |||
| from fastNLP.transformers.torch.configuration_utils import PretrainedConfig | |||
| from fastNLP.transformers.torch.file_utils import CONFIG_NAME | |||
| from fastNLP.core.log import logger | |||
| CONFIG_MAPPING_NAMES = OrderedDict( | |||
| [ | |||
| # Add configs here | |||
| ("fnet", "FNetConfig"), | |||
| ("gptj", "GPTJConfig"), | |||
| ("layoutlmv2", "LayoutLMv2Config"), | |||
| ("beit", "BeitConfig"), | |||
| ("rembert", "RemBertConfig"), | |||
| ("visual_bert", "VisualBertConfig"), | |||
| ("canine", "CanineConfig"), | |||
| ("roformer", "RoFormerConfig"), | |||
| ("clip", "CLIPConfig"), | |||
| ("bigbird_pegasus", "BigBirdPegasusConfig"), | |||
| ("deit", "DeiTConfig"), | |||
| ("luke", "LukeConfig"), | |||
| ("detr", "DetrConfig"), | |||
| ("gpt_neo", "GPTNeoConfig"), | |||
| ("big_bird", "BigBirdConfig"), | |||
| ("speech_to_text_2", "Speech2Text2Config"), | |||
| ("speech_to_text", "Speech2TextConfig"), | |||
| ("vit", "ViTConfig"), | |||
| ("wav2vec2", "Wav2Vec2Config"), | |||
| ("m2m_100", "M2M100Config"), | |||
| ("convbert", "ConvBertConfig"), | |||
| ("led", "LEDConfig"), | |||
| ("blenderbot-small", "BlenderbotSmallConfig"), | |||
| ("retribert", "RetriBertConfig"), | |||
| ("ibert", "IBertConfig"), | |||
| ("mt5", "MT5Config"), | |||
| ("t5", "T5Config"), | |||
| ("mobilebert", "MobileBertConfig"), | |||
| ("distilbert", "DistilBertConfig"), | |||
| ("albert", "AlbertConfig"), | |||
| ("bert-generation", "BertGenerationConfig"), | |||
| ("camembert", "CamembertConfig"), | |||
| ("xlm-roberta", "XLMRobertaConfig"), | |||
| ("pegasus", "PegasusConfig"), | |||
| ("marian", "MarianConfig"), | |||
| ("mbart", "MBartConfig"), | |||
| ("megatron-bert", "MegatronBertConfig"), | |||
| ("mpnet", "MPNetConfig"), | |||
| ("bart", "BartConfig"), | |||
| ("blenderbot", "BlenderbotConfig"), | |||
| ("reformer", "ReformerConfig"), | |||
| ("longformer", "LongformerConfig"), | |||
| ("roberta", "RobertaConfig"), | |||
| ("deberta-v2", "DebertaV2Config"), | |||
| ("deberta", "DebertaConfig"), | |||
| ("flaubert", "FlaubertConfig"), | |||
| ("fsmt", "FSMTConfig"), | |||
| ("squeezebert", "SqueezeBertConfig"), | |||
| ("hubert", "HubertConfig"), | |||
| ("bert", "BertConfig"), | |||
| ("openai-gpt", "OpenAIGPTConfig"), | |||
| ("gpt2", "GPT2Config"), | |||
| ("transfo-xl", "TransfoXLConfig"), | |||
| ("xlnet", "XLNetConfig"), | |||
| ("xlm-prophetnet", "XLMProphetNetConfig"), | |||
| ("prophetnet", "ProphetNetConfig"), | |||
| ("xlm", "XLMConfig"), | |||
| ("ctrl", "CTRLConfig"), | |||
| ("electra", "ElectraConfig"), | |||
| ("speech-encoder-decoder", "SpeechEncoderDecoderConfig"), | |||
| ("encoder-decoder", "EncoderDecoderConfig"), | |||
| ("funnel", "FunnelConfig"), | |||
| ("lxmert", "LxmertConfig"), | |||
| ("dpr", "DPRConfig"), | |||
| ("layoutlm", "LayoutLMConfig"), | |||
| ("rag", "RagConfig"), | |||
| ("tapas", "TapasConfig"), | |||
| ("splinter", "SplinterConfig"), | |||
| ] | |||
| ) | |||
| CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict( | |||
| [ | |||
| # Add archive maps here | |||
| ("fnet", "FNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("pegasus", "PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("gptj", "GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("layoutlmv2", "LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("rembert", "REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("visual_bert", "VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("roformer", "ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("clip", "CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("bigbird_pegasus", "BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("deit", "DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("luke", "LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("detr", "DETR_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("gpt_neo", "GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("big_bird", "BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("megatron-bert", "MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("speech_to_text", "SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("speech_to_text_2", "SPEECH_TO_TEXT_2_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("vit", "VIT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("wav2vec2", "WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("m2m_100", "M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("convbert", "CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("led", "LED_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("blenderbot-small", "BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("bert", "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("bart", "BART_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("blenderbot", "BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("mbart", "MBART_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("openai-gpt", "OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("gpt2", "GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("ctrl", "CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("xlnet", "XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("xlm", "XLM_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("roberta", "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("distilbert", "DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("albert", "ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("camembert", "CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("t5", "T5_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("xlm-roberta", "XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("flaubert", "FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("fsmt", "FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("electra", "ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("longformer", "LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("retribert", "RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("funnel", "FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("lxmert", "LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("layoutlm", "LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("dpr", "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("deberta", "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("deberta-v2", "DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("squeezebert", "SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("xlm-prophetnet", "XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("prophetnet", "PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("mpnet", "MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("tapas", "TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("ibert", "IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("hubert", "HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ("splinter", "SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP"), | |||
| ] | |||
| ) | |||
| MODEL_NAMES_MAPPING = OrderedDict( | |||
| [ | |||
| # Add full (and cased) model names here | |||
| ("fnet", "FNet"), | |||
| ("gptj", "GPT-J"), | |||
| ("beit", "BeiT"), | |||
| ("rembert", "RemBERT"), | |||
| ("layoutlmv2", "LayoutLMv2"), | |||
| ("visual_bert", "VisualBert"), | |||
| ("canine", "Canine"), | |||
| ("roformer", "RoFormer"), | |||
| ("clip", "CLIP"), | |||
| ("bigbird_pegasus", "BigBirdPegasus"), | |||
| ("deit", "DeiT"), | |||
| ("luke", "LUKE"), | |||
| ("detr", "DETR"), | |||
| ("gpt_neo", "GPT Neo"), | |||
| ("big_bird", "BigBird"), | |||
| ("speech_to_text_2", "Speech2Text2"), | |||
| ("speech_to_text", "Speech2Text"), | |||
| ("vit", "ViT"), | |||
| ("wav2vec2", "Wav2Vec2"), | |||
| ("m2m_100", "M2M100"), | |||
| ("convbert", "ConvBERT"), | |||
| ("led", "LED"), | |||
| ("blenderbot-small", "BlenderbotSmall"), | |||
| ("retribert", "RetriBERT"), | |||
| ("ibert", "I-BERT"), | |||
| ("t5", "T5"), | |||
| ("mobilebert", "MobileBERT"), | |||
| ("distilbert", "DistilBERT"), | |||
| ("albert", "ALBERT"), | |||
| ("bert-generation", "Bert Generation"), | |||
| ("camembert", "CamemBERT"), | |||
| ("xlm-roberta", "XLM-RoBERTa"), | |||
| ("pegasus", "Pegasus"), | |||
| ("blenderbot", "Blenderbot"), | |||
| ("marian", "Marian"), | |||
| ("mbart", "mBART"), | |||
| ("megatron-bert", "MegatronBert"), | |||
| ("bart", "BART"), | |||
| ("reformer", "Reformer"), | |||
| ("longformer", "Longformer"), | |||
| ("roberta", "RoBERTa"), | |||
| ("flaubert", "FlauBERT"), | |||
| ("fsmt", "FairSeq Machine-Translation"), | |||
| ("squeezebert", "SqueezeBERT"), | |||
| ("bert", "BERT"), | |||
| ("openai-gpt", "OpenAI GPT"), | |||
| ("gpt2", "OpenAI GPT-2"), | |||
| ("transfo-xl", "Transformer-XL"), | |||
| ("xlnet", "XLNet"), | |||
| ("xlm", "XLM"), | |||
| ("ctrl", "CTRL"), | |||
| ("electra", "ELECTRA"), | |||
| ("encoder-decoder", "Encoder decoder"), | |||
| ("speech-encoder-decoder", "Speech Encoder decoder"), | |||
| ("funnel", "Funnel Transformer"), | |||
| ("lxmert", "LXMERT"), | |||
| ("deberta-v2", "DeBERTa-v2"), | |||
| ("deberta", "DeBERTa"), | |||
| ("layoutlm", "LayoutLM"), | |||
| ("dpr", "DPR"), | |||
| ("rag", "RAG"), | |||
| ("xlm-prophetnet", "XLMProphetNet"), | |||
| ("prophetnet", "ProphetNet"), | |||
| ("mt5", "mT5"), | |||
| ("mpnet", "MPNet"), | |||
| ("tapas", "TAPAS"), | |||
| ("hubert", "Hubert"), | |||
| ("barthez", "BARThez"), | |||
| ("phobert", "PhoBERT"), | |||
| ("cpm", "CPM"), | |||
| ("bertweet", "Bertweet"), | |||
| ("bert-japanese", "BertJapanese"), | |||
| ("byt5", "ByT5"), | |||
| ("mbart50", "mBART-50"), | |||
| ("splinter", "Splinter"), | |||
| ] | |||
| ) | |||
| SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict([("openai-gpt", "openai")]) | |||
| def model_type_to_module_name(key): | |||
| """Converts a config key to the corresponding module.""" | |||
| # Special treatment | |||
| if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME: | |||
| return SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key] | |||
| return key.replace("-", "_") | |||
| def config_class_to_model_type(config): | |||
| """Converts a config class name to the corresponding model type""" | |||
| for key, cls in CONFIG_MAPPING_NAMES.items(): | |||
| if cls == config: | |||
| return key | |||
| return None | |||
| class _LazyConfigMapping(OrderedDict): | |||
| """ | |||
| A dictionary that lazily load its values when they are requested. | |||
| """ | |||
| def __init__(self, mapping): | |||
| self._mapping = mapping | |||
| self._modules = {} | |||
| def __getitem__(self, key): | |||
| if key not in self._mapping: | |||
| raise KeyError(key) | |||
| value = self._mapping[key] | |||
| module_name = model_type_to_module_name(key) | |||
| if module_name not in self._modules: | |||
| self._modules[module_name] = importlib.import_module(f".{module_name}", "transformers.models") | |||
| return getattr(self._modules[module_name], value) | |||
| def keys(self): | |||
| return self._mapping.keys() | |||
| def values(self): | |||
| return [self[k] for k in self._mapping.keys()] | |||
| def items(self): | |||
| return [(k, self[k]) for k in self._mapping.keys()] | |||
| def __iter__(self): | |||
| return iter(self._mapping.keys()) | |||
| def __contains__(self, item): | |||
| return item in self._mapping | |||
| CONFIG_MAPPING = _LazyConfigMapping(CONFIG_MAPPING_NAMES) | |||
| class _LazyLoadAllMappings(OrderedDict): | |||
| """ | |||
| A mapping that will load all pairs of key values at the first access (either by indexing, requestions keys, values, | |||
| etc.) | |||
| Args: | |||
| mapping: The mapping to load. | |||
| """ | |||
| def __init__(self, mapping): | |||
| self._mapping = mapping | |||
| self._initialized = False | |||
| self._data = {} | |||
| def _initialize(self): | |||
| if self._initialized: | |||
| return | |||
| logger.warn( | |||
| "ALL_PRETRAINED_CONFIG_ARCHIVE_MAP is deprecated and will be removed in v5 of Transformers. " | |||
| "It does not contain all available model checkpoints, far from it. Checkout hf.co/models for that.", | |||
| FutureWarning, | |||
| ) | |||
| for model_type, map_name in self._mapping.items(): | |||
| module_name = model_type_to_module_name(model_type) | |||
| module = importlib.import_module(f".{module_name}", "transformers.models") | |||
| mapping = getattr(module, map_name) | |||
| self._data.update(mapping) | |||
| self._initialized = True | |||
| def __getitem__(self, key): | |||
| self._initialize() | |||
| return self._data[key] | |||
| def keys(self): | |||
| self._initialize() | |||
| return self._data.keys() | |||
| def values(self): | |||
| self._initialize() | |||
| return self._data.values() | |||
| def items(self): | |||
| self._initialize() | |||
| return self._data.keys() | |||
| def __iter__(self): | |||
| self._initialize() | |||
| return iter(self._data) | |||
| def __contains__(self, item): | |||
| self._initialize() | |||
| return item in self._data | |||
| ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = _LazyLoadAllMappings(CONFIG_ARCHIVE_MAP_MAPPING_NAMES) | |||
| def _get_class_name(model_class: Union[str, List[str]]): | |||
| if isinstance(model_class, (list, tuple)): | |||
| return " or ".join([f":class:`~transformers.{c}`" for c in model_class if c is not None]) | |||
| return f":class:`~transformers.{model_class}`" | |||
| def _list_model_options(indent, config_to_class=None, use_model_types=True): | |||
| if config_to_class is None and not use_model_types: | |||
| raise ValueError("Using `use_model_types=False` requires a `config_to_class` dictionary.") | |||
| if use_model_types: | |||
| if config_to_class is None: | |||
| model_type_to_name = { | |||
| model_type: f":class:`~transformers.{config}`" for model_type, config in CONFIG_MAPPING_NAMES.items() | |||
| } | |||
| else: | |||
| model_type_to_name = { | |||
| model_type: _get_class_name(model_class) | |||
| for model_type, model_class in config_to_class.items() | |||
| if model_type in MODEL_NAMES_MAPPING | |||
| } | |||
| lines = [ | |||
| f"{indent}- **{model_type}** -- {model_type_to_name[model_type]} ({MODEL_NAMES_MAPPING[model_type]} model)" | |||
| for model_type in sorted(model_type_to_name.keys()) | |||
| ] | |||
| else: | |||
| config_to_name = { | |||
| CONFIG_MAPPING_NAMES[config]: _get_class_name(clas) | |||
| for config, clas in config_to_class.items() | |||
| if config in CONFIG_MAPPING_NAMES | |||
| } | |||
| config_to_model_name = { | |||
| config: MODEL_NAMES_MAPPING[model_type] for model_type, config in CONFIG_MAPPING_NAMES.items() | |||
| } | |||
| lines = [ | |||
| f"{indent}- :class:`~transformers.{config_name}` configuration class: {config_to_name[config_name]} ({config_to_model_name[config_name]} model)" | |||
| for config_name in sorted(config_to_name.keys()) | |||
| ] | |||
| return "\n".join(lines) | |||
| def replace_list_option_in_docstrings(config_to_class=None, use_model_types=True): | |||
| def docstring_decorator(fn): | |||
| docstrings = fn.__doc__ | |||
| lines = docstrings.split("\n") | |||
| i = 0 | |||
| while i < len(lines) and re.search(r"^(\s*)List options\s*$", lines[i]) is None: | |||
| i += 1 | |||
| if i < len(lines): | |||
| indent = re.search(r"^(\s*)List options\s*$", lines[i]).groups()[0] | |||
| if use_model_types: | |||
| indent = f"{indent} " | |||
| lines[i] = _list_model_options(indent, config_to_class=config_to_class, use_model_types=use_model_types) | |||
| docstrings = "\n".join(lines) | |||
| else: | |||
| raise ValueError( | |||
| f"The function {fn} should have an empty 'List options' in its docstring as placeholder, current docstring is:\n{docstrings}" | |||
| ) | |||
| fn.__doc__ = docstrings | |||
| return fn | |||
| return docstring_decorator | |||
| class AutoConfig: | |||
| r""" | |||
| This is a generic configuration class that will be instantiated as one of the configuration classes of the library | |||
| when created with the :meth:`~transformers.AutoConfig.from_pretrained` class method. | |||
| This class cannot be instantiated directly using ``__init__()`` (throws an error). | |||
| """ | |||
| def __init__(self): | |||
| raise EnvironmentError( | |||
| "AutoConfig is designed to be instantiated " | |||
| "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method." | |||
| ) | |||
| @classmethod | |||
| def for_model(cls, model_type: str, *args, **kwargs): | |||
| if model_type in CONFIG_MAPPING: | |||
| config_class = CONFIG_MAPPING[model_type] | |||
| return config_class(*args, **kwargs) | |||
| raise ValueError( | |||
| f"Unrecognized model identifier: {model_type}. Should contain one of {', '.join(CONFIG_MAPPING.keys())}" | |||
| ) | |||
| @classmethod | |||
| @replace_list_option_in_docstrings() | |||
| def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): | |||
| r""" | |||
| Instantiate one of the configuration classes of the library from a pretrained model configuration. | |||
| The configuration class to instantiate is selected based on the :obj:`model_type` property of the config object | |||
| that is loaded, or when it's missing, by falling back to using pattern matching on | |||
| :obj:`pretrained_model_name_or_path`: | |||
| List options | |||
| Args: | |||
| pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): | |||
| Can be either: | |||
| - A string, the `model id` of a pretrained model configuration hosted inside a model repo on | |||
| huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or | |||
| namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. | |||
| - A path to a `directory` containing a configuration file saved using the | |||
| :meth:`~transformers.PretrainedConfig.save_pretrained` method, or the | |||
| :meth:`~transformers.PreTrainedModel.save_pretrained` method, e.g., ``./my_model_directory/``. | |||
| - A path or url to a saved configuration JSON `file`, e.g., | |||
| ``./my_model_directory/configuration.json``. | |||
| cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`): | |||
| Path to a directory in which a downloaded pretrained model configuration should be cached if the | |||
| standard cache should not be used. | |||
| force_download (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether or not to force the (re-)download the model weights and configuration files and override the | |||
| cached versions if they exist. | |||
| resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether or not to delete incompletely received files. Will attempt to resume the download if such a | |||
| file exists. | |||
| proxies (:obj:`Dict[str, str]`, `optional`): | |||
| A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128', | |||
| 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. | |||
| revision(:obj:`str`, `optional`, defaults to :obj:`"main"`): | |||
| The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a | |||
| git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any | |||
| identifier allowed by git. | |||
| return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| If :obj:`False`, then this function returns just the final configuration object. | |||
| If :obj:`True`, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs` | |||
| is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., | |||
| the part of ``kwargs`` which has not been used to update ``config`` and is otherwise ignored. | |||
| kwargs(additional keyword arguments, `optional`): | |||
| The values in kwargs of any keys which are configuration attributes will be used to override the loaded | |||
| values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled | |||
| by the ``return_unused_kwargs`` keyword parameter. | |||
| Examples:: | |||
| >>> from transformers import AutoConfig | |||
| >>> # Download configuration from huggingface.co and cache. | |||
| >>> config = AutoConfig.from_pretrained('bert-base-uncased') | |||
| >>> # Download configuration from huggingface.co (user-uploaded) and cache. | |||
| >>> config = AutoConfig.from_pretrained('dbmdz/bert-base-german-cased') | |||
| >>> # If configuration file is in a directory (e.g., was saved using `save_pretrained('./test/saved_model/')`). | |||
| >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/') | |||
| >>> # Load a specific configuration file. | |||
| >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json') | |||
| >>> # Change some config attributes when loading a pretrained config. | |||
| >>> config = AutoConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False) | |||
| >>> config.output_attentions | |||
| True | |||
| >>> config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False, return_unused_kwargs=True) | |||
| >>> config.output_attentions | |||
| True | |||
| >>> config.unused_kwargs | |||
| {'foo': False} | |||
| """ | |||
| kwargs["_from_auto"] = True | |||
| config_dict, _ = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) | |||
| if "model_type" in config_dict: | |||
| config_class = CONFIG_MAPPING[config_dict["model_type"]] | |||
| return config_class.from_dict(config_dict, **kwargs) | |||
| else: | |||
| # Fallback: use pattern matching on the string. | |||
| for pattern, config_class in CONFIG_MAPPING.items(): | |||
| if pattern in str(pretrained_model_name_or_path): | |||
| return config_class.from_dict(config_dict, **kwargs) | |||
| raise ValueError( | |||
| f"Unrecognized model in {pretrained_model_name_or_path}. " | |||
| f"Should have a `model_type` key in its {CONFIG_NAME}, or contain one of the following strings " | |||
| f"in its name: {', '.join(CONFIG_MAPPING.keys())}" | |||
| ) | |||
| @@ -0,0 +1,199 @@ | |||
| # coding=utf-8 | |||
| # Copyright 2018 The HuggingFace Inc. team. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """ Auto Tokenizer class. """ | |||
| from collections import OrderedDict | |||
| from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union | |||
| from ...file_utils import ( | |||
| is_sentencepiece_available, | |||
| is_tokenizers_available, | |||
| ) | |||
| if TYPE_CHECKING: | |||
| # This significantly improves completion suggestion performance when | |||
| # the transformers package is used with Microsoft's Pylance language server. | |||
| TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict() | |||
| else: | |||
| TOKENIZER_MAPPING_NAMES = OrderedDict( | |||
| [ | |||
| ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)), | |||
| ("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)), | |||
| ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)), | |||
| ( | |||
| "t5", | |||
| ( | |||
| "T5Tokenizer" if is_sentencepiece_available() else None, | |||
| "T5TokenizerFast" if is_tokenizers_available() else None, | |||
| ), | |||
| ), | |||
| ( | |||
| "mt5", | |||
| ( | |||
| "MT5Tokenizer" if is_sentencepiece_available() else None, | |||
| "MT5TokenizerFast" if is_tokenizers_available() else None, | |||
| ), | |||
| ), | |||
| ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)), | |||
| ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)), | |||
| ( | |||
| "albert", | |||
| ( | |||
| "AlbertTokenizer" if is_sentencepiece_available() else None, | |||
| "AlbertTokenizerFast" if is_tokenizers_available() else None, | |||
| ), | |||
| ), | |||
| ( | |||
| "camembert", | |||
| ( | |||
| "CamembertTokenizer" if is_sentencepiece_available() else None, | |||
| "CamembertTokenizerFast" if is_tokenizers_available() else None, | |||
| ), | |||
| ), | |||
| ( | |||
| "pegasus", | |||
| ( | |||
| "PegasusTokenizer" if is_sentencepiece_available() else None, | |||
| "PegasusTokenizerFast" if is_tokenizers_available() else None, | |||
| ), | |||
| ), | |||
| ( | |||
| "mbart", | |||
| ( | |||
| "MBartTokenizer" if is_sentencepiece_available() else None, | |||
| "MBartTokenizerFast" if is_tokenizers_available() else None, | |||
| ), | |||
| ), | |||
| ( | |||
| "xlm-roberta", | |||
| ( | |||
| "XLMRobertaTokenizer" if is_sentencepiece_available() else None, | |||
| "XLMRobertaTokenizerFast" if is_tokenizers_available() else None, | |||
| ), | |||
| ), | |||
| ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)), | |||
| ("blenderbot-small", ("BlenderbotSmallTokenizer", None)), | |||
| ("blenderbot", ("BlenderbotTokenizer", None)), | |||
| ("bart", ("BartTokenizer", "BartTokenizerFast")), | |||
| ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)), | |||
| ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), | |||
| ( | |||
| "reformer", | |||
| ( | |||
| "ReformerTokenizer" if is_sentencepiece_available() else None, | |||
| "ReformerTokenizerFast" if is_tokenizers_available() else None, | |||
| ), | |||
| ), | |||
| ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)), | |||
| ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)), | |||
| ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)), | |||
| ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)), | |||
| ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)), | |||
| ( | |||
| "dpr", | |||
| ( | |||
| "DPRQuestionEncoderTokenizer", | |||
| "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None, | |||
| ), | |||
| ), | |||
| ( | |||
| "squeezebert", | |||
| ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None), | |||
| ), | |||
| ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), | |||
| ("openai-gpt", ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None)), | |||
| ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), | |||
| ("transfo-xl", ("TransfoXLTokenizer", None)), | |||
| ( | |||
| "xlnet", | |||
| ( | |||
| "XLNetTokenizer" if is_sentencepiece_available() else None, | |||
| "XLNetTokenizerFast" if is_tokenizers_available() else None, | |||
| ), | |||
| ), | |||
| ("flaubert", ("FlaubertTokenizer", None)), | |||
| ("xlm", ("XLMTokenizer", None)), | |||
| ("ctrl", ("CTRLTokenizer", None)), | |||
| ("fsmt", ("FSMTTokenizer", None)), | |||
| ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)), | |||
| ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)), | |||
| ("deberta-v2", ("DebertaV2Tokenizer" if is_sentencepiece_available() else None, None)), | |||
| ("rag", ("RagTokenizer", None)), | |||
| ("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)), | |||
| ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)), | |||
| ("speech_to_text_2", ("Speech2Text2Tokenizer", None)), | |||
| ("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)), | |||
| ("prophetnet", ("ProphetNetTokenizer", None)), | |||
| ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)), | |||
| ("tapas", ("TapasTokenizer", None)), | |||
| ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)), | |||
| ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)), | |||
| ( | |||
| "big_bird", | |||
| ( | |||
| "BigBirdTokenizer" if is_sentencepiece_available() else None, | |||
| "BigBirdTokenizerFast" if is_tokenizers_available() else None, | |||
| ), | |||
| ), | |||
| ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), | |||
| ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)), | |||
| ("hubert", ("Wav2Vec2CTCTokenizer", None)), | |||
| ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), | |||
| ("luke", ("LukeTokenizer", None)), | |||
| ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)), | |||
| ("canine", ("CanineTokenizer", None)), | |||
| ("bertweet", ("BertweetTokenizer", None)), | |||
| ("bert-japanese", ("BertJapaneseTokenizer", None)), | |||
| ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")), | |||
| ("byt5", ("ByT5Tokenizer", None)), | |||
| ( | |||
| "cpm", | |||
| ( | |||
| "CpmTokenizer" if is_sentencepiece_available() else None, | |||
| "CpmTokenizerFast" if is_tokenizers_available() else None, | |||
| ), | |||
| ), | |||
| ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)), | |||
| ("phobert", ("PhobertTokenizer", None)), | |||
| ( | |||
| "barthez", | |||
| ( | |||
| "BarthezTokenizer" if is_sentencepiece_available() else None, | |||
| "BarthezTokenizerFast" if is_tokenizers_available() else None, | |||
| ), | |||
| ), | |||
| ( | |||
| "mbart50", | |||
| ( | |||
| "MBart50Tokenizer" if is_sentencepiece_available() else None, | |||
| "MBart50TokenizerFast" if is_tokenizers_available() else None, | |||
| ), | |||
| ), | |||
| ( | |||
| "rembert", | |||
| ( | |||
| "RemBertTokenizer" if is_sentencepiece_available() else None, | |||
| "RemBertTokenizerFast" if is_tokenizers_available() else None, | |||
| ), | |||
| ), | |||
| ( | |||
| "clip", | |||
| ( | |||
| "CLIPTokenizer", | |||
| "CLIPTokenizerFast" if is_tokenizers_available() else None, | |||
| ), | |||
| ), | |||
| ] | |||
| ) | |||
| @@ -0,0 +1,20 @@ | |||
| __all__ = [ | |||
| "BartConfig", | |||
| "BART_PRETRAINED_CONFIG_ARCHIVE_MAP", | |||
| "BART_PRETRAINED_MODEL_ARCHIVE_LIST", | |||
| "BartForCausalLM", | |||
| "BartForConditionalGeneration", | |||
| "BartForQuestionAnswering", | |||
| "BartForSequenceClassification", | |||
| "BartModel", | |||
| "BartPretrainedModel", | |||
| "PretrainedBartModel", | |||
| "BartTokenizer", | |||
| ] | |||
| from .configuration_bart import BartConfig, BART_PRETRAINED_CONFIG_ARCHIVE_MAP | |||
| from .tokenization_bart import BartTokenizer | |||
| from .modeling_bart import BartForCausalLM, BartForConditionalGeneration, BartModel, BartForQuestionAnswering, \ | |||
| BartForSequenceClassification, BartPretrainedModel, PretrainedBartModel, BART_PRETRAINED_MODEL_ARCHIVE_LIST | |||
| @@ -0,0 +1,177 @@ | |||
| # coding=utf-8 | |||
| # Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """ BART model configuration """ | |||
| from fastNLP.transformers.torch.configuration_utils import PretrainedConfig | |||
| from fastNLP.core.log import logger | |||
| __all__ = [ | |||
| "BartConfig", | |||
| "BART_PRETRAINED_CONFIG_ARCHIVE_MAP", | |||
| ] | |||
| BART_PRETRAINED_CONFIG_ARCHIVE_MAP = { | |||
| "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/config.json", | |||
| # See all BART models at https://huggingface.co/models?filter=bart | |||
| } | |||
| class BartConfig(PretrainedConfig): | |||
| r""" | |||
| This is the configuration class to store the configuration of a :class:`~transformers.BartModel`. It is used to | |||
| instantiate a BART model according to the specified arguments, defining the model architecture. Instantiating a | |||
| configuration with the defaults will yield a similar configuration to that of the BART `facebook/bart-large | |||
| <https://huggingface.co/facebook/bart-large>`__ architecture. | |||
| Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model | |||
| outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. | |||
| Args: | |||
| vocab_size (:obj:`int`, `optional`, defaults to 50265): | |||
| Vocabulary size of the BART model. Defines the number of different tokens that can be represented by the | |||
| :obj:`inputs_ids` passed when calling :class:`~transformers.BartModel` or | |||
| :class:`~transformers.TFBartModel`. | |||
| d_model (:obj:`int`, `optional`, defaults to 1024): | |||
| Dimensionality of the layers and the pooler layer. | |||
| encoder_layers (:obj:`int`, `optional`, defaults to 12): | |||
| Number of encoder layers. | |||
| decoder_layers (:obj:`int`, `optional`, defaults to 12): | |||
| Number of decoder layers. | |||
| encoder_attention_heads (:obj:`int`, `optional`, defaults to 16): | |||
| Number of attention heads for each attention layer in the Transformer encoder. | |||
| decoder_attention_heads (:obj:`int`, `optional`, defaults to 16): | |||
| Number of attention heads for each attention layer in the Transformer decoder. | |||
| decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096): | |||
| Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. | |||
| encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096): | |||
| Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. | |||
| activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): | |||
| The non-linear activation function (function or string) in the encoder and pooler. If string, | |||
| :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. | |||
| dropout (:obj:`float`, `optional`, defaults to 0.1): | |||
| The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. | |||
| attention_dropout (:obj:`float`, `optional`, defaults to 0.0): | |||
| The dropout ratio for the attention probabilities. | |||
| activation_dropout (:obj:`float`, `optional`, defaults to 0.0): | |||
| The dropout ratio for activations inside the fully connected layer. | |||
| classifier_dropout (:obj:`float`, `optional`, defaults to 0.0): | |||
| The dropout ratio for classifier. | |||
| max_position_embeddings (:obj:`int`, `optional`, defaults to 1024): | |||
| The maximum sequence length that this model might ever be used with. Typically set this to something large | |||
| just in case (e.g., 512 or 1024 or 2048). | |||
| init_std (:obj:`float`, `optional`, defaults to 0.02): | |||
| The standard deviation of the truncated_normal_initializer for initializing all weight matrices. | |||
| encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0): | |||
| The LayerDrop probability for the encoder. See the `LayerDrop paper <see | |||
| https://arxiv.org/abs/1909.11556>`__ for more details. | |||
| decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0): | |||
| The LayerDrop probability for the decoder. See the `LayerDrop paper <see | |||
| https://arxiv.org/abs/1909.11556>`__ for more details. | |||
| scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Scale embeddings by diving by sqrt(d_model). | |||
| use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): | |||
| Whether or not the model should return the last key/values attentions (not used by all models). | |||
| num_labels: (:obj:`int`, `optional`, defaults to 3): | |||
| The number of labels to use in :class:`~transformers.BartForSequenceClassification`. | |||
| forced_eos_token_id (:obj:`int`, `optional`, defaults to 2): | |||
| The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to | |||
| :obj:`eos_token_id`. | |||
| Example:: | |||
| >>> from transformers import BartModel, BartConfig | |||
| >>> # Initializing a BART facebook/bart-large style configuration | |||
| >>> configuration = BartConfig() | |||
| >>> # Initializing a model from the facebook/bart-large style configuration | |||
| >>> model = BartModel(configuration) | |||
| >>> # Accessing the model configuration | |||
| >>> configuration = model.config | |||
| """ | |||
| model_type = "bart" | |||
| keys_to_ignore_at_inference = ["past_key_values"] | |||
| attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} | |||
| def __init__( | |||
| self, | |||
| vocab_size=50265, | |||
| max_position_embeddings=1024, | |||
| encoder_layers=12, | |||
| encoder_ffn_dim=4096, | |||
| encoder_attention_heads=16, | |||
| decoder_layers=12, | |||
| decoder_ffn_dim=4096, | |||
| decoder_attention_heads=16, | |||
| encoder_layerdrop=0.0, | |||
| decoder_layerdrop=0.0, | |||
| activation_function="gelu", | |||
| d_model=1024, | |||
| dropout=0.1, | |||
| attention_dropout=0.0, | |||
| activation_dropout=0.0, | |||
| init_std=0.02, | |||
| classifier_dropout=0.0, | |||
| scale_embedding=False, | |||
| use_cache=True, | |||
| num_labels=3, | |||
| pad_token_id=1, | |||
| bos_token_id=0, | |||
| eos_token_id=2, | |||
| is_encoder_decoder=True, | |||
| decoder_start_token_id=2, | |||
| forced_eos_token_id=2, | |||
| **kwargs | |||
| ): | |||
| self.vocab_size = vocab_size | |||
| self.max_position_embeddings = max_position_embeddings | |||
| self.d_model = d_model | |||
| self.encoder_ffn_dim = encoder_ffn_dim | |||
| self.encoder_layers = encoder_layers | |||
| self.encoder_attention_heads = encoder_attention_heads | |||
| self.decoder_ffn_dim = decoder_ffn_dim | |||
| self.decoder_layers = decoder_layers | |||
| self.decoder_attention_heads = decoder_attention_heads | |||
| self.dropout = dropout | |||
| self.attention_dropout = attention_dropout | |||
| self.activation_dropout = activation_dropout | |||
| self.activation_function = activation_function | |||
| self.init_std = init_std | |||
| self.encoder_layerdrop = encoder_layerdrop | |||
| self.decoder_layerdrop = decoder_layerdrop | |||
| self.classifier_dropout = classifier_dropout | |||
| self.use_cache = use_cache | |||
| self.num_hidden_layers = encoder_layers | |||
| self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True | |||
| super().__init__( | |||
| num_labels=num_labels, | |||
| pad_token_id=pad_token_id, | |||
| bos_token_id=bos_token_id, | |||
| eos_token_id=eos_token_id, | |||
| is_encoder_decoder=is_encoder_decoder, | |||
| decoder_start_token_id=decoder_start_token_id, | |||
| forced_eos_token_id=forced_eos_token_id, | |||
| **kwargs, | |||
| ) | |||
| # ensure backward compatibility for BART CNN models | |||
| if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False): | |||
| self.forced_bos_token_id = self.bos_token_id | |||
| logger.warn( | |||
| f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions." | |||
| "The config can simply be saved and uploaded again to be fixed." | |||
| ) | |||
| @@ -0,0 +1,65 @@ | |||
| # coding=utf-8 | |||
| # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| from ..roberta.tokenization_roberta import RobertaTokenizer | |||
| from fastNLP.core.log import logger | |||
| __all__ = [ | |||
| "BartTokenizer", | |||
| ] | |||
| VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"} | |||
| # See all BART models at https://huggingface.co/models?filter=bart | |||
| PRETRAINED_VOCAB_FILES_MAP = { | |||
| "vocab_file": { | |||
| "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/vocab.json", | |||
| "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/vocab.json", | |||
| "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/vocab.json", | |||
| "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/vocab.json", | |||
| "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/vocab.json", | |||
| "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/vocab.json", | |||
| }, | |||
| "merges_file": { | |||
| "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/merges.txt", | |||
| "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/merges.txt", | |||
| "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/merges.txt", | |||
| "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/merges.txt", | |||
| "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/merges.txt", | |||
| "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/merges.txt", | |||
| }, | |||
| } | |||
| PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { | |||
| "facebook/bart-base": 1024, | |||
| "facebook/bart-large": 1024, | |||
| "facebook/bart-large-mnli": 1024, | |||
| "facebook/bart-large-cnn": 1024, | |||
| "facebook/bart-large-xsum": 1024, | |||
| "yjernite/bart_eli5": 1024, | |||
| } | |||
| class BartTokenizer(RobertaTokenizer): | |||
| r""" | |||
| Construct a BART tokenizer. | |||
| :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to superclass | |||
| :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization | |||
| parameters and other methods. | |||
| """ | |||
| vocab_files_names = VOCAB_FILES_NAMES | |||
| pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP | |||
| max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES | |||
| @@ -0,0 +1,27 @@ | |||
| __all__ = [ | |||
| "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", | |||
| "BertConfig", | |||
| "BERT_PRETRAINED_MODEL_ARCHIVE_LIST", | |||
| "BertForMaskedLM", | |||
| "BertForMultipleChoice", | |||
| "BertForNextSentencePrediction", | |||
| "BertForPreTraining", | |||
| "BertForQuestionAnswering", | |||
| "BertForSequenceClassification", | |||
| "BertForTokenClassification", | |||
| "BertLayer", | |||
| "BertLMHeadModel", | |||
| "BertModel", | |||
| "BertPreTrainedModel", | |||
| "BasicTokenizer", | |||
| "BertTokenizer", | |||
| "WordpieceTokenizer", | |||
| ] | |||
| from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP | |||
| from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer | |||
| from .modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_LIST, BertForMaskedLM, BertForMultipleChoice, BertForPreTraining, \ | |||
| BertForNextSentencePrediction, BertForQuestionAnswering, BertForSequenceClassification, BertForTokenClassification, \ | |||
| BertLayer, BertLMHeadModel, BertModel, BertPreTrainedModel | |||
| @@ -0,0 +1,158 @@ | |||
| # coding=utf-8 | |||
| # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. | |||
| # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """ BERT model configuration """ | |||
| from fastNLP.transformers.torch.configuration_utils import PretrainedConfig | |||
| from fastNLP.core.log import logger | |||
| __all__ = [ | |||
| "BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", | |||
| "BertConfig", | |||
| ] | |||
| BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { | |||
| "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/config.json", | |||
| "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/config.json", | |||
| "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/config.json", | |||
| "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/config.json", | |||
| "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/config.json", | |||
| "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json", | |||
| "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/config.json", | |||
| "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/config.json", | |||
| "bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/config.json", | |||
| "bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/config.json", | |||
| "bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json", | |||
| "bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/config.json", | |||
| "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/config.json", | |||
| "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/config.json", | |||
| "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/config.json", | |||
| "cl-tohoku/bert-base-japanese": "https://huggingface.co/cl-tohoku/bert-base-japanese/resolve/main/config.json", | |||
| "cl-tohoku/bert-base-japanese-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-whole-word-masking/resolve/main/config.json", | |||
| "cl-tohoku/bert-base-japanese-char": "https://huggingface.co/cl-tohoku/bert-base-japanese-char/resolve/main/config.json", | |||
| "cl-tohoku/bert-base-japanese-char-whole-word-masking": "https://huggingface.co/cl-tohoku/bert-base-japanese-char-whole-word-masking/resolve/main/config.json", | |||
| "TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/config.json", | |||
| "TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/config.json", | |||
| "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/config.json", | |||
| # See all BERT models at https://huggingface.co/models?filter=bert | |||
| } | |||
| class BertConfig(PretrainedConfig): | |||
| r""" | |||
| This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a | |||
| :class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments, | |||
| defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration | |||
| to that of the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture. | |||
| Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model | |||
| outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. | |||
| Args: | |||
| vocab_size (:obj:`int`, `optional`, defaults to 30522): | |||
| Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the | |||
| :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or | |||
| :class:`~transformers.TFBertModel`. | |||
| hidden_size (:obj:`int`, `optional`, defaults to 768): | |||
| Dimensionality of the encoder layers and the pooler layer. | |||
| num_hidden_layers (:obj:`int`, `optional`, defaults to 12): | |||
| Number of hidden layers in the Transformer encoder. | |||
| num_attention_heads (:obj:`int`, `optional`, defaults to 12): | |||
| Number of attention heads for each attention layer in the Transformer encoder. | |||
| intermediate_size (:obj:`int`, `optional`, defaults to 3072): | |||
| Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. | |||
| hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): | |||
| The non-linear activation function (function or string) in the encoder and pooler. If string, | |||
| :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. | |||
| hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): | |||
| The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. | |||
| attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): | |||
| The dropout ratio for the attention probabilities. | |||
| max_position_embeddings (:obj:`int`, `optional`, defaults to 512): | |||
| The maximum sequence length that this model might ever be used with. Typically set this to something large | |||
| just in case (e.g., 512 or 1024 or 2048). | |||
| type_vocab_size (:obj:`int`, `optional`, defaults to 2): | |||
| The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or | |||
| :class:`~transformers.TFBertModel`. | |||
| initializer_range (:obj:`float`, `optional`, defaults to 0.02): | |||
| The standard deviation of the truncated_normal_initializer for initializing all weight matrices. | |||
| layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12): | |||
| The epsilon used by the layer normalization layers. | |||
| position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`): | |||
| Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`, | |||
| :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on | |||
| :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.) | |||
| <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to | |||
| `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.) | |||
| <https://arxiv.org/abs/2009.13658>`__. | |||
| use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): | |||
| Whether or not the model should return the last key/values attentions (not used by all models). Only | |||
| relevant if ``config.is_decoder=True``. | |||
| classifier_dropout (:obj:`float`, `optional`): | |||
| The dropout ratio for the classification head. | |||
| Examples:: | |||
| >>> from transformers import BertModel, BertConfig | |||
| >>> # Initializing a BERT bert-base-uncased style configuration | |||
| >>> configuration = BertConfig() | |||
| >>> # Initializing a model from the bert-base-uncased style configuration | |||
| >>> model = BertModel(configuration) | |||
| >>> # Accessing the model configuration | |||
| >>> configuration = model.config | |||
| """ | |||
| model_type = "bert" | |||
| def __init__( | |||
| self, | |||
| vocab_size=30522, | |||
| hidden_size=768, | |||
| num_hidden_layers=12, | |||
| num_attention_heads=12, | |||
| intermediate_size=3072, | |||
| hidden_act="gelu", | |||
| hidden_dropout_prob=0.1, | |||
| attention_probs_dropout_prob=0.1, | |||
| max_position_embeddings=512, | |||
| type_vocab_size=2, | |||
| initializer_range=0.02, | |||
| layer_norm_eps=1e-12, | |||
| pad_token_id=0, | |||
| position_embedding_type="absolute", | |||
| use_cache=True, | |||
| classifier_dropout=None, | |||
| **kwargs | |||
| ): | |||
| super().__init__(pad_token_id=pad_token_id, **kwargs) | |||
| self.vocab_size = vocab_size | |||
| self.hidden_size = hidden_size | |||
| self.num_hidden_layers = num_hidden_layers | |||
| self.num_attention_heads = num_attention_heads | |||
| self.hidden_act = hidden_act | |||
| self.intermediate_size = intermediate_size | |||
| self.hidden_dropout_prob = hidden_dropout_prob | |||
| self.attention_probs_dropout_prob = attention_probs_dropout_prob | |||
| self.max_position_embeddings = max_position_embeddings | |||
| self.type_vocab_size = type_vocab_size | |||
| self.initializer_range = initializer_range | |||
| self.layer_norm_eps = layer_norm_eps | |||
| self.position_embedding_type = position_embedding_type | |||
| self.use_cache = use_cache | |||
| self.classifier_dropout = classifier_dropout | |||
| @@ -0,0 +1,558 @@ | |||
| # coding=utf-8 | |||
| # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """Tokenization classes for Bert.""" | |||
| import collections | |||
| import os | |||
| import unicodedata | |||
| from typing import List, Optional, Tuple | |||
| from fastNLP.transformers.torch.tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace | |||
| from fastNLP.core.log import logger | |||
| __all__ = [ | |||
| "BasicTokenizer", | |||
| "BertTokenizer", | |||
| "WordpieceTokenizer", | |||
| ] | |||
| VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} | |||
| PRETRAINED_VOCAB_FILES_MAP = { | |||
| "vocab_file": { | |||
| "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", | |||
| "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", | |||
| "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt", | |||
| "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt", | |||
| "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/vocab.txt", | |||
| "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt", | |||
| "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt", | |||
| "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/vocab.txt", | |||
| "bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt", | |||
| "bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/vocab.txt", | |||
| "bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt", | |||
| "bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt", | |||
| "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt", | |||
| "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/vocab.txt", | |||
| "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt", | |||
| "TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt", | |||
| "TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/vocab.txt", | |||
| "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/vocab.txt", | |||
| } | |||
| } | |||
| PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { | |||
| "bert-base-uncased": 512, | |||
| "bert-large-uncased": 512, | |||
| "bert-base-cased": 512, | |||
| "bert-large-cased": 512, | |||
| "bert-base-multilingual-uncased": 512, | |||
| "bert-base-multilingual-cased": 512, | |||
| "bert-base-chinese": 512, | |||
| "bert-base-german-cased": 512, | |||
| "bert-large-uncased-whole-word-masking": 512, | |||
| "bert-large-cased-whole-word-masking": 512, | |||
| "bert-large-uncased-whole-word-masking-finetuned-squad": 512, | |||
| "bert-large-cased-whole-word-masking-finetuned-squad": 512, | |||
| "bert-base-cased-finetuned-mrpc": 512, | |||
| "bert-base-german-dbmdz-cased": 512, | |||
| "bert-base-german-dbmdz-uncased": 512, | |||
| "TurkuNLP/bert-base-finnish-cased-v1": 512, | |||
| "TurkuNLP/bert-base-finnish-uncased-v1": 512, | |||
| "wietsedv/bert-base-dutch-cased": 512, | |||
| } | |||
| PRETRAINED_INIT_CONFIGURATION = { | |||
| "bert-base-uncased": {"do_lower_case": True}, | |||
| "bert-large-uncased": {"do_lower_case": True}, | |||
| "bert-base-cased": {"do_lower_case": False}, | |||
| "bert-large-cased": {"do_lower_case": False}, | |||
| "bert-base-multilingual-uncased": {"do_lower_case": True}, | |||
| "bert-base-multilingual-cased": {"do_lower_case": False}, | |||
| "bert-base-chinese": {"do_lower_case": False}, | |||
| "bert-base-german-cased": {"do_lower_case": False}, | |||
| "bert-large-uncased-whole-word-masking": {"do_lower_case": True}, | |||
| "bert-large-cased-whole-word-masking": {"do_lower_case": False}, | |||
| "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True}, | |||
| "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False}, | |||
| "bert-base-cased-finetuned-mrpc": {"do_lower_case": False}, | |||
| "bert-base-german-dbmdz-cased": {"do_lower_case": False}, | |||
| "bert-base-german-dbmdz-uncased": {"do_lower_case": True}, | |||
| "TurkuNLP/bert-base-finnish-cased-v1": {"do_lower_case": False}, | |||
| "TurkuNLP/bert-base-finnish-uncased-v1": {"do_lower_case": True}, | |||
| "wietsedv/bert-base-dutch-cased": {"do_lower_case": False}, | |||
| } | |||
| def load_vocab(vocab_file): | |||
| """Loads a vocabulary file into a dictionary.""" | |||
| vocab = collections.OrderedDict() | |||
| with open(vocab_file, "r", encoding="utf-8") as reader: | |||
| tokens = reader.readlines() | |||
| for index, token in enumerate(tokens): | |||
| token = token.rstrip("\n") | |||
| vocab[token] = index | |||
| return vocab | |||
| def whitespace_tokenize(text): | |||
| """Runs basic whitespace cleaning and splitting on a piece of text.""" | |||
| text = text.strip() | |||
| if not text: | |||
| return [] | |||
| tokens = text.split() | |||
| return tokens | |||
| class BertTokenizer(PreTrainedTokenizer): | |||
| r""" | |||
| Construct a BERT tokenizer. Based on WordPiece. | |||
| This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. | |||
| Users should refer to this superclass for more information regarding those methods. | |||
| Args: | |||
| vocab_file (:obj:`str`): | |||
| File containing the vocabulary. | |||
| do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): | |||
| Whether or not to lowercase the input when tokenizing. | |||
| do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): | |||
| Whether or not to do basic tokenization before WordPiece. | |||
| never_split (:obj:`Iterable`, `optional`): | |||
| Collection of tokens which will never be split during tokenization. Only has an effect when | |||
| :obj:`do_basic_tokenize=True` | |||
| unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`): | |||
| The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this | |||
| token instead. | |||
| sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): | |||
| The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for | |||
| sequence classification or for a text and a question for question answering. It is also used as the last | |||
| token of a sequence built with special tokens. | |||
| pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`): | |||
| The token used for padding, for example when batching sequences of different lengths. | |||
| cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): | |||
| The classifier token which is used when doing sequence classification (classification of the whole sequence | |||
| instead of per-token classification). It is the first token of the sequence when built with special tokens. | |||
| mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): | |||
| The token used for masking values. This is the token used when training this model with masked language | |||
| modeling. This is the token which the model will try to predict. | |||
| tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): | |||
| Whether or not to tokenize Chinese characters. | |||
| This should likely be deactivated for Japanese (see this `issue | |||
| <https://github.com/huggingface/transformers/issues/328>`__). | |||
| strip_accents: (:obj:`bool`, `optional`): | |||
| Whether or not to strip all accents. If this option is not specified, then it will be determined by the | |||
| value for :obj:`lowercase` (as in the original BERT). | |||
| """ | |||
| vocab_files_names = VOCAB_FILES_NAMES | |||
| pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP | |||
| pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION | |||
| max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES | |||
| def __init__( | |||
| self, | |||
| vocab_file, | |||
| do_lower_case=True, | |||
| do_basic_tokenize=True, | |||
| never_split=None, | |||
| unk_token="[UNK]", | |||
| sep_token="[SEP]", | |||
| pad_token="[PAD]", | |||
| cls_token="[CLS]", | |||
| mask_token="[MASK]", | |||
| tokenize_chinese_chars=True, | |||
| strip_accents=None, | |||
| **kwargs | |||
| ): | |||
| super().__init__( | |||
| do_lower_case=do_lower_case, | |||
| do_basic_tokenize=do_basic_tokenize, | |||
| never_split=never_split, | |||
| unk_token=unk_token, | |||
| sep_token=sep_token, | |||
| pad_token=pad_token, | |||
| cls_token=cls_token, | |||
| mask_token=mask_token, | |||
| tokenize_chinese_chars=tokenize_chinese_chars, | |||
| strip_accents=strip_accents, | |||
| **kwargs, | |||
| ) | |||
| if not os.path.isfile(vocab_file): | |||
| raise ValueError( | |||
| f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained " | |||
| "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" | |||
| ) | |||
| self.vocab = load_vocab(vocab_file) | |||
| self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) | |||
| self.do_basic_tokenize = do_basic_tokenize | |||
| if do_basic_tokenize: | |||
| self.basic_tokenizer = BasicTokenizer( | |||
| do_lower_case=do_lower_case, | |||
| never_split=never_split, | |||
| tokenize_chinese_chars=tokenize_chinese_chars, | |||
| strip_accents=strip_accents, | |||
| ) | |||
| self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) | |||
| @property | |||
| def do_lower_case(self): | |||
| return self.basic_tokenizer.do_lower_case | |||
| @property | |||
| def vocab_size(self): | |||
| return len(self.vocab) | |||
| def get_vocab(self): | |||
| return dict(self.vocab, **self.added_tokens_encoder) | |||
| def _tokenize(self, text): | |||
| split_tokens = [] | |||
| if self.do_basic_tokenize: | |||
| for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): | |||
| # If the token is part of the never_split set | |||
| if token in self.basic_tokenizer.never_split: | |||
| split_tokens.append(token) | |||
| else: | |||
| split_tokens += self.wordpiece_tokenizer.tokenize(token) | |||
| else: | |||
| split_tokens = self.wordpiece_tokenizer.tokenize(text) | |||
| return split_tokens | |||
| def _convert_token_to_id(self, token): | |||
| """Converts a token (str) in an id using the vocab.""" | |||
| return self.vocab.get(token, self.vocab.get(self.unk_token)) | |||
| def _convert_id_to_token(self, index): | |||
| """Converts an index (integer) in a token (str) using the vocab.""" | |||
| return self.ids_to_tokens.get(index, self.unk_token) | |||
| def convert_tokens_to_string(self, tokens): | |||
| """Converts a sequence of tokens (string) in a single string.""" | |||
| out_string = " ".join(tokens).replace(" ##", "").strip() | |||
| return out_string | |||
| def build_inputs_with_special_tokens( | |||
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None | |||
| ) -> List[int]: | |||
| """ | |||
| Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and | |||
| adding special tokens. A BERT sequence has the following format: | |||
| - single sequence: ``[CLS] X [SEP]`` | |||
| - pair of sequences: ``[CLS] A [SEP] B [SEP]`` | |||
| Args: | |||
| token_ids_0 (:obj:`List[int]`): | |||
| List of IDs to which the special tokens will be added. | |||
| token_ids_1 (:obj:`List[int]`, `optional`): | |||
| Optional second list of IDs for sequence pairs. | |||
| Returns: | |||
| :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. | |||
| """ | |||
| if token_ids_1 is None: | |||
| return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] | |||
| cls = [self.cls_token_id] | |||
| sep = [self.sep_token_id] | |||
| return cls + token_ids_0 + sep + token_ids_1 + sep | |||
| def get_special_tokens_mask( | |||
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False | |||
| ) -> List[int]: | |||
| """ | |||
| Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding | |||
| special tokens using the tokenizer ``prepare_for_model`` method. | |||
| Args: | |||
| token_ids_0 (:obj:`List[int]`): | |||
| List of IDs. | |||
| token_ids_1 (:obj:`List[int]`, `optional`): | |||
| Optional second list of IDs for sequence pairs. | |||
| already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether or not the token list is already formatted with special tokens for the model. | |||
| Returns: | |||
| :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. | |||
| """ | |||
| if already_has_special_tokens: | |||
| return super().get_special_tokens_mask( | |||
| token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True | |||
| ) | |||
| if token_ids_1 is not None: | |||
| return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] | |||
| return [1] + ([0] * len(token_ids_0)) + [1] | |||
| def create_token_type_ids_from_sequences( | |||
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None | |||
| ) -> List[int]: | |||
| """ | |||
| Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence | |||
| pair mask has the following format: | |||
| :: | |||
| 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | |||
| | first sequence | second sequence | | |||
| If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). | |||
| Args: | |||
| token_ids_0 (:obj:`List[int]`): | |||
| List of IDs. | |||
| token_ids_1 (:obj:`List[int]`, `optional`): | |||
| Optional second list of IDs for sequence pairs. | |||
| Returns: | |||
| :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given | |||
| sequence(s). | |||
| """ | |||
| sep = [self.sep_token_id] | |||
| cls = [self.cls_token_id] | |||
| if token_ids_1 is None: | |||
| return len(cls + token_ids_0 + sep) * [0] | |||
| return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] | |||
| def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: | |||
| index = 0 | |||
| if os.path.isdir(save_directory): | |||
| vocab_file = os.path.join( | |||
| save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] | |||
| ) | |||
| else: | |||
| vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory | |||
| with open(vocab_file, "w", encoding="utf-8") as writer: | |||
| for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): | |||
| if index != token_index: | |||
| logger.warning( | |||
| f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." | |||
| " Please check that the vocabulary is not corrupted!" | |||
| ) | |||
| index = token_index | |||
| writer.write(token + "\n") | |||
| index += 1 | |||
| return (vocab_file,) | |||
| class BasicTokenizer(object): | |||
| """ | |||
| Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). | |||
| Args: | |||
| do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): | |||
| Whether or not to lowercase the input when tokenizing. | |||
| never_split (:obj:`Iterable`, `optional`): | |||
| Collection of tokens which will never be split during tokenization. Only has an effect when | |||
| :obj:`do_basic_tokenize=True` | |||
| tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): | |||
| Whether or not to tokenize Chinese characters. | |||
| This should likely be deactivated for Japanese (see this `issue | |||
| <https://github.com/huggingface/transformers/issues/328>`__). | |||
| strip_accents: (:obj:`bool`, `optional`): | |||
| Whether or not to strip all accents. If this option is not specified, then it will be determined by the | |||
| value for :obj:`lowercase` (as in the original BERT). | |||
| """ | |||
| def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): | |||
| if never_split is None: | |||
| never_split = [] | |||
| self.do_lower_case = do_lower_case | |||
| self.never_split = set(never_split) | |||
| self.tokenize_chinese_chars = tokenize_chinese_chars | |||
| self.strip_accents = strip_accents | |||
| def tokenize(self, text, never_split=None): | |||
| """ | |||
| Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see | |||
| WordPieceTokenizer. | |||
| Args: | |||
| **never_split**: (`optional`) list of str | |||
| Kept for backward compatibility purposes. Now implemented directly at the base class level (see | |||
| :func:`PreTrainedTokenizer.tokenize`) List of token not to split. | |||
| """ | |||
| # union() returns a new set by concatenating the two sets. | |||
| never_split = self.never_split.union(set(never_split)) if never_split else self.never_split | |||
| text = self._clean_text(text) | |||
| # This was added on November 1st, 2018 for the multilingual and Chinese | |||
| # models. This is also applied to the English models now, but it doesn't | |||
| # matter since the English models were not trained on any Chinese data | |||
| # and generally don't have any Chinese data in them (there are Chinese | |||
| # characters in the vocabulary because Wikipedia does have some Chinese | |||
| # words in the English Wikipedia.). | |||
| if self.tokenize_chinese_chars: | |||
| text = self._tokenize_chinese_chars(text) | |||
| orig_tokens = whitespace_tokenize(text) | |||
| split_tokens = [] | |||
| for token in orig_tokens: | |||
| if token not in never_split: | |||
| if self.do_lower_case: | |||
| token = token.lower() | |||
| if self.strip_accents is not False: | |||
| token = self._run_strip_accents(token) | |||
| elif self.strip_accents: | |||
| token = self._run_strip_accents(token) | |||
| split_tokens.extend(self._run_split_on_punc(token, never_split)) | |||
| output_tokens = whitespace_tokenize(" ".join(split_tokens)) | |||
| return output_tokens | |||
| def _run_strip_accents(self, text): | |||
| """Strips accents from a piece of text.""" | |||
| text = unicodedata.normalize("NFD", text) | |||
| output = [] | |||
| for char in text: | |||
| cat = unicodedata.category(char) | |||
| if cat == "Mn": | |||
| continue | |||
| output.append(char) | |||
| return "".join(output) | |||
| def _run_split_on_punc(self, text, never_split=None): | |||
| """Splits punctuation on a piece of text.""" | |||
| if never_split is not None and text in never_split: | |||
| return [text] | |||
| chars = list(text) | |||
| i = 0 | |||
| start_new_word = True | |||
| output = [] | |||
| while i < len(chars): | |||
| char = chars[i] | |||
| if _is_punctuation(char): | |||
| output.append([char]) | |||
| start_new_word = True | |||
| else: | |||
| if start_new_word: | |||
| output.append([]) | |||
| start_new_word = False | |||
| output[-1].append(char) | |||
| i += 1 | |||
| return ["".join(x) for x in output] | |||
| def _tokenize_chinese_chars(self, text): | |||
| """Adds whitespace around any CJK character.""" | |||
| output = [] | |||
| for char in text: | |||
| cp = ord(char) | |||
| if self._is_chinese_char(cp): | |||
| output.append(" ") | |||
| output.append(char) | |||
| output.append(" ") | |||
| else: | |||
| output.append(char) | |||
| return "".join(output) | |||
| def _is_chinese_char(self, cp): | |||
| """Checks whether CP is the codepoint of a CJK character.""" | |||
| # This defines a "chinese character" as anything in the CJK Unicode block: | |||
| # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) | |||
| # | |||
| # Note that the CJK Unicode block is NOT all Japanese and Korean characters, | |||
| # despite its name. The modern Korean Hangul alphabet is a different block, | |||
| # as is Japanese Hiragana and Katakana. Those alphabets are used to write | |||
| # space-separated words, so they are not treated specially and handled | |||
| # like the all of the other languages. | |||
| if ( | |||
| (cp >= 0x4E00 and cp <= 0x9FFF) | |||
| or (cp >= 0x3400 and cp <= 0x4DBF) # | |||
| or (cp >= 0x20000 and cp <= 0x2A6DF) # | |||
| or (cp >= 0x2A700 and cp <= 0x2B73F) # | |||
| or (cp >= 0x2B740 and cp <= 0x2B81F) # | |||
| or (cp >= 0x2B820 and cp <= 0x2CEAF) # | |||
| or (cp >= 0xF900 and cp <= 0xFAFF) | |||
| or (cp >= 0x2F800 and cp <= 0x2FA1F) # | |||
| ): # | |||
| return True | |||
| return False | |||
| def _clean_text(self, text): | |||
| """Performs invalid character removal and whitespace cleanup on text.""" | |||
| output = [] | |||
| for char in text: | |||
| cp = ord(char) | |||
| if cp == 0 or cp == 0xFFFD or _is_control(char): | |||
| continue | |||
| if _is_whitespace(char): | |||
| output.append(" ") | |||
| else: | |||
| output.append(char) | |||
| return "".join(output) | |||
| class WordpieceTokenizer(object): | |||
| """Runs WordPiece tokenization.""" | |||
| def __init__(self, vocab, unk_token, max_input_chars_per_word=100): | |||
| self.vocab = vocab | |||
| self.unk_token = unk_token | |||
| self.max_input_chars_per_word = max_input_chars_per_word | |||
| def tokenize(self, text): | |||
| """ | |||
| Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform | |||
| tokenization using the given vocabulary. | |||
| For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`. | |||
| Args: | |||
| text: A single token or whitespace separated tokens. This should have | |||
| already been passed through `BasicTokenizer`. | |||
| Returns: | |||
| A list of wordpiece tokens. | |||
| """ | |||
| output_tokens = [] | |||
| for token in whitespace_tokenize(text): | |||
| chars = list(token) | |||
| if len(chars) > self.max_input_chars_per_word: | |||
| output_tokens.append(self.unk_token) | |||
| continue | |||
| is_bad = False | |||
| start = 0 | |||
| sub_tokens = [] | |||
| while start < len(chars): | |||
| end = len(chars) | |||
| cur_substr = None | |||
| while start < end: | |||
| substr = "".join(chars[start:end]) | |||
| if start > 0: | |||
| substr = "##" + substr | |||
| if substr in self.vocab: | |||
| cur_substr = substr | |||
| break | |||
| end -= 1 | |||
| if cur_substr is None: | |||
| is_bad = True | |||
| break | |||
| sub_tokens.append(cur_substr) | |||
| start = end | |||
| if is_bad: | |||
| output_tokens.append(self.unk_token) | |||
| else: | |||
| output_tokens.extend(sub_tokens) | |||
| return output_tokens | |||
| @@ -0,0 +1,12 @@ | |||
| __all__ = [ | |||
| "CPT_PRETRAINED_MODEL_ARCHIVE_LIST", | |||
| "CPTForConditionalGeneration", | |||
| "CPTForSequenceClassification", | |||
| "CPTForMaskedLM", | |||
| "CPTForQuestionAnswering", | |||
| "CPTModel", | |||
| "CPTPretrainedModel", | |||
| ] | |||
| from .modeling_cpt import CPT_PRETRAINED_MODEL_ARCHIVE_LIST, CPTForConditionalGeneration, CPTForSequenceClassification, \ | |||
| CPTForMaskedLM, CPTForQuestionAnswering, CPTModel, CPTPretrainedModel | |||
| @@ -0,0 +1,19 @@ | |||
| __all__ = [ | |||
| "GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", | |||
| "GPT2Config", | |||
| "GPT2_PRETRAINED_MODEL_ARCHIVE_LIST", | |||
| "GPT2DoubleHeadsModel", | |||
| "GPT2ForSequenceClassification", | |||
| "GPT2ForTokenClassification", | |||
| "GPT2LMHeadModel", | |||
| "GPT2Model", | |||
| "GPT2PreTrainedModel", | |||
| "GPT2Tokenizer", | |||
| ] | |||
| from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config | |||
| from .tokenization_gpt2 import GPT2Tokenizer | |||
| from .modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_LIST, GPT2DoubleHeadsModel, GPT2ForSequenceClassification, \ | |||
| GPT2ForTokenClassification, GPT2LMHeadModel, GPT2Model, GPT2PreTrainedModel | |||
| @@ -0,0 +1,184 @@ | |||
| # coding=utf-8 | |||
| # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. | |||
| # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """ OpenAI GPT-2 configuration """ | |||
| from fastNLP.transformers.torch.configuration_utils import PretrainedConfig | |||
| __all__ = [ | |||
| "GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", | |||
| "GPT2Config", | |||
| ] | |||
| GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = { | |||
| "gpt2": "https://huggingface.co/gpt2/resolve/main/config.json", | |||
| "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/config.json", | |||
| "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/config.json", | |||
| "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/config.json", | |||
| "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/config.json", | |||
| } | |||
| class GPT2Config(PretrainedConfig): | |||
| """ | |||
| This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model` or a | |||
| :class:`~transformers.TFGPT2Model`. It is used to instantiate a GPT-2 model according to the specified arguments, | |||
| defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration | |||
| to that of the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture. | |||
| Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model | |||
| outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. | |||
| Args: | |||
| vocab_size (:obj:`int`, `optional`, defaults to 50257): | |||
| Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the | |||
| :obj:`inputs_ids` passed when calling :class:`~transformers.GPT2Model` or | |||
| :class:`~transformers.TFGPT2Model`. | |||
| n_positions (:obj:`int`, `optional`, defaults to 1024): | |||
| The maximum sequence length that this model might ever be used with. Typically set this to something large | |||
| just in case (e.g., 512 or 1024 or 2048). | |||
| n_ctx (:obj:`int`, `optional`, defaults to 1024): | |||
| Dimensionality of the causal mask (usually same as n_positions). | |||
| n_embd (:obj:`int`, `optional`, defaults to 768): | |||
| Dimensionality of the embeddings and hidden states. | |||
| n_layer (:obj:`int`, `optional`, defaults to 12): | |||
| Number of hidden layers in the Transformer encoder. | |||
| n_head (:obj:`int`, `optional`, defaults to 12): | |||
| Number of attention heads for each attention layer in the Transformer encoder. | |||
| n_inner (:obj:`int`, `optional`, defaults to None): | |||
| Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd | |||
| activation_function (:obj:`str`, `optional`, defaults to :obj:`"gelu"`): | |||
| Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]`. | |||
| resid_pdrop (:obj:`float`, `optional`, defaults to 0.1): | |||
| The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. | |||
| embd_pdrop (:obj:`int`, `optional`, defaults to 0.1): | |||
| The dropout ratio for the embeddings. | |||
| attn_pdrop (:obj:`float`, `optional`, defaults to 0.1): | |||
| The dropout ratio for the attention. | |||
| layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5): | |||
| The epsilon to use in the layer normalization layers | |||
| initializer_range (:obj:`float`, `optional`, defaults to 0.02): | |||
| The standard deviation of the truncated_normal_initializer for initializing all weight matrices. | |||
| summary_type (:obj:`string`, `optional`, defaults to :obj:`"cls_index"`): | |||
| Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel` | |||
| and :class:`~transformers.TFGPT2DoubleHeadsModel`. | |||
| Has to be one of the following options: | |||
| - :obj:`"last"`: Take the last token hidden state (like XLNet). | |||
| - :obj:`"first"`: Take the first token hidden state (like BERT). | |||
| - :obj:`"mean"`: Take the mean of all tokens hidden states. | |||
| - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2). | |||
| - :obj:`"attn"`: Not implemented now, use multi-head attention. | |||
| summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`): | |||
| Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel` | |||
| and :class:`~transformers.TFGPT2DoubleHeadsModel`. | |||
| Whether or not to add a projection after the vector extraction. | |||
| summary_activation (:obj:`str`, `optional`): | |||
| Argument used when doing sequence summary. Used in for the multiple choice head in | |||
| :class:`~transformers.GPT2DoubleHeadsModel`. | |||
| Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation. | |||
| summary_proj_to_labels (:obj:`bool`, `optional`, defaults to :obj:`True`): | |||
| Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel` | |||
| and :class:`~transformers.TFGPT2DoubleHeadsModel`. | |||
| Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes. | |||
| summary_first_dropout (:obj:`float`, `optional`, defaults to 0.1): | |||
| Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel` | |||
| and :class:`~transformers.TFGPT2DoubleHeadsModel`. | |||
| The dropout ratio to be used after the projection and activation. | |||
| scale_attn_weights (:obj:`bool`, `optional`, defaults to :obj:`True`): | |||
| Scale attention weights by dividing by sqrt(hidden_size).. | |||
| use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`): | |||
| Whether or not the model should return the last key/values attentions (not used by all models). | |||
| Example:: | |||
| >>> from transformers import GPT2Model, GPT2Config | |||
| >>> # Initializing a GPT2 configuration | |||
| >>> configuration = GPT2Config() | |||
| >>> # Initializing a model from the configuration | |||
| >>> model = GPT2Model(configuration) | |||
| >>> # Accessing the model configuration | |||
| >>> configuration = model.config | |||
| """ | |||
| model_type = "gpt2" | |||
| keys_to_ignore_at_inference = ["past_key_values"] | |||
| attribute_map = { | |||
| "hidden_size": "n_embd", | |||
| "max_position_embeddings": "n_positions", | |||
| "num_attention_heads": "n_head", | |||
| "num_hidden_layers": "n_layer", | |||
| } | |||
| def __init__( | |||
| self, | |||
| vocab_size=50257, | |||
| n_positions=1024, | |||
| n_ctx=1024, | |||
| n_embd=768, | |||
| n_layer=12, | |||
| n_head=12, | |||
| n_inner=None, | |||
| activation_function="gelu_new", | |||
| resid_pdrop=0.1, | |||
| embd_pdrop=0.1, | |||
| attn_pdrop=0.1, | |||
| layer_norm_epsilon=1e-5, | |||
| initializer_range=0.02, | |||
| summary_type="cls_index", | |||
| summary_use_proj=True, | |||
| summary_activation=None, | |||
| summary_proj_to_labels=True, | |||
| summary_first_dropout=0.1, | |||
| scale_attn_weights=True, | |||
| use_cache=True, | |||
| bos_token_id=50256, | |||
| eos_token_id=50256, | |||
| **kwargs | |||
| ): | |||
| self.vocab_size = vocab_size | |||
| self.n_ctx = n_ctx | |||
| self.n_positions = n_positions | |||
| self.n_embd = n_embd | |||
| self.n_layer = n_layer | |||
| self.n_head = n_head | |||
| self.n_inner = n_inner | |||
| self.activation_function = activation_function | |||
| self.resid_pdrop = resid_pdrop | |||
| self.embd_pdrop = embd_pdrop | |||
| self.attn_pdrop = attn_pdrop | |||
| self.layer_norm_epsilon = layer_norm_epsilon | |||
| self.initializer_range = initializer_range | |||
| self.summary_type = summary_type | |||
| self.summary_use_proj = summary_use_proj | |||
| self.summary_activation = summary_activation | |||
| self.summary_first_dropout = summary_first_dropout | |||
| self.summary_proj_to_labels = summary_proj_to_labels | |||
| self.scale_attn_weights = scale_attn_weights | |||
| self.use_cache = use_cache | |||
| self.bos_token_id = bos_token_id | |||
| self.eos_token_id = eos_token_id | |||
| super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) | |||
| @@ -0,0 +1,308 @@ | |||
| # coding=utf-8 | |||
| # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """Tokenization classes for OpenAI GPT.""" | |||
| import json | |||
| import os | |||
| from functools import lru_cache | |||
| from typing import TYPE_CHECKING, List, Optional, Tuple | |||
| import regex as re | |||
| from fastNLP.transformers.torch.tokenization_utils import AddedToken, PreTrainedTokenizer | |||
| # if TYPE_CHECKING: | |||
| # from transformers.pipelines.conversational import Conversation | |||
| from fastNLP.core.log import logger | |||
| __all__ = [ | |||
| "GPT2Tokenizer", | |||
| ] | |||
| VOCAB_FILES_NAMES = { | |||
| "vocab_file": "vocab.json", | |||
| "merges_file": "merges.txt", | |||
| } | |||
| PRETRAINED_VOCAB_FILES_MAP = { | |||
| "vocab_file": { | |||
| "gpt2": "https://huggingface.co/gpt2/resolve/main/vocab.json", | |||
| "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/vocab.json", | |||
| "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/vocab.json", | |||
| "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/vocab.json", | |||
| "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/vocab.json", | |||
| }, | |||
| "merges_file": { | |||
| "gpt2": "https://huggingface.co/gpt2/resolve/main/merges.txt", | |||
| "gpt2-medium": "https://huggingface.co/gpt2-medium/resolve/main/merges.txt", | |||
| "gpt2-large": "https://huggingface.co/gpt2-large/resolve/main/merges.txt", | |||
| "gpt2-xl": "https://huggingface.co/gpt2-xl/resolve/main/merges.txt", | |||
| "distilgpt2": "https://huggingface.co/distilgpt2/resolve/main/merges.txt", | |||
| }, | |||
| } | |||
| PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { | |||
| "gpt2": 1024, | |||
| "gpt2-medium": 1024, | |||
| "gpt2-large": 1024, | |||
| "gpt2-xl": 1024, | |||
| "distilgpt2": 1024, | |||
| } | |||
| @lru_cache() | |||
| def bytes_to_unicode(): | |||
| """ | |||
| Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control | |||
| characters the bpe code barfs on. | |||
| The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab | |||
| if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for | |||
| decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup | |||
| tables between utf-8 bytes and unicode strings. | |||
| """ | |||
| bs = ( | |||
| list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) | |||
| ) | |||
| cs = bs[:] | |||
| n = 0 | |||
| for b in range(2 ** 8): | |||
| if b not in bs: | |||
| bs.append(b) | |||
| cs.append(2 ** 8 + n) | |||
| n += 1 | |||
| cs = [chr(n) for n in cs] | |||
| return dict(zip(bs, cs)) | |||
| def get_pairs(word): | |||
| """ | |||
| Return set of symbol pairs in a word. | |||
| Word is represented as tuple of symbols (symbols being variable-length strings). | |||
| """ | |||
| pairs = set() | |||
| prev_char = word[0] | |||
| for char in word[1:]: | |||
| pairs.add((prev_char, char)) | |||
| prev_char = char | |||
| return pairs | |||
| class GPT2Tokenizer(PreTrainedTokenizer): | |||
| """ | |||
| Construct a GPT-2 tokenizer. Based on byte-level Byte-Pair-Encoding. | |||
| This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will | |||
| be encoded differently whether it is at the beginning of the sentence (without space) or not: | |||
| :: | |||
| >>> from transformers import GPT2Tokenizer | |||
| >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2") | |||
| >>> tokenizer("Hello world")['input_ids'] | |||
| [15496, 995] | |||
| >>> tokenizer(" Hello world")['input_ids'] | |||
| [18435, 995] | |||
| You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you | |||
| call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. | |||
| .. note:: | |||
| When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first | |||
| one). | |||
| This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. | |||
| Users should refer to this superclass for more information regarding those methods. | |||
| Args: | |||
| vocab_file (:obj:`str`): | |||
| Path to the vocabulary file. | |||
| merges_file (:obj:`str`): | |||
| Path to the merges file. | |||
| errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`): | |||
| Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode | |||
| <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information. | |||
| unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`): | |||
| The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this | |||
| token instead. | |||
| bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`): | |||
| The beginning of sequence token. | |||
| eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`): | |||
| The end of sequence token. | |||
| add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether or not to add an initial space to the input. This allows to treat the leading word just as any | |||
| other word. (GPT2 tokenizer detect beginning of words by the preceding space). | |||
| """ | |||
| vocab_files_names = VOCAB_FILES_NAMES | |||
| pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP | |||
| max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES | |||
| model_input_names = ["input_ids", "attention_mask"] | |||
| def __init__( | |||
| self, | |||
| vocab_file, | |||
| merges_file, | |||
| errors="replace", | |||
| unk_token="<|endoftext|>", | |||
| bos_token="<|endoftext|>", | |||
| eos_token="<|endoftext|>", | |||
| add_prefix_space=False, | |||
| **kwargs | |||
| ): | |||
| bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token | |||
| eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token | |||
| unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token | |||
| super().__init__( | |||
| errors=errors, | |||
| unk_token=unk_token, | |||
| bos_token=bos_token, | |||
| eos_token=eos_token, | |||
| add_prefix_space=add_prefix_space, | |||
| **kwargs, | |||
| ) | |||
| with open(vocab_file, encoding="utf-8") as vocab_handle: | |||
| self.encoder = json.load(vocab_handle) | |||
| self.decoder = {v: k for k, v in self.encoder.items()} | |||
| self.errors = errors # how to handle errors in decoding | |||
| self.byte_encoder = bytes_to_unicode() | |||
| self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} | |||
| with open(merges_file, encoding="utf-8") as merges_handle: | |||
| bpe_merges = merges_handle.read().split("\n")[1:-1] | |||
| bpe_merges = [tuple(merge.split()) for merge in bpe_merges] | |||
| self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) | |||
| self.cache = {} | |||
| self.add_prefix_space = add_prefix_space | |||
| # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions | |||
| self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") | |||
| @property | |||
| def vocab_size(self): | |||
| return len(self.encoder) | |||
| def get_vocab(self): | |||
| return dict(self.encoder, **self.added_tokens_encoder) | |||
| def bpe(self, token): | |||
| if token in self.cache: | |||
| return self.cache[token] | |||
| word = tuple(token) | |||
| pairs = get_pairs(word) | |||
| if not pairs: | |||
| return token | |||
| while True: | |||
| bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) | |||
| if bigram not in self.bpe_ranks: | |||
| break | |||
| first, second = bigram | |||
| new_word = [] | |||
| i = 0 | |||
| while i < len(word): | |||
| try: | |||
| j = word.index(first, i) | |||
| except ValueError: | |||
| new_word.extend(word[i:]) | |||
| break | |||
| else: | |||
| new_word.extend(word[i:j]) | |||
| i = j | |||
| if word[i] == first and i < len(word) - 1 and word[i + 1] == second: | |||
| new_word.append(first + second) | |||
| i += 2 | |||
| else: | |||
| new_word.append(word[i]) | |||
| i += 1 | |||
| new_word = tuple(new_word) | |||
| word = new_word | |||
| if len(word) == 1: | |||
| break | |||
| else: | |||
| pairs = get_pairs(word) | |||
| word = " ".join(word) | |||
| self.cache[token] = word | |||
| return word | |||
| def _tokenize(self, text): | |||
| """Tokenize a string.""" | |||
| bpe_tokens = [] | |||
| for token in re.findall(self.pat, text): | |||
| token = "".join( | |||
| self.byte_encoder[b] for b in token.encode("utf-8") | |||
| ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) | |||
| bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) | |||
| return bpe_tokens | |||
| def _convert_token_to_id(self, token): | |||
| """Converts a token (str) in an id using the vocab.""" | |||
| return self.encoder.get(token, self.encoder.get(self.unk_token)) | |||
| def _convert_id_to_token(self, index): | |||
| """Converts an index (integer) in a token (str) using the vocab.""" | |||
| return self.decoder.get(index) | |||
| def convert_tokens_to_string(self, tokens): | |||
| """Converts a sequence of tokens (string) in a single string.""" | |||
| text = "".join(tokens) | |||
| text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) | |||
| return text | |||
| def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: | |||
| if not os.path.isdir(save_directory): | |||
| logger.error(f"Vocabulary path ({save_directory}) should be a directory") | |||
| return | |||
| vocab_file = os.path.join( | |||
| save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] | |||
| ) | |||
| merge_file = os.path.join( | |||
| save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] | |||
| ) | |||
| with open(vocab_file, "w", encoding="utf-8") as f: | |||
| f.write(json.dumps(self.encoder, ensure_ascii=False)) | |||
| index = 0 | |||
| with open(merge_file, "w", encoding="utf-8") as writer: | |||
| writer.write("#version: 0.2\n") | |||
| for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): | |||
| if index != token_index: | |||
| logger.warning( | |||
| f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." | |||
| " Please check that the tokenizer is not corrupted!" | |||
| ) | |||
| index = token_index | |||
| writer.write(" ".join(bpe_tokens) + "\n") | |||
| index += 1 | |||
| return vocab_file, merge_file | |||
| def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): | |||
| add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) | |||
| if is_split_into_words or add_prefix_space: | |||
| text = " " + text | |||
| return (text, kwargs) | |||
| # def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]: | |||
| # input_ids = [] | |||
| # for is_user, text in conversation.iter_texts(): | |||
| # input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id]) | |||
| # if len(input_ids) > self.model_max_length: | |||
| # input_ids = input_ids[-self.model_max_length :] | |||
| # return input_ids | |||
| @@ -0,0 +1,21 @@ | |||
| __all__ = [ | |||
| "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", | |||
| "RobertaConfig", | |||
| "ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST", | |||
| "RobertaForCausalLM", | |||
| "RobertaForMaskedLM", | |||
| "RobertaForMultipleChoice", | |||
| "RobertaForQuestionAnswering", | |||
| "RobertaForSequenceClassification", | |||
| "RobertaForTokenClassification", | |||
| "RobertaModel", | |||
| "RobertaPreTrainedModel", | |||
| "RobertaTokenizer", | |||
| ] | |||
| from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig | |||
| from .tokenization_roberta import RobertaTokenizer | |||
| from .modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST, RobertaForCausalLM, RobertaForMaskedLM, RobertaForMultipleChoice, \ | |||
| RobertaForQuestionAnswering, RobertaForSequenceClassification, RobertaForTokenClassification, RobertaModel, RobertaPreTrainedModel | |||
| @@ -0,0 +1,65 @@ | |||
| # coding=utf-8 | |||
| # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. | |||
| # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """ RoBERTa configuration """ | |||
| from ..bert.configuration_bert import BertConfig | |||
| from fastNLP.core.log import logger | |||
| __all__ = [ | |||
| "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", | |||
| "RobertaConfig", | |||
| ] | |||
| ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { | |||
| "roberta-base": "https://huggingface.co/roberta-base/resolve/main/config.json", | |||
| "roberta-large": "https://huggingface.co/roberta-large/resolve/main/config.json", | |||
| "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/config.json", | |||
| "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/config.json", | |||
| "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/config.json", | |||
| "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/config.json", | |||
| } | |||
| class RobertaConfig(BertConfig): | |||
| r""" | |||
| This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel` or a | |||
| :class:`~transformers.TFRobertaModel`. It is used to instantiate a RoBERTa model according to the specified | |||
| arguments, defining the model architecture. | |||
| Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model | |||
| outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information. | |||
| The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. It reuses the | |||
| same defaults. Please check the parent class for more information. | |||
| Examples:: | |||
| >>> from transformers import RobertaConfig, RobertaModel | |||
| >>> # Initializing a RoBERTa configuration | |||
| >>> configuration = RobertaConfig() | |||
| >>> # Initializing a model from the configuration | |||
| >>> model = RobertaModel(configuration) | |||
| >>> # Accessing the model configuration | |||
| >>> configuration = model.config | |||
| """ | |||
| model_type = "roberta" | |||
| def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs): | |||
| """Constructs RobertaConfig.""" | |||
| super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) | |||
| @@ -0,0 +1,254 @@ | |||
| # coding=utf-8 | |||
| # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """Tokenization classes for RoBERTa.""" | |||
| from typing import List, Optional | |||
| from fastNLP.transformers.torch.tokenization_utils import AddedToken | |||
| from ..gpt2.tokenization_gpt2 import GPT2Tokenizer | |||
| from fastNLP.core.log import logger | |||
| __all__ = [ | |||
| "RobertaTokenizer", | |||
| ] | |||
| VOCAB_FILES_NAMES = { | |||
| "vocab_file": "vocab.json", | |||
| "merges_file": "merges.txt", | |||
| } | |||
| PRETRAINED_VOCAB_FILES_MAP = { | |||
| "vocab_file": { | |||
| "roberta-base": "https://huggingface.co/roberta-base/resolve/main/vocab.json", | |||
| "roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json", | |||
| "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json", | |||
| "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json", | |||
| "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/vocab.json", | |||
| "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/vocab.json", | |||
| }, | |||
| "merges_file": { | |||
| "roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt", | |||
| "roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt", | |||
| "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt", | |||
| "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt", | |||
| "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/merges.txt", | |||
| "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/merges.txt", | |||
| }, | |||
| } | |||
| PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { | |||
| "roberta-base": 512, | |||
| "roberta-large": 512, | |||
| "roberta-large-mnli": 512, | |||
| "distilroberta-base": 512, | |||
| "roberta-base-openai-detector": 512, | |||
| "roberta-large-openai-detector": 512, | |||
| } | |||
| class RobertaTokenizer(GPT2Tokenizer): | |||
| """ | |||
| Constructs a RoBERTa tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding. | |||
| This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will | |||
| be encoded differently whether it is at the beginning of the sentence (without space) or not: | |||
| :: | |||
| >>> from transformers import RobertaTokenizer | |||
| >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base") | |||
| >>> tokenizer("Hello world")['input_ids'] | |||
| [0, 31414, 232, 328, 2] | |||
| >>> tokenizer(" Hello world")['input_ids'] | |||
| [0, 20920, 232, 2] | |||
| You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you | |||
| call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. | |||
| .. note:: | |||
| When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first | |||
| one). | |||
| This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. | |||
| Users should refer to this superclass for more information regarding those methods. | |||
| Args: | |||
| vocab_file (:obj:`str`): | |||
| Path to the vocabulary file. | |||
| merges_file (:obj:`str`): | |||
| Path to the merges file. | |||
| errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`): | |||
| Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode | |||
| <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information. | |||
| bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`): | |||
| The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. | |||
| .. note:: | |||
| When building a sequence using special tokens, this is not the token that is used for the beginning of | |||
| sequence. The token used is the :obj:`cls_token`. | |||
| eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`): | |||
| The end of sequence token. | |||
| .. note:: | |||
| When building a sequence using special tokens, this is not the token that is used for the end of | |||
| sequence. The token used is the :obj:`sep_token`. | |||
| sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`): | |||
| The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for | |||
| sequence classification or for a text and a question for question answering. It is also used as the last | |||
| token of a sequence built with special tokens. | |||
| cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`): | |||
| The classifier token which is used when doing sequence classification (classification of the whole sequence | |||
| instead of per-token classification). It is the first token of the sequence when built with special tokens. | |||
| unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`): | |||
| The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this | |||
| token instead. | |||
| pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`): | |||
| The token used for padding, for example when batching sequences of different lengths. | |||
| mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`): | |||
| The token used for masking values. This is the token used when training this model with masked language | |||
| modeling. This is the token which the model will try to predict. | |||
| add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether or not to add an initial space to the input. This allows to treat the leading word just as any | |||
| other word. (RoBERTa tokenizer detect beginning of words by the preceding space). | |||
| """ | |||
| vocab_files_names = VOCAB_FILES_NAMES | |||
| pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP | |||
| max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES | |||
| model_input_names = ["input_ids", "attention_mask"] | |||
| def __init__( | |||
| self, | |||
| vocab_file, | |||
| merges_file, | |||
| errors="replace", | |||
| bos_token="<s>", | |||
| eos_token="</s>", | |||
| sep_token="</s>", | |||
| cls_token="<s>", | |||
| unk_token="<unk>", | |||
| pad_token="<pad>", | |||
| mask_token="<mask>", | |||
| add_prefix_space=False, | |||
| **kwargs | |||
| ): | |||
| bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token | |||
| eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token | |||
| sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token | |||
| cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token | |||
| unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token | |||
| pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token | |||
| # Mask token behave like a normal word, i.e. include the space before it | |||
| mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token | |||
| super().__init__( | |||
| vocab_file=vocab_file, | |||
| merges_file=merges_file, | |||
| errors=errors, | |||
| bos_token=bos_token, | |||
| eos_token=eos_token, | |||
| unk_token=unk_token, | |||
| sep_token=sep_token, | |||
| cls_token=cls_token, | |||
| pad_token=pad_token, | |||
| mask_token=mask_token, | |||
| add_prefix_space=add_prefix_space, | |||
| **kwargs, | |||
| ) | |||
| def build_inputs_with_special_tokens( | |||
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None | |||
| ) -> List[int]: | |||
| """ | |||
| Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and | |||
| adding special tokens. A RoBERTa sequence has the following format: | |||
| - single sequence: ``<s> X </s>`` | |||
| - pair of sequences: ``<s> A </s></s> B </s>`` | |||
| Args: | |||
| token_ids_0 (:obj:`List[int]`): | |||
| List of IDs to which the special tokens will be added. | |||
| token_ids_1 (:obj:`List[int]`, `optional`): | |||
| Optional second list of IDs for sequence pairs. | |||
| Returns: | |||
| :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. | |||
| """ | |||
| if token_ids_1 is None: | |||
| return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] | |||
| cls = [self.cls_token_id] | |||
| sep = [self.sep_token_id] | |||
| return cls + token_ids_0 + sep + sep + token_ids_1 + sep | |||
| def get_special_tokens_mask( | |||
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False | |||
| ) -> List[int]: | |||
| """ | |||
| Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding | |||
| special tokens using the tokenizer ``prepare_for_model`` method. | |||
| Args: | |||
| token_ids_0 (:obj:`List[int]`): | |||
| List of IDs. | |||
| token_ids_1 (:obj:`List[int]`, `optional`): | |||
| Optional second list of IDs for sequence pairs. | |||
| already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether or not the token list is already formatted with special tokens for the model. | |||
| Returns: | |||
| :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. | |||
| """ | |||
| if already_has_special_tokens: | |||
| return super().get_special_tokens_mask( | |||
| token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True | |||
| ) | |||
| if token_ids_1 is None: | |||
| return [1] + ([0] * len(token_ids_0)) + [1] | |||
| return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] | |||
| def create_token_type_ids_from_sequences( | |||
| self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None | |||
| ) -> List[int]: | |||
| """ | |||
| Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not | |||
| make use of token type ids, therefore a list of zeros is returned. | |||
| Args: | |||
| token_ids_0 (:obj:`List[int]`): | |||
| List of IDs. | |||
| token_ids_1 (:obj:`List[int]`, `optional`): | |||
| Optional second list of IDs for sequence pairs. | |||
| Returns: | |||
| :obj:`List[int]`: List of zeros. | |||
| """ | |||
| sep = [self.sep_token_id] | |||
| cls = [self.cls_token_id] | |||
| if token_ids_1 is None: | |||
| return len(cls + token_ids_0 + sep) * [0] | |||
| return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] | |||
| def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): | |||
| add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) | |||
| if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): | |||
| text = " " + text | |||
| return (text, kwargs) | |||
| @@ -0,0 +1,915 @@ | |||
| # coding=utf-8 | |||
| # Copyright 2020 The HuggingFace Inc. team. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """ | |||
| Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see | |||
| tokenization_utils_fast.py | |||
| """ | |||
| import bisect | |||
| import itertools | |||
| import re | |||
| import unicodedata | |||
| from collections import OrderedDict | |||
| from typing import Any, Dict, List, Optional, Tuple, Union, overload | |||
| from .file_utils import PaddingStrategy, TensorType, add_end_docstrings | |||
| from .tokenization_utils_base import ( | |||
| ENCODE_KWARGS_DOCSTRING, | |||
| ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING, | |||
| INIT_TOKENIZER_DOCSTRING, | |||
| AddedToken, | |||
| BatchEncoding, | |||
| EncodedInput, | |||
| EncodedInputPair, | |||
| PreTokenizedInput, | |||
| PreTokenizedInputPair, | |||
| PreTrainedTokenizerBase, | |||
| TextInput, | |||
| TextInputPair, | |||
| TruncationStrategy, | |||
| ) | |||
| from fastNLP.core.log import logger | |||
| # Slow tokenizers are saved in a vocabulary plus three separated files | |||
| SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" | |||
| ADDED_TOKENS_FILE = "added_tokens.json" | |||
| TOKENIZER_CONFIG_FILE = "tokenizer_config.json" | |||
| class Trie: | |||
| """ | |||
| Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass | |||
| Loose reference https://en.wikipedia.org/wiki/Trie | |||
| """ | |||
| def __init__(self): | |||
| self.data = {} | |||
| def add(self, word: str): | |||
| """ | |||
| Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation. | |||
| The special key `""` is used to represent termination. | |||
| This function is idempotent, adding twice the same word will leave the trie unchanged | |||
| Example:: | |||
| >>> trie = Trie() | |||
| >>> trie.add("Hello 友達") | |||
| >>> trie.data | |||
| {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}} | |||
| >>> trie.add("Hello") | |||
| >>> trie.data | |||
| {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}} | |||
| """ | |||
| if not word: | |||
| # Prevent empty string | |||
| return | |||
| ref = self.data | |||
| for char in word: | |||
| ref[char] = char in ref and ref[char] or {} | |||
| ref = ref[char] | |||
| ref[""] = 1 | |||
| def split(self, text: str) -> List[str]: | |||
| """ | |||
| Will look for the words added to the trie within `text`. Output is the original string splitted along the | |||
| boundaries of the words found. | |||
| This trie will match the longest possible word first ! | |||
| Example:: | |||
| >>> trie = Trie() | |||
| >>> trie.split("[CLS] This is a extra_id_100") | |||
| ["[CLS] This is a extra_id_100"] | |||
| >>> trie.add("[CLS]") | |||
| >>> trie.add("extra_id_1") | |||
| >>> trie.add("extra_id_100") | |||
| >>> trie.split("[CLS] This is a extra_id_100") | |||
| ["[CLS]", " This is a ", "extra_id_100"] | |||
| """ | |||
| # indexes are counted left of the chars index. | |||
| # "hello", index 0, is left of h, index 1 is between h and e. | |||
| # index 5 is right of the "o". | |||
| # States are going to capture every possible start (indexes as above) | |||
| # as keys, and have as values, a pointer to the position in the trie | |||
| # where we're at. This is a partial match for now. | |||
| # This enables to keep track of multiple matches while we're iterating | |||
| # the string | |||
| # If the trie contains, "blowing", and "lower" and we encounter the | |||
| # string "blower", we need to split into ["b", "lower"]. | |||
| # This is where we need to keep track of multiple possible starts. | |||
| states = OrderedDict() | |||
| # This will contain every indices where we need | |||
| # to cut. | |||
| # We force to cut at offset 0 and len(text) (added later) | |||
| offsets = [0] | |||
| # This is used by the lookahead which needs to skip over | |||
| # some text where the full match exceeded the place in the initial | |||
| # for loop | |||
| skip = None | |||
| # Main loop, Giving this algorithm O(n) complexity | |||
| for current, current_char in enumerate(text): | |||
| if skip and current < skip: | |||
| # Prevents the lookahead for matching twice | |||
| # like extra_id_100 and id_100 | |||
| continue | |||
| # This will track every state | |||
| # that stop matching, we need to stop tracking them. | |||
| # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then | |||
| # fail on "b", we need to remove 0 from the valid states. | |||
| to_remove = set() | |||
| # Whenever we found a match, we need to drop everything | |||
| # this is a greedy algorithm, it will match on the first found token | |||
| reset = False | |||
| # In this case, we already have partial matches (But unfinished) | |||
| for start, trie_pointer in states.items(): | |||
| if "" in trie_pointer: | |||
| # This is a final match, we need to reset and | |||
| # store the results in `offsets`. | |||
| # Lookahead to match longest first | |||
| # Important in case of extra_id_1 vs extra_id_100 | |||
| lookahead_index = current | |||
| end = current | |||
| next_char = text[lookahead_index] if lookahead_index < len(text) else None | |||
| while next_char in trie_pointer: | |||
| trie_pointer = trie_pointer[next_char] | |||
| lookahead_index += 1 | |||
| if "" in trie_pointer: | |||
| end = lookahead_index | |||
| skip = lookahead_index | |||
| if lookahead_index == len(text): | |||
| # End of string | |||
| break | |||
| next_char = text[lookahead_index] | |||
| # End lookahead | |||
| # Storing and resetting | |||
| offsets.append(start) | |||
| offsets.append(end) | |||
| reset = True | |||
| elif current_char in trie_pointer: | |||
| # The current character being looked at has a match within the trie | |||
| # update the pointer (it will be stored back into states later). | |||
| trie_pointer = trie_pointer[current_char] | |||
| # Storing back the new pointer into the states. | |||
| # Partial matches got longer by one. | |||
| states[start] = trie_pointer | |||
| else: | |||
| # The new character has not match in the trie, we need | |||
| # to stop keeping track of this partial match. | |||
| # We can't do it directly within the loop because of how | |||
| # python iteration works | |||
| to_remove.add(start) | |||
| # Either clearing the full start (we found a real match) | |||
| # Or clearing only the partial matches that didn't work. | |||
| if reset: | |||
| states = {} | |||
| else: | |||
| for start in to_remove: | |||
| del states[start] | |||
| # If this character is a starting character within the trie | |||
| # start keeping track of this partial match. | |||
| if current_char in self.data: | |||
| states[current] = self.data[current_char] | |||
| # We have a cut at the end with states. | |||
| for start, trie_pointer in states.items(): | |||
| if "" in trie_pointer: | |||
| # This is a final match, we need to reset and | |||
| # store the results in `offsets`. | |||
| end = len(text) | |||
| offsets.append(start) | |||
| offsets.append(end) | |||
| # Longest cut is always the one with lower start so the first | |||
| # item so we need to break. | |||
| break | |||
| # We have all the offsets now, we just need to do the actual splitting. | |||
| # We need to eventually add the first part of the string and the eventual | |||
| # last part. | |||
| offsets.append(len(text)) | |||
| tokens = [] | |||
| start = 0 | |||
| for end in offsets: | |||
| if start == end: | |||
| # This might happen if there's a match at index 0 | |||
| # we're also preventing zero-width cuts in case of two | |||
| # consecutive matches | |||
| continue | |||
| tokens.append(text[start:end]) | |||
| start = end | |||
| return tokens | |||
| def _is_whitespace(char): | |||
| """Checks whether `char` is a whitespace character.""" | |||
| # \t, \n, and \r are technically control characters but we treat them | |||
| # as whitespace since they are generally considered as such. | |||
| if char == " " or char == "\t" or char == "\n" or char == "\r": | |||
| return True | |||
| cat = unicodedata.category(char) | |||
| if cat == "Zs": | |||
| return True | |||
| return False | |||
| def _is_control(char): | |||
| """Checks whether `char` is a control character.""" | |||
| # These are technically control characters but we count them as whitespace | |||
| # characters. | |||
| if char == "\t" or char == "\n" or char == "\r": | |||
| return False | |||
| cat = unicodedata.category(char) | |||
| if cat.startswith("C"): | |||
| return True | |||
| return False | |||
| def _is_punctuation(char): | |||
| """Checks whether `char` is a punctuation character.""" | |||
| cp = ord(char) | |||
| # We treat all non-letter/number ASCII as punctuation. | |||
| # Characters such as "^", "$", and "`" are not in the Unicode | |||
| # Punctuation class but we treat them as punctuation anyways, for | |||
| # consistency. | |||
| if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): | |||
| return True | |||
| cat = unicodedata.category(char) | |||
| if cat.startswith("P"): | |||
| return True | |||
| return False | |||
| def _is_end_of_word(text): | |||
| """Checks whether the last character in text is one of a punctuation, control or whitespace character.""" | |||
| last_char = text[-1] | |||
| return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char)) | |||
| def _is_start_of_word(text): | |||
| """Checks whether the first character in text is one of a punctuation, control or whitespace character.""" | |||
| first_char = text[0] | |||
| return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char)) | |||
| def _insert_one_token_to_ordered_list(token_list: List[str], new_token: str): | |||
| """ | |||
| Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted. | |||
| """ | |||
| insertion_idx = bisect.bisect_left(token_list, new_token) | |||
| # Checks if new_token is already in the ordered token_list | |||
| if insertion_idx < len(token_list) and token_list[insertion_idx] == new_token: | |||
| # new_token is in token_list, don't add | |||
| return | |||
| else: | |||
| token_list.insert(insertion_idx, new_token) | |||
| @add_end_docstrings(INIT_TOKENIZER_DOCSTRING) | |||
| class PreTrainedTokenizer(PreTrainedTokenizerBase): | |||
| """ | |||
| Base class for all slow tokenizers. | |||
| Inherits from :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`. | |||
| Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading | |||
| pretrained tokenizers as well as adding tokens to the vocabulary. | |||
| This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the | |||
| specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). | |||
| """ | |||
| def __init__(self, **kwargs): | |||
| super().__init__(**kwargs) | |||
| # Added tokens - We store this for both slow and fast tokenizers | |||
| # until the serialization of Fast tokenizers is updated | |||
| self.added_tokens_encoder: Dict[str, int] = {} | |||
| self.added_tokens_decoder: Dict[int, str] = {} | |||
| self.unique_no_split_tokens: List[str] = [] | |||
| self.tokens_trie = Trie() | |||
| self._decode_use_source_tokenizer = False | |||
| @property | |||
| def is_fast(self) -> bool: | |||
| return False | |||
| @property | |||
| def vocab_size(self) -> int: | |||
| """ | |||
| :obj:`int`: Size of the base vocabulary (without the added tokens). | |||
| """ | |||
| raise NotImplementedError | |||
| def get_added_vocab(self) -> Dict[str, int]: | |||
| """ | |||
| Returns the added tokens in the vocabulary as a dictionary of token to index. | |||
| Returns: | |||
| :obj:`Dict[str, int]`: The added tokens. | |||
| """ | |||
| return self.added_tokens_encoder | |||
| def __len__(self): | |||
| """ | |||
| Size of the full vocabulary with the added tokens. | |||
| """ | |||
| return self.vocab_size + len(self.added_tokens_encoder) | |||
| def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int: | |||
| """ | |||
| Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to | |||
| it with indices starting from length of the current vocabulary. | |||
| Args: | |||
| new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`): | |||
| Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by | |||
| checking if the tokenizer assign the index of the ``unk_token`` to them). | |||
| special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether or not the tokens should be added as special tokens. | |||
| Returns: | |||
| :obj:`int`: The number of tokens actually added to the vocabulary. | |||
| Examples:: | |||
| # Let's see how to increase the vocabulary of Bert model and tokenizer | |||
| tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') | |||
| model = BertModel.from_pretrained('bert-base-uncased') | |||
| num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) | |||
| print('We have added', num_added_toks, 'tokens') | |||
| # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer. | |||
| model.resize_token_embeddings(len(tokenizer)) | |||
| """ | |||
| new_tokens = [str(tok) for tok in new_tokens] | |||
| tokens_to_add = [] | |||
| for token in new_tokens: | |||
| if not isinstance(token, str): | |||
| raise TypeError(f"Token {token} is not a string but a {type(token)}.") | |||
| if not special_tokens and hasattr(self, "do_lower_case") and self.do_lower_case: | |||
| token = token.lower() | |||
| if ( | |||
| token != self.unk_token | |||
| and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) | |||
| and token not in tokens_to_add | |||
| ): | |||
| tokens_to_add.append(token) | |||
| if self.verbose: | |||
| logger.info(f"Adding {token} to the vocabulary") | |||
| added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add)) | |||
| added_tok_decoder = {v: k for k, v in added_tok_encoder.items()} | |||
| self.added_tokens_encoder.update(added_tok_encoder) | |||
| self.added_tokens_decoder.update(added_tok_decoder) | |||
| # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert) | |||
| if special_tokens: | |||
| if len(new_tokens) == 1: | |||
| _insert_one_token_to_ordered_list(self.unique_no_split_tokens, new_tokens[0]) | |||
| else: | |||
| self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(new_tokens))) | |||
| else: | |||
| # Or on the newly added tokens | |||
| if len(tokens_to_add) == 1: | |||
| _insert_one_token_to_ordered_list(self.unique_no_split_tokens, tokens_to_add[0]) | |||
| else: | |||
| self.unique_no_split_tokens = sorted(set(self.unique_no_split_tokens).union(set(tokens_to_add))) | |||
| self._create_trie(self.unique_no_split_tokens) | |||
| return len(tokens_to_add) | |||
| def _create_trie(self, unique_no_split_tokens): | |||
| trie = Trie() | |||
| for token in unique_no_split_tokens: | |||
| if hasattr(self, "do_lower_case") and self.do_lower_case and token not in self.all_special_tokens: | |||
| trie.add(token.lower()) | |||
| else: | |||
| trie.add(token) | |||
| self.tokens_trie = trie | |||
| def num_special_tokens_to_add(self, pair: bool = False) -> int: | |||
| """ | |||
| Returns the number of added tokens when encoding a sequence with special tokens. | |||
| .. note:: | |||
| This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not | |||
| put this inside your training loop. | |||
| Args: | |||
| pair (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether the number of added tokens should be computed in the case of a sequence pair or a single | |||
| sequence. | |||
| Returns: | |||
| :obj:`int`: Number of special tokens added to sequences. | |||
| """ | |||
| token_ids_0 = [] | |||
| token_ids_1 = [] | |||
| return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) | |||
| def tokenize(self, text: TextInput, **kwargs) -> List[str]: | |||
| """ | |||
| Converts a string in a sequence of tokens, using the tokenizer. | |||
| Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies | |||
| (BPE/SentencePieces/WordPieces). Takes care of added tokens. | |||
| Args: | |||
| text (:obj:`str`): | |||
| The sequence to be encoded. | |||
| **kwargs (additional keyword arguments): | |||
| Passed along to the model-specific ``prepare_for_tokenization`` preprocessing method. | |||
| Returns: | |||
| :obj:`List[str]`: The list of tokens. | |||
| """ | |||
| # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors | |||
| all_special_tokens_extended = dict( | |||
| (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken) | |||
| ) | |||
| text, kwargs = self.prepare_for_tokenization(text, **kwargs) | |||
| if kwargs: | |||
| logger.warning(f"Keyword arguments {kwargs} not recognized.") | |||
| # TODO: should this be in the base class? | |||
| if hasattr(self, "do_lower_case") and self.do_lower_case: | |||
| # convert non-special tokens to lowercase | |||
| escaped_special_toks = [ | |||
| re.escape(s_tok) for s_tok in (self.unique_no_split_tokens + self.all_special_tokens) | |||
| ] | |||
| pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" | |||
| text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text) | |||
| no_split_token = set(self.unique_no_split_tokens) | |||
| tokens = self.tokens_trie.split(text) | |||
| # ["This is something", "<special_token_1>", " else"] | |||
| for i, token in enumerate(tokens): | |||
| if token in no_split_token: | |||
| tok_extended = all_special_tokens_extended.get(token, None) | |||
| left = tokens[i - 1] if i > 0 else None | |||
| right = tokens[i + 1] if i < len(tokens) - 1 else None | |||
| if isinstance(tok_extended, AddedToken): | |||
| if tok_extended.rstrip and right: | |||
| # A bit counter-intuitive but we strip the left of the string | |||
| # since tok_extended.rstrip means the special token is eating all white spaces on its right | |||
| tokens[i + 1] = right.lstrip() | |||
| # Strip white spaces on the left | |||
| if tok_extended.lstrip and left: | |||
| tokens[i - 1] = left.rstrip() # Opposite here | |||
| else: | |||
| # We strip left and right by default | |||
| if right: | |||
| tokens[i + 1] = right.lstrip() | |||
| if left: | |||
| tokens[i - 1] = left.rstrip() | |||
| # ["This is something", "<special_token_1>", "else"] | |||
| tokenized_text = [] | |||
| for token in tokens: | |||
| # Need to skip eventual empty (fully stripped) tokens | |||
| if not token: | |||
| continue | |||
| if token in no_split_token: | |||
| tokenized_text.append(token) | |||
| else: | |||
| tokenized_text.extend(self._tokenize(token)) | |||
| # ["This", " is", " something", "<special_token_1>", "else"] | |||
| return tokenized_text | |||
| def _tokenize(self, text, **kwargs): | |||
| """ | |||
| Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based | |||
| vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). | |||
| Do NOT take care of added tokens. | |||
| """ | |||
| raise NotImplementedError | |||
| def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]: | |||
| """ | |||
| Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the | |||
| vocabulary. | |||
| Args: | |||
| tokens (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s). | |||
| Returns: | |||
| :obj:`int` or :obj:`List[int]`: The token id or list of token ids. | |||
| """ | |||
| if tokens is None: | |||
| return None | |||
| if isinstance(tokens, str): | |||
| return self._convert_token_to_id_with_added_voc(tokens) | |||
| ids = [] | |||
| for token in tokens: | |||
| ids.append(self._convert_token_to_id_with_added_voc(token)) | |||
| return ids | |||
| def _convert_token_to_id_with_added_voc(self, token): | |||
| if token is None: | |||
| return None | |||
| if token in self.added_tokens_encoder: | |||
| return self.added_tokens_encoder[token] | |||
| return self._convert_token_to_id(token) | |||
| def _convert_token_to_id(self, token): | |||
| raise NotImplementedError | |||
| def _encode_plus( | |||
| self, | |||
| text: Union[TextInput, PreTokenizedInput, EncodedInput], | |||
| text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, | |||
| add_special_tokens: bool = True, | |||
| padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, | |||
| truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, | |||
| max_length: Optional[int] = None, | |||
| stride: int = 0, | |||
| is_split_into_words: bool = False, | |||
| pad_to_multiple_of: Optional[int] = None, | |||
| return_tensors: Optional[Union[str, TensorType]] = None, | |||
| return_token_type_ids: Optional[bool] = None, | |||
| return_attention_mask: Optional[bool] = None, | |||
| return_overflowing_tokens: bool = False, | |||
| return_special_tokens_mask: bool = False, | |||
| return_offsets_mapping: bool = False, | |||
| return_length: bool = False, | |||
| verbose: bool = True, | |||
| **kwargs | |||
| ) -> BatchEncoding: | |||
| def get_input_ids(text): | |||
| if isinstance(text, str): | |||
| tokens = self.tokenize(text, **kwargs) | |||
| return self.convert_tokens_to_ids(tokens) | |||
| elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): | |||
| if is_split_into_words: | |||
| tokens = list( | |||
| itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text)) | |||
| ) | |||
| return self.convert_tokens_to_ids(tokens) | |||
| else: | |||
| return self.convert_tokens_to_ids(text) | |||
| elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): | |||
| return text | |||
| else: | |||
| if is_split_into_words: | |||
| raise ValueError( | |||
| f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_split_into_words=True`." | |||
| ) | |||
| else: | |||
| raise ValueError( | |||
| f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." | |||
| ) | |||
| if return_offsets_mapping: | |||
| raise NotImplementedError( | |||
| "return_offset_mapping is not available when using Python tokenizers." | |||
| "To use this feature, change your tokenizer to one deriving from " | |||
| "transformers.PreTrainedTokenizerFast." | |||
| "More information on available tokenizers at " | |||
| "https://github.com/huggingface/transformers/pull/2674" | |||
| ) | |||
| first_ids = get_input_ids(text) | |||
| second_ids = get_input_ids(text_pair) if text_pair is not None else None | |||
| return self.prepare_for_model( | |||
| first_ids, | |||
| pair_ids=second_ids, | |||
| add_special_tokens=add_special_tokens, | |||
| padding=padding_strategy.value, | |||
| truncation=truncation_strategy.value, | |||
| max_length=max_length, | |||
| stride=stride, | |||
| pad_to_multiple_of=pad_to_multiple_of, | |||
| return_tensors=return_tensors, | |||
| prepend_batch_axis=True, | |||
| return_attention_mask=return_attention_mask, | |||
| return_token_type_ids=return_token_type_ids, | |||
| return_overflowing_tokens=return_overflowing_tokens, | |||
| return_special_tokens_mask=return_special_tokens_mask, | |||
| return_length=return_length, | |||
| verbose=verbose, | |||
| ) | |||
| def _batch_encode_plus( | |||
| self, | |||
| batch_text_or_text_pairs: Union[ | |||
| List[TextInput], | |||
| List[TextInputPair], | |||
| List[PreTokenizedInput], | |||
| List[PreTokenizedInputPair], | |||
| List[EncodedInput], | |||
| List[EncodedInputPair], | |||
| ], | |||
| add_special_tokens: bool = True, | |||
| padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, | |||
| truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, | |||
| max_length: Optional[int] = None, | |||
| stride: int = 0, | |||
| is_split_into_words: bool = False, | |||
| pad_to_multiple_of: Optional[int] = None, | |||
| return_tensors: Optional[Union[str, TensorType]] = None, | |||
| return_token_type_ids: Optional[bool] = None, | |||
| return_attention_mask: Optional[bool] = None, | |||
| return_overflowing_tokens: bool = False, | |||
| return_special_tokens_mask: bool = False, | |||
| return_offsets_mapping: bool = False, | |||
| return_length: bool = False, | |||
| verbose: bool = True, | |||
| **kwargs | |||
| ) -> BatchEncoding: | |||
| def get_input_ids(text): | |||
| if isinstance(text, str): | |||
| tokens = self.tokenize(text, **kwargs) | |||
| return self.convert_tokens_to_ids(tokens) | |||
| elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): | |||
| if is_split_into_words: | |||
| tokens = list( | |||
| itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text)) | |||
| ) | |||
| return self.convert_tokens_to_ids(tokens) | |||
| else: | |||
| return self.convert_tokens_to_ids(text) | |||
| elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): | |||
| return text | |||
| else: | |||
| raise ValueError( | |||
| "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." | |||
| ) | |||
| if return_offsets_mapping: | |||
| raise NotImplementedError( | |||
| "return_offset_mapping is not available when using Python tokenizers." | |||
| "To use this feature, change your tokenizer to one deriving from " | |||
| "transformers.PreTrainedTokenizerFast." | |||
| ) | |||
| input_ids = [] | |||
| for ids_or_pair_ids in batch_text_or_text_pairs: | |||
| if not isinstance(ids_or_pair_ids, (list, tuple)): | |||
| ids, pair_ids = ids_or_pair_ids, None | |||
| elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)): | |||
| ids, pair_ids = ids_or_pair_ids, None | |||
| else: | |||
| ids, pair_ids = ids_or_pair_ids | |||
| first_ids = get_input_ids(ids) | |||
| second_ids = get_input_ids(pair_ids) if pair_ids is not None else None | |||
| input_ids.append((first_ids, second_ids)) | |||
| batch_outputs = self._batch_prepare_for_model( | |||
| input_ids, | |||
| add_special_tokens=add_special_tokens, | |||
| padding_strategy=padding_strategy, | |||
| truncation_strategy=truncation_strategy, | |||
| max_length=max_length, | |||
| stride=stride, | |||
| pad_to_multiple_of=pad_to_multiple_of, | |||
| return_attention_mask=return_attention_mask, | |||
| return_token_type_ids=return_token_type_ids, | |||
| return_overflowing_tokens=return_overflowing_tokens, | |||
| return_special_tokens_mask=return_special_tokens_mask, | |||
| return_length=return_length, | |||
| return_tensors=return_tensors, | |||
| verbose=verbose, | |||
| ) | |||
| return BatchEncoding(batch_outputs) | |||
| @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) | |||
| def _batch_prepare_for_model( | |||
| self, | |||
| batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]], | |||
| add_special_tokens: bool = True, | |||
| padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, | |||
| truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, | |||
| max_length: Optional[int] = None, | |||
| stride: int = 0, | |||
| pad_to_multiple_of: Optional[int] = None, | |||
| return_tensors: Optional[str] = None, | |||
| return_token_type_ids: Optional[bool] = None, | |||
| return_attention_mask: Optional[bool] = None, | |||
| return_overflowing_tokens: bool = False, | |||
| return_special_tokens_mask: bool = False, | |||
| return_length: bool = False, | |||
| verbose: bool = True, | |||
| ) -> BatchEncoding: | |||
| """ | |||
| Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It | |||
| adds special tokens, truncates sequences if overflowing while taking into account the special tokens and | |||
| manages a moving window (with user defined stride) for overflowing tokens | |||
| Args: | |||
| batch_ids_pairs: list of tokenized input ids or input ids pairs | |||
| """ | |||
| batch_outputs = {} | |||
| for first_ids, second_ids in batch_ids_pairs: | |||
| outputs = self.prepare_for_model( | |||
| first_ids, | |||
| second_ids, | |||
| add_special_tokens=add_special_tokens, | |||
| padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward | |||
| truncation=truncation_strategy.value, | |||
| max_length=max_length, | |||
| stride=stride, | |||
| pad_to_multiple_of=None, # we pad in batch afterward | |||
| return_attention_mask=False, # we pad in batch afterward | |||
| return_token_type_ids=return_token_type_ids, | |||
| return_overflowing_tokens=return_overflowing_tokens, | |||
| return_special_tokens_mask=return_special_tokens_mask, | |||
| return_length=return_length, | |||
| return_tensors=None, # We convert the whole batch to tensors at the end | |||
| prepend_batch_axis=False, | |||
| verbose=verbose, | |||
| ) | |||
| for key, value in outputs.items(): | |||
| if key not in batch_outputs: | |||
| batch_outputs[key] = [] | |||
| batch_outputs[key].append(value) | |||
| batch_outputs = self.pad( | |||
| batch_outputs, | |||
| padding=padding_strategy.value, | |||
| max_length=max_length, | |||
| pad_to_multiple_of=pad_to_multiple_of, | |||
| return_attention_mask=return_attention_mask, | |||
| ) | |||
| batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors) | |||
| return batch_outputs | |||
| def prepare_for_tokenization( | |||
| self, text: str, is_split_into_words: bool = False, **kwargs | |||
| ) -> Tuple[str, Dict[str, Any]]: | |||
| """ | |||
| Performs any necessary transformations before tokenization. | |||
| This method should pop the arguments from kwargs and return the remaining :obj:`kwargs` as well. We test the | |||
| :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used. | |||
| Args: | |||
| text (:obj:`str`): | |||
| The text to prepare. | |||
| is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`, the | |||
| tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace) | |||
| which it will tokenize. This is useful for NER or token classification. | |||
| kwargs: | |||
| Keyword arguments to use for the tokenization. | |||
| Returns: | |||
| :obj:`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs. | |||
| """ | |||
| return (text, kwargs) | |||
| def get_special_tokens_mask( | |||
| self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False | |||
| ) -> List[int]: | |||
| """ | |||
| Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding | |||
| special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. | |||
| Args: | |||
| token_ids_0 (:obj:`List[int]`): | |||
| List of ids of the first sequence. | |||
| token_ids_1 (:obj:`List[int]`, `optional`): | |||
| List of ids of the second sequence. | |||
| already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether or not the token list is already formatted with special tokens for the model. | |||
| Returns: | |||
| A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. | |||
| """ | |||
| if already_has_special_tokens: | |||
| if token_ids_1 is not None: | |||
| raise ValueError( | |||
| "You should not supply a second sequence if the provided sequence of " | |||
| "ids is already formatted with special tokens for the model." | |||
| ) | |||
| return super().get_special_tokens_mask( | |||
| token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True | |||
| ) | |||
| return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) | |||
| @overload | |||
| def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: | |||
| ... | |||
| @overload | |||
| def convert_ids_to_tokens(self, ids: List[int], skip_special_tokens: bool = False) -> List[str]: | |||
| ... | |||
| def convert_ids_to_tokens( | |||
| self, ids: Union[int, List[int]], skip_special_tokens: bool = False | |||
| ) -> Union[str, List[str]]: | |||
| """ | |||
| Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and | |||
| added tokens. | |||
| Args: | |||
| ids (:obj:`int` or :obj:`List[int]`): | |||
| The token id (or token ids) to convert to tokens. | |||
| skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): | |||
| Whether or not to remove special tokens in the decoding. | |||
| Returns: | |||
| :obj:`str` or :obj:`List[str]`: The decoded token(s). | |||
| """ | |||
| if isinstance(ids, int): | |||
| if ids in self.added_tokens_decoder: | |||
| return self.added_tokens_decoder[ids] | |||
| else: | |||
| return self._convert_id_to_token(ids) | |||
| tokens = [] | |||
| for index in ids: | |||
| index = int(index) | |||
| if skip_special_tokens and index in self.all_special_ids: | |||
| continue | |||
| if index in self.added_tokens_decoder: | |||
| tokens.append(self.added_tokens_decoder[index]) | |||
| else: | |||
| tokens.append(self._convert_id_to_token(index)) | |||
| return tokens | |||
| def _convert_id_to_token(self, index: int) -> str: | |||
| raise NotImplementedError | |||
| def convert_tokens_to_string(self, tokens: List[str]) -> str: | |||
| return " ".join(tokens) | |||
| def _decode( | |||
| self, | |||
| token_ids: List[int], | |||
| skip_special_tokens: bool = False, | |||
| clean_up_tokenization_spaces: bool = True, | |||
| spaces_between_special_tokens: bool = True, | |||
| **kwargs | |||
| ) -> str: | |||
| self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) | |||
| filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) | |||
| # To avoid mixing byte-level and unicode for byte-level BPT | |||
| # we need to build string separately for added tokens and byte-level tokens | |||
| # cf. https://github.com/huggingface/transformers/issues/1133 | |||
| sub_texts = [] | |||
| current_sub_text = [] | |||
| for token in filtered_tokens: | |||
| if skip_special_tokens and token in self.all_special_ids: | |||
| continue | |||
| if token in self.added_tokens_encoder: | |||
| if current_sub_text: | |||
| sub_texts.append(self.convert_tokens_to_string(current_sub_text)) | |||
| current_sub_text = [] | |||
| sub_texts.append(token) | |||
| else: | |||
| current_sub_text.append(token) | |||
| if current_sub_text: | |||
| sub_texts.append(self.convert_tokens_to_string(current_sub_text)) | |||
| if spaces_between_special_tokens: | |||
| text = " ".join(sub_texts) | |||
| else: | |||
| text = "".join(sub_texts) | |||
| if clean_up_tokenization_spaces: | |||
| clean_text = self.clean_up_tokenization(text) | |||
| return clean_text | |||
| else: | |||
| return text | |||
| @@ -0,0 +1,54 @@ | |||
| # coding=utf-8 | |||
| # Copyright 2020 The HuggingFace Team. All rights reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| from math import ceil | |||
| def assert_device_map(device_map, num_blocks): | |||
| blocks = list(range(0, num_blocks)) | |||
| device_map_blocks = [item for sublist in list(device_map.values()) for item in sublist] | |||
| # Duplicate check | |||
| duplicate_blocks = [] | |||
| for i in device_map_blocks: | |||
| if device_map_blocks.count(i) > 1 and i not in duplicate_blocks: | |||
| duplicate_blocks.append(i) | |||
| # Missing blocks | |||
| missing_blocks = [i for i in blocks if i not in device_map_blocks] | |||
| extra_blocks = [i for i in device_map_blocks if i not in blocks] | |||
| assert len(duplicate_blocks) == 0, ( | |||
| "Duplicate attention blocks specified in device_map. Attention blocks must be specified to one device. These " | |||
| "attention blocks were specified more than once: " + str(duplicate_blocks) | |||
| ) | |||
| assert len(missing_blocks) == 0, ( | |||
| "There are attention blocks for this model that are not specified in the device_map. Add these attention " | |||
| "blocks to a device on the device_map: " + str(missing_blocks) | |||
| ) | |||
| assert ( | |||
| len(extra_blocks) == 0 | |||
| ), "The device_map contains more attention blocks than this model has. Remove these from the device_map:" + str( | |||
| extra_blocks | |||
| ) | |||
| def get_device_map(n_layers, devices): | |||
| """Returns a dictionary of layers distributed evenly across all devices.""" | |||
| layers = list(range(n_layers)) | |||
| n_blocks = int(ceil(n_layers / len(devices))) | |||
| layers_list = list(layers[i : i + n_blocks] for i in range(0, n_layers, n_blocks)) | |||
| return dict(zip(devices, layers_list)) | |||
| @@ -0,0 +1,120 @@ | |||
| # Copyright 2020 The HuggingFace Team. All rights reserved. | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| """ | |||
| Utilities for working with package versions | |||
| """ | |||
| import operator | |||
| import re | |||
| import sys | |||
| from typing import Optional | |||
| from packaging import version | |||
| # The package importlib_metadata is in a different place, depending on the python version. | |||
| if sys.version_info < (3, 8): | |||
| import importlib_metadata | |||
| else: | |||
| import importlib.metadata as importlib_metadata | |||
| ops = { | |||
| "<": operator.lt, | |||
| "<=": operator.le, | |||
| "==": operator.eq, | |||
| "!=": operator.ne, | |||
| ">=": operator.ge, | |||
| ">": operator.gt, | |||
| } | |||
| def _compare_versions(op, got_ver, want_ver, requirement, pkg, hint): | |||
| if got_ver is None: | |||
| raise ValueError("got_ver is None") | |||
| if want_ver is None: | |||
| raise ValueError("want_ver is None") | |||
| if not ops[op](version.parse(got_ver), version.parse(want_ver)): | |||
| raise ImportError( | |||
| f"{requirement} is required for a normal functioning of this module, but found {pkg}=={got_ver}.{hint}" | |||
| ) | |||
| def require_version(requirement: str, hint: Optional[str] = None) -> None: | |||
| """ | |||
| Perform a runtime check of the dependency versions, using the exact same syntax used by pip. | |||
| The installed module version comes from the `site-packages` dir via `importlib_metadata`. | |||
| Args: | |||
| requirement (:obj:`str`): pip style definition, e.g., "tokenizers==0.9.4", "tqdm>=4.27", "numpy" | |||
| hint (:obj:`str`, `optional`): what suggestion to print in case of requirements not being met | |||
| Example:: | |||
| require_version("pandas>1.1.2") | |||
| require_version("numpy>1.18.5", "this is important to have for whatever reason") | |||
| """ | |||
| hint = f"\n{hint}" if hint is not None else "" | |||
| # non-versioned check | |||
| if re.match(r"^[\w_\-\d]+$", requirement): | |||
| pkg, op, want_ver = requirement, None, None | |||
| else: | |||
| match = re.findall(r"^([^!=<>\s]+)([\s!=<>]{1,2}.+)", requirement) | |||
| if not match: | |||
| raise ValueError( | |||
| f"requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23, but got {requirement}" | |||
| ) | |||
| pkg, want_full = match[0] | |||
| want_range = want_full.split(",") # there could be multiple requirements | |||
| wanted = {} | |||
| for w in want_range: | |||
| match = re.findall(r"^([\s!=<>]{1,2})(.+)", w) | |||
| if not match: | |||
| raise ValueError( | |||
| f"requirement needs to be in the pip package format, .e.g., package_a==1.23, or package_b>=1.23, but got {requirement}" | |||
| ) | |||
| op, want_ver = match[0] | |||
| wanted[op] = want_ver | |||
| if op not in ops: | |||
| raise ValueError(f"{requirement}: need one of {list(ops.keys())}, but got {op}") | |||
| # special case | |||
| if pkg == "python": | |||
| got_ver = ".".join([str(x) for x in sys.version_info[:3]]) | |||
| for op, want_ver in wanted.items(): | |||
| _compare_versions(op, got_ver, want_ver, requirement, pkg, hint) | |||
| return | |||
| # check if any version is installed | |||
| try: | |||
| got_ver = importlib_metadata.version(pkg) | |||
| except importlib_metadata.PackageNotFoundError: | |||
| raise importlib_metadata.PackageNotFoundError( | |||
| f"The '{requirement}' distribution was not found and is required by this application. {hint}" | |||
| ) | |||
| # check that the right version is installed if version number or a range was provided | |||
| if want_ver is not None: | |||
| for op, want_ver in wanted.items(): | |||
| _compare_versions(op, got_ver, want_ver, requirement, pkg, hint) | |||
| def require_version_core(requirement): | |||
| """require_version wrapper which emits a core-specific hint on failure""" | |||
| hint = "Try: pip install transformers -U or pip install -e '.[dev]' if you're working with git master" | |||
| return require_version(requirement, hint) | |||
| @@ -73,7 +73,7 @@ def model_and_optimizers(request): | |||
| @pytest.mark.parametrize("driver,device", [("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1)]) # ("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1) | |||
| @pytest.mark.parametrize("version", [0, 1]) | |||
| @pytest.mark.parametrize("only_state_dict", [True, False]) | |||
| @magic_argv_env_context | |||
| @magic_argv_env_context(timeout=100) | |||
| def test_model_checkpoint_callback_1( | |||
| model_and_optimizers: TrainerParameters, | |||
| driver, | |||
| @@ -193,7 +193,7 @@ def test_model_checkpoint_callback_1( | |||
| trainer.load_model(folder, only_state_dict=only_state_dict) | |||
| trainer.run() | |||
| trainer.driver.barrier() | |||
| finally: | |||
| rank_zero_rm(path) | |||
| @@ -203,7 +203,7 @@ def test_model_checkpoint_callback_1( | |||
| @pytest.mark.parametrize("driver,device", [("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1)]) # ("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1) | |||
| @pytest.mark.parametrize("only_state_dict", [True]) | |||
| @magic_argv_env_context | |||
| @magic_argv_env_context(timeout=100) | |||
| def test_model_checkpoint_callback_2( | |||
| model_and_optimizers: TrainerParameters, | |||
| driver, | |||
| @@ -283,6 +283,7 @@ def test_model_checkpoint_callback_2( | |||
| trainer.load_model(folder, only_state_dict=only_state_dict) | |||
| trainer.run() | |||
| trainer.driver.barrier() | |||
| finally: | |||
| rank_zero_rm(path) | |||
| @@ -295,7 +296,7 @@ def test_model_checkpoint_callback_2( | |||
| @pytest.mark.parametrize("driver,device", [("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 0)]) # ("torch", "cpu"), ("torch_ddp", [0, 1]), ("torch", 1) | |||
| @pytest.mark.parametrize("version", [0, 1]) | |||
| @pytest.mark.parametrize("only_state_dict", [True, False]) | |||
| @magic_argv_env_context | |||
| @magic_argv_env_context(timeout=100) | |||
| def test_trainer_checkpoint_callback_1( | |||
| model_and_optimizers: TrainerParameters, | |||
| driver, | |||
| @@ -413,6 +414,7 @@ def test_trainer_checkpoint_callback_1( | |||
| trainer.load(folder, only_state_dict=only_state_dict) | |||
| trainer.run() | |||
| trainer.driver.barrier() | |||
| finally: | |||
| rank_zero_rm(path) | |||
| @@ -661,6 +663,7 @@ def test_trainer_checkpoint_callback_2( | |||
| trainer.load(folder, model_load_fn=model_load_fn) | |||
| trainer.run() | |||
| trainer.driver.barrier() | |||
| finally: | |||
| rank_zero_rm(path) | |||
| @@ -16,7 +16,6 @@ from fastNLP.core.controllers.trainer import Trainer | |||
| from fastNLP.core.metrics.accuracy import Accuracy | |||
| from fastNLP.core.callbacks.load_best_model_callback import LoadBestModelCallback | |||
| from fastNLP.core import Evaluator | |||
| from fastNLP.core.utils.utils import safe_rm | |||
| from fastNLP.core.drivers.torch_driver import TorchSingleDriver | |||
| from tests.helpers.models.torch_model import TorchNormalModel_Classification_1 | |||
| from tests.helpers.datasets.torch_data import TorchArgMaxDataset | |||
| @@ -112,7 +111,8 @@ def test_load_best_model_callback( | |||
| results = evaluator.run() | |||
| assert np.allclose(callbacks[0].monitor_value, results['acc#acc#dl1']) | |||
| if save_folder: | |||
| safe_rm(save_folder) | |||
| import shutil | |||
| shutil.rmtree(save_folder, ignore_errors=True) | |||
| if dist.is_initialized(): | |||
| dist.destroy_process_group() | |||
| @@ -171,7 +171,7 @@ def test_model_more_evaluate_callback_1( | |||
| trainer.load_model(folder, only_state_dict=only_state_dict) | |||
| trainer.run() | |||
| trainer.driver.barrier() | |||
| finally: | |||
| rank_zero_rm(path) | |||
| @@ -255,6 +255,7 @@ def test_trainer_checkpoint_callback_1( | |||
| trainer.load(folder, only_state_dict=only_state_dict) | |||
| trainer.run() | |||
| trainer.driver.barrier() | |||
| finally: | |||
| rank_zero_rm(path) | |||
| @@ -10,7 +10,7 @@ class TestNumpyNumberPadder: | |||
| def test_run(self): | |||
| padder = NumpyNumberPadder(ele_dtype=int, dtype=int, pad_val=-1) | |||
| a = [1, 2, 3] | |||
| assert isinstance(a, np.ndarray) | |||
| assert isinstance(padder(a), np.ndarray) | |||
| assert (padder(a) == np.array(a)).sum() == 3 | |||
| @@ -158,7 +158,7 @@ class TestCollator: | |||
| # 测试 ignore | |||
| collator = Collator(backend='raw') | |||
| collator.set_ignore('str', 'int', 'lst_int', 'nested_dict@@a') | |||
| collator.set_ignore('str', 'int', 'lst_int', ('nested_dict', 'a')) | |||
| raw_pad_batch = {'lst_str': [['1'], ['2', '2']], 'nest_lst_int': [[[1, 0], [0, 0]], [[1, 0], [1, 2]]], 'float': [1.1, 2.1], 'lst_float': [[1.1], [2.1]], 'bool': [True, False], 'numpy': [np.array([1.]), np.array([0.])], 'dict': {'1': ['1', '2']}, 'set': [{'1'}, {'2'}], 'nested_dict': {'b': [[1, 2], [1, 2]]}} | |||
| findDictDiff(raw_pad_batch, collator(dict_batch)) | |||
| @@ -171,7 +171,7 @@ class TestCollator: | |||
| # 测试设置 pad 值 | |||
| collator = Collator(backend='raw') | |||
| collator.set_pad('nest_lst_int', pad_val=100) | |||
| collator.set_ignore('str', 'int', 'lst_int', 'nested_dict@@a') | |||
| collator.set_ignore('str', 'int', 'lst_int', ('nested_dict','a')) | |||
| raw_pad_batch = {'lst_str': [['1'], ['2', '2']], 'nest_lst_int': [[[1, 100], [100, 100]], [[1, 100], [1, 2]]], | |||
| 'float': [1.1, 2.1], 'lst_float': [[1.1], [2.1]], 'bool': [True, False], 'numpy': [np.array([1.]), np.array([0.])], 'dict': {'1': ['1', '2']}, 'set': [{'1'}, {'2'}], 'nested_dict': {'b': [[1, 2], [1, 2]]}} | |||
| findDictDiff(raw_pad_batch, collator(dict_batch)) | |||
| @@ -217,6 +217,72 @@ class TestCollator: | |||
| collator.set_pad('_single') | |||
| findListDiff(list_batch, collator(list_batch)) | |||
| def test_nest_ignore(self): | |||
| dict_batch = [{ | |||
| 'str': '1', | |||
| 'lst_str': ['1'], | |||
| 'int': 1, | |||
| 'lst_int': [1], | |||
| 'nest_lst_int': [[1]], | |||
| 'float': 1.1, | |||
| 'lst_float': [1.1], | |||
| 'bool': True, | |||
| 'numpy': np.ones(1), | |||
| 'dict': {'1': '1'}, | |||
| 'set': {'1'}, | |||
| 'nested_dict': {'int': 1, 'lst_int':[1, 2], 'c': {'int': 1}} | |||
| }, | |||
| { | |||
| 'str': '2', | |||
| 'lst_str': ['2', '2'], | |||
| 'int': 2, | |||
| 'lst_int': [1, 2], | |||
| 'nest_lst_int': [[1], [1, 2]], | |||
| 'float': 2.1, | |||
| 'lst_float': [2.1], | |||
| 'bool': False, | |||
| 'numpy': np.zeros(1), | |||
| 'dict': {'1': '2'}, | |||
| 'set': {'2'}, | |||
| 'nested_dict': {'int': 1, 'lst_int': [1, 2], 'c': {'int': 1}} | |||
| } | |||
| ] | |||
| # 测试 ignore | |||
| collator = Collator(backend='raw') | |||
| collator.set_ignore('str', 'int', 'lst_int', ('nested_dict', 'int')) | |||
| raw_pad_batch = {'lst_str': [['1'], ['2', '2']], 'nest_lst_int': [[[1, 0], [0, 0]], [[1, 0], [1, 2]]], | |||
| 'float': [1.1, 2.1], 'lst_float': [[1.1], [2.1]], 'bool': [True, False], | |||
| 'numpy': [np.array([1.]), np.array([0.])], 'dict': {'1': ['1', '2']}, | |||
| 'set': [{'1'}, {'2'}], 'nested_dict': {'lst_int': [[1, 2], [1, 2]], | |||
| 'c': {'int':[1, 1]}}} | |||
| findDictDiff(raw_pad_batch, collator(dict_batch)) | |||
| collator = Collator(backend='raw') | |||
| collator.set_pad(('nested_dict', 'c'), pad_val=None) | |||
| collator.set_ignore('str', 'int', 'lst_int') | |||
| raw_pad_batch = {'lst_str': [['1'], ['2', '2']], 'nest_lst_int': [[[1, 0], [0, 0]], [[1, 0], [1, 2]]], | |||
| 'float': [1.1, 2.1], 'lst_float': [[1.1], [2.1]], 'bool': [True, False], | |||
| 'numpy': [np.array([1.]), np.array([0.])], 'dict': {'1': ['1', '2']}, | |||
| 'set': [{'1'}, {'2'}], 'nested_dict': {'lst_int': [[1, 2], [1, 2]], | |||
| 'c': [{'int':1}, {'int':1}]}} | |||
| pad_batch = collator(dict_batch) | |||
| findDictDiff(raw_pad_batch, pad_batch) | |||
| collator = Collator(backend='raw') | |||
| collator.set_pad(('nested_dict', 'c'), pad_val=1) | |||
| with pytest.raises(BaseException): | |||
| collator(dict_batch) | |||
| collator = Collator(backend='raw') | |||
| collator.set_ignore('str', 'int', 'lst_int') | |||
| collator.set_pad(('nested_dict', 'c'), pad_fn=lambda x: [d['int'] for d in x]) | |||
| pad_batch = collator(dict_batch) | |||
| raw_pad_batch = {'lst_str': [['1'], ['2', '2']], 'nest_lst_int': [[[1, 0], [0, 0]], [[1, 0], [1, 2]]], | |||
| 'float': [1.1, 2.1], 'lst_float': [[1.1], [2.1]], 'bool': [True, False], | |||
| 'numpy': [np.array([1.]), np.array([0.])], 'dict': {'1': ['1', '2']}, | |||
| 'set': [{'1'}, {'2'}], 'nested_dict': {'lst_int': [[1, 2], [1, 2]], | |||
| 'c': [1, 1]}} | |||
| findDictDiff(raw_pad_batch, pad_batch) | |||
| @@ -4,25 +4,25 @@ from fastNLP.core.collators.utils import * | |||
| def test_unpack_batch_mapping(): | |||
| batch = [{'a': [1, 2], 'b': 1}, {'a': [3], 'b': 2}] | |||
| assert unpack_batch_mapping(batch)=={'a': [[1, 2], [3]], 'b': [1, 2]} | |||
| assert unpack_batch_mapping(batch, {})=={'a': [[1, 2], [3]], 'b': [1, 2]} | |||
| def test_unpack_batch_nested_mapping(): | |||
| batch = [{'a': [1, 2], 'b': 1, 'c': {'c': 1}}, {'a': [3], 'b': 2, 'c': {'c': 2}}] | |||
| assert unpack_batch_nested_mapping(batch) == {'a': [[1, 2], [3]], 'b': [1, 2], 'c@@c': [1, 2]} | |||
| assert unpack_batch_nested_mapping(batch, {}, {}) == {'a': [[1, 2], [3]], 'b': [1, 2], ('c','c'): [1, 2]} | |||
| batch = [{'a': [1, 2], 'b': 1, 'c': {'c': {'c': 1}}}, {'a': [3], 'b': 2, 'c': {'c': {'c': 2}}}] | |||
| assert unpack_batch_nested_mapping(batch) == {'a': [[1, 2], [3]], 'b': [1, 2], 'c@@c@@c': [1, 2]} | |||
| assert unpack_batch_nested_mapping(batch, {}, {}) == {'a': [[1, 2], [3]], 'b': [1, 2], ('c', 'c', 'c'): [1, 2]} | |||
| batch = [{'a': [1, 2], 'b': 1, 'c': {'c': {'c': 1, 'd':[1, 1]}, 'd': [1]}}, | |||
| {'a': [3], 'b': 2, 'c': {'c': {'c': 2, 'd': [2, 2]}, 'd': [2, 2]}}] | |||
| assert unpack_batch_nested_mapping(batch) == {'a': [[1, 2], [3]], 'b': [1, 2], 'c@@c@@c': [1, 2], | |||
| 'c@@c@@d':[[1, 1], [2, 2]], 'c@@d': [[1], [2, 2]]} | |||
| assert unpack_batch_nested_mapping(batch, {}, {}) == {'a': [[1, 2], [3]], 'b': [1, 2], ('c', 'c', 'c'): [1, 2], | |||
| ('c','c', 'd'):[[1, 1], [2, 2]], ('c', 'd'): [[1], [2, 2]]} | |||
| def test_pack_batch_nested_mapping(): | |||
| batch = {'a': [[1, 2], [3]], 'b': [1, 2], 'c@@c@@c': [1, 2], | |||
| 'c@@c@@d':[[1, 1], [2, 2]], 'c@@d': [[1], [2, 2]]} | |||
| batch = {'a': [[1, 2], [3]], 'b': [1, 2], ('c', 'c', 'c'): [1, 2], | |||
| ('c', 'c', 'd'):[[1, 1], [2, 2]], ('c', 'd'): [[1], [2, 2]]} | |||
| new_batch = pack_batch_nested_mapping(batch) | |||
| assert new_batch == {'a': [[1, 2], [3]], 'b': [1, 2], | |||
| 'c': {'c':{'c': [1, 2], 'd': [[1, 1], [2, 2]]}, 'd':[[1], [2, 2]]}} | |||
| @@ -30,7 +30,7 @@ def test_pack_batch_nested_mapping(): | |||
| def test_unpack_batch_sequence(): | |||
| batch = [[1, 2, 3], [2, 4, 6]] | |||
| new_batch = unpack_batch_sequence(batch) | |||
| new_batch = unpack_batch_sequence(batch, {}) | |||
| assert new_batch == {'_0': [1, 2], '_1': [2, 4], '_2': [3, 6]} | |||
| @@ -4,7 +4,6 @@ | |||
| python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet.py | |||
| """ | |||
| import os | |||
| os.environ["FASTNLP_BACKEND"] = "paddle" | |||
| import sys | |||
| sys.path.append("../../../") | |||
| @@ -4,7 +4,6 @@ | |||
| python -m paddle.distributed.launch --gpus=0,2,3 test_trainer_fleet_outside.py | |||
| """ | |||
| import os | |||
| os.environ["FASTNLP_BACKEND"] = "paddle" | |||
| import sys | |||
| sys.path.append("../../../") | |||
| @@ -1,6 +1,4 @@ | |||
| import pytest | |||
| import os | |||
| os.environ["FASTNLP_BACKEND"] = "paddle" | |||
| from dataclasses import dataclass | |||
| from fastNLP.core.controllers.trainer import Trainer | |||
| @@ -25,7 +23,7 @@ class TrainPaddleConfig: | |||
| shuffle: bool = True | |||
| evaluate_every = 2 | |||
| @pytest.mark.parametrize("driver,device", [("paddle", "cpu"), ("paddle", 1)]) | |||
| @pytest.mark.parametrize("driver,device", [("paddle", "cpu"), ("paddle", 1), ("fleet", [0, 1])]) | |||
| # @pytest.mark.parametrize("driver,device", [("fleet", [0, 1])]) | |||
| @pytest.mark.parametrize("callbacks", [[RecordMetricCallback(monitor="acc#acc", metric_threshold=0.0, larger_better=True), | |||
| RichCallback(5)]]) | |||
| @@ -3,7 +3,6 @@ import sys | |||
| import signal | |||
| import pytest | |||
| import traceback | |||
| os.environ["FASTNLP_BACKEND"] = "paddle" | |||
| import numpy as np | |||
| @@ -1,8 +1,6 @@ | |||
| import pytest | |||
| import os | |||
| from pathlib import Path | |||
| os.environ["FASTNLP_BACKEND"] = "paddle" | |||
| from fastNLP.core.drivers.paddle_driver.fleet import PaddleFleetDriver | |||
| from fastNLP.core.samplers import ( | |||
| RandomSampler, | |||
| @@ -1,8 +1,5 @@ | |||
| import os | |||
| import pytest | |||
| os.environ["FASTNLP_BACKEND"] = "paddle" | |||
| from fastNLP.core.drivers import PaddleSingleDriver, PaddleFleetDriver | |||
| from fastNLP.core.drivers.paddle_driver.initialize_paddle_driver import initialize_paddle_driver | |||
| from fastNLP.envs import get_gpu_count | |||
| @@ -1,6 +1,3 @@ | |||
| import os | |||
| from re import S | |||
| os.environ["FASTNLP_BACKEND"] = "paddle" | |||
| import pytest | |||
| from pathlib import Path | |||
| @@ -1,6 +1,4 @@ | |||
| import os | |||
| import pytest | |||
| os.environ["FASTNLP_BACKEND"] = "paddle" | |||
| from fastNLP.core.drivers.paddle_driver.utils import ( | |||
| get_device_from_visible, | |||
| @@ -0,0 +1,31 @@ | |||
| import sys | |||
| sys.path.append("../../../../") | |||
| from fastNLP.core.drivers.torch_driver.ddp import TorchDDPDriver | |||
| from tests.helpers.models.torch_model import TorchNormalModel_Classification_1 | |||
| import torch | |||
| device = [0, 1] | |||
| torch_model = TorchNormalModel_Classification_1(10, 10) | |||
| torch_opt = torch.optim.Adam(params=torch_model.parameters(), lr=0.01) | |||
| device = [torch.device(i) for i in device] | |||
| driver = TorchDDPDriver( | |||
| model=torch_model, | |||
| parallel_device=device, | |||
| fp16=False | |||
| ) | |||
| driver.set_optimizers(torch_opt) | |||
| driver.setup() | |||
| print("-----------first--------------") | |||
| device = [0, 2] | |||
| torch_model = TorchNormalModel_Classification_1(10, 10) | |||
| torch_opt = torch.optim.Adam(params=torch_model.parameters(), lr=0.01) | |||
| device = [torch.device(i) for i in device] | |||
| driver = TorchDDPDriver( | |||
| model=torch_model, | |||
| parallel_device=device, | |||
| fp16=False | |||
| ) | |||
| driver.set_optimizers(torch_opt) | |||
| driver.setup() | |||
| @@ -1,8 +1,6 @@ | |||
| import pytest | |||
| import os | |||
| from pathlib import Path | |||
| os.environ["FASTNLP_BACKEND"] = "torch" | |||
| from fastNLP.core.drivers.torch_driver.ddp import TorchDDPDriver | |||
| from fastNLP.core.samplers import ( | |||
| RandomSampler, | |||
| @@ -1,8 +1,5 @@ | |||
| import os | |||
| import pytest | |||
| os.environ["FASTNLP_BACKEND"] = "torch" | |||
| from fastNLP.core.drivers import TorchSingleDriver, TorchDDPDriver | |||
| from fastNLP.core.drivers.torch_driver.initialize_torch_driver import initialize_torch_driver | |||
| from fastNLP.envs import get_gpu_count | |||
| @@ -1,5 +1,3 @@ | |||
| import os | |||
| os.environ["FASTNLP_BACKEND"] = "torch" | |||
| import pytest | |||
| from pathlib import Path | |||
| @@ -1,6 +1,4 @@ | |||
| import os | |||
| import pytest | |||
| os.environ["FASTNLP_BACKEND"] = "torch" | |||
| from fastNLP.core.drivers.torch_driver.utils import ( | |||
| replace_batch_sampler, | |||
| @@ -9,153 +9,153 @@ from fastNLP.core.samplers import RandomBatchSampler, BucketedBatchSampler | |||
| from fastNLP.core.drivers.torch_driver.utils import replace_batch_sampler | |||
| from tests.helpers.datasets.torch_data import TorchNormalDataset | |||
| class TestReproducibleBatchSampler: | |||
| # TODO 拆分测试,在这里只测试一个东西 | |||
| def test_torch_dataloader_1(self): | |||
| import torch | |||
| from torch.utils.data import DataLoader | |||
| # no shuffle | |||
| before_batch_size = 7 | |||
| dataset = TorchNormalDataset(num_of_data=100) | |||
| dataloader = DataLoader(dataset, batch_size=before_batch_size) | |||
| re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) | |||
| dataloader = replace_batch_sampler(dataloader, re_batchsampler) | |||
| forward_steps = 3 | |||
| iter_dataloader = iter(dataloader) | |||
| for _ in range(forward_steps): | |||
| next(iter_dataloader) | |||
| # 1. 保存状态 | |||
| _get_re_batchsampler = dataloader.batch_sampler | |||
| assert isinstance(_get_re_batchsampler, RandomBatchSampler) | |||
| state = _get_re_batchsampler.state_dict() | |||
| assert state == {"index_list": array("I", list(range(100))), "num_consumed_samples": forward_steps*before_batch_size, | |||
| "sampler_type": "RandomBatchSampler"} | |||
| # 2. 断点重训,重新生成一个 dataloader; | |||
| # 不改变 batch_size; | |||
| dataloader = DataLoader(dataset, batch_size=before_batch_size) | |||
| re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) | |||
| re_batchsampler.load_state_dict(state) | |||
| dataloader = replace_batch_sampler(dataloader, re_batchsampler) | |||
| real_res = [] | |||
| supposed_res = (torch.tensor(list(range(21, 28))), torch.tensor(list(range(28, 35)))) | |||
| forward_steps = 2 | |||
| iter_dataloader = iter(dataloader) | |||
| for _ in range(forward_steps): | |||
| real_res.append(next(iter_dataloader)) | |||
| for i in range(forward_steps): | |||
| assert all(real_res[i] == supposed_res[i]) | |||
| # 改变 batch_size; | |||
| after_batch_size = 3 | |||
| dataloader = DataLoader(dataset, batch_size=after_batch_size) | |||
| re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) | |||
| re_batchsampler.load_state_dict(state) | |||
| dataloader = replace_batch_sampler(dataloader, re_batchsampler) | |||
| real_res = [] | |||
| supposed_res = (torch.tensor(list(range(21, 24))), torch.tensor(list(range(24, 27)))) | |||
| forward_steps = 2 | |||
| iter_dataloader = iter(dataloader) | |||
| for _ in range(forward_steps): | |||
| real_res.append(next(iter_dataloader)) | |||
| for i in range(forward_steps): | |||
| assert all(real_res[i] == supposed_res[i]) | |||
| # 断点重训的第二轮是否是一个完整的 dataloader; | |||
| # 先把断点重训所在的那一个 epoch 跑完; | |||
| begin_idx = 27 | |||
| while True: | |||
| try: | |||
| data = next(iter_dataloader) | |||
| _batch_size = len(data) | |||
| assert all(data == torch.tensor(list(range(begin_idx, begin_idx + _batch_size)))) | |||
| begin_idx += _batch_size | |||
| except StopIteration: | |||
| break | |||
| # 开始新的一轮; | |||
| begin_idx = 0 | |||
| iter_dataloader = iter(dataloader) | |||
| while True: | |||
| try: | |||
| data = next(iter_dataloader) | |||
| _batch_size = len(data) | |||
| assert all(data == torch.tensor(list(range(begin_idx, begin_idx + _batch_size)))) | |||
| begin_idx += _batch_size | |||
| except StopIteration: | |||
| break | |||
| def test_torch_dataloader_2(self): | |||
| # 测试新的一轮的 index list 是重新生成的,而不是沿用上一轮的; | |||
| from torch.utils.data import DataLoader | |||
| # no shuffle | |||
| before_batch_size = 7 | |||
| dataset = TorchNormalDataset(num_of_data=100) | |||
| # 开启 shuffle,来检验断点重训后的第二轮的 index list 是不是重新生成的; | |||
| dataloader = DataLoader(dataset, batch_size=before_batch_size, shuffle=True) | |||
| re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) | |||
| dataloader = replace_batch_sampler(dataloader, re_batchsampler) | |||
| # 将一轮的所有数据保存下来,看是否恢复的是正确的; | |||
| all_supposed_data = [] | |||
| forward_steps = 3 | |||
| iter_dataloader = iter(dataloader) | |||
| for _ in range(forward_steps): | |||
| all_supposed_data.extend(next(iter_dataloader).tolist()) | |||
| # 1. 保存状态 | |||
| _get_re_batchsampler = dataloader.batch_sampler | |||
| assert isinstance(_get_re_batchsampler, RandomBatchSampler) | |||
| state = _get_re_batchsampler.state_dict() | |||
| # 2. 断点重训,重新生成一个 dataloader; | |||
| # 不改变 batch_size; | |||
| dataloader = DataLoader(dataset, batch_size=before_batch_size, shuffle=True) | |||
| re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) | |||
| re_batchsampler.load_state_dict(state) | |||
| dataloader = replace_batch_sampler(dataloader, re_batchsampler) | |||
| # 先把这一轮的数据过完; | |||
| pre_index_list = dataloader.batch_sampler.state_dict()["index_list"] | |||
| while True: | |||
| try: | |||
| all_supposed_data.extend(next(iter_dataloader).tolist()) | |||
| except StopIteration: | |||
| break | |||
| assert all_supposed_data == list(pre_index_list) | |||
| # 重新开启新的一轮; | |||
| for _ in range(3): | |||
| iter_dataloader = iter(dataloader) | |||
| res = [] | |||
| while True: | |||
| try: | |||
| res.append(next(iter_dataloader)) | |||
| except StopIteration: | |||
| break | |||
| def test_3(self): | |||
| import torch | |||
| from torch.utils.data import DataLoader | |||
| before_batch_size = 7 | |||
| dataset = TorchNormalDataset(num_of_data=100) | |||
| # 开启 shuffle,来检验断点重训后的第二轮的 index list 是不是重新生成的; | |||
| dataloader = DataLoader(dataset, batch_size=before_batch_size) | |||
| for idx, data in enumerate(dataloader): | |||
| if idx > 3: | |||
| break | |||
| iterator = iter(dataloader) | |||
| for each in iterator: | |||
| pass | |||
| # | |||
| # class TestReproducibleBatchSampler: | |||
| # # TODO 拆分测试,在这里只测试一个东西 | |||
| # def test_torch_dataloader_1(self): | |||
| # import torch | |||
| # from torch.utils.data import DataLoader | |||
| # # no shuffle | |||
| # before_batch_size = 7 | |||
| # dataset = TorchNormalDataset(num_of_data=100) | |||
| # dataloader = DataLoader(dataset, batch_size=before_batch_size) | |||
| # re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) | |||
| # dataloader = replace_batch_sampler(dataloader, re_batchsampler) | |||
| # | |||
| # forward_steps = 3 | |||
| # iter_dataloader = iter(dataloader) | |||
| # for _ in range(forward_steps): | |||
| # next(iter_dataloader) | |||
| # | |||
| # # 1. 保存状态 | |||
| # _get_re_batchsampler = dataloader.batch_sampler | |||
| # assert isinstance(_get_re_batchsampler, RandomBatchSampler) | |||
| # state = _get_re_batchsampler.state_dict() | |||
| # assert state == {"index_list": array("I", list(range(100))), "num_consumed_samples": forward_steps*before_batch_size, | |||
| # "sampler_type": "RandomBatchSampler"} | |||
| # | |||
| # # 2. 断点重训,重新生成一个 dataloader; | |||
| # # 不改变 batch_size; | |||
| # dataloader = DataLoader(dataset, batch_size=before_batch_size) | |||
| # re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) | |||
| # re_batchsampler.load_state_dict(state) | |||
| # dataloader = replace_batch_sampler(dataloader, re_batchsampler) | |||
| # | |||
| # real_res = [] | |||
| # supposed_res = (torch.tensor(list(range(21, 28))), torch.tensor(list(range(28, 35)))) | |||
| # forward_steps = 2 | |||
| # iter_dataloader = iter(dataloader) | |||
| # for _ in range(forward_steps): | |||
| # real_res.append(next(iter_dataloader)) | |||
| # | |||
| # for i in range(forward_steps): | |||
| # assert all(real_res[i] == supposed_res[i]) | |||
| # | |||
| # # 改变 batch_size; | |||
| # after_batch_size = 3 | |||
| # dataloader = DataLoader(dataset, batch_size=after_batch_size) | |||
| # re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) | |||
| # re_batchsampler.load_state_dict(state) | |||
| # dataloader = replace_batch_sampler(dataloader, re_batchsampler) | |||
| # | |||
| # real_res = [] | |||
| # supposed_res = (torch.tensor(list(range(21, 24))), torch.tensor(list(range(24, 27)))) | |||
| # forward_steps = 2 | |||
| # iter_dataloader = iter(dataloader) | |||
| # for _ in range(forward_steps): | |||
| # real_res.append(next(iter_dataloader)) | |||
| # | |||
| # for i in range(forward_steps): | |||
| # assert all(real_res[i] == supposed_res[i]) | |||
| # | |||
| # # 断点重训的第二轮是否是一个完整的 dataloader; | |||
| # # 先把断点重训所在的那一个 epoch 跑完; | |||
| # begin_idx = 27 | |||
| # while True: | |||
| # try: | |||
| # data = next(iter_dataloader) | |||
| # _batch_size = len(data) | |||
| # assert all(data == torch.tensor(list(range(begin_idx, begin_idx + _batch_size)))) | |||
| # begin_idx += _batch_size | |||
| # except StopIteration: | |||
| # break | |||
| # | |||
| # # 开始新的一轮; | |||
| # begin_idx = 0 | |||
| # iter_dataloader = iter(dataloader) | |||
| # while True: | |||
| # try: | |||
| # data = next(iter_dataloader) | |||
| # _batch_size = len(data) | |||
| # assert all(data == torch.tensor(list(range(begin_idx, begin_idx + _batch_size)))) | |||
| # begin_idx += _batch_size | |||
| # except StopIteration: | |||
| # break | |||
| # | |||
| # def test_torch_dataloader_2(self): | |||
| # # 测试新的一轮的 index list 是重新生成的,而不是沿用上一轮的; | |||
| # from torch.utils.data import DataLoader | |||
| # # no shuffle | |||
| # before_batch_size = 7 | |||
| # dataset = TorchNormalDataset(num_of_data=100) | |||
| # # 开启 shuffle,来检验断点重训后的第二轮的 index list 是不是重新生成的; | |||
| # dataloader = DataLoader(dataset, batch_size=before_batch_size, shuffle=True) | |||
| # re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) | |||
| # dataloader = replace_batch_sampler(dataloader, re_batchsampler) | |||
| # | |||
| # # 将一轮的所有数据保存下来,看是否恢复的是正确的; | |||
| # all_supposed_data = [] | |||
| # forward_steps = 3 | |||
| # iter_dataloader = iter(dataloader) | |||
| # for _ in range(forward_steps): | |||
| # all_supposed_data.extend(next(iter_dataloader).tolist()) | |||
| # | |||
| # # 1. 保存状态 | |||
| # _get_re_batchsampler = dataloader.batch_sampler | |||
| # assert isinstance(_get_re_batchsampler, RandomBatchSampler) | |||
| # state = _get_re_batchsampler.state_dict() | |||
| # | |||
| # # 2. 断点重训,重新生成一个 dataloader; | |||
| # # 不改变 batch_size; | |||
| # dataloader = DataLoader(dataset, batch_size=before_batch_size, shuffle=True) | |||
| # re_batchsampler = RandomBatchSampler(dataloader.batch_sampler, dataloader.batch_size, drop_last=False) | |||
| # re_batchsampler.load_state_dict(state) | |||
| # dataloader = replace_batch_sampler(dataloader, re_batchsampler) | |||
| # | |||
| # # 先把这一轮的数据过完; | |||
| # pre_index_list = dataloader.batch_sampler.state_dict()["index_list"] | |||
| # while True: | |||
| # try: | |||
| # all_supposed_data.extend(next(iter_dataloader).tolist()) | |||
| # except StopIteration: | |||
| # break | |||
| # assert all_supposed_data == list(pre_index_list) | |||
| # | |||
| # # 重新开启新的一轮; | |||
| # for _ in range(3): | |||
| # iter_dataloader = iter(dataloader) | |||
| # res = [] | |||
| # while True: | |||
| # try: | |||
| # res.append(next(iter_dataloader)) | |||
| # except StopIteration: | |||
| # break | |||
| # | |||
| # def test_3(self): | |||
| # import torch | |||
| # from torch.utils.data import DataLoader | |||
| # before_batch_size = 7 | |||
| # dataset = TorchNormalDataset(num_of_data=100) | |||
| # # 开启 shuffle,来检验断点重训后的第二轮的 index list 是不是重新生成的; | |||
| # dataloader = DataLoader(dataset, batch_size=before_batch_size) | |||
| # | |||
| # for idx, data in enumerate(dataloader): | |||
| # if idx > 3: | |||
| # break | |||
| # | |||
| # iterator = iter(dataloader) | |||
| # for each in iterator: | |||
| # pass | |||
| class DatasetWithVaryLength: | |||
| @@ -28,12 +28,12 @@ class TestUnrepeatedSampler: | |||
| @pytest.mark.parametrize('num_replicas', [2, 3]) | |||
| @pytest.mark.parametrize('num_of_data', [2, 3, 4, 100]) | |||
| @pytest.mark.parametrize('shuffle', [False, True]) | |||
| def test_multi(self, num_replica, num_of_data, shuffle): | |||
| def test_multi(self, num_replicas, num_of_data, shuffle): | |||
| data = DatasetWithVaryLength(num_of_data=num_of_data) | |||
| samplers = [] | |||
| for i in range(num_replica): | |||
| for i in range(num_replicas): | |||
| sampler = UnrepeatedRandomSampler(dataset=data, shuffle=shuffle) | |||
| sampler.set_distributed(num_replica, rank=i) | |||
| sampler.set_distributed(num_replicas, rank=i) | |||
| samplers.append(sampler) | |||
| indexes = list(chain(*samplers)) | |||
| @@ -52,12 +52,12 @@ class TestUnrepeatedSortedSampler: | |||
| @pytest.mark.parametrize('num_replicas', [2, 3]) | |||
| @pytest.mark.parametrize('num_of_data', [2, 3, 4, 100]) | |||
| def test_multi(self, num_replica, num_of_data): | |||
| def test_multi(self, num_replicas, num_of_data): | |||
| data = DatasetWithVaryLength(num_of_data=num_of_data) | |||
| samplers = [] | |||
| for i in range(num_replica): | |||
| for i in range(num_replicas): | |||
| sampler = UnrepeatedSortedSampler(dataset=data, length=data.data) | |||
| sampler.set_distributed(num_replica, rank=i) | |||
| sampler.set_distributed(num_replicas, rank=i) | |||
| samplers.append(sampler) | |||
| # 保证顺序是没乱的 | |||
| @@ -83,12 +83,12 @@ class TestUnrepeatedSequentialSampler: | |||
| @pytest.mark.parametrize('num_replicas', [2, 3]) | |||
| @pytest.mark.parametrize('num_of_data', [2, 3, 4, 100]) | |||
| def test_multi(self, num_replica, num_of_data): | |||
| def test_multi(self, num_replicas, num_of_data): | |||
| data = DatasetWithVaryLength(num_of_data=num_of_data) | |||
| samplers = [] | |||
| for i in range(num_replica): | |||
| for i in range(num_replicas): | |||
| sampler = UnrepeatedSequentialSampler(dataset=data, length=data.data) | |||
| sampler.set_distributed(num_replica, rank=i) | |||
| sampler.set_distributed(num_replicas, rank=i) | |||
| samplers.append(sampler) | |||
| # 保证顺序是没乱的 | |||
| @@ -33,6 +33,8 @@ def recover_logger(fn): | |||
| def magic_argv_env_context(fn=None, timeout=600): | |||
| """ | |||
| 用来在测试时包裹每一个单独的测试函数,使得 ddp 测试正确; | |||
| 会丢掉 pytest 中的 arg 参数。 | |||
| :param timeout: 表示一个测试如果经过多久还没有通过的话就主动将其 kill 掉,默认为 10 分钟,单位为秒; | |||
| :return: | |||
| """ | |||
| @@ -46,9 +48,10 @@ def magic_argv_env_context(fn=None, timeout=600): | |||
| env = deepcopy(os.environ.copy()) | |||
| used_args = [] | |||
| for each_arg in sys.argv[1:]: | |||
| if "test" not in each_arg: | |||
| used_args.append(each_arg) | |||
| # for each_arg in sys.argv[1:]: | |||
| # # warning,否则 可能导致 pytest -s . 中的点混入其中,导致多卡启动的 collect tests items 不为 1 | |||
| # if each_arg.startswith('-'): | |||
| # used_args.append(each_arg) | |||
| pytest_current_test = os.environ.get('PYTEST_CURRENT_TEST') | |||
| @@ -15,15 +15,15 @@ | |||
| "\n", | |||
| "    1.3   trainer 内部初始化 evaluater\n", | |||
| "\n", | |||
| "  2   使用 trainer 训练模型\n", | |||
| "  2   使用 fastNLP 0.8 搭建 argmax 模型\n", | |||
| "\n", | |||
| "    2.1   argmax 模型实例\n", | |||
| "    2.1   trainer_step 和 evaluator_step\n", | |||
| "\n", | |||
| "    2.2   trainer 的参数匹配\n", | |||
| "    2.2   trainer 和 evaluator 的参数匹配\n", | |||
| "\n", | |||
| "    2.3   trainer 的实际使用 \n", | |||
| "    2.3   一个实际案例:argmax 模型\n", | |||
| "\n", | |||
| "  3   使用 evaluator 评测模型\n", | |||
| "  3   使用 fastNLP 0.8 训练 argmax 模型\n", | |||
| " \n", | |||
| "    3.1   trainer 外部初始化的 evaluator\n", | |||
| "\n", | |||
| @@ -50,21 +50,21 @@ | |||
| "\n", | |||
| "```python\n", | |||
| "trainer = Trainer(\n", | |||
| " model=model,\n", | |||
| " train_dataloader=train_dataloader,\n", | |||
| " optimizers=optimizer,\n", | |||
| " model=model, # 模型基于 torch.nn.Module\n", | |||
| " train_dataloader=train_dataloader, # 加载模块基于 torch.utils.data.DataLoader \n", | |||
| " optimizers=optimizer, # 优化模块基于 torch.optim.*\n", | |||
| "\t...\n", | |||
| "\tdriver=\"torch\",\n", | |||
| "\tdevice=0,\n", | |||
| "\tdriver=\"torch\", # 使用 pytorch 模块进行训练 \n", | |||
| "\tdevice='cuda', # 使用 GPU:0 显卡执行训练\n", | |||
| "\t...\n", | |||
| ")\n", | |||
| "...\n", | |||
| "evaluator = Evaluator(\n", | |||
| " model=model,\n", | |||
| " dataloaders=evaluate_dataloader,\n", | |||
| " metrics={'acc': Accuracy()} \n", | |||
| " model=model, # 模型基于 torch.nn.Module\n", | |||
| " dataloaders=evaluate_dataloader, # 加载模块基于 torch.utils.data.DataLoader\n", | |||
| " metrics={'acc': Accuracy()}, # 测评方法使用 fastNLP.core.metrics.Accuracy \n", | |||
| " ...\n", | |||
| " driver=trainer.driver,\n", | |||
| " driver=trainer.driver, # 保持同 trainer 的 driver 一致\n", | |||
| "\tdevice=None,\n", | |||
| " ...\n", | |||
| ")\n", | |||
| @@ -88,7 +88,7 @@ | |||
| "\n", | |||
| "注:在同一脚本中,`Trainer`和`Evaluator`使用的`driver`应当保持一致\n", | |||
| "\n", | |||
| "  一个不能违背的原则在于:**不要将多卡的`driver`前使用单卡的`driver`**(???),这样使用可能会带来很多意想不到的错误。" | |||
| "  一个不能违背的原则在于:**不要将多卡的`driver`前使用单卡的`driver`**(???),这样使用可能会带来很多意想不到的错误" | |||
| ] | |||
| }, | |||
| { | |||
| @@ -109,10 +109,10 @@ | |||
| " optimizers=optimizer,\n", | |||
| "\t...\n", | |||
| "\tdriver=\"torch\",\n", | |||
| "\tdevice=0,\n", | |||
| "\tdevice='cuda',\n", | |||
| "\t...\n", | |||
| " evaluate_dataloaders=evaluate_dataloader,\n", | |||
| " metrics={'acc': Accuracy()},\n", | |||
| " evaluate_dataloaders=evaluate_dataloader, # 传入参数 evaluator_dataloaders\n", | |||
| " metrics={'acc': Accuracy()}, # 传入参数 metrics\n", | |||
| "\t...\n", | |||
| ")\n", | |||
| "```" | |||
| @@ -123,7 +123,7 @@ | |||
| "id": "0c9c7dda", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "## 2. 使用 trainer 训练模型" | |||
| "## 2. argmax 模型的搭建实例" | |||
| ] | |||
| }, | |||
| { | |||
| @@ -131,71 +131,41 @@ | |||
| "id": "524ac200", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "### 2.1 argmax 模型实例\n", | |||
| "### 2.1 trainer_step 和 evaluator_step\n", | |||
| "\n", | |||
| "本节将通过训练`argmax`模型,简单介绍如何`Trainer`模块的使用方式\n", | |||
| "在`fastNLP 0.8`中,使用`pytorch.nn.Module`搭建需要训练的模型,在搭建模型过程中,除了\n", | |||
| "\n", | |||
| "  使用`pytorch`定义`argmax`模型,输入一组固定维度的向量,输出其中数值最大的数的索引\n", | |||
| "\n", | |||
| "  除了添加`pytorch`要求的`forward`方法外,还需要添加 **`train_step`** 和 **`evaluate_step`** 这两个方法" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": null, | |||
| "id": "5314482b", | |||
| "metadata": { | |||
| "pycharm": { | |||
| "is_executing": true | |||
| } | |||
| }, | |||
| "outputs": [], | |||
| "source": [ | |||
| "import torch\n", | |||
| "import torch.nn as nn\n", | |||
| "\n", | |||
| "class ArgMaxModel(nn.Module):\n", | |||
| " def __init__(self, num_labels, feature_dimension):\n", | |||
| " super(ArgMaxModel, self).__init__()\n", | |||
| " self.num_labels = num_labels\n", | |||
| "\n", | |||
| " self.linear1 = nn.Linear(in_features=feature_dimension, out_features=10)\n", | |||
| " self.ac1 = nn.ReLU()\n", | |||
| " self.linear2 = nn.Linear(in_features=10, out_features=10)\n", | |||
| " self.ac2 = nn.ReLU()\n", | |||
| " self.output = nn.Linear(in_features=10, out_features=num_labels)\n", | |||
| " self.loss_fn = nn.CrossEntropyLoss()\n", | |||
| "  添加`pytorch`要求的`forward`方法外,还需要添加 **`train_step`** 和 **`evaluate_step`** 这两个方法\n", | |||
| "***\n", | |||
| "```python\n", | |||
| "class Model(torch.nn.Module):\n", | |||
| " def __init__(self):\n", | |||
| " super(Model, self).__init__()\n", | |||
| " self.loss_fn = torch.nn.CrossEntropyLoss()\n", | |||
| " pass\n", | |||
| "\n", | |||
| " def forward(self, x):\n", | |||
| " x = self.ac1(self.linear1(x))\n", | |||
| " x = self.ac2(self.linear2(x))\n", | |||
| " x = self.output(x)\n", | |||
| " return x\n", | |||
| " pass\n", | |||
| "\n", | |||
| " def train_step(self, x, y):\n", | |||
| " x = self(x)\n", | |||
| " return {\"loss\": self.loss_fn(x, y)}\n", | |||
| " pred = self(x)\n", | |||
| " return {\"loss\": self.loss_fn(pred, y)}\n", | |||
| "\n", | |||
| " def evaluate_step(self, x, y):\n", | |||
| " x = self(x)\n", | |||
| " x = torch.max(x, dim=-1)[1]\n", | |||
| " return {\"pred\": x, \"target\": y}" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "markdown", | |||
| "id": "ca897322", | |||
| "metadata": {}, | |||
| "source": [ | |||
| " pred = self(x)\n", | |||
| " pred = torch.max(pred, dim=-1)[1]\n", | |||
| " return {\"pred\": pred, \"target\": y}\n", | |||
| "```\n", | |||
| "***\n", | |||
| "在`fastNLP 0.8`中,**函数`train_step`是`Trainer`中参数`train_fn`的默认值**\n", | |||
| "\n", | |||
| "  由于,在`Trainer`训练时,**`Trainer`通过参数`_train_fn_`对应的模型方法获得当前数据批次的损失值**\n", | |||
| "  由于,在`Trainer`训练时,**`Trainer`通过参数`train_fn`对应的模型方法获得当前数据批次的损失值**\n", | |||
| "\n", | |||
| "  因此,在`Trainer`训练时,`Trainer`首先会寻找模型是否定义了`train_step`这一方法\n", | |||
| "\n", | |||
| "    如果没有找到,那么`Trainer`会默认使用模型的`forward`函数来进行训练的前向传播过程\n", | |||
| "\n", | |||
| "注:在`fastNLP 0.8`中,`Trainer`要求模型通过`train_step`来返回一个字典,将损失值作为`loss`的键值\n", | |||
| "注:在`fastNLP 0.8`中,**`Trainer`要求模型通过`train_step`来返回一个字典**,**满足如`{\"loss\": loss}`的形式**\n", | |||
| "\n", | |||
| "  此外,这里也可以通过传入`Trainer`的参数`output_mapping`来实现高度化的定制,具体请见这一note(???)\n", | |||
| "\n", | |||
| @@ -205,7 +175,11 @@ | |||
| "\n", | |||
| "  从用户角度,模型通过`evaluate_step`方法来返回一个字典,内容与传入`Evaluator`的`metrics`一致\n", | |||
| "\n", | |||
| "<!--   从模块角度,`fastNLP 0.8`会匹配该字典的键值和一个`metric`的更新函数的函数签名,自动地将`metric`所需要的内容传给该`metric`,也就是我们会自动进行“**参数匹配**”。 -->" | |||
| "  从模块角度,该字典的键值和`metric`中的`update`函数的签名一致,这样的机制在传参时被称为“**参数匹配**”\n", | |||
| "\n", | |||
| "***\n", | |||
| "\n", | |||
| "" | |||
| ] | |||
| }, | |||
| { | |||
| @@ -213,13 +187,52 @@ | |||
| "id": "fb3272eb", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "### 2.2 trainer 的参数匹配\n", | |||
| "### 2.2 trainer 和 evaluator 的参数匹配\n", | |||
| "\n", | |||
| "在`fastNLP 0.8`中,参数匹配涉及到两个方面,分别是在\n", | |||
| "\n", | |||
| "  一方面,**在模型的前向传播中**,**`dataloader`向`train_step`或`evaluate_step`函数传递`batch`**\n", | |||
| "\n", | |||
| "  另方面,**在模型的评测过程中**,**`evaluate_dataloader`向`metric`的`update`函数传递`batch`**\n", | |||
| "\n", | |||
| "`fastNLP 0.8`中的参数匹配涉及到两个方面,一是在模型训练或者评测的前向传播过程中,如果从`dataloader`中出来一个`batch`的数据是一个字典,那么我们会查看模型的`train_step`和`evaluate_step`方法的参数签名,然后对于每一个参数,我们会根据其名字从 batch 这一字典中选择出对应的数据传入进去。例如在接下来的定义`Dataset`的部分,注意`ArgMaxDatset`的`__getitem__`方法,您可以通过在`Trainer`和`Evaluator`中设置参数 `model_wo_auto_param_call`来关闭这一行为。当您关闭了这一行为后,我们会将`batch`直接传给您的`train_step`、`evaluate_step`或者 `forward`函数。\n", | |||
| "对于前者,在`Trainer`和`Evaluator`中的参数`model_wo_auto_param_call`被设置为`False`时\n", | |||
| "\n", | |||
| "二是在传入`Trainer`或者`Evaluator metrics`后,我们会在需要评测的时间点主动调用`metrics`来对`evaluate_dataloaders`进行评测,这一功能主要就是通过对`metrics`的`update`方法和一个`batch`的数据进行参数评测实现的。首先需要明确的是一个 metric 的计算通常分为 `update` 和 `get_metric`两步,其中`update`表示更新一个`batch`的评测数据,`get_metric` 表示根据已经得到的评测数据计算出最终的评测值,例如对于 `Accuracy`来说,其在`update`的时候会更新一个`batch`计算正确的数量 right_num 和计算错误的数量 total_num,最终在 `get_metric` 时返回评测值`right_num / total_num`。\n", | |||
| "    **`fastNLP 0.8`要求`dataloader`生成的每个`batch`**,**满足如`{\"x\": x, \"y\": y}`的形式**\n", | |||
| "\n", | |||
| "  同时,`fastNLP 0.8`会查看模型的`train_step`和`evaluate_step`方法的参数签名,并为对应参数传入对应数值\n", | |||
| "\n", | |||
| "    **字典形式的定义**,**对应在`Dataset`定义的`__getitem__`方法中**,例如下方的`ArgMaxDatset`\n", | |||
| "\n", | |||
| "  而在`Trainer`和`Evaluator`中的参数`model_wo_auto_param_call`被设置为`True`时\n", | |||
| "\n", | |||
| "    `fastNLP 0.8`会将`batch`直接传给模型的`train_step`、`evaluate_step`或`forward`函数\n", | |||
| "***\n", | |||
| "```python\n", | |||
| "class Dataset(torch.utils.data.Dataset):\n", | |||
| " def __init__(self, x, y):\n", | |||
| " self.x = x\n", | |||
| " self.y = y\n", | |||
| "\n", | |||
| " def __len__(self):\n", | |||
| " return len(self.x)\n", | |||
| "\n", | |||
| " def __getitem__(self, item):\n", | |||
| " return {\"x\": self.x[item], \"y\": self.y[item]}\n", | |||
| "```\n", | |||
| "***\n", | |||
| "对于后者,首先要明确,在`Trainer`和`Evaluator`中,`metrics`的计算分为`update`和`get_metric`两步\n", | |||
| "\n", | |||
| "因为`fastNLP 0.8`的`metrics`是自动计算的(只需要传给`Trainer`或者`Evaluator`),因此其一定依赖于参数匹配。对于从`evaluate_dataloader`中生成的一个`batch`的数据,我们会查看传给 `Trainer`(最终是传给`Evaluator`)和`Evaluator`的每一个`metric`,然后查看其`update`函数的函数签名,然后根据每一个参数的名字从`batch`字典中选择出对应的数据传入进去。" | |||
| "    **`update`函数**,**针对一个`batch`的预测结果**,计算其累计的评价指标\n", | |||
| "\n", | |||
| "    **`get_metric`函数**,**统计`update`函数累计的评价指标**,来计算最终的评价结果\n", | |||
| "\n", | |||
| "  例如对于`Accuracy`来说,`update`函数会更新一个`batch`的正例数量`right_num`和负例数量`total_num`\n", | |||
| "\n", | |||
| "    而`get_metric`函数则会返回所有`batch`的评测值`right_num / total_num`\n", | |||
| "\n", | |||
| "  在此基础上,**`fastNLP 0.8`要求`evaluate_dataloader`生成的每个`batch`传递给对应的`metric`**\n", | |||
| "\n", | |||
| "    **以`{\"pred\": y_pred, \"target\": y_true}`的形式**,对应其`update`函数的函数签名" | |||
| ] | |||
| }, | |||
| { | |||
| @@ -227,9 +240,65 @@ | |||
| "id": "f62b7bb1", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "### 2.3 trainer的实际使用\n", | |||
| "### 2.3 一个实际案例:argmax 模型\n", | |||
| "\n", | |||
| "接下来我们创建用于训练的 dataset,其接受三个参数:数据维度、数据量和随机数种子,生成指定数量的维度为 `feature_dimension` 向量,而每一个向量的标签就是该向量中最大值的索引。" | |||
| "下文将通过训练`argmax`模型,简单介绍如何`Trainer`模块的使用方式\n", | |||
| "\n", | |||
| "  首先,使用`pytorch.nn.Module`定义`argmax`模型,目标是输入一组固定维度的向量,输出其中数值最大的数的索引" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 1, | |||
| "id": "5314482b", | |||
| "metadata": { | |||
| "pycharm": { | |||
| "is_executing": false | |||
| } | |||
| }, | |||
| "outputs": [], | |||
| "source": [ | |||
| "import torch\n", | |||
| "import torch.nn as nn\n", | |||
| "\n", | |||
| "class ArgMaxModel(nn.Module):\n", | |||
| " def __init__(self, num_labels, feature_dimension):\n", | |||
| " super(ArgMaxModel, self).__init__()\n", | |||
| " self.num_labels = num_labels\n", | |||
| "\n", | |||
| " self.linear1 = nn.Linear(in_features=feature_dimension, out_features=10)\n", | |||
| " self.ac1 = nn.ReLU()\n", | |||
| " self.linear2 = nn.Linear(in_features=10, out_features=10)\n", | |||
| " self.ac2 = nn.ReLU()\n", | |||
| " self.output = nn.Linear(in_features=10, out_features=num_labels)\n", | |||
| " self.loss_fn = nn.CrossEntropyLoss()\n", | |||
| "\n", | |||
| " def forward(self, x):\n", | |||
| " pred = self.ac1(self.linear1(x))\n", | |||
| " pred = self.ac2(self.linear2(pred))\n", | |||
| " pred = self.output(pred)\n", | |||
| " return pred\n", | |||
| "\n", | |||
| " def train_step(self, x, y):\n", | |||
| " pred = self(x)\n", | |||
| " return {\"loss\": self.loss_fn(pred, y)}\n", | |||
| "\n", | |||
| " def evaluate_step(self, x, y):\n", | |||
| " pred = self(x)\n", | |||
| " pred = torch.max(pred, dim=-1)[1]\n", | |||
| " return {\"pred\": pred, \"target\": y}" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "markdown", | |||
| "id": "71f3fa6b", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "  接着,使用`torch.utils.data.Dataset`定义`ArgMaxDataset`数据集\n", | |||
| "\n", | |||
| "    数据集包含三个参数:维度`feature_dimension`、数据量`data_num`和随机种子`seed`\n", | |||
| "\n", | |||
| "    数据及初始化是,自动生成指定维度的向量,并为每个向量标注出其中最大值的索引作为预测标签" | |||
| ] | |||
| }, | |||
| { | |||
| @@ -245,7 +314,7 @@ | |||
| "source": [ | |||
| "from torch.utils.data import Dataset\n", | |||
| "\n", | |||
| "class ArgMaxDatset(Dataset):\n", | |||
| "class ArgMaxDataset(Dataset):\n", | |||
| " def __init__(self, feature_dimension, data_num=1000, seed=0):\n", | |||
| " self.num_labels = feature_dimension\n", | |||
| " self.feature_dimension = feature_dimension\n", | |||
| @@ -269,7 +338,9 @@ | |||
| "id": "2cb96332", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "现在准备好数据和模型。" | |||
| "  然后,根据`ArgMaxModel`类初始化模型实例,保持输入维度`feature_dimension`和输出标签数量`num_labels`一致\n", | |||
| "\n", | |||
| "    再根据`ArgMaxDataset`类初始化两个数据集实例,分别用来模型测试和模型评测,数据量各1000笔" | |||
| ] | |||
| }, | |||
| { | |||
| @@ -283,16 +354,10 @@ | |||
| }, | |||
| "outputs": [], | |||
| "source": [ | |||
| "from torch.utils.data import DataLoader\n", | |||
| "\n", | |||
| "train_dataset = ArgMaxDatset(feature_dimension=10, data_num=1000)\n", | |||
| "evaluate_dataset = ArgMaxDatset(feature_dimension=10, data_num=100)\n", | |||
| "\n", | |||
| "train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n", | |||
| "evaluate_dataloader = DataLoader(evaluate_dataset, batch_size=8)\n", | |||
| "model = ArgMaxModel(num_labels=10, feature_dimension=10)\n", | |||
| "\n", | |||
| "# num_labels 设置为 10,与 feature_dimension 保持一致,因为我们是预测十个位置中哪一个的概率最大。\n", | |||
| "model = ArgMaxModel(num_labels=10, feature_dimension=10)" | |||
| "train_dataset = ArgMaxDataset(feature_dimension=10, data_num=1000)\n", | |||
| "evaluate_dataset = ArgMaxDataset(feature_dimension=10, data_num=100)" | |||
| ] | |||
| }, | |||
| { | |||
| @@ -300,12 +365,33 @@ | |||
| "id": "4e7d25ee", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "将优化器也定义好。" | |||
| "  此外,使用`torch.utils.data.DataLoader`初始化两个数据加载模块,批量大小同为8,分别用于训练和测评" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 4, | |||
| "id": "363b5b09", | |||
| "metadata": {}, | |||
| "outputs": [], | |||
| "source": [ | |||
| "from torch.utils.data import DataLoader\n", | |||
| "\n", | |||
| "train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n", | |||
| "evaluate_dataloader = DataLoader(evaluate_dataset, batch_size=8)" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "markdown", | |||
| "id": "c8d4443f", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "  最后,使用`torch.optim.SGD`初始化一个优化模块,基于随机梯度下降法" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 5, | |||
| "id": "dc28a2d9", | |||
| "metadata": { | |||
| "pycharm": { | |||
| @@ -321,15 +407,33 @@ | |||
| }, | |||
| { | |||
| "cell_type": "markdown", | |||
| "id": "4f1fba81", | |||
| "id": "eb8ca6cf", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "## 3. 使用 fastNLP 0.8 训练 argmax 模型\n", | |||
| "\n", | |||
| "### 3.1 trainer 外部初始化的 evaluator" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "markdown", | |||
| "id": "55145553", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "现在万事俱备,开始使用 Trainer 进行训练!" | |||
| "通过从`fastNLP`库中导入`Trainer`类,初始化`trainer`实例,对模型进行训练\n", | |||
| "\n", | |||
| "  需要导入预先定义好的模型`model`、对应的数据加载模块`train_dataloader`、优化模块`optimizer`\n", | |||
| "\n", | |||
| "  通过`progress_bar`设定进度条格式,默认为`\"auto\"`,此外还有`\"rich\"`、`\"raw\"`和`None`\n", | |||
| "\n", | |||
| "    但对于`\"auto\"`和`\"rich\"`格式,训练结束后进度条会不显示(???)\n", | |||
| "\n", | |||
| "  通过`n_epochs`设定优化迭代轮数,默认为20;全部`Trainer`的全部变量与函数可以通过`dir(trainer)`查询" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 5, | |||
| "execution_count": 6, | |||
| "id": "b51b7a2d", | |||
| "metadata": { | |||
| "pycharm": { | |||
| @@ -349,167 +453,20 @@ | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/plain": [ | |||
| "['__annotations__',\n", | |||
| " '__class__',\n", | |||
| " '__delattr__',\n", | |||
| " '__dict__',\n", | |||
| " '__dir__',\n", | |||
| " '__doc__',\n", | |||
| " '__eq__',\n", | |||
| " '__format__',\n", | |||
| " '__ge__',\n", | |||
| " '__getattribute__',\n", | |||
| " '__gt__',\n", | |||
| " '__hash__',\n", | |||
| " '__init__',\n", | |||
| " '__init_subclass__',\n", | |||
| " '__le__',\n", | |||
| " '__lt__',\n", | |||
| " '__module__',\n", | |||
| " '__ne__',\n", | |||
| " '__new__',\n", | |||
| " '__reduce__',\n", | |||
| " '__reduce_ex__',\n", | |||
| " '__repr__',\n", | |||
| " '__setattr__',\n", | |||
| " '__sizeof__',\n", | |||
| " '__str__',\n", | |||
| " '__subclasshook__',\n", | |||
| " '__weakref__',\n", | |||
| " '_check_callback_called_legality',\n", | |||
| " '_check_train_batch_loop_legality',\n", | |||
| " '_custom_callbacks',\n", | |||
| " '_driver',\n", | |||
| " '_evaluate_dataloaders',\n", | |||
| " '_fetch_matched_fn_callbacks',\n", | |||
| " '_set_num_eval_batch_per_dl',\n", | |||
| " '_train_batch_loop',\n", | |||
| " '_train_dataloader',\n", | |||
| " '_train_step',\n", | |||
| " '_train_step_signature_fn',\n", | |||
| " 'accumulation_steps',\n", | |||
| " 'add_callback_fn',\n", | |||
| " 'backward',\n", | |||
| " 'batch_idx_in_epoch',\n", | |||
| " 'batch_step_fn',\n", | |||
| " 'callback_manager',\n", | |||
| " 'check_batch_step_fn',\n", | |||
| " 'cur_epoch_idx',\n", | |||
| " 'data_device',\n", | |||
| " 'dataloader',\n", | |||
| " 'device',\n", | |||
| " 'driver',\n", | |||
| " 'driver_name',\n", | |||
| " 'epoch_validate',\n", | |||
| " 'evaluate_batch_step_fn',\n", | |||
| " 'evaluate_dataloaders',\n", | |||
| " 'evaluate_every',\n", | |||
| " 'evaluate_fn',\n", | |||
| " 'evaluator',\n", | |||
| " 'extract_loss_from_outputs',\n", | |||
| " 'fp16',\n", | |||
| " 'get_no_sync_context',\n", | |||
| " 'global_forward_batches',\n", | |||
| " 'has_checked_train_batch_loop',\n", | |||
| " 'input_mapping',\n", | |||
| " 'kwargs',\n", | |||
| " 'larger_better',\n", | |||
| " 'load',\n", | |||
| " 'load_model',\n", | |||
| " 'marker',\n", | |||
| " 'metrics',\n", | |||
| " 'model',\n", | |||
| " 'model_device',\n", | |||
| " 'monitor',\n", | |||
| " 'move_data_to_device',\n", | |||
| " 'n_epochs',\n", | |||
| " 'num_batches_per_epoch',\n", | |||
| " 'on',\n", | |||
| " 'on_after_backward',\n", | |||
| " 'on_after_optimizers_step',\n", | |||
| " 'on_after_trainer_initialized',\n", | |||
| " 'on_after_zero_grad',\n", | |||
| " 'on_before_backward',\n", | |||
| " 'on_before_optimizers_step',\n", | |||
| " 'on_before_zero_grad',\n", | |||
| " 'on_exception',\n", | |||
| " 'on_fetch_data_begin',\n", | |||
| " 'on_fetch_data_end',\n", | |||
| " 'on_load_checkpoint',\n", | |||
| " 'on_load_model',\n", | |||
| " 'on_sanity_check_begin',\n", | |||
| " 'on_sanity_check_end',\n", | |||
| " 'on_save_checkpoint',\n", | |||
| " 'on_save_model',\n", | |||
| " 'on_train_batch_begin',\n", | |||
| " 'on_train_batch_end',\n", | |||
| " 'on_train_begin',\n", | |||
| " 'on_train_end',\n", | |||
| " 'on_train_epoch_begin',\n", | |||
| " 'on_train_epoch_end',\n", | |||
| " 'on_validate_begin',\n", | |||
| " 'on_validate_end',\n", | |||
| " 'optimizers',\n", | |||
| " 'output_mapping',\n", | |||
| " 'run',\n", | |||
| " 'save',\n", | |||
| " 'save_model',\n", | |||
| " 'set_grad_to_none',\n", | |||
| " 'state',\n", | |||
| " 'step',\n", | |||
| " 'step_validate',\n", | |||
| " 'total_batches',\n", | |||
| " 'train_batch_loop',\n", | |||
| " 'train_dataloader',\n", | |||
| " 'train_fn',\n", | |||
| " 'train_step',\n", | |||
| " 'trainer_state',\n", | |||
| " 'zero_grad']" | |||
| ] | |||
| }, | |||
| "execution_count": 5, | |||
| "metadata": {}, | |||
| "output_type": "execute_result" | |||
| } | |||
| ], | |||
| "source": [ | |||
| "from fastNLP import Trainer\n", | |||
| "\n", | |||
| "# 定义一个 Trainer\n", | |||
| "trainer = Trainer(\n", | |||
| " model=model,\n", | |||
| " driver=\"torch\", # 使用 pytorch 进行训练\n", | |||
| " device=0, # 使用 GPU:0\n", | |||
| " driver=\"torch\",\n", | |||
| " device='cuda',\n", | |||
| " train_dataloader=train_dataloader,\n", | |||
| " optimizers=optimizer,\n", | |||
| " n_epochs=10, # 训练 40 个 epoch\n", | |||
| " progress_bar=\"rich\"\n", | |||
| ")\n", | |||
| "dir(trainer)" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 8, | |||
| "id": "f8fe9c32", | |||
| "metadata": {}, | |||
| "outputs": [ | |||
| { | |||
| "name": "stdout", | |||
| "output_type": "stream", | |||
| "text": [ | |||
| "FullArgSpec(args=['self', 'num_train_batch_per_epoch', 'num_eval_batch_per_dl', 'num_eval_sanity_batch', 'resume_from', 'resume_training', 'catch_KeyboardInterrupt'], varargs=None, varkw=None, defaults=(-1, -1, 2, None, True, None), kwonlyargs=[], kwonlydefaults=None, annotations={'num_train_batch_per_epoch': <class 'int'>, 'num_eval_batch_per_dl': <class 'int'>, 'num_eval_sanity_batch': <class 'int'>, 'resume_from': <class 'str'>, 'resume_training': <class 'bool'>})\n" | |||
| ] | |||
| } | |||
| ], | |||
| "source": [ | |||
| "import inspect \n", | |||
| "\n", | |||
| "print(inspect.getfullargspec(trainer.run))" | |||
| " n_epochs=10, # 设定迭代轮数 \n", | |||
| " progress_bar=\"auto\" # 设定进度条格式\n", | |||
| ")" | |||
| ] | |||
| }, | |||
| { | |||
| @@ -517,16 +474,20 @@ | |||
| "id": "6e202d6e", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "没有问题,那么开始真正的训练!" | |||
| "通过使用`Trainer`类的`run`函数,进行训练\n", | |||
| "\n", | |||
| "  其中,可以通过参数`num_train_batch_per_epoch`决定每个`epoch`运行多少个`batch`后停止,默认全部\n", | |||
| "\n", | |||
| "  此外,可以通过`inspect.getfullargspec(trainer.run)`查询`run`函数的全部参数列表" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 9, | |||
| "execution_count": 7, | |||
| "id": "ba047ead", | |||
| "metadata": { | |||
| "pycharm": { | |||
| "is_executing": false | |||
| "is_executing": true | |||
| } | |||
| }, | |||
| "outputs": [ | |||
| @@ -585,29 +546,27 @@ | |||
| "trainer.run()" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "markdown", | |||
| "id": "eb8ca6cf", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "## 3. 使用 evaluator 评测模型" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "markdown", | |||
| "id": "c16c5fa4", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "模型训练好了我们开始使用 Evaluator 进行评测,查看效果怎么样吧。" | |||
| "通过从`fastNLP`库中导入`Evaluator`类,初始化`evaluator`实例,对模型进行评测\n", | |||
| "\n", | |||
| "  需要导入预先定义好的模型`model`、对应的数据加载模块`evaluate_dataloader`\n", | |||
| "\n", | |||
| "  需要注意的是评测方法`metrics`,设定为形如`{'acc': fastNLP.core.metrics.Accuracy()}`的字典\n", | |||
| "\n", | |||
| "  类似地,也可以通过`progress_bar`限定进度条格式,默认为`\"auto\"`" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 10, | |||
| "execution_count": 8, | |||
| "id": "1c6b6b36", | |||
| "metadata": { | |||
| "pycharm": { | |||
| "is_executing": false | |||
| "is_executing": true | |||
| } | |||
| }, | |||
| "outputs": [], | |||
| @@ -617,100 +576,32 @@ | |||
| "\n", | |||
| "evaluator = Evaluator(\n", | |||
| " model=model,\n", | |||
| " driver=trainer.driver, # 使用 trainer 已经启动的 driver;\n", | |||
| " driver=trainer.driver, # 需要使用 trainer 已经启动的 driver\n", | |||
| " device=None,\n", | |||
| " dataloaders=evaluate_dataloader,\n", | |||
| " metrics={'acc': Accuracy()} # 注意这里一定得是一个字典;\n", | |||
| " metrics={'acc': Accuracy()} # 需要严格使用此种形式的字典\n", | |||
| ")" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 11, | |||
| "id": "257061df", | |||
| "metadata": { | |||
| "scrolled": true | |||
| }, | |||
| "outputs": [ | |||
| { | |||
| "data": { | |||
| "text/plain": [ | |||
| "['__annotations__',\n", | |||
| " '__class__',\n", | |||
| " '__delattr__',\n", | |||
| " '__dict__',\n", | |||
| " '__dir__',\n", | |||
| " '__doc__',\n", | |||
| " '__eq__',\n", | |||
| " '__format__',\n", | |||
| " '__ge__',\n", | |||
| " '__getattribute__',\n", | |||
| " '__gt__',\n", | |||
| " '__hash__',\n", | |||
| " '__init__',\n", | |||
| " '__init_subclass__',\n", | |||
| " '__le__',\n", | |||
| " '__lt__',\n", | |||
| " '__module__',\n", | |||
| " '__ne__',\n", | |||
| " '__new__',\n", | |||
| " '__reduce__',\n", | |||
| " '__reduce_ex__',\n", | |||
| " '__repr__',\n", | |||
| " '__setattr__',\n", | |||
| " '__sizeof__',\n", | |||
| " '__str__',\n", | |||
| " '__subclasshook__',\n", | |||
| " '__weakref__',\n", | |||
| " '_dist_sampler',\n", | |||
| " '_evaluate_batch_loop',\n", | |||
| " '_evaluate_step',\n", | |||
| " '_evaluate_step_signature_fn',\n", | |||
| " '_metric_wrapper',\n", | |||
| " '_metrics',\n", | |||
| " 'dataloaders',\n", | |||
| " 'device',\n", | |||
| " 'driver',\n", | |||
| " 'evaluate_batch_loop',\n", | |||
| " 'evaluate_batch_step_fn',\n", | |||
| " 'evaluate_fn',\n", | |||
| " 'evaluate_step',\n", | |||
| " 'finally_progress_bar',\n", | |||
| " 'get_dataloader_metric',\n", | |||
| " 'input_mapping',\n", | |||
| " 'metrics',\n", | |||
| " 'metrics_wrapper',\n", | |||
| " 'model',\n", | |||
| " 'model_use_eval_mode',\n", | |||
| " 'move_data_to_device',\n", | |||
| " 'output_mapping',\n", | |||
| " 'progress_bar',\n", | |||
| " 'remove_progress_bar',\n", | |||
| " 'reset',\n", | |||
| " 'run',\n", | |||
| " 'separator',\n", | |||
| " 'start_progress_bar',\n", | |||
| " 'update',\n", | |||
| " 'update_progress_bar',\n", | |||
| " 'verbose']" | |||
| ] | |||
| }, | |||
| "execution_count": 11, | |||
| "metadata": {}, | |||
| "output_type": "execute_result" | |||
| } | |||
| ], | |||
| "cell_type": "markdown", | |||
| "id": "8157bb9b", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "dir(evaluator)" | |||
| "通过使用`Evaluator`类的`run`函数,进行训练\n", | |||
| "\n", | |||
| "  其中,可以通过参数`num_eval_batch_per_dl`决定每个`evaluate_dataloader`运行多少个`batch`停止,默认全部\n", | |||
| "\n", | |||
| "  最终,输出形如`{'acc#acc': acc}`的字典,中间的进度条会在运行结束后丢弃掉(???)" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 12, | |||
| "execution_count": 9, | |||
| "id": "f7cb0165", | |||
| "metadata": { | |||
| "pycharm": { | |||
| "is_executing": false | |||
| "is_executing": true | |||
| } | |||
| }, | |||
| "outputs": [ | |||
| @@ -750,11 +641,11 @@ | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'acc#acc'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.3</span><span style=\"font-weight: bold\">}</span>\n", | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'acc#acc'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.43</span><span style=\"font-weight: bold\">}</span>\n", | |||
| "</pre>\n" | |||
| ], | |||
| "text/plain": [ | |||
| "\u001b[1m{\u001b[0m\u001b[32m'acc#acc'\u001b[0m: \u001b[1;36m0.3\u001b[0m\u001b[1m}\u001b[0m\n" | |||
| "\u001b[1m{\u001b[0m\u001b[32m'acc#acc'\u001b[0m: \u001b[1;36m0.43\u001b[0m\u001b[1m}\u001b[0m\n" | |||
| ] | |||
| }, | |||
| "metadata": {}, | |||
| @@ -763,10 +654,10 @@ | |||
| { | |||
| "data": { | |||
| "text/plain": [ | |||
| "{'acc#acc': 0.3}" | |||
| "{'acc#acc': 0.43}" | |||
| ] | |||
| }, | |||
| "execution_count": 12, | |||
| "execution_count": 9, | |||
| "metadata": {}, | |||
| "output_type": "execute_result" | |||
| } | |||
| @@ -780,39 +671,37 @@ | |||
| "id": "dd9f68fa", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "## 4. 在 trainer 中加入 metric 来自动评测;" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "markdown", | |||
| "id": "ca97c9a4", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "现在我们尝试在训练过程中进行评测。" | |||
| "### 3.2 trainer 内部初始化的 evaluator \n", | |||
| "\n", | |||
| "通过在初始化`trainer`实例时加入`evaluate_dataloaders`和`metrics`,可以实现在训练过程中进行评测\n", | |||
| "\n", | |||
| "  通过`progress_bar`同时设定训练和评估进度条格式,训练结束后进度条会不显示(???)\n", | |||
| "\n", | |||
| "  **通过`evaluate_every`设定评估频率**,可以为负数、正数或者函数:\n", | |||
| "\n", | |||
| "    **为负数时**,**表示每隔几个`epoch`评估一次**;**为正数时**,**则表示每隔几个`batch`评估一次**" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 13, | |||
| "execution_count": 10, | |||
| "id": "183c7d19", | |||
| "metadata": { | |||
| "pycharm": { | |||
| "is_executing": false | |||
| "is_executing": true | |||
| } | |||
| }, | |||
| "outputs": [], | |||
| "source": [ | |||
| "# 重新定义一个 Trainer\n", | |||
| "\n", | |||
| "trainer = Trainer(\n", | |||
| " model=model,\n", | |||
| " driver=trainer.driver, # 因为我们是在同一脚本中,因此这里的 driver 同样需要重用;\n", | |||
| " driver=trainer.driver, # 因为是在同个脚本中,这里的 driver 同样需要重用\n", | |||
| " train_dataloader=train_dataloader,\n", | |||
| " evaluate_dataloaders=evaluate_dataloader,\n", | |||
| " metrics={'acc': Accuracy()},\n", | |||
| " optimizers=optimizer,\n", | |||
| " n_epochs=10, # 训练 40 个 epoch;\n", | |||
| " evaluate_every=-1, # 表示每一个 epoch 的结束会进行 evaluate;\n", | |||
| " n_epochs=10, \n", | |||
| " evaluate_every=-1, # 表示每个 epoch 的结束进行评估\n", | |||
| ")" | |||
| ] | |||
| }, | |||
| @@ -821,16 +710,18 @@ | |||
| "id": "714cc404", | |||
| "metadata": {}, | |||
| "source": [ | |||
| "再次训练。" | |||
| "通过使用`Trainer`类的`run`函数,进行训练\n", | |||
| "\n", | |||
| "  还可以通过参数`num_eval_sanity_batch`决定每次训练前运行多少个`evaluate_batch`进行评测,默认为2" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 14, | |||
| "execution_count": 11, | |||
| "id": "2e4daa2c", | |||
| "metadata": { | |||
| "pycharm": { | |||
| "is_executing": false | |||
| "is_executing": true | |||
| } | |||
| }, | |||
| "outputs": [ | |||
| @@ -884,96 +775,6 @@ | |||
| "source": [ | |||
| "trainer.run()" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 15, | |||
| "id": "eabda5eb", | |||
| "metadata": {}, | |||
| "outputs": [], | |||
| "source": [ | |||
| "evaluator = Evaluator(\n", | |||
| " model=model,\n", | |||
| " driver=trainer.driver, # 使用 trainer 已经启动的 driver;\n", | |||
| " dataloaders=evaluate_dataloader,\n", | |||
| " metrics={'acc': Accuracy()} # 注意这里一定得是一个字典;\n", | |||
| ")" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": 16, | |||
| "id": "a310d157", | |||
| "metadata": {}, | |||
| "outputs": [ | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n" | |||
| ], | |||
| "text/plain": [] | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"></pre>\n" | |||
| ], | |||
| "text/plain": [] | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n", | |||
| "</pre>\n" | |||
| ], | |||
| "text/plain": [ | |||
| "\n" | |||
| ] | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/html": [ | |||
| "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'acc#acc'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.5</span><span style=\"font-weight: bold\">}</span>\n", | |||
| "</pre>\n" | |||
| ], | |||
| "text/plain": [ | |||
| "\u001b[1m{\u001b[0m\u001b[32m'acc#acc'\u001b[0m: \u001b[1;36m0.5\u001b[0m\u001b[1m}\u001b[0m\n" | |||
| ] | |||
| }, | |||
| "metadata": {}, | |||
| "output_type": "display_data" | |||
| }, | |||
| { | |||
| "data": { | |||
| "text/plain": [ | |||
| "{'acc#acc': 0.5}" | |||
| ] | |||
| }, | |||
| "execution_count": 16, | |||
| "metadata": {}, | |||
| "output_type": "execute_result" | |||
| } | |||
| ], | |||
| "source": [ | |||
| "evaluator.run()" | |||
| ] | |||
| }, | |||
| { | |||
| "cell_type": "code", | |||
| "execution_count": null, | |||
| "id": "f1ef78f0", | |||
| "metadata": {}, | |||
| "outputs": [], | |||
| "source": [] | |||
| } | |||
| ], | |||
| "metadata": { | |||