Browse Source

删除core.sampelrs.sampler.py;增加torch的clipgradient和warmupcallback

tags/v1.0.0alpha
yh_cc 2 years ago
parent
commit
7c70874b4a
14 changed files with 275 additions and 770 deletions
  1. +5
    -1
      fastNLP/core/callbacks/__init__.py
  2. +8
    -0
      fastNLP/core/callbacks/torch_callbacks/__init__.py
  3. +52
    -0
      fastNLP/core/callbacks/torch_callbacks/torch_grad_clip_callback.py
  4. +58
    -0
      fastNLP/core/callbacks/torch_callbacks/torch_lr_sched_callback.py
  5. +1
    -1
      fastNLP/core/drivers/driver.py
  6. +7
    -1
      fastNLP/core/drivers/torch_driver/single_device.py
  7. +0
    -6
      fastNLP/core/samplers/__init__.py
  8. +0
    -728
      fastNLP/core/samplers/sampler.py
  9. +1
    -2
      fastNLP/io/loader/conll.py
  10. +0
    -0
      tests/core/callbacks/torch_callbacks/__init__.py
  11. +41
    -0
      tests/core/callbacks/torch_callbacks/test_torch_grad_clip_callback.py
  12. +34
    -0
      tests/core/callbacks/torch_callbacks/test_torch_warmup_callback.py
  13. +0
    -31
      tests/core/samplers/test_sampler.py
  14. +68
    -0
      tests/helpers/callbacks/prepare_trainer_args_for_torch_test.py

+ 5
- 1
fastNLP/core/callbacks/__init__.py View File

@@ -11,7 +11,10 @@ __all__ = [
'RichCallback', 'RichCallback',
"LRSchedCallback", "LRSchedCallback",
'LoadBestModelCallback', 'LoadBestModelCallback',
"EarlyStopCallback"
"EarlyStopCallback",

"TorchWarmupCallback",
"TorchGradClipCallback"
] ]




@@ -23,4 +26,5 @@ from .progress_callback import choose_progress_callback, ProgressCallback, RichC
from .lr_scheduler_callback import LRSchedCallback from .lr_scheduler_callback import LRSchedCallback
from .load_best_model_callback import LoadBestModelCallback from .load_best_model_callback import LoadBestModelCallback
from .early_stop_callback import EarlyStopCallback from .early_stop_callback import EarlyStopCallback
from .torch_callbacks import *



+ 8
- 0
fastNLP/core/callbacks/torch_callbacks/__init__.py View File

@@ -0,0 +1,8 @@
__all__ = [
'TorchWarmupCallback',
'TorchGradClipCallback'
]


from .torch_lr_sched_callback import TorchWarmupCallback
from .torch_grad_clip_callback import TorchGradClipCallback

+ 52
- 0
fastNLP/core/callbacks/torch_callbacks/torch_grad_clip_callback.py View File

@@ -0,0 +1,52 @@
__all__ = [
'TorchGradClipCallback'
]
from ..callback import Callback


class TorchGradClipCallback(Callback):
def __init__(self, clip_value=1, clip_type='norm', parameters=None):
r"""
在每次 optimizer update 之前将 parameter 进行 clip

:param float clip_value: 将gradient 限制到[-clip_value, clip_value]。clip_value应该为正数
:param str clip_type: 支持'norm', 'value'
两种::

1 'norm', 将gradient的norm rescale到[-clip_value, clip_value]

2 'value', 将gradient限制在[-clip_value, clip_value],
小于-clip_value的gradient被赋值为-clip_value;
大于clip_value的gradient被赋值为clip_value.
:param None,torch.Tensor,List[torch.Tensor] parameters: 一般通过model.parameters()获得。
如果为None则默认对 Trainer 的 optimizers 中所有参数进行梯度裁剪。
"""
super().__init__()

from torch import nn
if clip_type == 'norm':
self.clip_fun = nn.utils.clip_grad_norm_
elif clip_type == 'value':
self.clip_fun = nn.utils.clip_grad_value_
else:
raise ValueError("Only supports `norm` or `value` right now.")
if parameters is not None:
self.parameters = list(parameters)
else:
self.parameters = None
self.clip_value = clip_value

def on_after_trainer_initialized(self, trainer, driver):
assert 'torch' in driver.__class__.__name__.lower(), f"Callback:{self.__class__.__name__} only supports torch " \
f"related drivers for now."
parameters = []
for optimizer in trainer.driver.optimizers:
for param_group in optimizer.param_groups:
parameters.extend(param_group['params'])
self.parameters = parameters
assert len(self.parameters), "There is no parameters need to be clipped."

def on_before_optimizers_step(self, trainer, optimizers):
for optimizer in trainer.driver.optimizers:
trainer.driver.grad_scaler.unscale_(optimizer)
self.clip_fun(self.parameters, self.clip_value)

+ 58
- 0
fastNLP/core/callbacks/torch_callbacks/torch_lr_sched_callback.py View File

@@ -0,0 +1,58 @@
__all__ = [
'TorchWarmupCallback'
]
import math

from ..callback import Callback


class TorchWarmupCallback(Callback):
def __init__(self, warmup=0.1, schedule='constant'):
r"""
调整 learning rate 的 callback 。仅在实际发生参数更新的情况下

:param int,float warmup: 如果warmup为int,则在该step之前,learning rate根据schedule的策略变化; 如果warmup为float,
如0.1, 则前10%的step是按照schedule策略调整learning rate。
:param str schedule: 以哪种方式调整。
linear: 前warmup的step上升到指定的learning rate(从Trainer中的optimizer处获取的), 后warmup的step下降到0;
constant前warmup的step上升到指定learning rate,后面的step保持learning rate.
"""
super().__init__()
self.warmup = max(warmup, 0.)

self.initial_lrs = [] # 存放param_group的learning rate
if schedule == 'constant':
self.get_lr = self._get_constant_lr
elif schedule == 'linear':
self.get_lr = self._get_linear_lr
else:
raise RuntimeError("Only support 'linear', 'constant'.")

def _get_constant_lr(self, progress):
if progress <self.warmup:
return progress /self.warmup
return 1

def _get_linear_lr(self, progress):
if progress <self.warmup:
return progress /self.warmup
return max((progress - 1.) / (self.warmup - 1.), 0.)

def on_train_begin(self, trainer):
self.t_steps = trainer.total_batches
if self.warmup >1:
self.warmup = self.warmup / self.t_steps
self.t_steps = max(2, self.t_steps) # 不能小于2
# 防止 t_steps 不能整除 accumulation_steps
self.t_steps = math.ceil(self.t_steps/trainer.accumulation_steps) * trainer.accumulation_steps
# 获取param_group的初始learning rate
for optimizer in trainer.driver.optimizers:
for group in optimizer.param_groups:
self.initial_lrs.append(group['lr'])

def on_before_optimizers_step(self, trainer, optimizers):
# 这里需要加 accumulation_steps 是防止 lr 从 0 开始
progress = (trainer.global_forward_batches + trainer.accumulation_steps) / self.t_steps
for optimizer in trainer.driver.optimizers:
for lr, group in zip(self.initial_lrs, optimizer.param_groups):
group['lr'] = lr * self.get_lr(progress)

+ 1
- 1
fastNLP/core/drivers/driver.py View File

@@ -129,7 +129,7 @@ class Driver(ABC):
@property @property
def optimizers(self) -> List: def optimizers(self) -> List:
r""" r"""
如下所示,driver 返回的 optimizers 一定是一个 List,如果用户直接向 Trainer 传入一个单独的 optimzer,我们会使用一个 List 将其
如下所示,driver 返回的 optimizers 一定是一个 List,如果用户直接向 Trainer 传入一个单独的 optimizer,我们会使用一个 List 将其
包裹; 包裹;


:return: List[optimizer0, optimizer1, optimizer2, ...] :return: List[optimizer0, optimizer1, optimizer2, ...]


+ 7
- 1
fastNLP/core/drivers/torch_driver/single_device.py View File

@@ -37,7 +37,12 @@ class TorchSingleDriver(TorchDriver):
super(TorchSingleDriver, self).__init__(model, fp16=fp16, **kwargs) super(TorchSingleDriver, self).__init__(model, fp16=fp16, **kwargs)


if device is None: if device is None:
raise ValueError("Parameter `device` can not be None in `TorchSingleDriver`.")
logger.debug("device is not set, fastNLP will try to automatically get it.")
try:
device = next(model.parameters()).device
assert isinstance(device, torch.device)
except:
raise ValueError("fastNLP cannot get device automatically, please set device explicitly.")


self.model_device = device self.model_device = device


@@ -70,6 +75,7 @@ class TorchSingleDriver(TorchDriver):


return self.model, model.forward return self.model, model.forward
else: else:
# TODO 这种直接调用模型某个接口的方法无法触发hook,也许需要做一个warning,如果用户有钩子,提醒他train_step无法触发。
if hasattr(self.model, fn): if hasattr(self.model, fn):
fn = getattr(self.model, fn) fn = getattr(self.model, fn)
if not callable(fn): if not callable(fn):


+ 0
- 6
fastNLP/core/samplers/__init__.py View File

@@ -1,9 +1,4 @@
__all__ = [ __all__ = [
'BucketSampler',
'SortedSampler',
'ConstTokenNumSampler',
'ConstantTokenNumSampler',

'MixSampler', 'MixSampler',
'DopedSampler', 'DopedSampler',
'MixSequentialSampler', 'MixSequentialSampler',
@@ -26,7 +21,6 @@ __all__ = [
"re_instantiate_sampler" "re_instantiate_sampler"
] ]


from .sampler import BucketSampler, SortedSampler, ConstTokenNumSampler, ConstantTokenNumSampler
from .unrepeated_sampler import UnrepeatedSampler, UnrepeatedRandomSampler, UnrepeatedSortedSampler, UnrepeatedSequentialSampler from .unrepeated_sampler import UnrepeatedSampler, UnrepeatedRandomSampler, UnrepeatedSortedSampler, UnrepeatedSequentialSampler
from .mix_sampler import MixSampler, DopedSampler, MixSequentialSampler, PollingSampler from .mix_sampler import MixSampler, DopedSampler, MixSequentialSampler, PollingSampler
from .reproducible_sampler import ReproducibleSampler, RandomSampler, SequentialSampler, SortedSampler from .reproducible_sampler import ReproducibleSampler, RandomSampler, SequentialSampler, SortedSampler


+ 0
- 728
fastNLP/core/samplers/sampler.py View File

@@ -1,728 +0,0 @@
r"""
sampler 子类实现了 fastNLP 所需的各种采样器。
"""

__all__ = [
"BucketSampler",
"SortedSampler",
'ConstTokenNumSampler',
"ConstantTokenNumSampler",
]

from itertools import chain
from typing import List, Iterable

import numpy as np

from fastNLP.envs.imports import _NEED_IMPORT_TORCH

if _NEED_IMPORT_TORCH:
from torch.utils.data import Sampler
else:
from fastNLP.core.utils.dummy_class import DummyClass as Sampler

# class DopedSampler(Sampler):
# """
# 定制给MixDataLoader的BatchSampler,其功能是将传入的datasets的list列表混合采样组成一个个batch返回。
# """
#
# def __init__(self, dataset: Union[List, Dict], batch_size: int = None,
# sampler: Union[List[Sampler], Dict[str, Sampler]] = None,
# ds_ratio: Union[str, None, List[float], Dict[str, float]] = None, drop_last: bool = False) -> None:
# if batch_size <= 0:
# raise ValueError("batch_size should be a positive integer value, "
# "but got batch_size={}".format(batch_size))
# if not isinstance(drop_last, bool):
# raise ValueError("drop_last should be a boolean value, but got "
# "drop_last={}".format(drop_last))
# self.batch_size = batch_size
# self.drop_last = drop_last
# self.ds_ratio = ds_ratio
# if sampler is None:
# if isinstance(dataset, List):
# self.sampler = [SequentialSampler(ds) for ds in dataset]
# elif isinstance(dataset, Dict):
# self.sampler = {name: SequentialSampler(ds) for name, ds in dataset.items()}
#
# elif isinstance(sampler, List):
# if len(sampler) != len(dataset):
# raise ValueError("the length of sampler != the length of sampler")
# self.sampler = sampler
# else:
# self.sampler = sampler
# if ds_ratio == 'pad_to_most' or ds_ratio == 'truncate_to_least' or ds_ratio is None:
# self.ds_ratio = ds_ratio
# elif isinstance(ds_ratio, List):
# if not all(item >= 0 for item in ds_ratio):
# raise ValueError("batch_size should be a positive integer value, "
# "but got batch_size={}".format(ds_ratio))
# self.ds_ratio = ds_ratio
# else:
# raise ValueError(f"{ds_ratio} must be pad_to_least or truncate_to_least or None")
#
# def __iter__(self):
# samplers, index = [], 0
# if isinstance(self.sampler, List):
# for idx, sampler in enumerate(self.sampler):
# samplers.append((iter(sampler), self.batch_size, index, 0, idx))
# index += len(sampler)
# elif isinstance(self.sampler, Dict):
# for name, sampler in self.sampler.items():
# samplers.append((iter(sampler), self.batch_size, index, 0, name))
# index += len(sampler)
#
# def __len__(self):
# lens = 0
# max_len, ds_len = 0, 0
# if self.ds_ratio == 'truncate_to_least':
# if isinstance(self.sampler, List):
# max_len = min(len(sampler) for sampler in self.sampler)
# ds_len = len(self.sampler)
# elif isinstance(self.sampler, Dict):
# max_len = min(len(sampler) for _, sampler in self.sampler.items())
# for _, _ in self.sampler.items():
# ds_len += 1
#
# elif self.ds_ratio == 'pad_to_most':
# if isinstance(self.sampler, List):
# max_len = max(len(sampler) for sampler in self.sampler)
# ds_len = len(self.sampler)
# elif isinstance(self.sampler, Dict):
# max_len = max(len(sampler) for _, sampler in self.sampler.items())
# for _, _ in self.sampler.items():
# ds_len += 1
#
# if self.ds_ratio is None:
# if isinstance(self.sampler, List):
# for i in range(len(self.sampler)):
# sampler = self.sampler[i]
# if self.drop_last:
# lens += len(sampler) // self.batch_size
# else:
# lens += (len(sampler) + self.batch_size - 1) // self.batch_size
# elif isinstance(self.sampler, Dict):
# for name, sampler in self.sampler.items():
# if self.drop_last:
# lens += len(sampler) // self.batch_size
# else:
# lens += (len(sampler) + self.batch_size - 1) // self.batch_size
# elif self.ds_ratio == 'truncate_to_least' or self.ds_ratio == 'pad_to_most':
# for i in range(ds_len):
# if self.drop_last:
# lens += max_len // self.batch_size
# else:
# lens += (max_len + self.batch_size - 1) // self.batch_size
# return lens
#
# def demo(self):
# indexes = np.array([0]*self.batch_size + [1]*self.batch_size + [2]*self.batch_size)
# shift = np.array([0]*self.batch_size + [len(ds1)]*self.batch_size + [len(ds1)+len(ds2)]*self.batch_size)
# buffer = np.zeros(self.batch_size*self.num_ds, dtype=int)
# select_sampler = np.random.randint(0, self.batch_size*self.num_ds, num_sample=self.batch_size)
# select_indices = buffer[select_sampler] + shift[select_sampler]
# num_1 = (indexes[select_sampler]==0).sum()
#


# class MixSequentialSampler(Sampler):
# """
# 定制给MixDataLoader的BatchSampler,其功能是将传入的datasets的list列表顺序采样并返回index,只有处理了上一个dataset才会处理下一个。
# """
#
# def __init__(self, dataset: Union[List, Dict], batch_size: int = None,
# sampler: Union[List[Sampler], Dict[str, Sampler], None] = None,
# drop_last: bool = False) -> None:
# """
#
# :param dataset: 实现了__getitem__和__len__的数据容器列表
# :param batch_size: 对应dataset的批次大小,可以为list或者为int,当为int时默认所有dataset
# :param sampler: 实例化好的sampler,每个dataset对应一个sampler对象
# :param drop_last: 是否去掉最后一个batch的数据,其长度小于batch_size
# """
# # 如果dataset为Dict,则其他参数如collate_fn必须为Dict或者Callable,
# if isinstance(dataset, Dict) and isinstance(sampler, List):
# raise ValueError(f"{sampler} must be dict")
#
# # 判断batch_size是否大于等于0
# if batch_size <= 0:
# raise ValueError("batch_size should be a positive integer value, "
# "but got batch_size={}".format(batch_size))
#
# if not isinstance(drop_last, bool):
# raise ValueError("drop_last should be a boolean value, but got "
# "drop_last={}".format(drop_last))
# self.batch_size = batch_size
# self.drop_last = drop_last
# if sampler is None:
# if isinstance(dataset, List):
# self.sampler = [SequentialSampler(ds) for ds in dataset]
# elif isinstance(dataset, Dict):
# self.sampler = {name: SequentialSampler(ds) for name, ds in dataset.items()}
# elif isinstance(sampler, List):
# if len(sampler) != len(dataset):
# raise ValueError("the length of sampler != the length of sampler")
# self.sampler = sampler
#
# def __iter__(self) -> Iterable[List[int]]:
# """
# 按照dataset的顺序采样,打包成一个batch后返回
# :return:
# """
# index = 0
# batch = []
# if isinstance(self. sampler, List):
# for i in range(len(self.sampler)):
# sampler = self.sampler[i]
# for idx in sampler:
# batch.append(idx + index)
# if len(batch) == self.batch_size:
# yield batch
# batch = []
# if len(batch) > 0 and not self.drop_last:
# yield batch
# batch = []
# index += len(sampler)
# elif isinstance(self.sampler, Dict):
# for name, sampler in self.sampler.items():
# for idx in sampler:
# batch.append(idx + index)
# if len(batch) == self.batch_size:
# yield batch
# batch = []
# if len(batch) > 0 and not self.drop_last:
# yield batch
# batch = []
# index += len(sampler)
#
# def __len__(self) -> int:
# lens = 0
# if isinstance(self.sampler, List):
# for i in range(len(self.sampler)):
# sampler = self.sampler[i]
# if self.drop_last:
# lens += len(sampler) // self.batch_size
# else:
# lens += (len(sampler) + self.batch_size - 1) // self.batch_size
# elif isinstance(self.sampler, Dict):
# for _, sampler in self.sampler.items():
# if self.drop_last:
# lens += len(sampler) // self.batch_size
# else:
# lens += (len(sampler) + self.batch_size - 1) // self.batch_size
# return lens


# class PollingSampler(Sampler):
# """
# 定制给MixDataLoader的BatchSampler,其功能是将传入的datasets的list列表轮流采样并返回index,处理了上个dataset的一个batch后会处理下一个。
# """
#
# def __init__(self, dataset: Union[List, Dict], batch_size: int = 16,
# sampler: Union[List[Sampler], Dict[str, Sampler]] = None,
# drop_last: bool = False, ds_ratio="pad_to_most") -> None:
# """
#
# :param dataset: 实现了__getitem__和__len__的数据容器列表
# :param batch_size: 对应dataset的批次大小,可以为list或者为int,当为int时默认所有dataset
# :param sampler: 实例化好的sampler,每个dataset对应一个sampler对象
# :param drop_last: 是否去掉最后一个batch的数据,其长度小于batch_size
# :param ds_ratio: 当ds_ratio=None时候, 轮流采样dataset列表直至所有的数据集采样完;当ds_ratio='truncate_to_least'时,
# 以dataset列表最短的ds为基准,长的数据集会被截断;当ds_ratio='pad_to_most'时,以dataset列表最长ds为基准,短的数据集会被重采样
# """
# # 如果dataset为Dict,则其他参数如collate_fn必须为Dict或者Callable,
# if isinstance(dataset, Dict) and isinstance(sampler, List):
# raise ValueError(f"{sampler} must be dict")
# if isinstance(dataset, List) and isinstance(sampler, Dict):
# raise ValueError(f"{sampler} must be list")
# # 判断batch_size是否大于等于0
# if batch_size <= 0:
# raise ValueError("batch_size should be a positive integer value, "
# "but got batch_size={}".format(batch_size))
#
# if not isinstance(drop_last, bool):
# raise ValueError("drop_last should be a boolean value, but got "
# "drop_last={}".format(drop_last))
#
# self.batch_size = batch_size
# self.drop_last = drop_last
# if sampler is None:
# if isinstance(dataset, List):
# self.sampler = [SequentialSampler(ds) for ds in dataset]
# elif isinstance(dataset, Dict):
# self.sampler = {name: SequentialSampler(ds) for name, ds in dataset.items()}
#
# elif isinstance(sampler, List):
# if len(sampler) != len(dataset):
# raise ValueError("the length of sampler != the length of sampler")
# self.sampler = sampler
# else:
# self.sampler = sampler
# if ds_ratio == 'pad_to_most' or ds_ratio == 'truncate_to_least' or ds_ratio is None:
# self.ds_ratio = ds_ratio
# else:
# raise ValueError(f"{ds_ratio} must be pad_to_least or truncate_to_least or None")
#
# def __iter__(self) -> Iterable[List[int]]:
# # index是数据集下标基址, pointer指向数据集列表的某个数据集
# index, pointer, samplers, flag = 0, 0, [], False
#
# if isinstance(self.sampler, List):
# for idx, sampler in enumerate(self.sampler):
# samplers.append((iter(sampler), self.batch_size, index, 0, idx))
# index += len(sampler)
# elif isinstance(self.sampler, Dict):
# for name, sampler in self.sampler.items():
# samplers.append((iter(sampler), self.batch_size, index, 0, name))
# index += len(sampler)
# if self.ds_ratio == 'pad_to_most':
# if isinstance(self.sampler, List):
# limit_len = max(len(ds) for ds in self.sampler)
# else:
# limit_len = max(len(ds) for _, ds in self.sampler.items())
# elif self.ds_ratio == 'truncate_to_least':
# if isinstance(self.sampler, List):
# limit_len = min(len(ds) for ds in self.sampler)
# else:
# limit_len = min(len(ds) for _, ds in self.sampler.items())
# else:
# limit_len = 0
# # 最后一个批次的大小
# last_batch_size = limit_len % self.batch_size
#
# while True:
# # 全部采样完,退出
# if len(samplers) == 0:
# break
# batch, flag = [], False
# # sampler_len代表已经取出来的数据个数
# sampler, batch_size, index, sampler_len, name = samplers.pop(0)
# for _ in range(batch_size):
# try:
# batch.append(index + next(sampler))
# sampler_len += 1
# except StopIteration:
# flag = True
# # ds_ratio为None,第一种情况,删除掉采样完的数据即可。
# if self.ds_ratio == 'pad_to_most' and sampler_len < limit_len:
# # 重置sampler,并取足一个batch数据
# sampler = iter(self.sampler[name])
# # 由于batch_size一定小于等于ds的长度,故能够取足一个batch_size的数据
# for _ in range(batch_size-len(batch)):
# batch.append(next(sampler) + index)
# sampler_len += 1
# break
#
# # ds_ratio不为None情况
# # 两种情况会触发一下逻辑:1.truncate_to_least时,最短的数据集最后一个batch大小不等于batch_size时,
# # 其他较长的数据集的最后一个batch长度会较长;2. pad_to_most,最长的数据集最后一个batch不等于batch_size时,较短数据集最后一个
# # batch长度会较长
# if limit_len != 0 and limit_len < sampler_len:
# batch = batch[:last_batch_size]
# # ds_ratio为任意情况下, 没有取完所有数据,则添加到队列尾部
# elif (limit_len == 0 and flag == False) or limit_len > sampler_len:
# samplers.append((sampler, batch_size, index, sampler_len, name))
# if len(batch) == batch_size:
# yield batch
# elif len(batch) > 0 and not self.drop_last:
# yield batch
#
# def __len__(self) -> int:
# lens = 0
# max_len, ds_len = 0, 0
# if self.ds_ratio == 'truncate_to_least':
# if isinstance(self.sampler, List):
# max_len = min(len(sampler) for sampler in self.sampler)
# ds_len = len(self.sampler)
# elif isinstance(self.sampler, Dict):
# max_len = min(len(sampler) for _, sampler in self.sampler.items())
# for _, _ in self.sampler.items():
# ds_len += 1
#
# elif self.ds_ratio == 'pad_to_most':
# if isinstance(self.sampler, List):
# max_len = max(len(sampler) for sampler in self.sampler)
# ds_len = len(self.sampler)
# elif isinstance(self.sampler, Dict):
# max_len = max(len(sampler) for _, sampler in self.sampler.items())
# for _, _ in self.sampler.items():
# ds_len += 1
# if self.ds_ratio is None:
# if isinstance(self.sampler, List):
# for i in range(len(self.sampler)):
# sampler = self.sampler[i]
# if self.drop_last:
# lens += len(sampler) // self.batch_size
# else:
# lens += (len(sampler) + self.batch_size - 1) // self.batch_size
# elif isinstance(self.sampler, Dict):
# for name, sampler in self.sampler.items():
# if self.drop_last:
# lens += len(sampler) // self.batch_size
# else:
# lens += (len(sampler) + self.batch_size - 1) // self.batch_size
# else:
# for i in range(ds_len):
# if self.drop_last:
# lens += max_len // self.batch_size
# else:
# lens += (max_len + self.batch_size - 1) // self.batch_size
# return lens


class BucketSampler(Sampler):
r"""
带Bucket的 `Random Sampler`. 可以随机地取出长度相似的元素
"""

def __init__(self, dataset, num_buckets=10, batch_size=None, seq_len_field_name='seq_len', drop_last=False) -> None:
r"""
:param int num_buckets: bucket的数量
:param int batch_size: batch的大小. 默认为None,Trainer/Tester在调用BucketSampler时,会将该值正确设置,如果是非
Trainer/Tester场景使用,需要显示传递该值
:param str seq_len_field_name: 对应序列长度的 `field` 的名字
"""
self.dataset = dataset
self.num_buckets = num_buckets
self.batch_size = batch_size
self.seq_len_field_name = seq_len_field_name

def set_batch_size(self, batch_size) -> None:
r"""

:param int batch_size: 每个batch的大小
:return:
"""
self.batch_size = batch_size

def __iter__(self):
if self.batch_size is None:
raise RuntimeError("batch_size is None.")
seq_lens = self.dataset.get_all_fields()[self.seq_len_field_name].content
total_sample_num = len(seq_lens)

bucket_indexes = []
assert total_sample_num >= self.num_buckets, "The number of samples is smaller than the number of buckets."
num_sample_per_bucket = total_sample_num // self.num_buckets
for i in range(self.num_buckets):
bucket_indexes.append([num_sample_per_bucket * i, num_sample_per_bucket * (i + 1)])
bucket_indexes[-1][1] = total_sample_num

sorted_seq_lens = list(sorted([(idx, seq_len) for
idx, seq_len in zip(range(total_sample_num), seq_lens)],
key=lambda x: x[1]))

batchs = []

left_init_indexes = []
for b_idx in range(self.num_buckets):
start_idx = bucket_indexes[b_idx][0]
end_idx = bucket_indexes[b_idx][1]
sorted_bucket_seq_lens = sorted_seq_lens[start_idx:end_idx]
left_init_indexes.extend([tup[0] for tup in sorted_bucket_seq_lens])
num_batch_per_bucket = len(left_init_indexes) // self.batch_size
np.random.shuffle(left_init_indexes)
for i in range(num_batch_per_bucket):
batchs.append(left_init_indexes[i * self.batch_size:(i + 1) * self.batch_size])
left_init_indexes = left_init_indexes[num_batch_per_bucket * self.batch_size:]
if (left_init_indexes) != 0:
batchs.append(left_init_indexes)
np.random.shuffle(batchs)

return chain(*batchs)


class ConstTokenNumSampler(Sampler):
"""
尽量保证每个batch的输入token数量是接近的。

"""

def __init__(self, dataset, seq_len_field_name: List[int], max_token: int = 4096, max_sentence: int = -1,
need_be_multiple_of: int = 1, num_bucket: int = -1) -> None:
"""

:param dataset:
:param List[int] seq_len_field_name: 哪个field指示的sample的长度
:param int max_token: 每个batch的最大的token数量
:param int max_sentence: 每个batch最多多少个instance, -1表示根据max_token决定
:param int need_be_multiple_of: 生成的batch的instance的数量需要是几的倍数,在DataParallel场景下会用到
:param int num_bucket: 将数据按长度拆分为num_bucket个bucket,batch中的sample尽量在bucket之中进行组合,这样可以减少padding。
"""
assert (max_sentence != -1 and max_sentence >= need_be_multiple_of) or max_sentence < 1
self.dataset = dataset
self.seq_len_field_name = seq_len_field_name
self.num_bucket = num_bucket
self.max_token = max_token
self._max_sentence = max_sentence
self.need_be_multiple_of = need_be_multiple_of

assert len(self.dataset) > self.num_bucket, "The number of samples should be larger than buckets."
seq_len = self.dataset.get_field(self.seq_len_field_name)
self.seq_len = seq_len
seq_len_indice = [(length, i) for i, length in enumerate(seq_len)]
seq_len_indice.sort(key=lambda x: x[0])
indice_in_buckets = []
if self.num_bucket > 0:
sample_per_bucket = len(seq_len_indice) // self.num_bucket
i = 0
while len(indice_in_buckets) < len(seq_len_indice):
indice_in_buckets.append(seq_len_indice[i * sample_per_bucket:(i + 1) * sample_per_bucket])
i += 1
else:
indice_in_buckets = [seq_len_indice]
self.indice_in_buckets = indice_in_buckets
self.get_new_order()

@property
def max_sentence(self):
if self._max_sentence < 1:
return 100000000
return self._max_sentence

@max_sentence.setter
def max_sentence(self, max_sentence):
self._max_sentence = max_sentence

def get_new_order(self) -> None:
np.random.shuffle(self.indice_in_buckets)
for bucket in self.indice_in_buckets:
np.random.shuffle(bucket)
indices = list(chain(*self.indice_in_buckets))
batches = []
cur_max_len = 0
batch = []
for length, i in indices:
max_len = max(length, cur_max_len)
if max_len * (len(batch) + 1) > self.max_token or len(batch) >= self.max_sentence:
left_sample = len(batch) % self.need_be_multiple_of
add_samples = batch.copy()
cur_max_len = length
if left_sample != 0:
add_samples = add_samples[:-left_sample]
batch = batch[-left_sample:]
cur_max_len = max(cur_max_len, max(batch))
else:
batch = []
if len(add_samples) == 0:
raise RuntimeError(
f"The sample `{i}` is too long to make a batch with {self.need_be_multiple_of} samples.")
batches.append(add_samples)
else:
cur_max_len = max_len
batch.append(i)
if batch:
left_sample = len(batch) % self.need_be_multiple_of
add_samples = batch.copy()
if left_sample != 0:
add_samples = add_samples[:-left_sample].copy()
if add_samples:
batches.append(add_samples)
np.random.shuffle(batches)
self.batches = batches

def __iter__(self) -> Iterable[int]:
for batch in self.batches:
yield batch
self.get_new_order()

def __len__(self):
return len(self.batches)


class ConstantTokenNumSampler:
"""
尽量保证每个batch的输入token数量是接近的。

"""

def __init__(self, seq_len, max_token: List[int] = 4096, max_sentence: int = -1,
need_be_multiple_of: int = 1, num_bucket: int = -1) -> None:
"""

:param List[int] seq_len: list[int], 是每个sample的长度。一般可以通过dataset.get_field('seq_len').content传入
:param int max_token: 每个batch的最大的token数量
:param int max_sentence: 每个batch最多多少个instance, -1表示根据max_token决定
:param int need_be_multiple_of: 生成的batch的instance的数量需要是几的倍数,在DataParallel场景下会用到
:param int num_bucket: 将数据按长度拆分为num_bucket个bucket,batch中的sample尽量在bucket之中进行组合,这样可以减少padding。
"""
assert (max_sentence != -1 and max_sentence >= need_be_multiple_of) or max_sentence < 1
assert len(seq_len) > num_bucket, "The number of samples should be larger than buckets."
self.seq_len = seq_len
self.max_token = max_token
self._max_sentence = max_sentence
self.need_be_multiple_of = need_be_multiple_of
seq_len_indice = [(length, i) for i, length in enumerate(seq_len)]
seq_len_indice.sort(key=lambda x: x[0])
indice_in_buckets = []
if num_bucket > 0:
sample_per_bucket = len(seq_len_indice) // num_bucket
i = 0
while len(indice_in_buckets) < len(seq_len_indice):
indice_in_buckets.append(seq_len_indice[i * sample_per_bucket:(i + 1) * sample_per_bucket])
i += 1
else:
indice_in_buckets = [seq_len_indice]
self.indice_in_buckets = indice_in_buckets
self.get_new_order()

@property
def max_sentence(self):
if self._max_sentence < 1:
return 100000000
return self._max_sentence

@max_sentence.setter
def max_sentence(self, max_sentence):
self._max_sentence = max_sentence

def get_new_order(self) -> None:
np.random.shuffle(self.indice_in_buckets)
for bucket in self.indice_in_buckets:
np.random.shuffle(bucket)
indices = list(chain(*self.indice_in_buckets))
batches = []
cur_max_len = 0
batch = []
for length, i in indices:
max_len = max(length, cur_max_len)
if max_len * (len(batch) + 1) > self.max_token or len(batch) >= self.max_sentence:
left_sample = len(batch) % self.need_be_multiple_of
add_samples = batch.copy()
cur_max_len = length
if left_sample != 0:
add_samples = add_samples[:-left_sample]
batch = batch[-left_sample:]
cur_max_len = max(cur_max_len, max(batch))
else:
batch = []
if len(add_samples) == 0:
raise RuntimeError(
f"The sample `{i}` is too long to make a batch with {self.need_be_multiple_of} samples.")
batches.append(add_samples)
else:
cur_max_len = max_len
batch.append(i)
if batch:
left_sample = len(batch) % self.need_be_multiple_of
add_samples = batch.copy()
if left_sample != 0:
add_samples = add_samples[:-left_sample].copy()
if add_samples:
batches.append(add_samples)
np.random.shuffle(batches)
self.batches = batches

def __iter__(self) -> Iterable[int]:
for batch in self.batches:
yield batch
self.get_new_order()

def __len__(self):
return len(self.batches)


class SortedSampler(Sampler):
r"""
按照sample的长度进行排序,主要在测试的时候使用,可以加速测试(因为减少了padding)
"""

def __init__(self, dataset, seq_len_field_name: str = 'seq_len', descending: bool = True) -> None:
"""

:param str seq_len_field_name: 按哪个field进行排序。如果传入的field是数字,则直接按照该数字大小排序;如果传入的field不是
数字,则使用该field的长度进行排序
:param bool descending: 是否降序排列
"""
self.dataset = dataset
self.seq_len_field_name = seq_len_field_name
self.descending = descending

def __iter__(self) -> Iterable[int]:
seq_lens = self.dataset.get_field(self.seq_len_field_name).content
try:
seq_lens = list(map(len, seq_lens))
except:
pass

orders = np.argsort(seq_lens).tolist() # 从小到大的顺序
if self.descending:
orders = orders[::-1]
for order in orders:
yield order


def simple_sort_bucketing(lengths):
r"""

:param lengths: list of int, the lengths of all examples.
:return data: 2-level list
::

[
[index_11, index_12, ...], # bucket 1
[index_21, index_22, ...], # bucket 2
...
]

"""
lengths_mapping = [(idx, length) for idx, length in enumerate(lengths)]
sorted_lengths = sorted(lengths_mapping, key=lambda x: x[1])
# TODO: need to return buckets
return [idx for idx, _ in sorted_lengths]


def k_means_1d(x, k, max_iter=100):
r"""Perform k-means on 1-D data.

:param x: list of int, representing points in 1-D.
:param k: the number of clusters required.
:param max_iter: maximum iteration
:return centroids: numpy array, centroids of the k clusters
assignment: numpy array, 1-D, the bucket id assigned to each example.
"""
sorted_x = sorted(list(set(x)))
x = np.array(x)
if len(sorted_x) < k:
raise ValueError("too few buckets")
gap = len(sorted_x) / k

centroids = np.array([sorted_x[int(x * gap)] for x in range(k)])
assign = None

for i in range(max_iter):
# Cluster Assignment step
assign = np.array([np.argmin([np.absolute(x_i - x) for x in centroids]) for x_i in x])
# Move centroids step
new_centroids = np.array([x[assign == k].mean() for k in range(k)])
if (new_centroids == centroids).all():
centroids = new_centroids
break
centroids = new_centroids
return np.array(centroids), assign


def k_means_bucketing(lengths, buckets):
r"""Assign all instances into possible buckets using k-means, such that instances in the same bucket have similar lengths.

:param lengths: list of int, the length of all samples.
:param buckets: list of int. The length of the list is the number of buckets. Each integer is the maximum length
threshold for each bucket (This is usually None.).
:return data: 2-level list
::

[
[index_11, index_12, ...], # bucket 1
[index_21, index_22, ...], # bucket 2
...
]

"""
bucket_data = [[] for _ in buckets]
num_buckets = len(buckets)
_, assignments = k_means_1d(lengths, num_buckets)

for idx, bucket_id in enumerate(assignments):
if buckets[bucket_id] is None or lengths[idx] <= buckets[bucket_id]:
bucket_data[bucket_id].append(idx)
return bucket_data

+ 1
- 2
fastNLP/io/loader/conll.py View File

@@ -50,8 +50,6 @@ class ConllLoader(Loader):


ConllLoader返回的DataSet的field由传入的headers确定。 ConllLoader返回的DataSet的field由传入的headers确定。


数据中以"-DOCSTART-"开头的行将被忽略,因为该符号在conll 2003中被用为文档分割符。

""" """
def __init__(self, headers, sep=None, indexes=None, dropna=True): def __init__(self, headers, sep=None, indexes=None, dropna=True):
@@ -93,6 +91,7 @@ class ConllLoader(Loader):
class Conll2003Loader(ConllLoader): class Conll2003Loader(ConllLoader):
r""" r"""
用于读取conll2003任务的数据。数据的内容应该类似与以下的内容, 第一列为raw_words, 第二列为pos, 第三列为chunking,第四列为ner。 用于读取conll2003任务的数据。数据的内容应该类似与以下的内容, 第一列为raw_words, 第二列为pos, 第三列为chunking,第四列为ner。
数据中以"-DOCSTART-"开头的行将被忽略,因为该符号在conll 2003中被用为文档分割符。


Example:: Example::




+ 0
- 0
tests/core/callbacks/torch_callbacks/__init__.py View File


+ 41
- 0
tests/core/callbacks/torch_callbacks/test_torch_grad_clip_callback.py View File

@@ -0,0 +1,41 @@
import pytest
import numpy as np

from fastNLP.core.callbacks import TorchGradClipCallback, Callback
from fastNLP import Trainer
from fastNLP.envs.imports import _NEED_IMPORT_TORCH

if _NEED_IMPORT_TORCH:
import torch

from tests.helpers.callbacks.prepare_trainer_args_for_torch_test import get_trainer_args


class CheckClipCallback(Callback):
def __init__(self, parameters, clip_type, clip_value):
self.parameters = parameters
self.clip_type = clip_type
self.clip_value = clip_value

def on_after_optimizers_step(self, trainer, optimizers):
for param in self.parameters:
if self.clip_type == 'value':
assert param.grad.max().item()<=self.clip_value
else:
assert np.linalg.norm(param.grad.cpu().view(-1).numpy())<=self.clip_value


@pytest.mark.parametrize('accumulation_steps', [1, 3, 5])
@pytest.mark.parametrize('fp16', [True, False])
@pytest.mark.parametrize('clip_type', ['norm', 'value'])
@pytest.mark.parametrize('clip_value', [1, 2])
def test_torch_grad_clip_callback(accumulation_steps, fp16, clip_type, clip_value):
if not torch.cuda.is_available() and fp16:
pytest.skip("No cuda, cannot test fp16.")
device = 'cuda' if fp16 else 'cpu'
kwargs = get_trainer_args(lr=1, device=device)
callbacks = []
callbacks.append(TorchGradClipCallback(clip_value=clip_value, clip_type=clip_type))
callbacks.append(CheckClipCallback(kwargs['model'].parameters(), clip_type, clip_value))
trainer = Trainer(**kwargs, callbacks=callbacks, fp16=fp16)
trainer.run()

+ 34
- 0
tests/core/callbacks/torch_callbacks/test_torch_warmup_callback.py View File

@@ -0,0 +1,34 @@
import pytest
import numpy as np

from fastNLP.core.callbacks import TorchWarmupCallback, Callback
from fastNLP import Trainer

from tests.helpers.callbacks.prepare_trainer_args_for_torch_test import get_trainer_args


class RecordLrCallback(Callback):
def __init__(self):
self.lrs = []

def on_after_optimizers_step(self, trainer, optimizers):
self.lrs.append(trainer.driver.optimizers[0].param_groups[0]['lr'])


@pytest.mark.parametrize('warmup', [5, 0.1])
@pytest.mark.parametrize('schedule', ['constant', 'linear'])
@pytest.mark.parametrize('accumulation_steps', [1, 3, 4])
def test_torch_warmup_callback(warmup, schedule, accumulation_steps):
kwargs = get_trainer_args(lr=0.1, bsz=4)
callback = TorchWarmupCallback(warmup, schedule)
r_callback = RecordLrCallback()
kwargs['callbacks'] = [callback, r_callback]
trainer = Trainer(**kwargs, accumulation_steps=accumulation_steps)
trainer.run()

if schedule == 'linear':
assert kwargs['optimizers'].param_groups[0]['lr'] <= 0.01
elif schedule == 'constant':
assert np.allclose(0.1, kwargs['optimizers'].param_groups[0]['lr'])

assert len(r_callback.lrs)<=trainer.total_batches//accumulation_steps+1

+ 0
- 31
tests/core/samplers/test_sampler.py View File

@@ -1,31 +0,0 @@
import unittest
import random
from fastNLP.core.samplers import SequentialSampler, RandomSampler, BucketSampler
from fastNLP.core.dataset import DataSet
from array import array
import torch

from fastNLP.core.samplers.sampler import ReproduceBatchSampler
from fastNLP.core.drivers.torch_driver.utils import replace_batch_sampler
from tests.helpers.datasets.torch_data import TorchNormalDataset


class SamplerTest(unittest.TestCase):

def test_sequentialsampler(self):
ds = DataSet({'x': [1, 2, 3, 4] * 10})
sqspl = SequentialSampler(ds)
for idx, inst in enumerate(sqspl):
self.assertEqual(idx, inst)

def test_randomsampler(self):
ds = DataSet({'x': [1, 2, 3, 4] * 10})
rdspl = RandomSampler(ds)
ans = [ds[i] for i in rdspl]
self.assertEqual(len(ans), len(ds))

def test_bucketsampler(self):
data_set = DataSet({"x": [[0] * random.randint(1, 10)] * 10, "y": [[5, 6]] * 10})
sampler = BucketSampler(data_set, num_buckets=3, batch_size=16, seq_len_field_name="seq_len")



+ 68
- 0
tests/helpers/callbacks/prepare_trainer_args_for_torch_test.py View File

@@ -0,0 +1,68 @@

"""
这个文件主要用于提供测试 callback 时的 Trainer 的参数,可以直接使用进行对Trainer进行初始化。只需要再额外传入相应的callback就可以运行

"""

from fastNLP.envs.imports import _NEED_IMPORT_TORCH
from fastNLP.core.metrics import Accuracy


if _NEED_IMPORT_TORCH:
import torch
from torch import nn
from torch.utils.data import DataLoader
import torch.nn.functional as F

class DataSet:
def __init__(self, num_samples=1000, num_features=10):
g = torch.Generator()
g.manual_seed(1000)
self.data = torch.randn(num_samples, num_features, generator=g)
self.y = self.data.argmax(dim=-1)

def __getitem__(self, item):
return {'x': self.data[item], 'target': self.y[item]}

def __len__(self):
return len(self.data)


class Model(nn.Module):
def __init__(self, num_features=5):
super().__init__()
self.mlps = nn.Sequential(
nn.Linear(num_features, 20),
nn.ReLU(),
nn.Linear(20, 20),
nn.Dropout(p=0.3),
nn.ReLU(),
nn.Linear(20, num_features)
)

def forward(self, x, target):
y = self.mlps(x)
if self.training:
return {'loss': F.cross_entropy(y, target)}
return {'pred': y}


def get_trainer_args(num_features=5, num_samples=20, bsz=4, lr=0.1, n_epochs=5, device=None):
ds = DataSet(num_samples=num_samples, num_features=num_features)
dl = DataLoader(ds, batch_size=bsz)
model = Model(num_features=num_features)

optimizer = torch.optim.SGD(model.parameters(), lr=lr)

kwargs = {
'model': model,
'driver': 'torch',
'device': device,
'optimizers': optimizer,
'train_dataloader': dl,
'evaluate_dataloaders': dl,
'metrics': {'acc': Accuracy()},
'n_epochs': n_epochs
}

return kwargs

Loading…
Cancel
Save