@@ -86,7 +86,7 @@ fastNLP中的Vocabulary | |||||
# 将验证集或者测试集在建立词表是放入no_create_entry_dataset这个参数中。 | # 将验证集或者测试集在建立词表是放入no_create_entry_dataset这个参数中。 | ||||
vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data]) | vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data]) | ||||
:class:`~fastNLP.Vocabulary` 中的 `no_create_entry` , 建议在添加来自于测试集和验证集的词的时候将该参数置为True, 或将验证集和测试集 | |||||
:class:`~fastNLP.Vocabulary` 中的 `no_create_entry` ,如果您并不关心具体的原理,您可以直接采取以下的建议:在添加来自于非训练集的词的时候将该参数置为True, 或将非训练集数据 | |||||
传入 `no_create_entry_dataset` 参数。它们的意义是在接下来的模型会使用pretrain的embedding(包括glove, word2vec, elmo与bert)且会finetune的 | 传入 `no_create_entry_dataset` 参数。它们的意义是在接下来的模型会使用pretrain的embedding(包括glove, word2vec, elmo与bert)且会finetune的 | ||||
情况下,如果仅使用来自于train的数据建立vocabulary,会导致只出现在test与dev中的词语无法充分利用到来自于预训练embedding的信息(因为他们 | 情况下,如果仅使用来自于train的数据建立vocabulary,会导致只出现在test与dev中的词语无法充分利用到来自于预训练embedding的信息(因为他们 | ||||
会被认为是unk),所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。 | 会被认为是unk),所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。 | ||||
@@ -62,6 +62,7 @@ __all__ = [ | |||||
"CrossEntropyLoss", | "CrossEntropyLoss", | ||||
"L1Loss", | "L1Loss", | ||||
"BCELoss", | "BCELoss", | ||||
"BCEWithLogits", | |||||
"NLLLoss", | "NLLLoss", | ||||
"LossInForward", | "LossInForward", | ||||
"CMRC2018Loss", | "CMRC2018Loss", | ||||
@@ -98,7 +99,7 @@ from .dataset import DataSet | |||||
from .field import FieldArray, Padder, AutoPadder, EngChar2DPadder | from .field import FieldArray, Padder, AutoPadder, EngChar2DPadder | ||||
from .instance import Instance | from .instance import Instance | ||||
from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, \ | from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, \ | ||||
LossInForward, CMRC2018Loss, LossBase, MSELoss | |||||
LossInForward, CMRC2018Loss, LossBase, MSELoss, BCEWithLogits | |||||
from .metrics import AccuracyMetric, SpanFPreRecMetric, CMRC2018Metric, ClassifyFPreRecMetric, MetricBase,\ | from .metrics import AccuracyMetric, SpanFPreRecMetric, CMRC2018Metric, ClassifyFPreRecMetric, MetricBase,\ | ||||
ConfusionMatrixMetric | ConfusionMatrixMetric | ||||
from .optimizer import Optimizer, SGD, Adam, AdamW | from .optimizer import Optimizer, SGD, Adam, AdamW | ||||
@@ -86,7 +86,6 @@ except: | |||||
from .dataset import DataSet | from .dataset import DataSet | ||||
from .tester import Tester | from .tester import Tester | ||||
from ._logger import logger | from ._logger import logger | ||||
from .utils import _check_fp16 | |||||
from ._parallel_utils import _model_contains_inner_module | from ._parallel_utils import _model_contains_inner_module | ||||
try: | try: | ||||
@@ -94,11 +93,6 @@ try: | |||||
except: | except: | ||||
pass | pass | ||||
try: | |||||
from apex import amp | |||||
except: | |||||
amp = None | |||||
class Callback(object): | class Callback(object): | ||||
r""" | r""" | ||||
@@ -123,6 +117,20 @@ class Callback(object): | |||||
该属性可以通过self.trainer获取到,一般情况下不需要使用这个属性。 | 该属性可以通过self.trainer获取到,一般情况下不需要使用这个属性。 | ||||
""" | """ | ||||
return self._trainer | return self._trainer | ||||
@property | |||||
def grad_scaler(self): | |||||
r""" | |||||
float16的gradient scaler | |||||
""" | |||||
return self._trainer.grad_scaler | |||||
@property | |||||
def auto_cast(self): | |||||
r""" | |||||
float16用的auto cast环境 | |||||
""" | |||||
return self._trainer.auto_cast | |||||
@property | @property | ||||
def step(self): | def step(self): | ||||
@@ -472,14 +480,9 @@ class GradientClipCallback(Callback): | |||||
def on_backward_end(self): | def on_backward_end(self): | ||||
if self.step%self.update_every==0: | if self.step%self.update_every==0: | ||||
if self.parameters is None: | |||||
if getattr(self.trainer, 'fp16', ''): | |||||
_check_fp16() | |||||
self.clip_fun(amp.master_params(self.optimizer), self.clip_value) | |||||
else: | |||||
self.clip_fun(self.model.parameters(), self.clip_value) | |||||
else: | |||||
self.clip_fun(self.parameters, self.clip_value) | |||||
if self.trainer.fp16: | |||||
self.grad_scaler.unscale_(self.optimizer) | |||||
self.clip_fun(self.parameters, self.clip_value) | |||||
class EarlyStopCallback(Callback): | class EarlyStopCallback(Callback): | ||||
@@ -569,10 +572,10 @@ class FitlogCallback(Callback): | |||||
if len(self.datasets) > 0: | if len(self.datasets) > 0: | ||||
for key, data in self.datasets.items(): | for key, data in self.datasets.items(): | ||||
tester = Tester(data=data, model=self.model, | tester = Tester(data=data, model=self.model, | ||||
batch_size=self.trainer.kwargs.get('dev_batch_size', self.batch_size), | |||||
batch_size=self.trainer.kwargs.get('dev_batch_size', self.trainer.batch_size), | |||||
metrics=self.trainer.metrics, | metrics=self.trainer.metrics, | ||||
verbose=0, | verbose=0, | ||||
use_tqdm=self.trainer.test_use_tqdm, | |||||
use_tqdm=self.trainer.kwargs.get('test_use_tqdm', self.trainer.use_tqdm), | |||||
sampler=self.trainer.kwargs.get('test_sampler', None)) | sampler=self.trainer.kwargs.get('test_sampler', None)) | ||||
self.testers[key] = tester | self.testers[key] = tester | ||||
fitlog.add_progress(total_steps=self.n_steps) | fitlog.add_progress(total_steps=self.n_steps) | ||||
@@ -948,6 +951,7 @@ class CheckPointCallback(Callback): | |||||
model = model.module | model = model.module | ||||
model.load_state_dict(states['model']) | model.load_state_dict(states['model']) | ||||
self.optimizer.load_state_dict(states['optimizer']) | self.optimizer.load_state_dict(states['optimizer']) | ||||
self.grad_scaler.load_state_dict(states['grad_scaler']) | |||||
self.trainer.epoch = states['epoch'] + 1 # 因为是结束储存的,所以需要从下一个epoch开始 | self.trainer.epoch = states['epoch'] + 1 # 因为是结束储存的,所以需要从下一个epoch开始 | ||||
self.trainer.step = states['step'] | self.trainer.step = states['step'] | ||||
if 'best_dev_epoch' in states: | if 'best_dev_epoch' in states: | ||||
@@ -29,7 +29,6 @@ from .dataset import DataSet | |||||
from .losses import _prepare_losser | from .losses import _prepare_losser | ||||
from .optimizer import Optimizer | from .optimizer import Optimizer | ||||
from .utils import _build_args | from .utils import _build_args | ||||
from .utils import _check_fp16 | |||||
from .utils import _get_func_signature | from .utils import _get_func_signature | ||||
from .utils import _move_dict_value_to_device | from .utils import _move_dict_value_to_device | ||||
@@ -10,6 +10,7 @@ __all__ = [ | |||||
"CrossEntropyLoss", | "CrossEntropyLoss", | ||||
"BCELoss", | "BCELoss", | ||||
"BCEWithLogits", | |||||
"L1Loss", | "L1Loss", | ||||
"NLLLoss", | "NLLLoss", | ||||
"MSELoss", | "MSELoss", | ||||
@@ -311,6 +312,25 @@ class BCELoss(LossBase): | |||||
return F.binary_cross_entropy(input=pred, target=target, reduction=self.reduction) | return F.binary_cross_entropy(input=pred, target=target, reduction=self.reduction) | ||||
class BCEWithLogits(LossBase): | |||||
r""" | |||||
二分类交叉熵损失函数, 传入数据之前不需要做sigmoid操作 | |||||
:param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` | |||||
:param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` | |||||
:param str reduction: 支持 `mean` ,`sum` 和 `none` . | |||||
""" | |||||
def __init__(self, pred=None, target=None, reduction='mean'): | |||||
super(BCEWithLogits, self).__init__() | |||||
self._init_param_map(pred=pred, target=target) | |||||
assert reduction in ('mean', 'sum', 'none') | |||||
self.reduction = reduction | |||||
def get_loss(self, pred, target): | |||||
return F.binary_cross_entropy_with_logits(input=pred, target=target, reduction=self.reduction) | |||||
class NLLLoss(LossBase): | class NLLLoss(LossBase): | ||||
r""" | r""" | ||||
负对数似然损失函数 | 负对数似然损失函数 | ||||
@@ -112,6 +112,108 @@ class BucketSampler(Sampler): | |||||
return list(chain(*batchs)) | return list(chain(*batchs)) | ||||
class ConstTokenNumSampler(Sampler): | |||||
""" | |||||
尽量保证每个batch的输入token数量是接近的。 | |||||
使用示例 | |||||
>>> # 假设已经有了tr_data并有一个field叫做seq_len保存了每个instance的token数量 | |||||
>>> from fastNLP import DataSetIter, Trainer | |||||
>>> sampler = ConstTokenNumSampler('src_seq_len', max_token=4096) | |||||
>>> | |||||
>>> # 直接将sampler传入Trainer中,此时batch_size参数的值会被忽略 | |||||
>>> trainer = Trainer(tr_data, model, optimizer=optimizer, loss=TranslationLoss(), | |||||
>>> batch_size=1, sampler=sampler, drop_last=False, update_every=1) | |||||
""" | |||||
def __init__(self, seq_len_field_name, max_token=4096, max_sentence=-1, need_be_multiple_of=1, num_bucket=-1): | |||||
""" | |||||
:param List[int] seq_len_field_name: 哪个field指示的sample的长度 | |||||
:param int max_token: 每个batch的最大的token数量 | |||||
:param int max_sentence: 每个batch最多多少个instance, -1表示根据max_token决定 | |||||
:param int need_be_multiple_of: 生成的batch的instance的数量需要是几的倍数,在DataParallel场景下会用到 | |||||
:param int num_bucket: 将数据按长度拆分为num_bucket个bucket,batch中的sample尽量在bucket之中进行组合,这样可以减少padding。 | |||||
""" | |||||
assert (max_sentence!=-1 and max_sentence>=need_be_multiple_of) or max_sentence<1 | |||||
self.seq_len_field_name = seq_len_field_name | |||||
self.num_bucket = num_bucket | |||||
self.max_token = max_token | |||||
self._max_sentence = max_sentence | |||||
self.need_be_multiple_of = need_be_multiple_of | |||||
def __call__(self, data_set): | |||||
assert len(data_set)>self.num_bucket, "The number of samples should be larger than buckets." | |||||
seq_len = data_set.get_field(self.seq_len_field_name) | |||||
self.seq_len = seq_len | |||||
seq_len_indice = [(length, i) for i, length in enumerate(seq_len)] | |||||
seq_len_indice.sort(key=lambda x: x[0]) | |||||
indice_in_buckets = [] | |||||
if self.num_bucket>0: | |||||
sample_per_bucket = len(seq_len_indice)//self.num_bucket | |||||
i = 0 | |||||
while len(indice_in_buckets)<len(seq_len_indice): | |||||
indice_in_buckets.append(seq_len_indice[i*sample_per_bucket:(i+1)*sample_per_bucket]) | |||||
i += 1 | |||||
else: | |||||
indice_in_buckets = [seq_len_indice] | |||||
self.indice_in_buckets = indice_in_buckets | |||||
self.get_new_order() | |||||
@property | |||||
def max_sentence(self): | |||||
if self._max_sentence<1: | |||||
return 100000000 | |||||
return self._max_sentence | |||||
@max_sentence.setter | |||||
def max_sentence(self, max_sentence): | |||||
self._max_sentence = max_sentence | |||||
def get_new_order(self): | |||||
np.random.shuffle(self.indice_in_buckets) | |||||
for bucket in self.indice_in_buckets: | |||||
np.random.shuffle(bucket) | |||||
indices = list(chain(*self.indice_in_buckets)) | |||||
batches = [] | |||||
cur_max_len = 0 | |||||
batch = [] | |||||
for length, i in indices: | |||||
max_len = max(length, cur_max_len) | |||||
if max_len*(len(batch)+1)>self.max_token or len(batch)>=self.max_sentence: | |||||
left_sample = len(batch) % self.need_be_multiple_of | |||||
add_samples = batch.copy() | |||||
cur_max_len =length | |||||
if left_sample!=0: | |||||
add_samples = add_samples[:-left_sample] | |||||
batch = batch[-left_sample:] | |||||
cur_max_len = max(cur_max_len, max(batch)) | |||||
else: | |||||
batch = [] | |||||
if len(add_samples)==0: | |||||
raise RuntimeError(f"The sample `{i}` is too long to make a batch with {self.need_be_multiple_of} samples.") | |||||
batches.append(add_samples) | |||||
else: | |||||
cur_max_len = max_len | |||||
batch.append(i) | |||||
if batch: | |||||
left_sample = len(batch) % self.need_be_multiple_of | |||||
add_samples = batch.copy() | |||||
if left_sample != 0: | |||||
add_samples = add_samples[:-left_sample].copy() | |||||
if add_samples: | |||||
batches.append(add_samples) | |||||
np.random.shuffle(batches) | |||||
self.batches = batches | |||||
def __iter__(self): | |||||
for batch in self.batches: | |||||
yield batch | |||||
self.get_new_order() | |||||
def __len__(self): | |||||
return len(self.batches) | |||||
class ConstantTokenNumSampler: | class ConstantTokenNumSampler: | ||||
""" | """ | ||||
尽量保证每个batch的输入token数量是接近的。 | 尽量保证每个batch的输入token数量是接近的。 | ||||
@@ -119,7 +221,7 @@ class ConstantTokenNumSampler: | |||||
使用示例 | 使用示例 | ||||
>>> # 假设已经有了tr_data并有一个field叫做seq_len保存了每个instance的token数量 | >>> # 假设已经有了tr_data并有一个field叫做seq_len保存了每个instance的token数量 | ||||
>>> from fastNLP import DataSetIter, Trainer | >>> from fastNLP import DataSetIter, Trainer | ||||
>>> sampler = BatchSampler(tr_data.get_field('seq_len').content, max_token=4096) | |||||
>>> sampler = ConstantTokenNumSampler(tr_data.get_field('seq_len').content, max_token=4096) | |||||
>>> tr_iter = DataSetIter(tr_data, | >>> tr_iter = DataSetIter(tr_data, | ||||
>>> batch_size=1, sampler=None, as_numpy=False, num_workers=0, pin_memory=False, | >>> batch_size=1, sampler=None, as_numpy=False, num_workers=0, pin_memory=False, | ||||
>>> drop_last=False, timeout=0, worker_init_fn=None, | >>> drop_last=False, timeout=0, worker_init_fn=None, | ||||
@@ -128,7 +230,6 @@ class ConstantTokenNumSampler: | |||||
>>> # 直接将tr_iter传入Trainer中,此时batch_size参数的值会被忽略 | >>> # 直接将tr_iter传入Trainer中,此时batch_size参数的值会被忽略 | ||||
>>> trainer = Trainer(tr_iter, model, optimizer=optimizer, loss=TranslationLoss(), | >>> trainer = Trainer(tr_iter, model, optimizer=optimizer, loss=TranslationLoss(), | ||||
>>> batch_size=1, sampler=None, drop_last=False, update_every=1) | >>> batch_size=1, sampler=None, drop_last=False, update_every=1) | ||||
""" | """ | ||||
def __init__(self, seq_len, max_token=4096, max_sentence=-1, need_be_multiple_of=1, num_bucket=-1): | def __init__(self, seq_len, max_token=4096, max_sentence=-1, need_be_multiple_of=1, num_bucket=-1): | ||||
""" | """ | ||||
@@ -53,6 +53,8 @@ from .utils import _move_dict_value_to_device | |||||
from .utils import _get_func_signature | from .utils import _get_func_signature | ||||
from .utils import _get_model_device | from .utils import _get_model_device | ||||
from .utils import _move_model_to_device | from .utils import _move_model_to_device | ||||
from .utils import _build_fp16_env | |||||
from .utils import _can_use_fp16 | |||||
from ._parallel_utils import _data_parallel_wrapper | from ._parallel_utils import _data_parallel_wrapper | ||||
from ._parallel_utils import _model_contains_inner_module | from ._parallel_utils import _model_contains_inner_module | ||||
from functools import partial | from functools import partial | ||||
@@ -70,7 +72,7 @@ class Tester(object): | |||||
""" | """ | ||||
def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1, use_tqdm=True, | def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1, use_tqdm=True, | ||||
**kwargs): | |||||
fp16=False, **kwargs): | |||||
r""" | r""" | ||||
:param ~fastNLP.DataSet,~fastNLP.BatchIter data: 需要测试的数据集 | :param ~fastNLP.DataSet,~fastNLP.BatchIter data: 需要测试的数据集 | ||||
@@ -93,7 +95,9 @@ class Tester(object): | |||||
如果模型是通过predict()进行预测的话,那么将不能使用多卡(DataParallel)进行验证,只会使用第一张卡上的模型。 | 如果模型是通过predict()进行预测的话,那么将不能使用多卡(DataParallel)进行验证,只会使用第一张卡上的模型。 | ||||
:param int verbose: 如果为0不输出任何信息; 如果为1,打印出验证结果。 | :param int verbose: 如果为0不输出任何信息; 如果为1,打印出验证结果。 | ||||
:param bool use_tqdm: 是否使用tqdm来显示测试进度; 如果为False,则不会显示任何内容。 | :param bool use_tqdm: 是否使用tqdm来显示测试进度; 如果为False,则不会显示任何内容。 | ||||
:param kwargs: 支持传入sampler控制测试顺序 | |||||
:param bool fp16: 是否使用float16进行验证 | |||||
:param kwargs: | |||||
Sampler sampler: 支持传入sampler控制测试顺序 | |||||
""" | """ | ||||
super(Tester, self).__init__() | super(Tester, self).__init__() | ||||
@@ -147,7 +151,11 @@ class Tester(object): | |||||
else: | else: | ||||
self._predict_func = self._model.forward | self._predict_func = self._model.forward | ||||
self._predict_func_wrapper = self._model.forward | self._predict_func_wrapper = self._model.forward | ||||
if fp16: | |||||
_can_use_fp16(model=model, device=device, func=self._predict_func) | |||||
self.auto_cast, _grad_scaler = _build_fp16_env(not fp16) | |||||
def test(self): | def test(self): | ||||
r"""开始进行验证,并返回验证结果。 | r"""开始进行验证,并返回验证结果。 | ||||
@@ -172,12 +180,13 @@ class Tester(object): | |||||
for batch_x, batch_y in data_iterator: | for batch_x, batch_y in data_iterator: | ||||
_move_dict_value_to_device(batch_x, batch_y, device=self._model_device) | _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) | ||||
pred_dict = self._data_forward(self._predict_func, batch_x) | |||||
if not isinstance(pred_dict, dict): | |||||
raise TypeError(f"The return value of {_get_func_signature(self._predict_func)} " | |||||
f"must be `dict`, got {type(pred_dict)}.") | |||||
for metric in self.metrics: | |||||
metric(pred_dict, batch_y) | |||||
with self.auto_cast(): | |||||
pred_dict = self._data_forward(self._predict_func, batch_x) | |||||
if not isinstance(pred_dict, dict): | |||||
raise TypeError(f"The return value of {_get_func_signature(self._predict_func)} " | |||||
f"must be `dict`, got {type(pred_dict)}.") | |||||
for metric in self.metrics: | |||||
metric(pred_dict, batch_y) | |||||
if self.use_tqdm: | if self.use_tqdm: | ||||
pbar.update() | pbar.update() | ||||
@@ -342,7 +342,7 @@ from .losses import _prepare_losser | |||||
from .metrics import _prepare_metrics | from .metrics import _prepare_metrics | ||||
from .optimizer import Optimizer | from .optimizer import Optimizer | ||||
from .sampler import Sampler | from .sampler import Sampler | ||||
from .sampler import RandomSampler | |||||
from .sampler import RandomSampler, ConstTokenNumSampler | |||||
from .tester import Tester | from .tester import Tester | ||||
from .utils import _CheckError | from .utils import _CheckError | ||||
from .utils import _build_args | from .utils import _build_args | ||||
@@ -352,6 +352,8 @@ from .utils import _move_dict_value_to_device | |||||
from .utils import _get_func_signature | from .utils import _get_func_signature | ||||
from .utils import _get_model_device | from .utils import _get_model_device | ||||
from .utils import _move_model_to_device | from .utils import _move_model_to_device | ||||
from .utils import _build_fp16_env | |||||
from .utils import _can_use_fp16 | |||||
from ._parallel_utils import _model_contains_inner_module | from ._parallel_utils import _model_contains_inner_module | ||||
from ._logger import logger | from ._logger import logger | ||||
@@ -373,7 +375,7 @@ class Trainer(object): | |||||
num_workers=0, n_epochs=10, print_every=5, | num_workers=0, n_epochs=10, print_every=5, | ||||
dev_data=None, metrics=None, metric_key=None, | dev_data=None, metrics=None, metric_key=None, | ||||
validate_every=-1, save_path=None, use_tqdm=True, device=None, | validate_every=-1, save_path=None, use_tqdm=True, device=None, | ||||
callbacks=None, check_code_level=0, **kwargs): | |||||
callbacks=None, check_code_level=0, fp16=False, **kwargs): | |||||
r""" | r""" | ||||
:param train_data: 训练集, :class:`~fastNLP.DataSet` 类型或 :class:`~fastNLP.BatchIter` 的子类 | :param train_data: 训练集, :class:`~fastNLP.DataSet` 类型或 :class:`~fastNLP.BatchIter` 的子类 | ||||
:param nn.modules model: 待训练的模型 | :param nn.modules model: 待训练的模型 | ||||
@@ -422,9 +424,14 @@ class Trainer(object): | |||||
报告警告信息; 2: 有任何field没有被使用都报错. 检查的原理是通过使用很小的batch(默认2个sample)来运行代码,但是 | 报告警告信息; 2: 有任何field没有被使用都报错. 检查的原理是通过使用很小的batch(默认2个sample)来运行代码,但是 | ||||
这个过程理论上不会修改任何参数,只是会检查能否运行。但如果(1)模型中存在将batch_size写为某个固定值的情况; | 这个过程理论上不会修改任何参数,只是会检查能否运行。但如果(1)模型中存在将batch_size写为某个固定值的情况; | ||||
(2)模型中存在累加前向计算次数的,可能会多计算1次。以上情况建议将check_code_level设置为-1。 | (2)模型中存在累加前向计算次数的,可能会多计算1次。以上情况建议将check_code_level设置为-1。 | ||||
:param bool fp16: 是否使用fp16进行训练。 | |||||
:param kwargs: 支持配置可选参数 | :param kwargs: 支持配置可选参数 | ||||
bool test_use_tqdm: 在dev上验证的时候是否开启tqdm | bool test_use_tqdm: 在dev上验证的时候是否开启tqdm | ||||
Sampler test_sampler: 在evaluate的时候使用的sampler | Sampler test_sampler: 在evaluate的时候使用的sampler | ||||
bool test_use_fp16: evalute的时候是否使用fp16测试,默认与fp16相同的取值。 | |||||
bool set_grad_to_none: 在zero_grad的时候是否将gradient设置为None,而不是设置为zero | |||||
GradScaler grad_scaler: 仅在fp16为True时有效,如果不使用torch.cuda.amp.GradScaler的初始化参数,可传入一个已经初始化后的 | |||||
grad_scaler。 | |||||
""" | """ | ||||
super(Trainer, self).__init__() | super(Trainer, self).__init__() | ||||
if not isinstance(model, nn.Module): | if not isinstance(model, nn.Module): | ||||
@@ -488,6 +495,15 @@ class Trainer(object): | |||||
sampler = RandomSampler() | sampler = RandomSampler() | ||||
elif hasattr(sampler, 'set_batch_size'): | elif hasattr(sampler, 'set_batch_size'): | ||||
sampler.set_batch_size(batch_size) | sampler.set_batch_size(batch_size) | ||||
if isinstance(sampler, ConstTokenNumSampler): # 直接使用固定token数量的Sampler | |||||
assert isinstance(train_data, | |||||
DataSet), f"When sampler is `ConstTokenNumSampler`, the train_data must" \ | |||||
f" be `DataSet`." | |||||
sampler(train_data) | |||||
train_data = DataSetIter(train_data, | |||||
batch_size=1, sampler=None, as_numpy=False, num_workers=num_workers, | |||||
pin_memory=False, drop_last=drop_last, timeout=0, worker_init_fn=None, | |||||
batch_sampler=sampler) | |||||
if isinstance(train_data, DataSet): | if isinstance(train_data, DataSet): | ||||
self.data_iterator = DataSetIter(dataset=train_data, batch_size=batch_size, sampler=sampler, | self.data_iterator = DataSetIter(dataset=train_data, batch_size=batch_size, sampler=sampler, | ||||
@@ -505,6 +521,21 @@ class Trainer(object): | |||||
self._forward_func = self.model.module.forward | self._forward_func = self.model.module.forward | ||||
else: | else: | ||||
self._forward_func = self.model.forward | self._forward_func = self.model.forward | ||||
self.fp16 = fp16 | |||||
# check fp16相关的设置 | |||||
self.auto_cast, _grad_scaler = _build_fp16_env(dummy=not fp16) | |||||
if self.fp16: | |||||
_can_use_fp16(device=device, model=model, func=self._forward_func) | |||||
grad_scaler = kwargs.get('grad_scaler', None) | |||||
if grad_scaler is not None: | |||||
self.grad_scaler = grad_scaler | |||||
else: | |||||
self.grad_scaler = _grad_scaler() | |||||
self.test_use_fp16 = kwargs.get('test_use_fp16', fp16) | |||||
self.set_grad_to_none = kwargs.get('set_grad_to_none', True) | |||||
if check_code_level > -1: | if check_code_level > -1: | ||||
# _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的field名与模型的输入 | # _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的field名与模型的输入 | ||||
# 名是否匹配 | # 名是否匹配 | ||||
@@ -553,10 +584,7 @@ class Trainer(object): | |||||
self.logger = logger | self.logger = logger | ||||
self.use_tqdm = use_tqdm | self.use_tqdm = use_tqdm | ||||
if 'test_use_tqdm' in kwargs: | |||||
self.test_use_tqdm = kwargs.get('test_use_tqdm') | |||||
else: | |||||
self.test_use_tqdm = self.use_tqdm | |||||
self.test_use_tqdm = kwargs.get('test_use_tqdm', self.use_tqdm) | |||||
self.pbar = None | self.pbar = None | ||||
self.print_every = abs(self.print_every) | self.print_every = abs(self.print_every) | ||||
self.kwargs = kwargs | self.kwargs = kwargs | ||||
@@ -568,7 +596,8 @@ class Trainer(object): | |||||
device=None, # 由上面的部分处理device | device=None, # 由上面的部分处理device | ||||
verbose=0, | verbose=0, | ||||
use_tqdm=self.test_use_tqdm, | use_tqdm=self.test_use_tqdm, | ||||
sampler=kwargs.get('test_sampler', None)) | |||||
sampler=kwargs.get('test_sampler', None), | |||||
fp16=self.test_use_fp16) | |||||
self.start_time = None # start timestamp | self.start_time = None # start timestamp | ||||
@@ -677,7 +706,8 @@ class Trainer(object): | |||||
# edit prediction | # edit prediction | ||||
self.callback_manager.on_loss_begin(batch_y, prediction) | self.callback_manager.on_loss_begin(batch_y, prediction) | ||||
loss = self._compute_loss(prediction, batch_y).mean() | |||||
with self.auto_cast(): | |||||
loss = self._compute_loss(prediction, batch_y).mean() | |||||
loss = loss / self.update_every | loss = loss / self.update_every | ||||
avg_loss += loss.item() | avg_loss += loss.item() | ||||
@@ -762,11 +792,13 @@ class Trainer(object): | |||||
""" | """ | ||||
if self.step % self.update_every == 0: | if self.step % self.update_every == 0: | ||||
self.optimizer.step() | |||||
self.grad_scaler.step(self.optimizer) | |||||
self.grad_scaler.update() | |||||
def _data_forward(self, network, x): | def _data_forward(self, network, x): | ||||
x = _build_args(self._forward_func, **x) | x = _build_args(self._forward_func, **x) | ||||
y = network(**x) | |||||
with self.auto_cast(): | |||||
y = network(**x) | |||||
if not isinstance(y, dict): | if not isinstance(y, dict): | ||||
raise TypeError( | raise TypeError( | ||||
f"The return value of {_get_func_signature(self._forward_func)} should be dict, got {type(y)}.") | f"The return value of {_get_func_signature(self._forward_func)} should be dict, got {type(y)}.") | ||||
@@ -780,8 +812,22 @@ class Trainer(object): | |||||
For PyTorch, just do "loss.backward()" | For PyTorch, just do "loss.backward()" | ||||
""" | """ | ||||
if (self.step-1) % self.update_every == 0: | if (self.step-1) % self.update_every == 0: | ||||
self.model.zero_grad() | |||||
loss.backward() | |||||
self._clear_grad(self.optimizer, self.set_grad_to_none) | |||||
self.grad_scaler.scale(loss).backward() | |||||
def _clear_grad(self, optimizer, set_to_none=True): | |||||
param_groups = optimizer.param_groups | |||||
for group in param_groups: | |||||
for p in group['params']: | |||||
if p.grad is not None: | |||||
if set_to_none: | |||||
p.grad = None | |||||
else: | |||||
if p.grad.grad_fn is not None: | |||||
p.grad.detach_() | |||||
else: | |||||
p.grad.requires_grad_(False) | |||||
p.grad.zero_() | |||||
def _compute_loss(self, predict, truth): | def _compute_loss(self, predict, truth): | ||||
r"""Compute loss given prediction and ground truth. | r"""Compute loss given prediction and ground truth. | ||||
@@ -12,23 +12,20 @@ import inspect | |||||
import os | import os | ||||
import warnings | import warnings | ||||
from collections import Counter, namedtuple | from collections import Counter, namedtuple | ||||
from copy import deepcopy | |||||
from typing import List | from typing import List | ||||
import _pickle | import _pickle | ||||
import numpy as np | import numpy as np | ||||
import torch | |||||
import torch.nn as nn | import torch.nn as nn | ||||
from prettytable import PrettyTable | from prettytable import PrettyTable | ||||
from ._logger import logger | from ._logger import logger | ||||
from ._parallel_utils import _model_contains_inner_module | from ._parallel_utils import _model_contains_inner_module | ||||
# from .vocabulary import Vocabulary | # from .vocabulary import Vocabulary | ||||
import torch | |||||
import contextlib | |||||
from pkg_resources import parse_version | |||||
try: | |||||
from apex import amp | |||||
except: | |||||
amp = None | |||||
_CheckRes = namedtuple('_CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed', | _CheckRes = namedtuple('_CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed', | ||||
'varargs']) | 'varargs']) | ||||
@@ -1032,8 +1029,92 @@ def sub_column(string: str, c: int, c_size: int, title: str) -> str: | |||||
return res | return res | ||||
def _check_fp16(): | |||||
if amp is None: | |||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") | |||||
if not torch.backends.cudnn.enabled: | |||||
raise RuntimeError("Amp requires cudnn backend to be enabled.") | |||||
def _is_function_contains_autocast(func): | |||||
""" | |||||
检查func是否包含autocast,(1)是否使用了autocast的修饰器或, (2)使用使用with autocast()环境 | |||||
:param func: 待检查的函数 | |||||
""" | |||||
import re | |||||
source = inspect.getsource(func) | |||||
lines = source.split('\n') | |||||
for line in lines: | |||||
line = line.strip() | |||||
if re.search(r'@[\w\.]*autocast\(\)', line): | |||||
raise RuntimeError("Please do not use `autocast()` decorator, use `with autocast():` instead. Please refer to" | |||||
" https://pytorch.org/docs/stable/notes/amp_examples.html#dataparallel-in-a-single-process ") | |||||
if re.search(r'with [\w\.]*autocast\(\):', line): | |||||
return True | |||||
return False | |||||
class DummyGradScaler: | |||||
""" | |||||
用于Dummy pytorch的GradScaler对象,防止重复写大量的if判断 | |||||
""" | |||||
def __init__(self, *args, **kwargs): | |||||
pass | |||||
def get_scale(self): | |||||
return 1.0 | |||||
def is_enabled(self): | |||||
return False | |||||
def scale(self, outputs): | |||||
return outputs | |||||
def step(self, optimizer, *args, **kwargs): | |||||
optimizer.step(*args, **kwargs) | |||||
def update(self, new_scale=None): | |||||
pass | |||||
def unscale_(self, optimizer): | |||||
pass | |||||
def load_state_dict(self, state_dict): | |||||
pass | |||||
def state_dict(self): | |||||
return {} | |||||
def _build_fp16_env(dummy=False): | |||||
if dummy: | |||||
autocast = contextlib.ExitStack | |||||
GradScaler = DummyGradScaler | |||||
else: | |||||
if not torch.cuda.is_available(): | |||||
raise RuntimeError("No cuda") | |||||
if torch.cuda.get_device_capability(0)[0] < 7: | |||||
warnings.warn( | |||||
"NOTE: your device does NOT support faster training with fp16, " | |||||
"please switch to FP32 which is likely to be faster" | |||||
) | |||||
try: | |||||
from torch.cuda.amp import autocast, GradScaler | |||||
except ImportError: | |||||
raise RuntimeError("torch version too low (less than 1.6)") | |||||
return autocast, GradScaler | |||||
def _can_use_fp16(device, model, func): | |||||
if parse_version(torch.__version__) < parse_version('1.6'): | |||||
raise RuntimeError("Pytorch supports float16 after version 1.6, please upgrade your pytorch version.") | |||||
model_device = _get_model_device(model) | |||||
if device is None and model_device is not None and model_device.type != 'cuda': | |||||
raise RuntimeError("You have to run in cuda device to use fp16.") | |||||
if isinstance(device, str): | |||||
if device=='cpu': | |||||
raise RuntimeError("You have to run in cuda device to use fp16.") | |||||
if isinstance(device, torch.device) and device.type=='cpu': | |||||
raise RuntimeError("You have to run in cuda device to use fp16.") | |||||
if (_model_contains_inner_module(model) or (isinstance(device, list) and len(device) > 1)): | |||||
# 需要提醒用户 | |||||
if not _is_function_contains_autocast(func): | |||||
raise RuntimeError("When use fp16 in Parallel Training, you have to set autocast() in your forward " | |||||
"function as described in " | |||||
"https://pytorch.org/docs/stable/notes/amp_examples.html#dataparallel-in-a-single-process") |
@@ -125,7 +125,7 @@ class Vocabulary(object): | |||||
r"""依次增加序列中词在词典中的出现频率 | r"""依次增加序列中词在词典中的出现频率 | ||||
:param list word_lst: a list of strings | :param list word_lst: a list of strings | ||||
:param bool no_create_entry: 在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||||
:param bool no_create_entry: 如果词语来自于非训练集建议设置为True。在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||||
如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 | 如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 | ||||
的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 | 的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 | ||||
加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 | 加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 | ||||
@@ -142,7 +142,7 @@ class Vocabulary(object): | |||||
增加一个新词在词典中的出现频率 | 增加一个新词在词典中的出现频率 | ||||
:param str word: 新词 | :param str word: 新词 | ||||
:param bool no_create_entry: 在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||||
:param bool no_create_entry: 如果词语来自于非训练集建议设置为True。在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||||
如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 | 如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 | ||||
的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 | 的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 | ||||
加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 | 加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 | ||||
@@ -175,7 +175,7 @@ class Vocabulary(object): | |||||
增加一个新词在词典中的出现频率 | 增加一个新词在词典中的出现频率 | ||||
:param str word: 新词 | :param str word: 新词 | ||||
:param bool no_create_entry: 在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||||
:param bool no_create_entry: 如果词语来自于非训练集建议设置为True。在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||||
如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 | 如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 | ||||
的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 | 的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 | ||||
加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 | 加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 | ||||
@@ -190,7 +190,7 @@ class Vocabulary(object): | |||||
依次增加序列中词在词典中的出现频率 | 依次增加序列中词在词典中的出现频率 | ||||
:param list[str] word_lst: 词的序列 | :param list[str] word_lst: 词的序列 | ||||
:param bool no_create_entry: 在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||||
:param bool no_create_entry: 如果词语来自于非训练集建议设置为True。在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||||
如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 | 如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 | ||||
的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 | 的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 | ||||
加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 | 加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 | ||||
@@ -344,7 +344,7 @@ class Vocabulary(object): | |||||
:param str,List[str] field_name: 可为 ``str`` 或 ``List[str]`` . | :param str,List[str] field_name: 可为 ``str`` 或 ``List[str]`` . | ||||
构建词典所使用的 field(s), 支持一个或多个field,若有多个 DataSet, 每个DataSet都必须有这些field. 目前支持的field结构 | 构建词典所使用的 field(s), 支持一个或多个field,若有多个 DataSet, 每个DataSet都必须有这些field. 目前支持的field结构 | ||||
: ``str`` , ``List[str]`` | : ``str`` , ``List[str]`` | ||||
:param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认),该选项用在接下来的模型会使用pretrain | |||||
:param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认), 建议直接将非训练数据都传入到这个参数。该选项用在接下来的模型会使用pretrain | |||||
的embedding(包括glove, word2vec, elmo与bert)且会finetune的情况。如果仅使用来自于train的数据建立vocabulary,会导致test与dev | 的embedding(包括glove, word2vec, elmo与bert)且会finetune的情况。如果仅使用来自于train的数据建立vocabulary,会导致test与dev | ||||
中的数据无法充分利用到来自于预训练embedding的信息,所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。 | 中的数据无法充分利用到来自于预训练embedding的信息,所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。 | ||||
如果一个词出现在了train中,但是没在预训练模型中,embedding会为它用unk初始化,但它是单独的一个vector,如果 | 如果一个词出现在了train中,但是没在预训练模型中,embedding会为它用unk初始化,但它是单独的一个vector,如果 | ||||
@@ -108,7 +108,7 @@ class BertEmbedding(ContextualEmbedding): | |||||
self._word_sep_index = vocab['[SEP]'] | self._word_sep_index = vocab['[SEP]'] | ||||
self._word_cls_index = -100 | self._word_cls_index = -100 | ||||
if '[CLS]' in vocab: | if '[CLS]' in vocab: | ||||
self._word_cls_index = vocab['CLS'] | |||||
self._word_cls_index = vocab['[CLS]'] | |||||
min_freq = kwargs.get('min_freq', 1) | min_freq = kwargs.get('min_freq', 1) | ||||
self._min_freq = min_freq | self._min_freq = min_freq | ||||
@@ -281,7 +281,9 @@ class StaticEmbedding(TokenEmbedding): | |||||
if word in vocab: | if word in vocab: | ||||
index = vocab.to_index(word) | index = vocab.to_index(word) | ||||
if index in matrix: | if index in matrix: | ||||
warnings.warn(f"Word:{word} occurs again in line:{idx}(starts from 0)") | |||||
warnings.warn(f"Word has more than one vector in embedding file. Set logger level to " | |||||
f"DEBUG for detail.") | |||||
logger.debug(f"Word:{word} occurs again in line:{idx}(starts from 0)") | |||||
matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) | matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) | ||||
if self.only_norm_found_vector: | if self.only_norm_found_vector: | ||||
matrix[index] = matrix[index] / np.linalg.norm(matrix[index]) | matrix[index] = matrix[index] / np.linalg.norm(matrix[index]) | ||||
@@ -34,3 +34,56 @@ class NaiveClassifier(BaseModel): | |||||
def predict(self, x): | def predict(self, x): | ||||
return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | ||||
class NaiveClassifier2(BaseModel): | |||||
r""" | |||||
一个简单的分类器例子,可用于各种测试 | |||||
""" | |||||
def __init__(self, in_feature_dim, out_feature_dim): | |||||
super(NaiveClassifier2, self).__init__() | |||||
self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||||
def forward(self, x): | |||||
return {"predict": self.mlp(x)} | |||||
def predict(self, x): | |||||
return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | |||||
class NaiveClassifier3(BaseModel): | |||||
r""" | |||||
一个简单的分类器例子,可用于各种测试 | |||||
""" | |||||
def __init__(self, in_feature_dim, out_feature_dim): | |||||
super(NaiveClassifier3, self).__init__() | |||||
self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||||
@torch.cuda.amp.autocast() | |||||
def forward(self, x): | |||||
return {"predict": self.mlp(x)} | |||||
@torch.cuda.amp.autocast() | |||||
def predict(self, x): | |||||
return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | |||||
class NaiveClassifier4(BaseModel): | |||||
r""" | |||||
一个简单的分类器例子,可用于各种测试 | |||||
""" | |||||
def __init__(self, in_feature_dim, out_feature_dim): | |||||
super(NaiveClassifier4, self).__init__() | |||||
self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||||
def forward(self, x): | |||||
with torch.cuda.amp.autocast(): | |||||
return {"predict": self.mlp(x)} | |||||
def predict(self, x): | |||||
with torch.cuda.amp.autocast(): | |||||
return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} |
@@ -44,3 +44,11 @@ class TestSampler(unittest.TestCase): | |||||
indices = sampler(data_set) | indices = sampler(data_set) | ||||
self.assertEqual(len(indices), 10) | self.assertEqual(len(indices), 10) | ||||
# 跑通即可,不验证效果 | # 跑通即可,不验证效果 | ||||
def test_ConstantTokenNumSampler(self): | |||||
# 需要check的是,是否在number上是接近的 | |||||
pass | |||||
def test_ConstTokenNumSampler(self): | |||||
# 需要check的是,是否可以直接运行 | |||||
pass |
@@ -9,12 +9,12 @@ import torch | |||||
from fastNLP import DataSet | from fastNLP import DataSet | ||||
from fastNLP import Instance | from fastNLP import Instance | ||||
from fastNLP import BCELoss | |||||
from fastNLP import BCELoss, BCEWithLogits | |||||
from fastNLP import CrossEntropyLoss | from fastNLP import CrossEntropyLoss | ||||
from fastNLP import AccuracyMetric | from fastNLP import AccuracyMetric | ||||
from fastNLP import SGD | from fastNLP import SGD | ||||
from fastNLP import Trainer | from fastNLP import Trainer | ||||
from fastNLP.models.base_model import NaiveClassifier | |||||
from fastNLP.models.base_model import NaiveClassifier, NaiveClassifier2, NaiveClassifier3, NaiveClassifier4 | |||||
from fastNLP import TorchLoaderIter | from fastNLP import TorchLoaderIter | ||||
@@ -575,3 +575,83 @@ class TrainerTestGround(unittest.TestCase): | |||||
) | ) | ||||
trainer.train() | trainer.train() | ||||
""" | """ | ||||
class Fp16TrainerTest(unittest.TestCase): | |||||
def test_raise_error(self): | |||||
data_set = prepare_fake_dataset() | |||||
data_set.set_input("x", flag=True) | |||||
data_set.set_target("y", flag=True) | |||||
train_set, dev_set = data_set.split(0.3) | |||||
model = NaiveClassifier2(2, 1) | |||||
with self.assertRaises(RuntimeError): | |||||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||||
use_tqdm=True, check_code_level=2, fp16=True) | |||||
with self.assertRaises(RuntimeError): | |||||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||||
use_tqdm=True, check_code_level=2, fp16=True, device='cpu') | |||||
with self.assertRaises(RuntimeError): | |||||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||||
use_tqdm=True, check_code_level=2, fp16=True, device=torch.device('cpu')) | |||||
@unittest.skipIf(torch.cuda.is_available()==False, "Skip when no cuda device detch") | |||||
def test_run_fp16(self): | |||||
data_set = prepare_fake_dataset() | |||||
data_set.set_input("x", flag=True) | |||||
data_set.set_target("y", flag=True) | |||||
train_set, dev_set = data_set.split(0.3) | |||||
model = NaiveClassifier2(2, 1) | |||||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||||
use_tqdm=True, check_code_level=2, fp16=True, device=0) | |||||
trainer.train(load_best_model=False) | |||||
model = NaiveClassifier2(2, 1) | |||||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||||
use_tqdm=True, check_code_level=2, fp16=True, device=0, test_use_fp16=False) | |||||
trainer.train(load_best_model=False) | |||||
@unittest.skipIf(torch.cuda.device_count()<2, "Skip when lower than 1 gpus.") | |||||
def test_run_data_parallel(self): | |||||
data_set = prepare_fake_dataset() | |||||
data_set.set_input("x", flag=True) | |||||
data_set.set_target("y", flag=True) | |||||
train_set, dev_set = data_set.split(0.3) | |||||
model = NaiveClassifier2(2, 1) | |||||
with self.assertRaises(RuntimeError): | |||||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||||
use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1]) | |||||
with self.assertRaises(RuntimeError): | |||||
model = NaiveClassifier3(2, 1) | |||||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||||
use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1], test_use_fp16=True) | |||||
model = NaiveClassifier4(2, 1) | |||||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||||
use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1], test_use_fp16=True) | |||||
trainer.train(load_best_model=False) |