@@ -86,7 +86,7 @@ fastNLP中的Vocabulary | |||
# 将验证集或者测试集在建立词表是放入no_create_entry_dataset这个参数中。 | |||
vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data]) | |||
:class:`~fastNLP.Vocabulary` 中的 `no_create_entry` , 建议在添加来自于测试集和验证集的词的时候将该参数置为True, 或将验证集和测试集 | |||
:class:`~fastNLP.Vocabulary` 中的 `no_create_entry` ,如果您并不关心具体的原理,您可以直接采取以下的建议:在添加来自于非训练集的词的时候将该参数置为True, 或将非训练集数据 | |||
传入 `no_create_entry_dataset` 参数。它们的意义是在接下来的模型会使用pretrain的embedding(包括glove, word2vec, elmo与bert)且会finetune的 | |||
情况下,如果仅使用来自于train的数据建立vocabulary,会导致只出现在test与dev中的词语无法充分利用到来自于预训练embedding的信息(因为他们 | |||
会被认为是unk),所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。 | |||
@@ -62,6 +62,7 @@ __all__ = [ | |||
"CrossEntropyLoss", | |||
"L1Loss", | |||
"BCELoss", | |||
"BCEWithLogits", | |||
"NLLLoss", | |||
"LossInForward", | |||
"CMRC2018Loss", | |||
@@ -98,7 +99,7 @@ from .dataset import DataSet | |||
from .field import FieldArray, Padder, AutoPadder, EngChar2DPadder | |||
from .instance import Instance | |||
from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, \ | |||
LossInForward, CMRC2018Loss, LossBase, MSELoss | |||
LossInForward, CMRC2018Loss, LossBase, MSELoss, BCEWithLogits | |||
from .metrics import AccuracyMetric, SpanFPreRecMetric, CMRC2018Metric, ClassifyFPreRecMetric, MetricBase,\ | |||
ConfusionMatrixMetric | |||
from .optimizer import Optimizer, SGD, Adam, AdamW | |||
@@ -86,7 +86,6 @@ except: | |||
from .dataset import DataSet | |||
from .tester import Tester | |||
from ._logger import logger | |||
from .utils import _check_fp16 | |||
from ._parallel_utils import _model_contains_inner_module | |||
try: | |||
@@ -94,11 +93,6 @@ try: | |||
except: | |||
pass | |||
try: | |||
from apex import amp | |||
except: | |||
amp = None | |||
class Callback(object): | |||
r""" | |||
@@ -123,6 +117,20 @@ class Callback(object): | |||
该属性可以通过self.trainer获取到,一般情况下不需要使用这个属性。 | |||
""" | |||
return self._trainer | |||
@property | |||
def grad_scaler(self): | |||
r""" | |||
float16的gradient scaler | |||
""" | |||
return self._trainer.grad_scaler | |||
@property | |||
def auto_cast(self): | |||
r""" | |||
float16用的auto cast环境 | |||
""" | |||
return self._trainer.auto_cast | |||
@property | |||
def step(self): | |||
@@ -472,14 +480,9 @@ class GradientClipCallback(Callback): | |||
def on_backward_end(self): | |||
if self.step%self.update_every==0: | |||
if self.parameters is None: | |||
if getattr(self.trainer, 'fp16', ''): | |||
_check_fp16() | |||
self.clip_fun(amp.master_params(self.optimizer), self.clip_value) | |||
else: | |||
self.clip_fun(self.model.parameters(), self.clip_value) | |||
else: | |||
self.clip_fun(self.parameters, self.clip_value) | |||
if self.trainer.fp16: | |||
self.grad_scaler.unscale_(self.optimizer) | |||
self.clip_fun(self.parameters, self.clip_value) | |||
class EarlyStopCallback(Callback): | |||
@@ -569,10 +572,10 @@ class FitlogCallback(Callback): | |||
if len(self.datasets) > 0: | |||
for key, data in self.datasets.items(): | |||
tester = Tester(data=data, model=self.model, | |||
batch_size=self.trainer.kwargs.get('dev_batch_size', self.batch_size), | |||
batch_size=self.trainer.kwargs.get('dev_batch_size', self.trainer.batch_size), | |||
metrics=self.trainer.metrics, | |||
verbose=0, | |||
use_tqdm=self.trainer.test_use_tqdm, | |||
use_tqdm=self.trainer.kwargs.get('test_use_tqdm', self.trainer.use_tqdm), | |||
sampler=self.trainer.kwargs.get('test_sampler', None)) | |||
self.testers[key] = tester | |||
fitlog.add_progress(total_steps=self.n_steps) | |||
@@ -948,6 +951,7 @@ class CheckPointCallback(Callback): | |||
model = model.module | |||
model.load_state_dict(states['model']) | |||
self.optimizer.load_state_dict(states['optimizer']) | |||
self.grad_scaler.load_state_dict(states['grad_scaler']) | |||
self.trainer.epoch = states['epoch'] + 1 # 因为是结束储存的,所以需要从下一个epoch开始 | |||
self.trainer.step = states['step'] | |||
if 'best_dev_epoch' in states: | |||
@@ -29,7 +29,6 @@ from .dataset import DataSet | |||
from .losses import _prepare_losser | |||
from .optimizer import Optimizer | |||
from .utils import _build_args | |||
from .utils import _check_fp16 | |||
from .utils import _get_func_signature | |||
from .utils import _move_dict_value_to_device | |||
@@ -10,6 +10,7 @@ __all__ = [ | |||
"CrossEntropyLoss", | |||
"BCELoss", | |||
"BCEWithLogits", | |||
"L1Loss", | |||
"NLLLoss", | |||
"MSELoss", | |||
@@ -311,6 +312,25 @@ class BCELoss(LossBase): | |||
return F.binary_cross_entropy(input=pred, target=target, reduction=self.reduction) | |||
class BCEWithLogits(LossBase): | |||
r""" | |||
二分类交叉熵损失函数, 传入数据之前不需要做sigmoid操作 | |||
:param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` | |||
:param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` | |||
:param str reduction: 支持 `mean` ,`sum` 和 `none` . | |||
""" | |||
def __init__(self, pred=None, target=None, reduction='mean'): | |||
super(BCEWithLogits, self).__init__() | |||
self._init_param_map(pred=pred, target=target) | |||
assert reduction in ('mean', 'sum', 'none') | |||
self.reduction = reduction | |||
def get_loss(self, pred, target): | |||
return F.binary_cross_entropy_with_logits(input=pred, target=target, reduction=self.reduction) | |||
class NLLLoss(LossBase): | |||
r""" | |||
负对数似然损失函数 | |||
@@ -112,6 +112,108 @@ class BucketSampler(Sampler): | |||
return list(chain(*batchs)) | |||
class ConstTokenNumSampler(Sampler): | |||
""" | |||
尽量保证每个batch的输入token数量是接近的。 | |||
使用示例 | |||
>>> # 假设已经有了tr_data并有一个field叫做seq_len保存了每个instance的token数量 | |||
>>> from fastNLP import DataSetIter, Trainer | |||
>>> sampler = ConstTokenNumSampler('src_seq_len', max_token=4096) | |||
>>> | |||
>>> # 直接将sampler传入Trainer中,此时batch_size参数的值会被忽略 | |||
>>> trainer = Trainer(tr_data, model, optimizer=optimizer, loss=TranslationLoss(), | |||
>>> batch_size=1, sampler=sampler, drop_last=False, update_every=1) | |||
""" | |||
def __init__(self, seq_len_field_name, max_token=4096, max_sentence=-1, need_be_multiple_of=1, num_bucket=-1): | |||
""" | |||
:param List[int] seq_len_field_name: 哪个field指示的sample的长度 | |||
:param int max_token: 每个batch的最大的token数量 | |||
:param int max_sentence: 每个batch最多多少个instance, -1表示根据max_token决定 | |||
:param int need_be_multiple_of: 生成的batch的instance的数量需要是几的倍数,在DataParallel场景下会用到 | |||
:param int num_bucket: 将数据按长度拆分为num_bucket个bucket,batch中的sample尽量在bucket之中进行组合,这样可以减少padding。 | |||
""" | |||
assert (max_sentence!=-1 and max_sentence>=need_be_multiple_of) or max_sentence<1 | |||
self.seq_len_field_name = seq_len_field_name | |||
self.num_bucket = num_bucket | |||
self.max_token = max_token | |||
self._max_sentence = max_sentence | |||
self.need_be_multiple_of = need_be_multiple_of | |||
def __call__(self, data_set): | |||
assert len(data_set)>self.num_bucket, "The number of samples should be larger than buckets." | |||
seq_len = data_set.get_field(self.seq_len_field_name) | |||
self.seq_len = seq_len | |||
seq_len_indice = [(length, i) for i, length in enumerate(seq_len)] | |||
seq_len_indice.sort(key=lambda x: x[0]) | |||
indice_in_buckets = [] | |||
if self.num_bucket>0: | |||
sample_per_bucket = len(seq_len_indice)//self.num_bucket | |||
i = 0 | |||
while len(indice_in_buckets)<len(seq_len_indice): | |||
indice_in_buckets.append(seq_len_indice[i*sample_per_bucket:(i+1)*sample_per_bucket]) | |||
i += 1 | |||
else: | |||
indice_in_buckets = [seq_len_indice] | |||
self.indice_in_buckets = indice_in_buckets | |||
self.get_new_order() | |||
@property | |||
def max_sentence(self): | |||
if self._max_sentence<1: | |||
return 100000000 | |||
return self._max_sentence | |||
@max_sentence.setter | |||
def max_sentence(self, max_sentence): | |||
self._max_sentence = max_sentence | |||
def get_new_order(self): | |||
np.random.shuffle(self.indice_in_buckets) | |||
for bucket in self.indice_in_buckets: | |||
np.random.shuffle(bucket) | |||
indices = list(chain(*self.indice_in_buckets)) | |||
batches = [] | |||
cur_max_len = 0 | |||
batch = [] | |||
for length, i in indices: | |||
max_len = max(length, cur_max_len) | |||
if max_len*(len(batch)+1)>self.max_token or len(batch)>=self.max_sentence: | |||
left_sample = len(batch) % self.need_be_multiple_of | |||
add_samples = batch.copy() | |||
cur_max_len =length | |||
if left_sample!=0: | |||
add_samples = add_samples[:-left_sample] | |||
batch = batch[-left_sample:] | |||
cur_max_len = max(cur_max_len, max(batch)) | |||
else: | |||
batch = [] | |||
if len(add_samples)==0: | |||
raise RuntimeError(f"The sample `{i}` is too long to make a batch with {self.need_be_multiple_of} samples.") | |||
batches.append(add_samples) | |||
else: | |||
cur_max_len = max_len | |||
batch.append(i) | |||
if batch: | |||
left_sample = len(batch) % self.need_be_multiple_of | |||
add_samples = batch.copy() | |||
if left_sample != 0: | |||
add_samples = add_samples[:-left_sample].copy() | |||
if add_samples: | |||
batches.append(add_samples) | |||
np.random.shuffle(batches) | |||
self.batches = batches | |||
def __iter__(self): | |||
for batch in self.batches: | |||
yield batch | |||
self.get_new_order() | |||
def __len__(self): | |||
return len(self.batches) | |||
class ConstantTokenNumSampler: | |||
""" | |||
尽量保证每个batch的输入token数量是接近的。 | |||
@@ -119,7 +221,7 @@ class ConstantTokenNumSampler: | |||
使用示例 | |||
>>> # 假设已经有了tr_data并有一个field叫做seq_len保存了每个instance的token数量 | |||
>>> from fastNLP import DataSetIter, Trainer | |||
>>> sampler = BatchSampler(tr_data.get_field('seq_len').content, max_token=4096) | |||
>>> sampler = ConstantTokenNumSampler(tr_data.get_field('seq_len').content, max_token=4096) | |||
>>> tr_iter = DataSetIter(tr_data, | |||
>>> batch_size=1, sampler=None, as_numpy=False, num_workers=0, pin_memory=False, | |||
>>> drop_last=False, timeout=0, worker_init_fn=None, | |||
@@ -128,7 +230,6 @@ class ConstantTokenNumSampler: | |||
>>> # 直接将tr_iter传入Trainer中,此时batch_size参数的值会被忽略 | |||
>>> trainer = Trainer(tr_iter, model, optimizer=optimizer, loss=TranslationLoss(), | |||
>>> batch_size=1, sampler=None, drop_last=False, update_every=1) | |||
""" | |||
def __init__(self, seq_len, max_token=4096, max_sentence=-1, need_be_multiple_of=1, num_bucket=-1): | |||
""" | |||
@@ -53,6 +53,8 @@ from .utils import _move_dict_value_to_device | |||
from .utils import _get_func_signature | |||
from .utils import _get_model_device | |||
from .utils import _move_model_to_device | |||
from .utils import _build_fp16_env | |||
from .utils import _can_use_fp16 | |||
from ._parallel_utils import _data_parallel_wrapper | |||
from ._parallel_utils import _model_contains_inner_module | |||
from functools import partial | |||
@@ -70,7 +72,7 @@ class Tester(object): | |||
""" | |||
def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1, use_tqdm=True, | |||
**kwargs): | |||
fp16=False, **kwargs): | |||
r""" | |||
:param ~fastNLP.DataSet,~fastNLP.BatchIter data: 需要测试的数据集 | |||
@@ -93,7 +95,9 @@ class Tester(object): | |||
如果模型是通过predict()进行预测的话,那么将不能使用多卡(DataParallel)进行验证,只会使用第一张卡上的模型。 | |||
:param int verbose: 如果为0不输出任何信息; 如果为1,打印出验证结果。 | |||
:param bool use_tqdm: 是否使用tqdm来显示测试进度; 如果为False,则不会显示任何内容。 | |||
:param kwargs: 支持传入sampler控制测试顺序 | |||
:param bool fp16: 是否使用float16进行验证 | |||
:param kwargs: | |||
Sampler sampler: 支持传入sampler控制测试顺序 | |||
""" | |||
super(Tester, self).__init__() | |||
@@ -147,7 +151,11 @@ class Tester(object): | |||
else: | |||
self._predict_func = self._model.forward | |||
self._predict_func_wrapper = self._model.forward | |||
if fp16: | |||
_can_use_fp16(model=model, device=device, func=self._predict_func) | |||
self.auto_cast, _grad_scaler = _build_fp16_env(not fp16) | |||
def test(self): | |||
r"""开始进行验证,并返回验证结果。 | |||
@@ -172,12 +180,13 @@ class Tester(object): | |||
for batch_x, batch_y in data_iterator: | |||
_move_dict_value_to_device(batch_x, batch_y, device=self._model_device) | |||
pred_dict = self._data_forward(self._predict_func, batch_x) | |||
if not isinstance(pred_dict, dict): | |||
raise TypeError(f"The return value of {_get_func_signature(self._predict_func)} " | |||
f"must be `dict`, got {type(pred_dict)}.") | |||
for metric in self.metrics: | |||
metric(pred_dict, batch_y) | |||
with self.auto_cast(): | |||
pred_dict = self._data_forward(self._predict_func, batch_x) | |||
if not isinstance(pred_dict, dict): | |||
raise TypeError(f"The return value of {_get_func_signature(self._predict_func)} " | |||
f"must be `dict`, got {type(pred_dict)}.") | |||
for metric in self.metrics: | |||
metric(pred_dict, batch_y) | |||
if self.use_tqdm: | |||
pbar.update() | |||
@@ -342,7 +342,7 @@ from .losses import _prepare_losser | |||
from .metrics import _prepare_metrics | |||
from .optimizer import Optimizer | |||
from .sampler import Sampler | |||
from .sampler import RandomSampler | |||
from .sampler import RandomSampler, ConstTokenNumSampler | |||
from .tester import Tester | |||
from .utils import _CheckError | |||
from .utils import _build_args | |||
@@ -352,6 +352,8 @@ from .utils import _move_dict_value_to_device | |||
from .utils import _get_func_signature | |||
from .utils import _get_model_device | |||
from .utils import _move_model_to_device | |||
from .utils import _build_fp16_env | |||
from .utils import _can_use_fp16 | |||
from ._parallel_utils import _model_contains_inner_module | |||
from ._logger import logger | |||
@@ -373,7 +375,7 @@ class Trainer(object): | |||
num_workers=0, n_epochs=10, print_every=5, | |||
dev_data=None, metrics=None, metric_key=None, | |||
validate_every=-1, save_path=None, use_tqdm=True, device=None, | |||
callbacks=None, check_code_level=0, **kwargs): | |||
callbacks=None, check_code_level=0, fp16=False, **kwargs): | |||
r""" | |||
:param train_data: 训练集, :class:`~fastNLP.DataSet` 类型或 :class:`~fastNLP.BatchIter` 的子类 | |||
:param nn.modules model: 待训练的模型 | |||
@@ -422,9 +424,14 @@ class Trainer(object): | |||
报告警告信息; 2: 有任何field没有被使用都报错. 检查的原理是通过使用很小的batch(默认2个sample)来运行代码,但是 | |||
这个过程理论上不会修改任何参数,只是会检查能否运行。但如果(1)模型中存在将batch_size写为某个固定值的情况; | |||
(2)模型中存在累加前向计算次数的,可能会多计算1次。以上情况建议将check_code_level设置为-1。 | |||
:param bool fp16: 是否使用fp16进行训练。 | |||
:param kwargs: 支持配置可选参数 | |||
bool test_use_tqdm: 在dev上验证的时候是否开启tqdm | |||
Sampler test_sampler: 在evaluate的时候使用的sampler | |||
bool test_use_fp16: evalute的时候是否使用fp16测试,默认与fp16相同的取值。 | |||
bool set_grad_to_none: 在zero_grad的时候是否将gradient设置为None,而不是设置为zero | |||
GradScaler grad_scaler: 仅在fp16为True时有效,如果不使用torch.cuda.amp.GradScaler的初始化参数,可传入一个已经初始化后的 | |||
grad_scaler。 | |||
""" | |||
super(Trainer, self).__init__() | |||
if not isinstance(model, nn.Module): | |||
@@ -488,6 +495,15 @@ class Trainer(object): | |||
sampler = RandomSampler() | |||
elif hasattr(sampler, 'set_batch_size'): | |||
sampler.set_batch_size(batch_size) | |||
if isinstance(sampler, ConstTokenNumSampler): # 直接使用固定token数量的Sampler | |||
assert isinstance(train_data, | |||
DataSet), f"When sampler is `ConstTokenNumSampler`, the train_data must" \ | |||
f" be `DataSet`." | |||
sampler(train_data) | |||
train_data = DataSetIter(train_data, | |||
batch_size=1, sampler=None, as_numpy=False, num_workers=num_workers, | |||
pin_memory=False, drop_last=drop_last, timeout=0, worker_init_fn=None, | |||
batch_sampler=sampler) | |||
if isinstance(train_data, DataSet): | |||
self.data_iterator = DataSetIter(dataset=train_data, batch_size=batch_size, sampler=sampler, | |||
@@ -505,6 +521,21 @@ class Trainer(object): | |||
self._forward_func = self.model.module.forward | |||
else: | |||
self._forward_func = self.model.forward | |||
self.fp16 = fp16 | |||
# check fp16相关的设置 | |||
self.auto_cast, _grad_scaler = _build_fp16_env(dummy=not fp16) | |||
if self.fp16: | |||
_can_use_fp16(device=device, model=model, func=self._forward_func) | |||
grad_scaler = kwargs.get('grad_scaler', None) | |||
if grad_scaler is not None: | |||
self.grad_scaler = grad_scaler | |||
else: | |||
self.grad_scaler = _grad_scaler() | |||
self.test_use_fp16 = kwargs.get('test_use_fp16', fp16) | |||
self.set_grad_to_none = kwargs.get('set_grad_to_none', True) | |||
if check_code_level > -1: | |||
# _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的field名与模型的输入 | |||
# 名是否匹配 | |||
@@ -553,10 +584,7 @@ class Trainer(object): | |||
self.logger = logger | |||
self.use_tqdm = use_tqdm | |||
if 'test_use_tqdm' in kwargs: | |||
self.test_use_tqdm = kwargs.get('test_use_tqdm') | |||
else: | |||
self.test_use_tqdm = self.use_tqdm | |||
self.test_use_tqdm = kwargs.get('test_use_tqdm', self.use_tqdm) | |||
self.pbar = None | |||
self.print_every = abs(self.print_every) | |||
self.kwargs = kwargs | |||
@@ -568,7 +596,8 @@ class Trainer(object): | |||
device=None, # 由上面的部分处理device | |||
verbose=0, | |||
use_tqdm=self.test_use_tqdm, | |||
sampler=kwargs.get('test_sampler', None)) | |||
sampler=kwargs.get('test_sampler', None), | |||
fp16=self.test_use_fp16) | |||
self.start_time = None # start timestamp | |||
@@ -677,7 +706,8 @@ class Trainer(object): | |||
# edit prediction | |||
self.callback_manager.on_loss_begin(batch_y, prediction) | |||
loss = self._compute_loss(prediction, batch_y).mean() | |||
with self.auto_cast(): | |||
loss = self._compute_loss(prediction, batch_y).mean() | |||
loss = loss / self.update_every | |||
avg_loss += loss.item() | |||
@@ -762,11 +792,13 @@ class Trainer(object): | |||
""" | |||
if self.step % self.update_every == 0: | |||
self.optimizer.step() | |||
self.grad_scaler.step(self.optimizer) | |||
self.grad_scaler.update() | |||
def _data_forward(self, network, x): | |||
x = _build_args(self._forward_func, **x) | |||
y = network(**x) | |||
with self.auto_cast(): | |||
y = network(**x) | |||
if not isinstance(y, dict): | |||
raise TypeError( | |||
f"The return value of {_get_func_signature(self._forward_func)} should be dict, got {type(y)}.") | |||
@@ -780,8 +812,22 @@ class Trainer(object): | |||
For PyTorch, just do "loss.backward()" | |||
""" | |||
if (self.step-1) % self.update_every == 0: | |||
self.model.zero_grad() | |||
loss.backward() | |||
self._clear_grad(self.optimizer, self.set_grad_to_none) | |||
self.grad_scaler.scale(loss).backward() | |||
def _clear_grad(self, optimizer, set_to_none=True): | |||
param_groups = optimizer.param_groups | |||
for group in param_groups: | |||
for p in group['params']: | |||
if p.grad is not None: | |||
if set_to_none: | |||
p.grad = None | |||
else: | |||
if p.grad.grad_fn is not None: | |||
p.grad.detach_() | |||
else: | |||
p.grad.requires_grad_(False) | |||
p.grad.zero_() | |||
def _compute_loss(self, predict, truth): | |||
r"""Compute loss given prediction and ground truth. | |||
@@ -12,23 +12,20 @@ import inspect | |||
import os | |||
import warnings | |||
from collections import Counter, namedtuple | |||
from copy import deepcopy | |||
from typing import List | |||
import _pickle | |||
import numpy as np | |||
import torch | |||
import torch.nn as nn | |||
from prettytable import PrettyTable | |||
from ._logger import logger | |||
from ._parallel_utils import _model_contains_inner_module | |||
# from .vocabulary import Vocabulary | |||
import torch | |||
import contextlib | |||
from pkg_resources import parse_version | |||
try: | |||
from apex import amp | |||
except: | |||
amp = None | |||
_CheckRes = namedtuple('_CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed', | |||
'varargs']) | |||
@@ -1032,8 +1029,92 @@ def sub_column(string: str, c: int, c_size: int, title: str) -> str: | |||
return res | |||
def _check_fp16(): | |||
if amp is None: | |||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") | |||
if not torch.backends.cudnn.enabled: | |||
raise RuntimeError("Amp requires cudnn backend to be enabled.") | |||
def _is_function_contains_autocast(func): | |||
""" | |||
检查func是否包含autocast,(1)是否使用了autocast的修饰器或, (2)使用使用with autocast()环境 | |||
:param func: 待检查的函数 | |||
""" | |||
import re | |||
source = inspect.getsource(func) | |||
lines = source.split('\n') | |||
for line in lines: | |||
line = line.strip() | |||
if re.search(r'@[\w\.]*autocast\(\)', line): | |||
raise RuntimeError("Please do not use `autocast()` decorator, use `with autocast():` instead. Please refer to" | |||
" https://pytorch.org/docs/stable/notes/amp_examples.html#dataparallel-in-a-single-process ") | |||
if re.search(r'with [\w\.]*autocast\(\):', line): | |||
return True | |||
return False | |||
class DummyGradScaler: | |||
""" | |||
用于Dummy pytorch的GradScaler对象,防止重复写大量的if判断 | |||
""" | |||
def __init__(self, *args, **kwargs): | |||
pass | |||
def get_scale(self): | |||
return 1.0 | |||
def is_enabled(self): | |||
return False | |||
def scale(self, outputs): | |||
return outputs | |||
def step(self, optimizer, *args, **kwargs): | |||
optimizer.step(*args, **kwargs) | |||
def update(self, new_scale=None): | |||
pass | |||
def unscale_(self, optimizer): | |||
pass | |||
def load_state_dict(self, state_dict): | |||
pass | |||
def state_dict(self): | |||
return {} | |||
def _build_fp16_env(dummy=False): | |||
if dummy: | |||
autocast = contextlib.ExitStack | |||
GradScaler = DummyGradScaler | |||
else: | |||
if not torch.cuda.is_available(): | |||
raise RuntimeError("No cuda") | |||
if torch.cuda.get_device_capability(0)[0] < 7: | |||
warnings.warn( | |||
"NOTE: your device does NOT support faster training with fp16, " | |||
"please switch to FP32 which is likely to be faster" | |||
) | |||
try: | |||
from torch.cuda.amp import autocast, GradScaler | |||
except ImportError: | |||
raise RuntimeError("torch version too low (less than 1.6)") | |||
return autocast, GradScaler | |||
def _can_use_fp16(device, model, func): | |||
if parse_version(torch.__version__) < parse_version('1.6'): | |||
raise RuntimeError("Pytorch supports float16 after version 1.6, please upgrade your pytorch version.") | |||
model_device = _get_model_device(model) | |||
if device is None and model_device is not None and model_device.type != 'cuda': | |||
raise RuntimeError("You have to run in cuda device to use fp16.") | |||
if isinstance(device, str): | |||
if device=='cpu': | |||
raise RuntimeError("You have to run in cuda device to use fp16.") | |||
if isinstance(device, torch.device) and device.type=='cpu': | |||
raise RuntimeError("You have to run in cuda device to use fp16.") | |||
if (_model_contains_inner_module(model) or (isinstance(device, list) and len(device) > 1)): | |||
# 需要提醒用户 | |||
if not _is_function_contains_autocast(func): | |||
raise RuntimeError("When use fp16 in Parallel Training, you have to set autocast() in your forward " | |||
"function as described in " | |||
"https://pytorch.org/docs/stable/notes/amp_examples.html#dataparallel-in-a-single-process") |
@@ -125,7 +125,7 @@ class Vocabulary(object): | |||
r"""依次增加序列中词在词典中的出现频率 | |||
:param list word_lst: a list of strings | |||
:param bool no_create_entry: 在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||
:param bool no_create_entry: 如果词语来自于非训练集建议设置为True。在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||
如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 | |||
的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 | |||
加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 | |||
@@ -142,7 +142,7 @@ class Vocabulary(object): | |||
增加一个新词在词典中的出现频率 | |||
:param str word: 新词 | |||
:param bool no_create_entry: 在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||
:param bool no_create_entry: 如果词语来自于非训练集建议设置为True。在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||
如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 | |||
的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 | |||
加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 | |||
@@ -175,7 +175,7 @@ class Vocabulary(object): | |||
增加一个新词在词典中的出现频率 | |||
:param str word: 新词 | |||
:param bool no_create_entry: 在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||
:param bool no_create_entry: 如果词语来自于非训练集建议设置为True。在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||
如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 | |||
的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 | |||
加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 | |||
@@ -190,7 +190,7 @@ class Vocabulary(object): | |||
依次增加序列中词在词典中的出现频率 | |||
:param list[str] word_lst: 词的序列 | |||
:param bool no_create_entry: 在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||
:param bool no_create_entry: 如果词语来自于非训练集建议设置为True。在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||
如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 | |||
的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 | |||
加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 | |||
@@ -344,7 +344,7 @@ class Vocabulary(object): | |||
:param str,List[str] field_name: 可为 ``str`` 或 ``List[str]`` . | |||
构建词典所使用的 field(s), 支持一个或多个field,若有多个 DataSet, 每个DataSet都必须有这些field. 目前支持的field结构 | |||
: ``str`` , ``List[str]`` | |||
:param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认),该选项用在接下来的模型会使用pretrain | |||
:param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认), 建议直接将非训练数据都传入到这个参数。该选项用在接下来的模型会使用pretrain | |||
的embedding(包括glove, word2vec, elmo与bert)且会finetune的情况。如果仅使用来自于train的数据建立vocabulary,会导致test与dev | |||
中的数据无法充分利用到来自于预训练embedding的信息,所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。 | |||
如果一个词出现在了train中,但是没在预训练模型中,embedding会为它用unk初始化,但它是单独的一个vector,如果 | |||
@@ -108,7 +108,7 @@ class BertEmbedding(ContextualEmbedding): | |||
self._word_sep_index = vocab['[SEP]'] | |||
self._word_cls_index = -100 | |||
if '[CLS]' in vocab: | |||
self._word_cls_index = vocab['CLS'] | |||
self._word_cls_index = vocab['[CLS]'] | |||
min_freq = kwargs.get('min_freq', 1) | |||
self._min_freq = min_freq | |||
@@ -281,7 +281,9 @@ class StaticEmbedding(TokenEmbedding): | |||
if word in vocab: | |||
index = vocab.to_index(word) | |||
if index in matrix: | |||
warnings.warn(f"Word:{word} occurs again in line:{idx}(starts from 0)") | |||
warnings.warn(f"Word has more than one vector in embedding file. Set logger level to " | |||
f"DEBUG for detail.") | |||
logger.debug(f"Word:{word} occurs again in line:{idx}(starts from 0)") | |||
matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) | |||
if self.only_norm_found_vector: | |||
matrix[index] = matrix[index] / np.linalg.norm(matrix[index]) | |||
@@ -34,3 +34,56 @@ class NaiveClassifier(BaseModel): | |||
def predict(self, x): | |||
return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | |||
class NaiveClassifier2(BaseModel): | |||
r""" | |||
一个简单的分类器例子,可用于各种测试 | |||
""" | |||
def __init__(self, in_feature_dim, out_feature_dim): | |||
super(NaiveClassifier2, self).__init__() | |||
self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||
def forward(self, x): | |||
return {"predict": self.mlp(x)} | |||
def predict(self, x): | |||
return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | |||
class NaiveClassifier3(BaseModel): | |||
r""" | |||
一个简单的分类器例子,可用于各种测试 | |||
""" | |||
def __init__(self, in_feature_dim, out_feature_dim): | |||
super(NaiveClassifier3, self).__init__() | |||
self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||
@torch.cuda.amp.autocast() | |||
def forward(self, x): | |||
return {"predict": self.mlp(x)} | |||
@torch.cuda.amp.autocast() | |||
def predict(self, x): | |||
return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | |||
class NaiveClassifier4(BaseModel): | |||
r""" | |||
一个简单的分类器例子,可用于各种测试 | |||
""" | |||
def __init__(self, in_feature_dim, out_feature_dim): | |||
super(NaiveClassifier4, self).__init__() | |||
self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||
def forward(self, x): | |||
with torch.cuda.amp.autocast(): | |||
return {"predict": self.mlp(x)} | |||
def predict(self, x): | |||
with torch.cuda.amp.autocast(): | |||
return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} |
@@ -44,3 +44,11 @@ class TestSampler(unittest.TestCase): | |||
indices = sampler(data_set) | |||
self.assertEqual(len(indices), 10) | |||
# 跑通即可,不验证效果 | |||
def test_ConstantTokenNumSampler(self): | |||
# 需要check的是,是否在number上是接近的 | |||
pass | |||
def test_ConstTokenNumSampler(self): | |||
# 需要check的是,是否可以直接运行 | |||
pass |
@@ -9,12 +9,12 @@ import torch | |||
from fastNLP import DataSet | |||
from fastNLP import Instance | |||
from fastNLP import BCELoss | |||
from fastNLP import BCELoss, BCEWithLogits | |||
from fastNLP import CrossEntropyLoss | |||
from fastNLP import AccuracyMetric | |||
from fastNLP import SGD | |||
from fastNLP import Trainer | |||
from fastNLP.models.base_model import NaiveClassifier | |||
from fastNLP.models.base_model import NaiveClassifier, NaiveClassifier2, NaiveClassifier3, NaiveClassifier4 | |||
from fastNLP import TorchLoaderIter | |||
@@ -575,3 +575,83 @@ class TrainerTestGround(unittest.TestCase): | |||
) | |||
trainer.train() | |||
""" | |||
class Fp16TrainerTest(unittest.TestCase): | |||
def test_raise_error(self): | |||
data_set = prepare_fake_dataset() | |||
data_set.set_input("x", flag=True) | |||
data_set.set_target("y", flag=True) | |||
train_set, dev_set = data_set.split(0.3) | |||
model = NaiveClassifier2(2, 1) | |||
with self.assertRaises(RuntimeError): | |||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||
use_tqdm=True, check_code_level=2, fp16=True) | |||
with self.assertRaises(RuntimeError): | |||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||
use_tqdm=True, check_code_level=2, fp16=True, device='cpu') | |||
with self.assertRaises(RuntimeError): | |||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||
use_tqdm=True, check_code_level=2, fp16=True, device=torch.device('cpu')) | |||
@unittest.skipIf(torch.cuda.is_available()==False, "Skip when no cuda device detch") | |||
def test_run_fp16(self): | |||
data_set = prepare_fake_dataset() | |||
data_set.set_input("x", flag=True) | |||
data_set.set_target("y", flag=True) | |||
train_set, dev_set = data_set.split(0.3) | |||
model = NaiveClassifier2(2, 1) | |||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||
use_tqdm=True, check_code_level=2, fp16=True, device=0) | |||
trainer.train(load_best_model=False) | |||
model = NaiveClassifier2(2, 1) | |||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||
use_tqdm=True, check_code_level=2, fp16=True, device=0, test_use_fp16=False) | |||
trainer.train(load_best_model=False) | |||
@unittest.skipIf(torch.cuda.device_count()<2, "Skip when lower than 1 gpus.") | |||
def test_run_data_parallel(self): | |||
data_set = prepare_fake_dataset() | |||
data_set.set_input("x", flag=True) | |||
data_set.set_target("y", flag=True) | |||
train_set, dev_set = data_set.split(0.3) | |||
model = NaiveClassifier2(2, 1) | |||
with self.assertRaises(RuntimeError): | |||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||
use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1]) | |||
with self.assertRaises(RuntimeError): | |||
model = NaiveClassifier3(2, 1) | |||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||
use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1], test_use_fp16=True) | |||
model = NaiveClassifier4(2, 1) | |||
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||
use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1], test_use_fp16=True) | |||
trainer.train(load_best_model=False) |