diff --git a/docs/source/tutorials/tutorial_2_vocabulary.rst b/docs/source/tutorials/tutorial_2_vocabulary.rst index 777ee63a..e8855d99 100644 --- a/docs/source/tutorials/tutorial_2_vocabulary.rst +++ b/docs/source/tutorials/tutorial_2_vocabulary.rst @@ -86,7 +86,7 @@ fastNLP中的Vocabulary # 将验证集或者测试集在建立词表是放入no_create_entry_dataset这个参数中。 vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data]) -:class:`~fastNLP.Vocabulary` 中的 `no_create_entry` , 建议在添加来自于测试集和验证集的词的时候将该参数置为True, 或将验证集和测试集 +:class:`~fastNLP.Vocabulary` 中的 `no_create_entry` ,如果您并不关心具体的原理,您可以直接采取以下的建议:在添加来自于非训练集的词的时候将该参数置为True, 或将非训练集数据 传入 `no_create_entry_dataset` 参数。它们的意义是在接下来的模型会使用pretrain的embedding(包括glove, word2vec, elmo与bert)且会finetune的 情况下,如果仅使用来自于train的数据建立vocabulary,会导致只出现在test与dev中的词语无法充分利用到来自于预训练embedding的信息(因为他们 会被认为是unk),所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。 diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index 56808bff..d2963d13 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -62,6 +62,7 @@ __all__ = [ "CrossEntropyLoss", "L1Loss", "BCELoss", + "BCEWithLogits", "NLLLoss", "LossInForward", "CMRC2018Loss", @@ -98,7 +99,7 @@ from .dataset import DataSet from .field import FieldArray, Padder, AutoPadder, EngChar2DPadder from .instance import Instance from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, \ - LossInForward, CMRC2018Loss, LossBase, MSELoss + LossInForward, CMRC2018Loss, LossBase, MSELoss, BCEWithLogits from .metrics import AccuracyMetric, SpanFPreRecMetric, CMRC2018Metric, ClassifyFPreRecMetric, MetricBase,\ ConfusionMatrixMetric from .optimizer import Optimizer, SGD, Adam, AdamW diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index e02e04a0..91c888df 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -86,7 +86,6 @@ except: from .dataset import DataSet from .tester import Tester from ._logger import logger -from .utils import _check_fp16 from ._parallel_utils import _model_contains_inner_module try: @@ -94,11 +93,6 @@ try: except: pass -try: - from apex import amp -except: - amp = None - class Callback(object): r""" @@ -123,6 +117,20 @@ class Callback(object): 该属性可以通过self.trainer获取到,一般情况下不需要使用这个属性。 """ return self._trainer + + @property + def grad_scaler(self): + r""" + float16的gradient scaler + """ + return self._trainer.grad_scaler + + @property + def auto_cast(self): + r""" + float16用的auto cast环境 + """ + return self._trainer.auto_cast @property def step(self): @@ -472,14 +480,9 @@ class GradientClipCallback(Callback): def on_backward_end(self): if self.step%self.update_every==0: - if self.parameters is None: - if getattr(self.trainer, 'fp16', ''): - _check_fp16() - self.clip_fun(amp.master_params(self.optimizer), self.clip_value) - else: - self.clip_fun(self.model.parameters(), self.clip_value) - else: - self.clip_fun(self.parameters, self.clip_value) + if self.trainer.fp16: + self.grad_scaler.unscale_(self.optimizer) + self.clip_fun(self.parameters, self.clip_value) class EarlyStopCallback(Callback): @@ -569,10 +572,10 @@ class FitlogCallback(Callback): if len(self.datasets) > 0: for key, data in self.datasets.items(): tester = Tester(data=data, model=self.model, - batch_size=self.trainer.kwargs.get('dev_batch_size', self.batch_size), + batch_size=self.trainer.kwargs.get('dev_batch_size', self.trainer.batch_size), metrics=self.trainer.metrics, verbose=0, - use_tqdm=self.trainer.test_use_tqdm, + use_tqdm=self.trainer.kwargs.get('test_use_tqdm', self.trainer.use_tqdm), sampler=self.trainer.kwargs.get('test_sampler', None)) self.testers[key] = tester fitlog.add_progress(total_steps=self.n_steps) @@ -948,6 +951,7 @@ class CheckPointCallback(Callback): model = model.module model.load_state_dict(states['model']) self.optimizer.load_state_dict(states['optimizer']) + self.grad_scaler.load_state_dict(states['grad_scaler']) self.trainer.epoch = states['epoch'] + 1 # 因为是结束储存的,所以需要从下一个epoch开始 self.trainer.step = states['step'] if 'best_dev_epoch' in states: diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py index bd7ba423..2f6dffbb 100644 --- a/fastNLP/core/dist_trainer.py +++ b/fastNLP/core/dist_trainer.py @@ -29,7 +29,6 @@ from .dataset import DataSet from .losses import _prepare_losser from .optimizer import Optimizer from .utils import _build_args -from .utils import _check_fp16 from .utils import _get_func_signature from .utils import _move_dict_value_to_device diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index a821d85a..642c8ef3 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -10,6 +10,7 @@ __all__ = [ "CrossEntropyLoss", "BCELoss", + "BCEWithLogits", "L1Loss", "NLLLoss", "MSELoss", @@ -311,6 +312,25 @@ class BCELoss(LossBase): return F.binary_cross_entropy(input=pred, target=target, reduction=self.reduction) +class BCEWithLogits(LossBase): + r""" + 二分类交叉熵损失函数, 传入数据之前不需要做sigmoid操作 + + :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` + :param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` + :param str reduction: 支持 `mean` ,`sum` 和 `none` . + """ + + def __init__(self, pred=None, target=None, reduction='mean'): + super(BCEWithLogits, self).__init__() + self._init_param_map(pred=pred, target=target) + assert reduction in ('mean', 'sum', 'none') + self.reduction = reduction + + def get_loss(self, pred, target): + return F.binary_cross_entropy_with_logits(input=pred, target=target, reduction=self.reduction) + + class NLLLoss(LossBase): r""" 负对数似然损失函数 diff --git a/fastNLP/core/sampler.py b/fastNLP/core/sampler.py index f5d60ebb..329de742 100644 --- a/fastNLP/core/sampler.py +++ b/fastNLP/core/sampler.py @@ -112,6 +112,108 @@ class BucketSampler(Sampler): return list(chain(*batchs)) +class ConstTokenNumSampler(Sampler): + """ + 尽量保证每个batch的输入token数量是接近的。 + + 使用示例 + >>> # 假设已经有了tr_data并有一个field叫做seq_len保存了每个instance的token数量 + >>> from fastNLP import DataSetIter, Trainer + >>> sampler = ConstTokenNumSampler('src_seq_len', max_token=4096) + >>> + >>> # 直接将sampler传入Trainer中,此时batch_size参数的值会被忽略 + >>> trainer = Trainer(tr_data, model, optimizer=optimizer, loss=TranslationLoss(), + >>> batch_size=1, sampler=sampler, drop_last=False, update_every=1) + """ + def __init__(self, seq_len_field_name, max_token=4096, max_sentence=-1, need_be_multiple_of=1, num_bucket=-1): + """ + + :param List[int] seq_len_field_name: 哪个field指示的sample的长度 + :param int max_token: 每个batch的最大的token数量 + :param int max_sentence: 每个batch最多多少个instance, -1表示根据max_token决定 + :param int need_be_multiple_of: 生成的batch的instance的数量需要是几的倍数,在DataParallel场景下会用到 + :param int num_bucket: 将数据按长度拆分为num_bucket个bucket,batch中的sample尽量在bucket之中进行组合,这样可以减少padding。 + """ + assert (max_sentence!=-1 and max_sentence>=need_be_multiple_of) or max_sentence<1 + self.seq_len_field_name = seq_len_field_name + self.num_bucket = num_bucket + self.max_token = max_token + self._max_sentence = max_sentence + self.need_be_multiple_of = need_be_multiple_of + + def __call__(self, data_set): + assert len(data_set)>self.num_bucket, "The number of samples should be larger than buckets." + seq_len = data_set.get_field(self.seq_len_field_name) + self.seq_len = seq_len + seq_len_indice = [(length, i) for i, length in enumerate(seq_len)] + seq_len_indice.sort(key=lambda x: x[0]) + indice_in_buckets = [] + if self.num_bucket>0: + sample_per_bucket = len(seq_len_indice)//self.num_bucket + i = 0 + while len(indice_in_buckets)self.max_token or len(batch)>=self.max_sentence: + left_sample = len(batch) % self.need_be_multiple_of + add_samples = batch.copy() + cur_max_len =length + if left_sample!=0: + add_samples = add_samples[:-left_sample] + batch = batch[-left_sample:] + cur_max_len = max(cur_max_len, max(batch)) + else: + batch = [] + if len(add_samples)==0: + raise RuntimeError(f"The sample `{i}` is too long to make a batch with {self.need_be_multiple_of} samples.") + batches.append(add_samples) + else: + cur_max_len = max_len + batch.append(i) + if batch: + left_sample = len(batch) % self.need_be_multiple_of + add_samples = batch.copy() + if left_sample != 0: + add_samples = add_samples[:-left_sample].copy() + if add_samples: + batches.append(add_samples) + np.random.shuffle(batches) + self.batches = batches + + def __iter__(self): + for batch in self.batches: + yield batch + self.get_new_order() + + def __len__(self): + return len(self.batches) + + class ConstantTokenNumSampler: """ 尽量保证每个batch的输入token数量是接近的。 @@ -119,7 +221,7 @@ class ConstantTokenNumSampler: 使用示例 >>> # 假设已经有了tr_data并有一个field叫做seq_len保存了每个instance的token数量 >>> from fastNLP import DataSetIter, Trainer - >>> sampler = BatchSampler(tr_data.get_field('seq_len').content, max_token=4096) + >>> sampler = ConstantTokenNumSampler(tr_data.get_field('seq_len').content, max_token=4096) >>> tr_iter = DataSetIter(tr_data, >>> batch_size=1, sampler=None, as_numpy=False, num_workers=0, pin_memory=False, >>> drop_last=False, timeout=0, worker_init_fn=None, @@ -128,7 +230,6 @@ class ConstantTokenNumSampler: >>> # 直接将tr_iter传入Trainer中,此时batch_size参数的值会被忽略 >>> trainer = Trainer(tr_iter, model, optimizer=optimizer, loss=TranslationLoss(), >>> batch_size=1, sampler=None, drop_last=False, update_every=1) - """ def __init__(self, seq_len, max_token=4096, max_sentence=-1, need_be_multiple_of=1, num_bucket=-1): """ diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index abb39c56..4cf83fac 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -53,6 +53,8 @@ from .utils import _move_dict_value_to_device from .utils import _get_func_signature from .utils import _get_model_device from .utils import _move_model_to_device +from .utils import _build_fp16_env +from .utils import _can_use_fp16 from ._parallel_utils import _data_parallel_wrapper from ._parallel_utils import _model_contains_inner_module from functools import partial @@ -70,7 +72,7 @@ class Tester(object): """ def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1, use_tqdm=True, - **kwargs): + fp16=False, **kwargs): r""" :param ~fastNLP.DataSet,~fastNLP.BatchIter data: 需要测试的数据集 @@ -93,7 +95,9 @@ class Tester(object): 如果模型是通过predict()进行预测的话,那么将不能使用多卡(DataParallel)进行验证,只会使用第一张卡上的模型。 :param int verbose: 如果为0不输出任何信息; 如果为1,打印出验证结果。 :param bool use_tqdm: 是否使用tqdm来显示测试进度; 如果为False,则不会显示任何内容。 - :param kwargs: 支持传入sampler控制测试顺序 + :param bool fp16: 是否使用float16进行验证 + :param kwargs: + Sampler sampler: 支持传入sampler控制测试顺序 """ super(Tester, self).__init__() @@ -147,7 +151,11 @@ class Tester(object): else: self._predict_func = self._model.forward self._predict_func_wrapper = self._model.forward - + + if fp16: + _can_use_fp16(model=model, device=device, func=self._predict_func) + self.auto_cast, _grad_scaler = _build_fp16_env(not fp16) + def test(self): r"""开始进行验证,并返回验证结果。 @@ -172,12 +180,13 @@ class Tester(object): for batch_x, batch_y in data_iterator: _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) - pred_dict = self._data_forward(self._predict_func, batch_x) - if not isinstance(pred_dict, dict): - raise TypeError(f"The return value of {_get_func_signature(self._predict_func)} " - f"must be `dict`, got {type(pred_dict)}.") - for metric in self.metrics: - metric(pred_dict, batch_y) + with self.auto_cast(): + pred_dict = self._data_forward(self._predict_func, batch_x) + if not isinstance(pred_dict, dict): + raise TypeError(f"The return value of {_get_func_signature(self._predict_func)} " + f"must be `dict`, got {type(pred_dict)}.") + for metric in self.metrics: + metric(pred_dict, batch_y) if self.use_tqdm: pbar.update() diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 72aba38a..d9731217 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -342,7 +342,7 @@ from .losses import _prepare_losser from .metrics import _prepare_metrics from .optimizer import Optimizer from .sampler import Sampler -from .sampler import RandomSampler +from .sampler import RandomSampler, ConstTokenNumSampler from .tester import Tester from .utils import _CheckError from .utils import _build_args @@ -352,6 +352,8 @@ from .utils import _move_dict_value_to_device from .utils import _get_func_signature from .utils import _get_model_device from .utils import _move_model_to_device +from .utils import _build_fp16_env +from .utils import _can_use_fp16 from ._parallel_utils import _model_contains_inner_module from ._logger import logger @@ -373,7 +375,7 @@ class Trainer(object): num_workers=0, n_epochs=10, print_every=5, dev_data=None, metrics=None, metric_key=None, validate_every=-1, save_path=None, use_tqdm=True, device=None, - callbacks=None, check_code_level=0, **kwargs): + callbacks=None, check_code_level=0, fp16=False, **kwargs): r""" :param train_data: 训练集, :class:`~fastNLP.DataSet` 类型或 :class:`~fastNLP.BatchIter` 的子类 :param nn.modules model: 待训练的模型 @@ -422,9 +424,14 @@ class Trainer(object): 报告警告信息; 2: 有任何field没有被使用都报错. 检查的原理是通过使用很小的batch(默认2个sample)来运行代码,但是 这个过程理论上不会修改任何参数,只是会检查能否运行。但如果(1)模型中存在将batch_size写为某个固定值的情况; (2)模型中存在累加前向计算次数的,可能会多计算1次。以上情况建议将check_code_level设置为-1。 + :param bool fp16: 是否使用fp16进行训练。 :param kwargs: 支持配置可选参数 bool test_use_tqdm: 在dev上验证的时候是否开启tqdm Sampler test_sampler: 在evaluate的时候使用的sampler + bool test_use_fp16: evalute的时候是否使用fp16测试,默认与fp16相同的取值。 + bool set_grad_to_none: 在zero_grad的时候是否将gradient设置为None,而不是设置为zero + GradScaler grad_scaler: 仅在fp16为True时有效,如果不使用torch.cuda.amp.GradScaler的初始化参数,可传入一个已经初始化后的 + grad_scaler。 """ super(Trainer, self).__init__() if not isinstance(model, nn.Module): @@ -488,6 +495,15 @@ class Trainer(object): sampler = RandomSampler() elif hasattr(sampler, 'set_batch_size'): sampler.set_batch_size(batch_size) + if isinstance(sampler, ConstTokenNumSampler): # 直接使用固定token数量的Sampler + assert isinstance(train_data, + DataSet), f"When sampler is `ConstTokenNumSampler`, the train_data must" \ + f" be `DataSet`." + sampler(train_data) + train_data = DataSetIter(train_data, + batch_size=1, sampler=None, as_numpy=False, num_workers=num_workers, + pin_memory=False, drop_last=drop_last, timeout=0, worker_init_fn=None, + batch_sampler=sampler) if isinstance(train_data, DataSet): self.data_iterator = DataSetIter(dataset=train_data, batch_size=batch_size, sampler=sampler, @@ -505,6 +521,21 @@ class Trainer(object): self._forward_func = self.model.module.forward else: self._forward_func = self.model.forward + + self.fp16 = fp16 + + # check fp16相关的设置 + self.auto_cast, _grad_scaler = _build_fp16_env(dummy=not fp16) + if self.fp16: + _can_use_fp16(device=device, model=model, func=self._forward_func) + grad_scaler = kwargs.get('grad_scaler', None) + if grad_scaler is not None: + self.grad_scaler = grad_scaler + else: + self.grad_scaler = _grad_scaler() + self.test_use_fp16 = kwargs.get('test_use_fp16', fp16) + self.set_grad_to_none = kwargs.get('set_grad_to_none', True) + if check_code_level > -1: # _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的field名与模型的输入 # 名是否匹配 @@ -553,10 +584,7 @@ class Trainer(object): self.logger = logger self.use_tqdm = use_tqdm - if 'test_use_tqdm' in kwargs: - self.test_use_tqdm = kwargs.get('test_use_tqdm') - else: - self.test_use_tqdm = self.use_tqdm + self.test_use_tqdm = kwargs.get('test_use_tqdm', self.use_tqdm) self.pbar = None self.print_every = abs(self.print_every) self.kwargs = kwargs @@ -568,7 +596,8 @@ class Trainer(object): device=None, # 由上面的部分处理device verbose=0, use_tqdm=self.test_use_tqdm, - sampler=kwargs.get('test_sampler', None)) + sampler=kwargs.get('test_sampler', None), + fp16=self.test_use_fp16) self.start_time = None # start timestamp @@ -677,7 +706,8 @@ class Trainer(object): # edit prediction self.callback_manager.on_loss_begin(batch_y, prediction) - loss = self._compute_loss(prediction, batch_y).mean() + with self.auto_cast(): + loss = self._compute_loss(prediction, batch_y).mean() loss = loss / self.update_every avg_loss += loss.item() @@ -762,11 +792,13 @@ class Trainer(object): """ if self.step % self.update_every == 0: - self.optimizer.step() + self.grad_scaler.step(self.optimizer) + self.grad_scaler.update() def _data_forward(self, network, x): x = _build_args(self._forward_func, **x) - y = network(**x) + with self.auto_cast(): + y = network(**x) if not isinstance(y, dict): raise TypeError( f"The return value of {_get_func_signature(self._forward_func)} should be dict, got {type(y)}.") @@ -780,8 +812,22 @@ class Trainer(object): For PyTorch, just do "loss.backward()" """ if (self.step-1) % self.update_every == 0: - self.model.zero_grad() - loss.backward() + self._clear_grad(self.optimizer, self.set_grad_to_none) + self.grad_scaler.scale(loss).backward() + + def _clear_grad(self, optimizer, set_to_none=True): + param_groups = optimizer.param_groups + for group in param_groups: + for p in group['params']: + if p.grad is not None: + if set_to_none: + p.grad = None + else: + if p.grad.grad_fn is not None: + p.grad.detach_() + else: + p.grad.requires_grad_(False) + p.grad.zero_() def _compute_loss(self, predict, truth): r"""Compute loss given prediction and ground truth. diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 212a31e6..bccc0813 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -12,23 +12,20 @@ import inspect import os import warnings from collections import Counter, namedtuple -from copy import deepcopy from typing import List import _pickle import numpy as np -import torch import torch.nn as nn from prettytable import PrettyTable from ._logger import logger from ._parallel_utils import _model_contains_inner_module # from .vocabulary import Vocabulary +import torch +import contextlib +from pkg_resources import parse_version -try: - from apex import amp -except: - amp = None _CheckRes = namedtuple('_CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed', 'varargs']) @@ -1032,8 +1029,92 @@ def sub_column(string: str, c: int, c_size: int, title: str) -> str: return res -def _check_fp16(): - if amp is None: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") - if not torch.backends.cudnn.enabled: - raise RuntimeError("Amp requires cudnn backend to be enabled.") +def _is_function_contains_autocast(func): + """ + 检查func是否包含autocast,(1)是否使用了autocast的修饰器或, (2)使用使用with autocast()环境 + + :param func: 待检查的函数 + """ + import re + source = inspect.getsource(func) + lines = source.split('\n') + for line in lines: + line = line.strip() + if re.search(r'@[\w\.]*autocast\(\)', line): + raise RuntimeError("Please do not use `autocast()` decorator, use `with autocast():` instead. Please refer to" + " https://pytorch.org/docs/stable/notes/amp_examples.html#dataparallel-in-a-single-process ") + if re.search(r'with [\w\.]*autocast\(\):', line): + return True + return False + + +class DummyGradScaler: + """ + 用于Dummy pytorch的GradScaler对象,防止重复写大量的if判断 + + """ + def __init__(self, *args, **kwargs): + pass + + def get_scale(self): + return 1.0 + + def is_enabled(self): + return False + + def scale(self, outputs): + return outputs + + def step(self, optimizer, *args, **kwargs): + optimizer.step(*args, **kwargs) + + def update(self, new_scale=None): + pass + + def unscale_(self, optimizer): + pass + + def load_state_dict(self, state_dict): + pass + + def state_dict(self): + return {} + + +def _build_fp16_env(dummy=False): + if dummy: + autocast = contextlib.ExitStack + GradScaler = DummyGradScaler + else: + if not torch.cuda.is_available(): + raise RuntimeError("No cuda") + if torch.cuda.get_device_capability(0)[0] < 7: + warnings.warn( + "NOTE: your device does NOT support faster training with fp16, " + "please switch to FP32 which is likely to be faster" + ) + try: + from torch.cuda.amp import autocast, GradScaler + except ImportError: + raise RuntimeError("torch version too low (less than 1.6)") + return autocast, GradScaler + + +def _can_use_fp16(device, model, func): + if parse_version(torch.__version__) < parse_version('1.6'): + raise RuntimeError("Pytorch supports float16 after version 1.6, please upgrade your pytorch version.") + model_device = _get_model_device(model) + if device is None and model_device is not None and model_device.type != 'cuda': + raise RuntimeError("You have to run in cuda device to use fp16.") + if isinstance(device, str): + if device=='cpu': + raise RuntimeError("You have to run in cuda device to use fp16.") + if isinstance(device, torch.device) and device.type=='cpu': + raise RuntimeError("You have to run in cuda device to use fp16.") + + if (_model_contains_inner_module(model) or (isinstance(device, list) and len(device) > 1)): + # 需要提醒用户 + if not _is_function_contains_autocast(func): + raise RuntimeError("When use fp16 in Parallel Training, you have to set autocast() in your forward " + "function as described in " + "https://pytorch.org/docs/stable/notes/amp_examples.html#dataparallel-in-a-single-process") diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index 0b010c02..aef99034 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -125,7 +125,7 @@ class Vocabulary(object): r"""依次增加序列中词在词典中的出现频率 :param list word_lst: a list of strings - :param bool no_create_entry: 在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 + :param bool no_create_entry: 如果词语来自于非训练集建议设置为True。在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 @@ -142,7 +142,7 @@ class Vocabulary(object): 增加一个新词在词典中的出现频率 :param str word: 新词 - :param bool no_create_entry: 在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 + :param bool no_create_entry: 如果词语来自于非训练集建议设置为True。在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 @@ -175,7 +175,7 @@ class Vocabulary(object): 增加一个新词在词典中的出现频率 :param str word: 新词 - :param bool no_create_entry: 在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 + :param bool no_create_entry: 如果词语来自于非训练集建议设置为True。在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 @@ -190,7 +190,7 @@ class Vocabulary(object): 依次增加序列中词在词典中的出现频率 :param list[str] word_lst: 词的序列 - :param bool no_create_entry: 在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 + :param bool no_create_entry: 如果词语来自于非训练集建议设置为True。在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 @@ -344,7 +344,7 @@ class Vocabulary(object): :param str,List[str] field_name: 可为 ``str`` 或 ``List[str]`` . 构建词典所使用的 field(s), 支持一个或多个field,若有多个 DataSet, 每个DataSet都必须有这些field. 目前支持的field结构 : ``str`` , ``List[str]`` - :param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认),该选项用在接下来的模型会使用pretrain + :param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认), 建议直接将非训练数据都传入到这个参数。该选项用在接下来的模型会使用pretrain 的embedding(包括glove, word2vec, elmo与bert)且会finetune的情况。如果仅使用来自于train的数据建立vocabulary,会导致test与dev 中的数据无法充分利用到来自于预训练embedding的信息,所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。 如果一个词出现在了train中,但是没在预训练模型中,embedding会为它用unk初始化,但它是单独的一个vector,如果 diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index 694e6b52..29b17c65 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -108,7 +108,7 @@ class BertEmbedding(ContextualEmbedding): self._word_sep_index = vocab['[SEP]'] self._word_cls_index = -100 if '[CLS]' in vocab: - self._word_cls_index = vocab['CLS'] + self._word_cls_index = vocab['[CLS]'] min_freq = kwargs.get('min_freq', 1) self._min_freq = min_freq diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py index 5f0e8eb0..2f57af47 100644 --- a/fastNLP/embeddings/static_embedding.py +++ b/fastNLP/embeddings/static_embedding.py @@ -281,7 +281,9 @@ class StaticEmbedding(TokenEmbedding): if word in vocab: index = vocab.to_index(word) if index in matrix: - warnings.warn(f"Word:{word} occurs again in line:{idx}(starts from 0)") + warnings.warn(f"Word has more than one vector in embedding file. Set logger level to " + f"DEBUG for detail.") + logger.debug(f"Word:{word} occurs again in line:{idx}(starts from 0)") matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) if self.only_norm_found_vector: matrix[index] = matrix[index] / np.linalg.norm(matrix[index]) diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py index f83f768f..80fffb30 100644 --- a/fastNLP/models/base_model.py +++ b/fastNLP/models/base_model.py @@ -34,3 +34,56 @@ class NaiveClassifier(BaseModel): def predict(self, x): return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} + + +class NaiveClassifier2(BaseModel): + r""" + 一个简单的分类器例子,可用于各种测试 + """ + + def __init__(self, in_feature_dim, out_feature_dim): + super(NaiveClassifier2, self).__init__() + self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) + + def forward(self, x): + return {"predict": self.mlp(x)} + + def predict(self, x): + return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} + + +class NaiveClassifier3(BaseModel): + r""" + 一个简单的分类器例子,可用于各种测试 + """ + + def __init__(self, in_feature_dim, out_feature_dim): + super(NaiveClassifier3, self).__init__() + self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) + + @torch.cuda.amp.autocast() + def forward(self, x): + return {"predict": self.mlp(x)} + + @torch.cuda.amp.autocast() + def predict(self, x): + return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} + + +class NaiveClassifier4(BaseModel): + r""" + 一个简单的分类器例子,可用于各种测试 + """ + + def __init__(self, in_feature_dim, out_feature_dim): + super(NaiveClassifier4, self).__init__() + self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) + + def forward(self, x): + with torch.cuda.amp.autocast(): + return {"predict": self.mlp(x)} + + + def predict(self, x): + with torch.cuda.amp.autocast(): + return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} \ No newline at end of file diff --git a/tests/core/test_sampler.py b/tests/core/test_sampler.py index 703a9428..40d196f0 100644 --- a/tests/core/test_sampler.py +++ b/tests/core/test_sampler.py @@ -44,3 +44,11 @@ class TestSampler(unittest.TestCase): indices = sampler(data_set) self.assertEqual(len(indices), 10) # 跑通即可,不验证效果 + + def test_ConstantTokenNumSampler(self): + # 需要check的是,是否在number上是接近的 + pass + + def test_ConstTokenNumSampler(self): + # 需要check的是,是否可以直接运行 + pass diff --git a/tests/core/test_trainer.py b/tests/core/test_trainer.py index 01767738..d0d462da 100644 --- a/tests/core/test_trainer.py +++ b/tests/core/test_trainer.py @@ -9,12 +9,12 @@ import torch from fastNLP import DataSet from fastNLP import Instance -from fastNLP import BCELoss +from fastNLP import BCELoss, BCEWithLogits from fastNLP import CrossEntropyLoss from fastNLP import AccuracyMetric from fastNLP import SGD from fastNLP import Trainer -from fastNLP.models.base_model import NaiveClassifier +from fastNLP.models.base_model import NaiveClassifier, NaiveClassifier2, NaiveClassifier3, NaiveClassifier4 from fastNLP import TorchLoaderIter @@ -575,3 +575,83 @@ class TrainerTestGround(unittest.TestCase): ) trainer.train() """ + + +class Fp16TrainerTest(unittest.TestCase): + def test_raise_error(self): + data_set = prepare_fake_dataset() + data_set.set_input("x", flag=True) + data_set.set_target("y", flag=True) + + train_set, dev_set = data_set.split(0.3) + + model = NaiveClassifier2(2, 1) + + with self.assertRaises(RuntimeError): + trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), + batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, + metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, + use_tqdm=True, check_code_level=2, fp16=True) + + with self.assertRaises(RuntimeError): + trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), + batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, + metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, + use_tqdm=True, check_code_level=2, fp16=True, device='cpu') + + with self.assertRaises(RuntimeError): + trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), + batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, + metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, + use_tqdm=True, check_code_level=2, fp16=True, device=torch.device('cpu')) + + @unittest.skipIf(torch.cuda.is_available()==False, "Skip when no cuda device detch") + def test_run_fp16(self): + data_set = prepare_fake_dataset() + data_set.set_input("x", flag=True) + data_set.set_target("y", flag=True) + + train_set, dev_set = data_set.split(0.3) + + model = NaiveClassifier2(2, 1) + trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), + batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, + metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, + use_tqdm=True, check_code_level=2, fp16=True, device=0) + trainer.train(load_best_model=False) + + model = NaiveClassifier2(2, 1) + trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), + batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, + metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, + use_tqdm=True, check_code_level=2, fp16=True, device=0, test_use_fp16=False) + trainer.train(load_best_model=False) + + @unittest.skipIf(torch.cuda.device_count()<2, "Skip when lower than 1 gpus.") + def test_run_data_parallel(self): + data_set = prepare_fake_dataset() + data_set.set_input("x", flag=True) + data_set.set_target("y", flag=True) + + train_set, dev_set = data_set.split(0.3) + + model = NaiveClassifier2(2, 1) + with self.assertRaises(RuntimeError): + trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), + batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, + metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, + use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1]) + + with self.assertRaises(RuntimeError): + model = NaiveClassifier3(2, 1) + trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), + batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, + metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, + use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1], test_use_fp16=True) + + model = NaiveClassifier4(2, 1) + trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), + batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, + metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, + use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1], test_use_fp16=True) + trainer.train(load_best_model=False)