| @@ -86,7 +86,7 @@ fastNLP中的Vocabulary | |||
| # 将验证集或者测试集在建立词表是放入no_create_entry_dataset这个参数中。 | |||
| vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data]) | |||
| :class:`~fastNLP.Vocabulary` 中的 `no_create_entry` , 建议在添加来自于测试集和验证集的词的时候将该参数置为True, 或将验证集和测试集 | |||
| :class:`~fastNLP.Vocabulary` 中的 `no_create_entry` ,如果您并不关心具体的原理,您可以直接采取以下的建议:在添加来自于非训练集的词的时候将该参数置为True, 或将非训练集数据 | |||
| 传入 `no_create_entry_dataset` 参数。它们的意义是在接下来的模型会使用pretrain的embedding(包括glove, word2vec, elmo与bert)且会finetune的 | |||
| 情况下,如果仅使用来自于train的数据建立vocabulary,会导致只出现在test与dev中的词语无法充分利用到来自于预训练embedding的信息(因为他们 | |||
| 会被认为是unk),所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。 | |||
| @@ -62,6 +62,7 @@ __all__ = [ | |||
| "CrossEntropyLoss", | |||
| "L1Loss", | |||
| "BCELoss", | |||
| "BCEWithLogits", | |||
| "NLLLoss", | |||
| "LossInForward", | |||
| "CMRC2018Loss", | |||
| @@ -98,7 +99,7 @@ from .dataset import DataSet | |||
| from .field import FieldArray, Padder, AutoPadder, EngChar2DPadder | |||
| from .instance import Instance | |||
| from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, \ | |||
| LossInForward, CMRC2018Loss, LossBase, MSELoss | |||
| LossInForward, CMRC2018Loss, LossBase, MSELoss, BCEWithLogits | |||
| from .metrics import AccuracyMetric, SpanFPreRecMetric, CMRC2018Metric, ClassifyFPreRecMetric, MetricBase,\ | |||
| ConfusionMatrixMetric | |||
| from .optimizer import Optimizer, SGD, Adam, AdamW | |||
| @@ -86,7 +86,6 @@ except: | |||
| from .dataset import DataSet | |||
| from .tester import Tester | |||
| from ._logger import logger | |||
| from .utils import _check_fp16 | |||
| from ._parallel_utils import _model_contains_inner_module | |||
| try: | |||
| @@ -94,11 +93,6 @@ try: | |||
| except: | |||
| pass | |||
| try: | |||
| from apex import amp | |||
| except: | |||
| amp = None | |||
| class Callback(object): | |||
| r""" | |||
| @@ -123,6 +117,20 @@ class Callback(object): | |||
| 该属性可以通过self.trainer获取到,一般情况下不需要使用这个属性。 | |||
| """ | |||
| return self._trainer | |||
| @property | |||
| def grad_scaler(self): | |||
| r""" | |||
| float16的gradient scaler | |||
| """ | |||
| return self._trainer.grad_scaler | |||
| @property | |||
| def auto_cast(self): | |||
| r""" | |||
| float16用的auto cast环境 | |||
| """ | |||
| return self._trainer.auto_cast | |||
| @property | |||
| def step(self): | |||
| @@ -472,14 +480,9 @@ class GradientClipCallback(Callback): | |||
| def on_backward_end(self): | |||
| if self.step%self.update_every==0: | |||
| if self.parameters is None: | |||
| if getattr(self.trainer, 'fp16', ''): | |||
| _check_fp16() | |||
| self.clip_fun(amp.master_params(self.optimizer), self.clip_value) | |||
| else: | |||
| self.clip_fun(self.model.parameters(), self.clip_value) | |||
| else: | |||
| self.clip_fun(self.parameters, self.clip_value) | |||
| if self.trainer.fp16: | |||
| self.grad_scaler.unscale_(self.optimizer) | |||
| self.clip_fun(self.parameters, self.clip_value) | |||
| class EarlyStopCallback(Callback): | |||
| @@ -569,10 +572,10 @@ class FitlogCallback(Callback): | |||
| if len(self.datasets) > 0: | |||
| for key, data in self.datasets.items(): | |||
| tester = Tester(data=data, model=self.model, | |||
| batch_size=self.trainer.kwargs.get('dev_batch_size', self.batch_size), | |||
| batch_size=self.trainer.kwargs.get('dev_batch_size', self.trainer.batch_size), | |||
| metrics=self.trainer.metrics, | |||
| verbose=0, | |||
| use_tqdm=self.trainer.test_use_tqdm, | |||
| use_tqdm=self.trainer.kwargs.get('test_use_tqdm', self.trainer.use_tqdm), | |||
| sampler=self.trainer.kwargs.get('test_sampler', None)) | |||
| self.testers[key] = tester | |||
| fitlog.add_progress(total_steps=self.n_steps) | |||
| @@ -948,6 +951,7 @@ class CheckPointCallback(Callback): | |||
| model = model.module | |||
| model.load_state_dict(states['model']) | |||
| self.optimizer.load_state_dict(states['optimizer']) | |||
| self.grad_scaler.load_state_dict(states['grad_scaler']) | |||
| self.trainer.epoch = states['epoch'] + 1 # 因为是结束储存的,所以需要从下一个epoch开始 | |||
| self.trainer.step = states['step'] | |||
| if 'best_dev_epoch' in states: | |||
| @@ -29,7 +29,6 @@ from .dataset import DataSet | |||
| from .losses import _prepare_losser | |||
| from .optimizer import Optimizer | |||
| from .utils import _build_args | |||
| from .utils import _check_fp16 | |||
| from .utils import _get_func_signature | |||
| from .utils import _move_dict_value_to_device | |||
| @@ -10,6 +10,7 @@ __all__ = [ | |||
| "CrossEntropyLoss", | |||
| "BCELoss", | |||
| "BCEWithLogits", | |||
| "L1Loss", | |||
| "NLLLoss", | |||
| "MSELoss", | |||
| @@ -311,6 +312,25 @@ class BCELoss(LossBase): | |||
| return F.binary_cross_entropy(input=pred, target=target, reduction=self.reduction) | |||
| class BCEWithLogits(LossBase): | |||
| r""" | |||
| 二分类交叉熵损失函数, 传入数据之前不需要做sigmoid操作 | |||
| :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` | |||
| :param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` | |||
| :param str reduction: 支持 `mean` ,`sum` 和 `none` . | |||
| """ | |||
| def __init__(self, pred=None, target=None, reduction='mean'): | |||
| super(BCEWithLogits, self).__init__() | |||
| self._init_param_map(pred=pred, target=target) | |||
| assert reduction in ('mean', 'sum', 'none') | |||
| self.reduction = reduction | |||
| def get_loss(self, pred, target): | |||
| return F.binary_cross_entropy_with_logits(input=pred, target=target, reduction=self.reduction) | |||
| class NLLLoss(LossBase): | |||
| r""" | |||
| 负对数似然损失函数 | |||
| @@ -112,6 +112,108 @@ class BucketSampler(Sampler): | |||
| return list(chain(*batchs)) | |||
| class ConstTokenNumSampler(Sampler): | |||
| """ | |||
| 尽量保证每个batch的输入token数量是接近的。 | |||
| 使用示例 | |||
| >>> # 假设已经有了tr_data并有一个field叫做seq_len保存了每个instance的token数量 | |||
| >>> from fastNLP import DataSetIter, Trainer | |||
| >>> sampler = ConstTokenNumSampler('src_seq_len', max_token=4096) | |||
| >>> | |||
| >>> # 直接将sampler传入Trainer中,此时batch_size参数的值会被忽略 | |||
| >>> trainer = Trainer(tr_data, model, optimizer=optimizer, loss=TranslationLoss(), | |||
| >>> batch_size=1, sampler=sampler, drop_last=False, update_every=1) | |||
| """ | |||
| def __init__(self, seq_len_field_name, max_token=4096, max_sentence=-1, need_be_multiple_of=1, num_bucket=-1): | |||
| """ | |||
| :param List[int] seq_len_field_name: 哪个field指示的sample的长度 | |||
| :param int max_token: 每个batch的最大的token数量 | |||
| :param int max_sentence: 每个batch最多多少个instance, -1表示根据max_token决定 | |||
| :param int need_be_multiple_of: 生成的batch的instance的数量需要是几的倍数,在DataParallel场景下会用到 | |||
| :param int num_bucket: 将数据按长度拆分为num_bucket个bucket,batch中的sample尽量在bucket之中进行组合,这样可以减少padding。 | |||
| """ | |||
| assert (max_sentence!=-1 and max_sentence>=need_be_multiple_of) or max_sentence<1 | |||
| self.seq_len_field_name = seq_len_field_name | |||
| self.num_bucket = num_bucket | |||
| self.max_token = max_token | |||
| self._max_sentence = max_sentence | |||
| self.need_be_multiple_of = need_be_multiple_of | |||
| def __call__(self, data_set): | |||
| assert len(data_set)>self.num_bucket, "The number of samples should be larger than buckets." | |||
| seq_len = data_set.get_field(self.seq_len_field_name) | |||
| self.seq_len = seq_len | |||
| seq_len_indice = [(length, i) for i, length in enumerate(seq_len)] | |||
| seq_len_indice.sort(key=lambda x: x[0]) | |||
| indice_in_buckets = [] | |||
| if self.num_bucket>0: | |||
| sample_per_bucket = len(seq_len_indice)//self.num_bucket | |||
| i = 0 | |||
| while len(indice_in_buckets)<len(seq_len_indice): | |||
| indice_in_buckets.append(seq_len_indice[i*sample_per_bucket:(i+1)*sample_per_bucket]) | |||
| i += 1 | |||
| else: | |||
| indice_in_buckets = [seq_len_indice] | |||
| self.indice_in_buckets = indice_in_buckets | |||
| self.get_new_order() | |||
| @property | |||
| def max_sentence(self): | |||
| if self._max_sentence<1: | |||
| return 100000000 | |||
| return self._max_sentence | |||
| @max_sentence.setter | |||
| def max_sentence(self, max_sentence): | |||
| self._max_sentence = max_sentence | |||
| def get_new_order(self): | |||
| np.random.shuffle(self.indice_in_buckets) | |||
| for bucket in self.indice_in_buckets: | |||
| np.random.shuffle(bucket) | |||
| indices = list(chain(*self.indice_in_buckets)) | |||
| batches = [] | |||
| cur_max_len = 0 | |||
| batch = [] | |||
| for length, i in indices: | |||
| max_len = max(length, cur_max_len) | |||
| if max_len*(len(batch)+1)>self.max_token or len(batch)>=self.max_sentence: | |||
| left_sample = len(batch) % self.need_be_multiple_of | |||
| add_samples = batch.copy() | |||
| cur_max_len =length | |||
| if left_sample!=0: | |||
| add_samples = add_samples[:-left_sample] | |||
| batch = batch[-left_sample:] | |||
| cur_max_len = max(cur_max_len, max(batch)) | |||
| else: | |||
| batch = [] | |||
| if len(add_samples)==0: | |||
| raise RuntimeError(f"The sample `{i}` is too long to make a batch with {self.need_be_multiple_of} samples.") | |||
| batches.append(add_samples) | |||
| else: | |||
| cur_max_len = max_len | |||
| batch.append(i) | |||
| if batch: | |||
| left_sample = len(batch) % self.need_be_multiple_of | |||
| add_samples = batch.copy() | |||
| if left_sample != 0: | |||
| add_samples = add_samples[:-left_sample].copy() | |||
| if add_samples: | |||
| batches.append(add_samples) | |||
| np.random.shuffle(batches) | |||
| self.batches = batches | |||
| def __iter__(self): | |||
| for batch in self.batches: | |||
| yield batch | |||
| self.get_new_order() | |||
| def __len__(self): | |||
| return len(self.batches) | |||
| class ConstantTokenNumSampler: | |||
| """ | |||
| 尽量保证每个batch的输入token数量是接近的。 | |||
| @@ -119,7 +221,7 @@ class ConstantTokenNumSampler: | |||
| 使用示例 | |||
| >>> # 假设已经有了tr_data并有一个field叫做seq_len保存了每个instance的token数量 | |||
| >>> from fastNLP import DataSetIter, Trainer | |||
| >>> sampler = BatchSampler(tr_data.get_field('seq_len').content, max_token=4096) | |||
| >>> sampler = ConstantTokenNumSampler(tr_data.get_field('seq_len').content, max_token=4096) | |||
| >>> tr_iter = DataSetIter(tr_data, | |||
| >>> batch_size=1, sampler=None, as_numpy=False, num_workers=0, pin_memory=False, | |||
| >>> drop_last=False, timeout=0, worker_init_fn=None, | |||
| @@ -128,7 +230,6 @@ class ConstantTokenNumSampler: | |||
| >>> # 直接将tr_iter传入Trainer中,此时batch_size参数的值会被忽略 | |||
| >>> trainer = Trainer(tr_iter, model, optimizer=optimizer, loss=TranslationLoss(), | |||
| >>> batch_size=1, sampler=None, drop_last=False, update_every=1) | |||
| """ | |||
| def __init__(self, seq_len, max_token=4096, max_sentence=-1, need_be_multiple_of=1, num_bucket=-1): | |||
| """ | |||
| @@ -53,6 +53,8 @@ from .utils import _move_dict_value_to_device | |||
| from .utils import _get_func_signature | |||
| from .utils import _get_model_device | |||
| from .utils import _move_model_to_device | |||
| from .utils import _build_fp16_env | |||
| from .utils import _can_use_fp16 | |||
| from ._parallel_utils import _data_parallel_wrapper | |||
| from ._parallel_utils import _model_contains_inner_module | |||
| from functools import partial | |||
| @@ -70,7 +72,7 @@ class Tester(object): | |||
| """ | |||
| def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1, use_tqdm=True, | |||
| **kwargs): | |||
| fp16=False, **kwargs): | |||
| r""" | |||
| :param ~fastNLP.DataSet,~fastNLP.BatchIter data: 需要测试的数据集 | |||
| @@ -93,7 +95,9 @@ class Tester(object): | |||
| 如果模型是通过predict()进行预测的话,那么将不能使用多卡(DataParallel)进行验证,只会使用第一张卡上的模型。 | |||
| :param int verbose: 如果为0不输出任何信息; 如果为1,打印出验证结果。 | |||
| :param bool use_tqdm: 是否使用tqdm来显示测试进度; 如果为False,则不会显示任何内容。 | |||
| :param kwargs: 支持传入sampler控制测试顺序 | |||
| :param bool fp16: 是否使用float16进行验证 | |||
| :param kwargs: | |||
| Sampler sampler: 支持传入sampler控制测试顺序 | |||
| """ | |||
| super(Tester, self).__init__() | |||
| @@ -147,7 +151,11 @@ class Tester(object): | |||
| else: | |||
| self._predict_func = self._model.forward | |||
| self._predict_func_wrapper = self._model.forward | |||
| if fp16: | |||
| _can_use_fp16(model=model, device=device, func=self._predict_func) | |||
| self.auto_cast, _grad_scaler = _build_fp16_env(not fp16) | |||
| def test(self): | |||
| r"""开始进行验证,并返回验证结果。 | |||
| @@ -172,12 +180,13 @@ class Tester(object): | |||
| for batch_x, batch_y in data_iterator: | |||
| _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) | |||
| pred_dict = self._data_forward(self._predict_func, batch_x) | |||
| if not isinstance(pred_dict, dict): | |||
| raise TypeError(f"The return value of {_get_func_signature(self._predict_func)} " | |||
| f"must be `dict`, got {type(pred_dict)}.") | |||
| for metric in self.metrics: | |||
| metric(pred_dict, batch_y) | |||
| with self.auto_cast(): | |||
| pred_dict = self._data_forward(self._predict_func, batch_x) | |||
| if not isinstance(pred_dict, dict): | |||
| raise TypeError(f"The return value of {_get_func_signature(self._predict_func)} " | |||
| f"must be `dict`, got {type(pred_dict)}.") | |||
| for metric in self.metrics: | |||
| metric(pred_dict, batch_y) | |||
| if self.use_tqdm: | |||
| pbar.update() | |||
| @@ -342,7 +342,7 @@ from .losses import _prepare_losser | |||
| from .metrics import _prepare_metrics | |||
| from .optimizer import Optimizer | |||
| from .sampler import Sampler | |||
| from .sampler import RandomSampler | |||
| from .sampler import RandomSampler, ConstTokenNumSampler | |||
| from .tester import Tester | |||
| from .utils import _CheckError | |||
| from .utils import _build_args | |||
| @@ -352,6 +352,8 @@ from .utils import _move_dict_value_to_device | |||
| from .utils import _get_func_signature | |||
| from .utils import _get_model_device | |||
| from .utils import _move_model_to_device | |||
| from .utils import _build_fp16_env | |||
| from .utils import _can_use_fp16 | |||
| from ._parallel_utils import _model_contains_inner_module | |||
| from ._logger import logger | |||
| @@ -373,7 +375,7 @@ class Trainer(object): | |||
| num_workers=0, n_epochs=10, print_every=5, | |||
| dev_data=None, metrics=None, metric_key=None, | |||
| validate_every=-1, save_path=None, use_tqdm=True, device=None, | |||
| callbacks=None, check_code_level=0, **kwargs): | |||
| callbacks=None, check_code_level=0, fp16=False, **kwargs): | |||
| r""" | |||
| :param train_data: 训练集, :class:`~fastNLP.DataSet` 类型或 :class:`~fastNLP.BatchIter` 的子类 | |||
| :param nn.modules model: 待训练的模型 | |||
| @@ -422,9 +424,14 @@ class Trainer(object): | |||
| 报告警告信息; 2: 有任何field没有被使用都报错. 检查的原理是通过使用很小的batch(默认2个sample)来运行代码,但是 | |||
| 这个过程理论上不会修改任何参数,只是会检查能否运行。但如果(1)模型中存在将batch_size写为某个固定值的情况; | |||
| (2)模型中存在累加前向计算次数的,可能会多计算1次。以上情况建议将check_code_level设置为-1。 | |||
| :param bool fp16: 是否使用fp16进行训练。 | |||
| :param kwargs: 支持配置可选参数 | |||
| bool test_use_tqdm: 在dev上验证的时候是否开启tqdm | |||
| Sampler test_sampler: 在evaluate的时候使用的sampler | |||
| bool test_use_fp16: evalute的时候是否使用fp16测试,默认与fp16相同的取值。 | |||
| bool set_grad_to_none: 在zero_grad的时候是否将gradient设置为None,而不是设置为zero | |||
| GradScaler grad_scaler: 仅在fp16为True时有效,如果不使用torch.cuda.amp.GradScaler的初始化参数,可传入一个已经初始化后的 | |||
| grad_scaler。 | |||
| """ | |||
| super(Trainer, self).__init__() | |||
| if not isinstance(model, nn.Module): | |||
| @@ -488,6 +495,15 @@ class Trainer(object): | |||
| sampler = RandomSampler() | |||
| elif hasattr(sampler, 'set_batch_size'): | |||
| sampler.set_batch_size(batch_size) | |||
| if isinstance(sampler, ConstTokenNumSampler): # 直接使用固定token数量的Sampler | |||
| assert isinstance(train_data, | |||
| DataSet), f"When sampler is `ConstTokenNumSampler`, the train_data must" \ | |||
| f" be `DataSet`." | |||
| sampler(train_data) | |||
| train_data = DataSetIter(train_data, | |||
| batch_size=1, sampler=None, as_numpy=False, num_workers=num_workers, | |||
| pin_memory=False, drop_last=drop_last, timeout=0, worker_init_fn=None, | |||
| batch_sampler=sampler) | |||
| if isinstance(train_data, DataSet): | |||
| self.data_iterator = DataSetIter(dataset=train_data, batch_size=batch_size, sampler=sampler, | |||
| @@ -505,6 +521,21 @@ class Trainer(object): | |||
| self._forward_func = self.model.module.forward | |||
| else: | |||
| self._forward_func = self.model.forward | |||
| self.fp16 = fp16 | |||
| # check fp16相关的设置 | |||
| self.auto_cast, _grad_scaler = _build_fp16_env(dummy=not fp16) | |||
| if self.fp16: | |||
| _can_use_fp16(device=device, model=model, func=self._forward_func) | |||
| grad_scaler = kwargs.get('grad_scaler', None) | |||
| if grad_scaler is not None: | |||
| self.grad_scaler = grad_scaler | |||
| else: | |||
| self.grad_scaler = _grad_scaler() | |||
| self.test_use_fp16 = kwargs.get('test_use_fp16', fp16) | |||
| self.set_grad_to_none = kwargs.get('set_grad_to_none', True) | |||
| if check_code_level > -1: | |||
| # _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的field名与模型的输入 | |||
| # 名是否匹配 | |||
| @@ -553,10 +584,7 @@ class Trainer(object): | |||
| self.logger = logger | |||
| self.use_tqdm = use_tqdm | |||
| if 'test_use_tqdm' in kwargs: | |||
| self.test_use_tqdm = kwargs.get('test_use_tqdm') | |||
| else: | |||
| self.test_use_tqdm = self.use_tqdm | |||
| self.test_use_tqdm = kwargs.get('test_use_tqdm', self.use_tqdm) | |||
| self.pbar = None | |||
| self.print_every = abs(self.print_every) | |||
| self.kwargs = kwargs | |||
| @@ -568,7 +596,8 @@ class Trainer(object): | |||
| device=None, # 由上面的部分处理device | |||
| verbose=0, | |||
| use_tqdm=self.test_use_tqdm, | |||
| sampler=kwargs.get('test_sampler', None)) | |||
| sampler=kwargs.get('test_sampler', None), | |||
| fp16=self.test_use_fp16) | |||
| self.start_time = None # start timestamp | |||
| @@ -677,7 +706,8 @@ class Trainer(object): | |||
| # edit prediction | |||
| self.callback_manager.on_loss_begin(batch_y, prediction) | |||
| loss = self._compute_loss(prediction, batch_y).mean() | |||
| with self.auto_cast(): | |||
| loss = self._compute_loss(prediction, batch_y).mean() | |||
| loss = loss / self.update_every | |||
| avg_loss += loss.item() | |||
| @@ -762,11 +792,13 @@ class Trainer(object): | |||
| """ | |||
| if self.step % self.update_every == 0: | |||
| self.optimizer.step() | |||
| self.grad_scaler.step(self.optimizer) | |||
| self.grad_scaler.update() | |||
| def _data_forward(self, network, x): | |||
| x = _build_args(self._forward_func, **x) | |||
| y = network(**x) | |||
| with self.auto_cast(): | |||
| y = network(**x) | |||
| if not isinstance(y, dict): | |||
| raise TypeError( | |||
| f"The return value of {_get_func_signature(self._forward_func)} should be dict, got {type(y)}.") | |||
| @@ -780,8 +812,22 @@ class Trainer(object): | |||
| For PyTorch, just do "loss.backward()" | |||
| """ | |||
| if (self.step-1) % self.update_every == 0: | |||
| self.model.zero_grad() | |||
| loss.backward() | |||
| self._clear_grad(self.optimizer, self.set_grad_to_none) | |||
| self.grad_scaler.scale(loss).backward() | |||
| def _clear_grad(self, optimizer, set_to_none=True): | |||
| param_groups = optimizer.param_groups | |||
| for group in param_groups: | |||
| for p in group['params']: | |||
| if p.grad is not None: | |||
| if set_to_none: | |||
| p.grad = None | |||
| else: | |||
| if p.grad.grad_fn is not None: | |||
| p.grad.detach_() | |||
| else: | |||
| p.grad.requires_grad_(False) | |||
| p.grad.zero_() | |||
| def _compute_loss(self, predict, truth): | |||
| r"""Compute loss given prediction and ground truth. | |||
| @@ -12,23 +12,20 @@ import inspect | |||
| import os | |||
| import warnings | |||
| from collections import Counter, namedtuple | |||
| from copy import deepcopy | |||
| from typing import List | |||
| import _pickle | |||
| import numpy as np | |||
| import torch | |||
| import torch.nn as nn | |||
| from prettytable import PrettyTable | |||
| from ._logger import logger | |||
| from ._parallel_utils import _model_contains_inner_module | |||
| # from .vocabulary import Vocabulary | |||
| import torch | |||
| import contextlib | |||
| from pkg_resources import parse_version | |||
| try: | |||
| from apex import amp | |||
| except: | |||
| amp = None | |||
| _CheckRes = namedtuple('_CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed', | |||
| 'varargs']) | |||
| @@ -1032,8 +1029,92 @@ def sub_column(string: str, c: int, c_size: int, title: str) -> str: | |||
| return res | |||
| def _check_fp16(): | |||
| if amp is None: | |||
| raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") | |||
| if not torch.backends.cudnn.enabled: | |||
| raise RuntimeError("Amp requires cudnn backend to be enabled.") | |||
| def _is_function_contains_autocast(func): | |||
| """ | |||
| 检查func是否包含autocast,(1)是否使用了autocast的修饰器或, (2)使用使用with autocast()环境 | |||
| :param func: 待检查的函数 | |||
| """ | |||
| import re | |||
| source = inspect.getsource(func) | |||
| lines = source.split('\n') | |||
| for line in lines: | |||
| line = line.strip() | |||
| if re.search(r'@[\w\.]*autocast\(\)', line): | |||
| raise RuntimeError("Please do not use `autocast()` decorator, use `with autocast():` instead. Please refer to" | |||
| " https://pytorch.org/docs/stable/notes/amp_examples.html#dataparallel-in-a-single-process ") | |||
| if re.search(r'with [\w\.]*autocast\(\):', line): | |||
| return True | |||
| return False | |||
| class DummyGradScaler: | |||
| """ | |||
| 用于Dummy pytorch的GradScaler对象,防止重复写大量的if判断 | |||
| """ | |||
| def __init__(self, *args, **kwargs): | |||
| pass | |||
| def get_scale(self): | |||
| return 1.0 | |||
| def is_enabled(self): | |||
| return False | |||
| def scale(self, outputs): | |||
| return outputs | |||
| def step(self, optimizer, *args, **kwargs): | |||
| optimizer.step(*args, **kwargs) | |||
| def update(self, new_scale=None): | |||
| pass | |||
| def unscale_(self, optimizer): | |||
| pass | |||
| def load_state_dict(self, state_dict): | |||
| pass | |||
| def state_dict(self): | |||
| return {} | |||
| def _build_fp16_env(dummy=False): | |||
| if dummy: | |||
| autocast = contextlib.ExitStack | |||
| GradScaler = DummyGradScaler | |||
| else: | |||
| if not torch.cuda.is_available(): | |||
| raise RuntimeError("No cuda") | |||
| if torch.cuda.get_device_capability(0)[0] < 7: | |||
| warnings.warn( | |||
| "NOTE: your device does NOT support faster training with fp16, " | |||
| "please switch to FP32 which is likely to be faster" | |||
| ) | |||
| try: | |||
| from torch.cuda.amp import autocast, GradScaler | |||
| except ImportError: | |||
| raise RuntimeError("torch version too low (less than 1.6)") | |||
| return autocast, GradScaler | |||
| def _can_use_fp16(device, model, func): | |||
| if parse_version(torch.__version__) < parse_version('1.6'): | |||
| raise RuntimeError("Pytorch supports float16 after version 1.6, please upgrade your pytorch version.") | |||
| model_device = _get_model_device(model) | |||
| if device is None and model_device is not None and model_device.type != 'cuda': | |||
| raise RuntimeError("You have to run in cuda device to use fp16.") | |||
| if isinstance(device, str): | |||
| if device=='cpu': | |||
| raise RuntimeError("You have to run in cuda device to use fp16.") | |||
| if isinstance(device, torch.device) and device.type=='cpu': | |||
| raise RuntimeError("You have to run in cuda device to use fp16.") | |||
| if (_model_contains_inner_module(model) or (isinstance(device, list) and len(device) > 1)): | |||
| # 需要提醒用户 | |||
| if not _is_function_contains_autocast(func): | |||
| raise RuntimeError("When use fp16 in Parallel Training, you have to set autocast() in your forward " | |||
| "function as described in " | |||
| "https://pytorch.org/docs/stable/notes/amp_examples.html#dataparallel-in-a-single-process") | |||
| @@ -125,7 +125,7 @@ class Vocabulary(object): | |||
| r"""依次增加序列中词在词典中的出现频率 | |||
| :param list word_lst: a list of strings | |||
| :param bool no_create_entry: 在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||
| :param bool no_create_entry: 如果词语来自于非训练集建议设置为True。在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||
| 如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 | |||
| 的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 | |||
| 加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 | |||
| @@ -142,7 +142,7 @@ class Vocabulary(object): | |||
| 增加一个新词在词典中的出现频率 | |||
| :param str word: 新词 | |||
| :param bool no_create_entry: 在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||
| :param bool no_create_entry: 如果词语来自于非训练集建议设置为True。在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||
| 如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 | |||
| 的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 | |||
| 加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 | |||
| @@ -175,7 +175,7 @@ class Vocabulary(object): | |||
| 增加一个新词在词典中的出现频率 | |||
| :param str word: 新词 | |||
| :param bool no_create_entry: 在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||
| :param bool no_create_entry: 如果词语来自于非训练集建议设置为True。在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||
| 如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 | |||
| 的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 | |||
| 加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 | |||
| @@ -190,7 +190,7 @@ class Vocabulary(object): | |||
| 依次增加序列中词在词典中的出现频率 | |||
| :param list[str] word_lst: 词的序列 | |||
| :param bool no_create_entry: 在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||
| :param bool no_create_entry: 如果词语来自于非训练集建议设置为True。在使用fastNLP.TokenEmbedding加载预训练模型时,没有从预训练词表中找到这个词的处理方式。 | |||
| 如果为True,则不会有这个词语创建一个单独的entry,它将一直被指向unk的表示; 如果为False,则为这个词创建一个单独 | |||
| 的entry。如果这个word来自于dev或者test,一般设置为True,如果来自与train一般设置为False。以下两种情况: 如果新 | |||
| 加入一个word,且no_create_entry为True,但这个词之前已经在Vocabulary中且并不是no_create_entry的,则还是会为这 | |||
| @@ -344,7 +344,7 @@ class Vocabulary(object): | |||
| :param str,List[str] field_name: 可为 ``str`` 或 ``List[str]`` . | |||
| 构建词典所使用的 field(s), 支持一个或多个field,若有多个 DataSet, 每个DataSet都必须有这些field. 目前支持的field结构 | |||
| : ``str`` , ``List[str]`` | |||
| :param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认),该选项用在接下来的模型会使用pretrain | |||
| :param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认), 建议直接将非训练数据都传入到这个参数。该选项用在接下来的模型会使用pretrain | |||
| 的embedding(包括glove, word2vec, elmo与bert)且会finetune的情况。如果仅使用来自于train的数据建立vocabulary,会导致test与dev | |||
| 中的数据无法充分利用到来自于预训练embedding的信息,所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。 | |||
| 如果一个词出现在了train中,但是没在预训练模型中,embedding会为它用unk初始化,但它是单独的一个vector,如果 | |||
| @@ -108,7 +108,7 @@ class BertEmbedding(ContextualEmbedding): | |||
| self._word_sep_index = vocab['[SEP]'] | |||
| self._word_cls_index = -100 | |||
| if '[CLS]' in vocab: | |||
| self._word_cls_index = vocab['CLS'] | |||
| self._word_cls_index = vocab['[CLS]'] | |||
| min_freq = kwargs.get('min_freq', 1) | |||
| self._min_freq = min_freq | |||
| @@ -281,7 +281,9 @@ class StaticEmbedding(TokenEmbedding): | |||
| if word in vocab: | |||
| index = vocab.to_index(word) | |||
| if index in matrix: | |||
| warnings.warn(f"Word:{word} occurs again in line:{idx}(starts from 0)") | |||
| warnings.warn(f"Word has more than one vector in embedding file. Set logger level to " | |||
| f"DEBUG for detail.") | |||
| logger.debug(f"Word:{word} occurs again in line:{idx}(starts from 0)") | |||
| matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) | |||
| if self.only_norm_found_vector: | |||
| matrix[index] = matrix[index] / np.linalg.norm(matrix[index]) | |||
| @@ -34,3 +34,56 @@ class NaiveClassifier(BaseModel): | |||
| def predict(self, x): | |||
| return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | |||
| class NaiveClassifier2(BaseModel): | |||
| r""" | |||
| 一个简单的分类器例子,可用于各种测试 | |||
| """ | |||
| def __init__(self, in_feature_dim, out_feature_dim): | |||
| super(NaiveClassifier2, self).__init__() | |||
| self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||
| def forward(self, x): | |||
| return {"predict": self.mlp(x)} | |||
| def predict(self, x): | |||
| return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | |||
| class NaiveClassifier3(BaseModel): | |||
| r""" | |||
| 一个简单的分类器例子,可用于各种测试 | |||
| """ | |||
| def __init__(self, in_feature_dim, out_feature_dim): | |||
| super(NaiveClassifier3, self).__init__() | |||
| self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||
| @torch.cuda.amp.autocast() | |||
| def forward(self, x): | |||
| return {"predict": self.mlp(x)} | |||
| @torch.cuda.amp.autocast() | |||
| def predict(self, x): | |||
| return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | |||
| class NaiveClassifier4(BaseModel): | |||
| r""" | |||
| 一个简单的分类器例子,可用于各种测试 | |||
| """ | |||
| def __init__(self, in_feature_dim, out_feature_dim): | |||
| super(NaiveClassifier4, self).__init__() | |||
| self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim]) | |||
| def forward(self, x): | |||
| with torch.cuda.amp.autocast(): | |||
| return {"predict": self.mlp(x)} | |||
| def predict(self, x): | |||
| with torch.cuda.amp.autocast(): | |||
| return {"predict": torch.sigmoid(self.mlp(x)) > 0.5} | |||
| @@ -44,3 +44,11 @@ class TestSampler(unittest.TestCase): | |||
| indices = sampler(data_set) | |||
| self.assertEqual(len(indices), 10) | |||
| # 跑通即可,不验证效果 | |||
| def test_ConstantTokenNumSampler(self): | |||
| # 需要check的是,是否在number上是接近的 | |||
| pass | |||
| def test_ConstTokenNumSampler(self): | |||
| # 需要check的是,是否可以直接运行 | |||
| pass | |||
| @@ -9,12 +9,12 @@ import torch | |||
| from fastNLP import DataSet | |||
| from fastNLP import Instance | |||
| from fastNLP import BCELoss | |||
| from fastNLP import BCELoss, BCEWithLogits | |||
| from fastNLP import CrossEntropyLoss | |||
| from fastNLP import AccuracyMetric | |||
| from fastNLP import SGD | |||
| from fastNLP import Trainer | |||
| from fastNLP.models.base_model import NaiveClassifier | |||
| from fastNLP.models.base_model import NaiveClassifier, NaiveClassifier2, NaiveClassifier3, NaiveClassifier4 | |||
| from fastNLP import TorchLoaderIter | |||
| @@ -575,3 +575,83 @@ class TrainerTestGround(unittest.TestCase): | |||
| ) | |||
| trainer.train() | |||
| """ | |||
| class Fp16TrainerTest(unittest.TestCase): | |||
| def test_raise_error(self): | |||
| data_set = prepare_fake_dataset() | |||
| data_set.set_input("x", flag=True) | |||
| data_set.set_target("y", flag=True) | |||
| train_set, dev_set = data_set.split(0.3) | |||
| model = NaiveClassifier2(2, 1) | |||
| with self.assertRaises(RuntimeError): | |||
| trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||
| batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||
| metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||
| use_tqdm=True, check_code_level=2, fp16=True) | |||
| with self.assertRaises(RuntimeError): | |||
| trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||
| batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||
| metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||
| use_tqdm=True, check_code_level=2, fp16=True, device='cpu') | |||
| with self.assertRaises(RuntimeError): | |||
| trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||
| batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||
| metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||
| use_tqdm=True, check_code_level=2, fp16=True, device=torch.device('cpu')) | |||
| @unittest.skipIf(torch.cuda.is_available()==False, "Skip when no cuda device detch") | |||
| def test_run_fp16(self): | |||
| data_set = prepare_fake_dataset() | |||
| data_set.set_input("x", flag=True) | |||
| data_set.set_target("y", flag=True) | |||
| train_set, dev_set = data_set.split(0.3) | |||
| model = NaiveClassifier2(2, 1) | |||
| trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||
| batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||
| metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||
| use_tqdm=True, check_code_level=2, fp16=True, device=0) | |||
| trainer.train(load_best_model=False) | |||
| model = NaiveClassifier2(2, 1) | |||
| trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||
| batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||
| metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||
| use_tqdm=True, check_code_level=2, fp16=True, device=0, test_use_fp16=False) | |||
| trainer.train(load_best_model=False) | |||
| @unittest.skipIf(torch.cuda.device_count()<2, "Skip when lower than 1 gpus.") | |||
| def test_run_data_parallel(self): | |||
| data_set = prepare_fake_dataset() | |||
| data_set.set_input("x", flag=True) | |||
| data_set.set_target("y", flag=True) | |||
| train_set, dev_set = data_set.split(0.3) | |||
| model = NaiveClassifier2(2, 1) | |||
| with self.assertRaises(RuntimeError): | |||
| trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||
| batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||
| metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||
| use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1]) | |||
| with self.assertRaises(RuntimeError): | |||
| model = NaiveClassifier3(2, 1) | |||
| trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||
| batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||
| metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||
| use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1], test_use_fp16=True) | |||
| model = NaiveClassifier4(2, 1) | |||
| trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"), | |||
| batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, | |||
| metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, | |||
| use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1], test_use_fp16=True) | |||
| trainer.train(load_best_model=False) | |||