| @@ -122,6 +122,14 @@ class BatchIter: | |||
| @staticmethod | |||
| def get_num_batches(num_samples, batch_size, drop_last): | |||
| """ | |||
| 计算batch的数量。 | |||
| :param int num_samples: | |||
| :param int batch_size: | |||
| :param bool drop_last: 如果最后一个batch没有batch_size这么多,是否就丢掉。 | |||
| :return: | |||
| """ | |||
| num_batches = num_samples // batch_size | |||
| if not drop_last and (num_samples % batch_size > 0): | |||
| num_batches += 1 | |||
| @@ -134,6 +142,11 @@ class BatchIter: | |||
| yield batch_x, batch_y | |||
| def get_batch_indices(self): | |||
| """ | |||
| 获取当前已经输出的batch的index。 | |||
| :return: | |||
| """ | |||
| return self.cur_batch_indices | |||
| def __len__(self): | |||
| @@ -193,6 +206,10 @@ class DataSetIter(BatchIter): | |||
| class TorchLoaderIter(BatchIter): | |||
| """ | |||
| 与DataSetIter类似,但用于pytorch的DataSet对象。通过使用TorchLoaderIter封装pytorch的DataSet,然后将其传入到Trainer中。 | |||
| """ | |||
| def __init__(self, dataset): | |||
| super().__init__() | |||
| assert isinstance(dataset, torch.utils.data.DataLoader) | |||
| @@ -590,7 +590,7 @@ class FitlogCallback(Callback): | |||
| try: | |||
| eval_result = tester.test() | |||
| if self.verbose != 0: | |||
| self.pbar.write("Evaluation on DataSet {}:".format(key)) | |||
| self.pbar.write("FitlogCallback evaluation on {}:".format(key)) | |||
| self.pbar.write(tester._format_eval_results(eval_result)) | |||
| fitlog.add_metric(eval_result, name=key, step=self.step, epoch=self.epoch) | |||
| if better_result: | |||
| @@ -609,14 +609,16 @@ class FitlogCallback(Callback): | |||
| class EvaluateCallback(Callback): | |||
| """ | |||
| 该callback用于扩展Trainer训练过程中只能对dev数据进行验证的问题。 | |||
| 通过使用该Callback可以使得Trainer在evaluate dev之外还可以evaluate其它数据集,比如测试集。每一次验证dev之前都会先验证EvaluateCallback | |||
| 中的数据。 | |||
| """ | |||
| def __init__(self, data=None, tester=None): | |||
| """ | |||
| :param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要传入多个 | |||
| :param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用Trainer中的metric对数据进行验证。如果需要传入多个 | |||
| DataSet请通过dict的方式传入。 | |||
| :param ~fastNLP.Tester,Dict[~fastNLP.DataSet] tester: Tester对象,将在on_valid_end时调用。 | |||
| :param ~fastNLP.Tester,Dict[~fastNLP.DataSet] tester: Tester对象, 通过使用Tester对象,可以使得验证的metric与Trainer中 | |||
| 的metric不一样。 | |||
| """ | |||
| super().__init__() | |||
| self.datasets = {} | |||
| @@ -659,13 +661,10 @@ class EvaluateCallback(Callback): | |||
| for key, tester in self.testers.items(): | |||
| try: | |||
| eval_result = tester.test() | |||
| # self.pbar.write("Evaluation on {}:".format(key)) | |||
| self.logger.info("Evaluation on {}:".format(key)) | |||
| # self.pbar.write(tester._format_eval_results(eval_result)) | |||
| self.logger.info("EvaluateCallback evaluation on {}:".format(key)) | |||
| self.logger.info(tester._format_eval_results(eval_result)) | |||
| except Exception: | |||
| # self.pbar.write("Exception happens when evaluate on DataSet named `{}`.".format(key)) | |||
| self.logger.info("Exception happens when evaluate on DataSet named `{}`.".format(key)) | |||
| self.logger.error("Exception happens when evaluate on DataSet named `{}`.".format(key)) | |||
| class LRScheduler(Callback): | |||
| @@ -872,15 +871,16 @@ class TensorboardCallback(Callback): | |||
| class WarmupCallback(Callback): | |||
| """ | |||
| 按一定的周期调节Learning rate的大小。 | |||
| learning rate按照一定的速率从0上升到设置的learning rate。 | |||
| """ | |||
| def __init__(self, warmup=0.1, schedule='constant'): | |||
| """ | |||
| :param int,float warmup: 如果warmup为int,则在该step之前,learning rate根据schedule的策略变化; 如果warmup为float, | |||
| 如0.1, 则前10%的step是按照schedule策略调整learning rate。 | |||
| :param str schedule: 以哪种方式调整。linear: 前warmup的step上升到指定的learning rate(从Trainer中的optimizer处获取的), 后 | |||
| warmup的step下降到0; constant前warmup的step上升到指定learning rate,后面的step保持learning rate. | |||
| :param str schedule: 以哪种方式调整。 | |||
| linear: 前warmup的step上升到指定的learning rate(从Trainer中的optimizer处获取的), 后warmup的step下降到0; | |||
| constant前warmup的step上升到指定learning rate,后面的step保持learning rate. | |||
| """ | |||
| super().__init__() | |||
| self.warmup = max(warmup, 0.) | |||
| @@ -935,15 +935,14 @@ class SaveModelCallback(Callback): | |||
| def __init__(self, save_dir, top=3, only_param=False, save_on_exception=False): | |||
| """ | |||
| :param str save_dir: 将模型存放在哪个目录下,会在该目录下创建以时间戳命名的目录,并存放模型 | |||
| :param str save_dir: 将模型存放在哪个目录下,会在该目录下创建以时间戳命名的目录,并存放模型。如果save_dir不存在将自动创建 | |||
| :param int top: 保存dev表现top多少模型。-1为保存所有模型。 | |||
| :param bool only_param: 是否只保存模型d饿权重。 | |||
| :param bool only_param: 是否只保存模型的权重。 | |||
| :param save_on_exception: 发生exception时,是否保存一份发生exception的模型。模型名称为epoch:x_step:x_Exception:{exception_name}. | |||
| """ | |||
| super().__init__() | |||
| if not os.path.isdir(save_dir): | |||
| raise IsADirectoryError("{} is not a directory.".format(save_dir)) | |||
| os.makedirs(save_dir, exist_ok=True) | |||
| self.save_dir = save_dir | |||
| if top < 0: | |||
| self.top = sys.maxsize | |||
| @@ -2,6 +2,8 @@ import unittest | |||
| import numpy as np | |||
| import torch | |||
| import os | |||
| import shutil | |||
| from fastNLP.core.callback import EarlyStopCallback, GradientClipCallback, LRScheduler, ControlC, \ | |||
| LRFinder, TensorboardCallback | |||
| @@ -13,7 +15,8 @@ from fastNLP import SGD | |||
| from fastNLP import Trainer | |||
| from fastNLP.models.base_model import NaiveClassifier | |||
| from fastNLP.core.callback import EarlyStopError | |||
| from fastNLP.core.callback import EvaluateCallback, FitlogCallback, SaveModelCallback | |||
| from fastNLP.core.callback import WarmupCallback | |||
| def prepare_env(): | |||
| def prepare_fake_dataset(): | |||
| @@ -113,3 +116,54 @@ class TestCallback(unittest.TestCase): | |||
| check_code_level=2) | |||
| trainer.train() | |||
| assert passed_epochs == list(range(1, total_epochs + 1)) | |||
| def test_evaluate_callback(self): | |||
| data_set, model = prepare_env() | |||
| from fastNLP import Tester | |||
| tester = Tester(data=data_set, model=model, metrics=AccuracyMetric(pred="predict", target="y")) | |||
| evaluate_callback = EvaluateCallback(data_set, tester) | |||
| trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), | |||
| batch_size=32, n_epochs=5, print_every=50, dev_data=data_set, | |||
| metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False, | |||
| callbacks=evaluate_callback, check_code_level=2) | |||
| trainer.train() | |||
| def test_fitlog_callback(self): | |||
| import fitlog | |||
| os.makedirs('logs/') | |||
| fitlog.set_log_dir('logs/') | |||
| data_set, model = prepare_env() | |||
| from fastNLP import Tester | |||
| tester = Tester(data=data_set, model=model, metrics=AccuracyMetric(pred="predict", target="y")) | |||
| fitlog_callback = FitlogCallback(data_set, tester) | |||
| trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), | |||
| batch_size=32, n_epochs=5, print_every=50, dev_data=data_set, | |||
| metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=True, | |||
| callbacks=fitlog_callback, check_code_level=2) | |||
| trainer.train() | |||
| shutil.rmtree('logs/') | |||
| def test_save_model_callback(self): | |||
| data_set, model = prepare_env() | |||
| top = 3 | |||
| save_model_callback = SaveModelCallback('save_models/', top=top) | |||
| trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), | |||
| batch_size=32, n_epochs=5, print_every=50, dev_data=data_set, | |||
| metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=True, | |||
| callbacks=save_model_callback, check_code_level=2) | |||
| trainer.train() | |||
| timestamp = os.listdir('save_models')[0] | |||
| self.assertEqual(len(os.listdir(os.path.join('save_models', timestamp))), top) | |||
| shutil.rmtree('save_models/') | |||
| def test_warmup_callback(self): | |||
| data_set, model = prepare_env() | |||
| warmup_callback = WarmupCallback() | |||
| trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), | |||
| batch_size=32, n_epochs=5, print_every=50, dev_data=data_set, | |||
| metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=True, | |||
| callbacks=warmup_callback, check_code_level=2) | |||
| trainer.train() | |||
| @@ -10,7 +10,8 @@ import torch | |||
| from torch import nn | |||
| from fastNLP.core.utils import _move_model_to_device, _get_model_device | |||
| import numpy as np | |||
| from fastNLP.core.utils import seq_len_to_mask | |||
| from fastNLP.core.utils import seq_len_to_mask, get_seq_len | |||
| from fastNLP.core.utils import iob2, iob2bioes | |||
| class Model(nn.Module): | |||
| def __init__(self): | |||
| @@ -263,4 +264,27 @@ class TestSeqLenToMask(unittest.TestCase): | |||
| # 3. pad到指定长度 | |||
| seq_len = torch.randint(1, 10, size=(10, )) | |||
| mask = seq_len_to_mask(seq_len, 100) | |||
| self.assertEqual(100, mask.size(1)) | |||
| self.assertEqual(100, mask.size(1)) | |||
| class TestUtils(unittest.TestCase): | |||
| def test_get_seq_len(self): | |||
| seq_len = torch.randint(1, 10, size=(10, )) | |||
| mask = seq_len_to_mask(seq_len) | |||
| new_seq_len = get_seq_len(mask) | |||
| self.assertSequenceEqual(seq_len.tolist(), new_seq_len.tolist()) | |||
| def test_iob2(self): | |||
| tags = ['B-NP', 'O', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'O', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP'] | |||
| convert_tags = ['B-NP', 'O', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'O', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP'] | |||
| self.assertSequenceEqual(convert_tags, iob2(tags)) | |||
| tags = ['I-NP', 'O', 'I-NP', 'I-VP', 'B-NP', 'I-NP', 'O', 'I-NP', 'I-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP'] | |||
| self.assertSequenceEqual(convert_tags, iob2(tags)) | |||
| def test_iob2bioes(self): | |||
| tags = ['B-NP', 'O', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'O', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP'] | |||
| convert_tags = ['S-NP', 'O', 'S-NP', 'S-VP', 'B-NP', 'E-NP', 'O', 'S-NP', 'S-PP', 'B-NP', 'E-NP', 'O', 'B-NP', 'E-NP', 'S-NP', 'O', 'B-NP', 'I-NP', 'E-NP'] | |||
| self.assertSequenceEqual(convert_tags, iob2bioes(tags)) | |||
| @@ -1,6 +1,6 @@ | |||
| import unittest | |||
| from fastNLP import Vocabulary | |||
| from fastNLP.embeddings import BertEmbedding | |||
| from fastNLP.embeddings import BertEmbedding, BertWordPieceEncoder | |||
| import torch | |||
| import os | |||
| @@ -37,3 +37,12 @@ class TestBertEmbedding(unittest.TestCase): | |||
| words = torch.LongTensor([[2, 3, 4, 0]]) | |||
| result = embed(words) | |||
| self.assertEqual(result.size(), (1, 4, 16)) | |||
| class TestBertWordPieceEncoder(unittest.TestCase): | |||
| def test_bert_word_piece_encoder(self): | |||
| embed = BertWordPieceEncoder(model_dir_or_name='test/data_for_tests/embedding/small_bert', word_dropout=0.1) | |||
| from fastNLP import DataSet | |||
| ds = DataSet({'words': ["this is a test . [SEP]".split()]}) | |||
| embed.index_datasets(ds, field_name='words') | |||
| self.assertTrue(ds.has_field('word_pieces')) | |||
| @@ -0,0 +1,9 @@ | |||
| import unittest | |||
| import torch | |||
| from fastNLP.modules.utils import get_dropout_mask | |||
| class TestUtil(unittest.TestCase): | |||
| def test_get_dropout_mask(self): | |||
| tensor = torch.randn(3, 4) | |||
| mask = get_dropout_mask(0.3, tensor) | |||
| self.assertSequenceEqual(mask.size(), torch.Size([3, 4])) | |||