diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index 1a31e92a..76c14005 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -122,6 +122,14 @@ class BatchIter: @staticmethod def get_num_batches(num_samples, batch_size, drop_last): + """ + 计算batch的数量。 + + :param int num_samples: + :param int batch_size: + :param bool drop_last: 如果最后一个batch没有batch_size这么多,是否就丢掉。 + :return: + """ num_batches = num_samples // batch_size if not drop_last and (num_samples % batch_size > 0): num_batches += 1 @@ -134,6 +142,11 @@ class BatchIter: yield batch_x, batch_y def get_batch_indices(self): + """ + 获取当前已经输出的batch的index。 + + :return: + """ return self.cur_batch_indices def __len__(self): @@ -193,6 +206,10 @@ class DataSetIter(BatchIter): class TorchLoaderIter(BatchIter): + """ + 与DataSetIter类似,但用于pytorch的DataSet对象。通过使用TorchLoaderIter封装pytorch的DataSet,然后将其传入到Trainer中。 + + """ def __init__(self, dataset): super().__init__() assert isinstance(dataset, torch.utils.data.DataLoader) diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 520ea733..985431bc 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -590,7 +590,7 @@ class FitlogCallback(Callback): try: eval_result = tester.test() if self.verbose != 0: - self.pbar.write("Evaluation on DataSet {}:".format(key)) + self.pbar.write("FitlogCallback evaluation on {}:".format(key)) self.pbar.write(tester._format_eval_results(eval_result)) fitlog.add_metric(eval_result, name=key, step=self.step, epoch=self.epoch) if better_result: @@ -609,14 +609,16 @@ class FitlogCallback(Callback): class EvaluateCallback(Callback): """ - 该callback用于扩展Trainer训练过程中只能对dev数据进行验证的问题。 + 通过使用该Callback可以使得Trainer在evaluate dev之外还可以evaluate其它数据集,比如测试集。每一次验证dev之前都会先验证EvaluateCallback + 中的数据。 """ def __init__(self, data=None, tester=None): """ - :param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用多个Trainer中的metric对数据进行验证。如果需要传入多个 + :param ~fastNLP.DataSet,Dict[~fastNLP.DataSet] data: 传入DataSet对象,会使用Trainer中的metric对数据进行验证。如果需要传入多个 DataSet请通过dict的方式传入。 - :param ~fastNLP.Tester,Dict[~fastNLP.DataSet] tester: Tester对象,将在on_valid_end时调用。 + :param ~fastNLP.Tester,Dict[~fastNLP.DataSet] tester: Tester对象, 通过使用Tester对象,可以使得验证的metric与Trainer中 + 的metric不一样。 """ super().__init__() self.datasets = {} @@ -659,13 +661,10 @@ class EvaluateCallback(Callback): for key, tester in self.testers.items(): try: eval_result = tester.test() - # self.pbar.write("Evaluation on {}:".format(key)) - self.logger.info("Evaluation on {}:".format(key)) - # self.pbar.write(tester._format_eval_results(eval_result)) + self.logger.info("EvaluateCallback evaluation on {}:".format(key)) self.logger.info(tester._format_eval_results(eval_result)) except Exception: - # self.pbar.write("Exception happens when evaluate on DataSet named `{}`.".format(key)) - self.logger.info("Exception happens when evaluate on DataSet named `{}`.".format(key)) + self.logger.error("Exception happens when evaluate on DataSet named `{}`.".format(key)) class LRScheduler(Callback): @@ -872,15 +871,16 @@ class TensorboardCallback(Callback): class WarmupCallback(Callback): """ - 按一定的周期调节Learning rate的大小。 + learning rate按照一定的速率从0上升到设置的learning rate。 """ def __init__(self, warmup=0.1, schedule='constant'): """ :param int,float warmup: 如果warmup为int,则在该step之前,learning rate根据schedule的策略变化; 如果warmup为float, 如0.1, 则前10%的step是按照schedule策略调整learning rate。 - :param str schedule: 以哪种方式调整。linear: 前warmup的step上升到指定的learning rate(从Trainer中的optimizer处获取的), 后 - warmup的step下降到0; constant前warmup的step上升到指定learning rate,后面的step保持learning rate. + :param str schedule: 以哪种方式调整。 + linear: 前warmup的step上升到指定的learning rate(从Trainer中的optimizer处获取的), 后warmup的step下降到0; + constant前warmup的step上升到指定learning rate,后面的step保持learning rate. """ super().__init__() self.warmup = max(warmup, 0.) @@ -935,15 +935,14 @@ class SaveModelCallback(Callback): def __init__(self, save_dir, top=3, only_param=False, save_on_exception=False): """ - :param str save_dir: 将模型存放在哪个目录下,会在该目录下创建以时间戳命名的目录,并存放模型 + :param str save_dir: 将模型存放在哪个目录下,会在该目录下创建以时间戳命名的目录,并存放模型。如果save_dir不存在将自动创建 :param int top: 保存dev表现top多少模型。-1为保存所有模型。 - :param bool only_param: 是否只保存模型d饿权重。 + :param bool only_param: 是否只保存模型的权重。 :param save_on_exception: 发生exception时,是否保存一份发生exception的模型。模型名称为epoch:x_step:x_Exception:{exception_name}. """ super().__init__() - if not os.path.isdir(save_dir): - raise IsADirectoryError("{} is not a directory.".format(save_dir)) + os.makedirs(save_dir, exist_ok=True) self.save_dir = save_dir if top < 0: self.top = sys.maxsize diff --git a/test/core/test_callbacks.py b/test/core/test_callbacks.py index 98dd422d..b36beb06 100644 --- a/test/core/test_callbacks.py +++ b/test/core/test_callbacks.py @@ -2,6 +2,8 @@ import unittest import numpy as np import torch +import os +import shutil from fastNLP.core.callback import EarlyStopCallback, GradientClipCallback, LRScheduler, ControlC, \ LRFinder, TensorboardCallback @@ -13,7 +15,8 @@ from fastNLP import SGD from fastNLP import Trainer from fastNLP.models.base_model import NaiveClassifier from fastNLP.core.callback import EarlyStopError - +from fastNLP.core.callback import EvaluateCallback, FitlogCallback, SaveModelCallback +from fastNLP.core.callback import WarmupCallback def prepare_env(): def prepare_fake_dataset(): @@ -113,3 +116,54 @@ class TestCallback(unittest.TestCase): check_code_level=2) trainer.train() assert passed_epochs == list(range(1, total_epochs + 1)) + + def test_evaluate_callback(self): + data_set, model = prepare_env() + from fastNLP import Tester + tester = Tester(data=data_set, model=model, metrics=AccuracyMetric(pred="predict", target="y")) + evaluate_callback = EvaluateCallback(data_set, tester) + + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=5, print_every=50, dev_data=data_set, + metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False, + callbacks=evaluate_callback, check_code_level=2) + trainer.train() + + def test_fitlog_callback(self): + import fitlog + os.makedirs('logs/') + fitlog.set_log_dir('logs/') + data_set, model = prepare_env() + from fastNLP import Tester + tester = Tester(data=data_set, model=model, metrics=AccuracyMetric(pred="predict", target="y")) + fitlog_callback = FitlogCallback(data_set, tester) + + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=5, print_every=50, dev_data=data_set, + metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=True, + callbacks=fitlog_callback, check_code_level=2) + trainer.train() + shutil.rmtree('logs/') + + def test_save_model_callback(self): + data_set, model = prepare_env() + top = 3 + save_model_callback = SaveModelCallback('save_models/', top=top) + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=5, print_every=50, dev_data=data_set, + metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=True, + callbacks=save_model_callback, check_code_level=2) + trainer.train() + + timestamp = os.listdir('save_models')[0] + self.assertEqual(len(os.listdir(os.path.join('save_models', timestamp))), top) + shutil.rmtree('save_models/') + + def test_warmup_callback(self): + data_set, model = prepare_env() + warmup_callback = WarmupCallback() + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=5, print_every=50, dev_data=data_set, + metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=True, + callbacks=warmup_callback, check_code_level=2) + trainer.train() diff --git a/test/core/test_utils.py b/test/core/test_utils.py index 29645fb1..0093c3e8 100644 --- a/test/core/test_utils.py +++ b/test/core/test_utils.py @@ -10,7 +10,8 @@ import torch from torch import nn from fastNLP.core.utils import _move_model_to_device, _get_model_device import numpy as np -from fastNLP.core.utils import seq_len_to_mask +from fastNLP.core.utils import seq_len_to_mask, get_seq_len +from fastNLP.core.utils import iob2, iob2bioes class Model(nn.Module): def __init__(self): @@ -263,4 +264,27 @@ class TestSeqLenToMask(unittest.TestCase): # 3. pad到指定长度 seq_len = torch.randint(1, 10, size=(10, )) mask = seq_len_to_mask(seq_len, 100) - self.assertEqual(100, mask.size(1)) \ No newline at end of file + self.assertEqual(100, mask.size(1)) + + +class TestUtils(unittest.TestCase): + def test_get_seq_len(self): + seq_len = torch.randint(1, 10, size=(10, )) + mask = seq_len_to_mask(seq_len) + new_seq_len = get_seq_len(mask) + self.assertSequenceEqual(seq_len.tolist(), new_seq_len.tolist()) + + def test_iob2(self): + tags = ['B-NP', 'O', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'O', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP'] + convert_tags = ['B-NP', 'O', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'O', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP'] + self.assertSequenceEqual(convert_tags, iob2(tags)) + + tags = ['I-NP', 'O', 'I-NP', 'I-VP', 'B-NP', 'I-NP', 'O', 'I-NP', 'I-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP'] + self.assertSequenceEqual(convert_tags, iob2(tags)) + + def test_iob2bioes(self): + tags = ['B-NP', 'O', 'B-NP', 'B-VP', 'B-NP', 'I-NP', 'O', 'B-NP', 'B-PP', 'B-NP', 'I-NP', 'O', 'B-NP', 'I-NP', 'B-NP', 'O', 'B-NP', 'I-NP', 'I-NP'] + convert_tags = ['S-NP', 'O', 'S-NP', 'S-VP', 'B-NP', 'E-NP', 'O', 'S-NP', 'S-PP', 'B-NP', 'E-NP', 'O', 'B-NP', 'E-NP', 'S-NP', 'O', 'B-NP', 'I-NP', 'E-NP'] + + self.assertSequenceEqual(convert_tags, iob2bioes(tags)) + diff --git a/test/embeddings/test_bert_embedding.py b/test/embeddings/test_bert_embedding.py index 71511458..2a8550c3 100644 --- a/test/embeddings/test_bert_embedding.py +++ b/test/embeddings/test_bert_embedding.py @@ -1,6 +1,6 @@ import unittest from fastNLP import Vocabulary -from fastNLP.embeddings import BertEmbedding +from fastNLP.embeddings import BertEmbedding, BertWordPieceEncoder import torch import os @@ -37,3 +37,12 @@ class TestBertEmbedding(unittest.TestCase): words = torch.LongTensor([[2, 3, 4, 0]]) result = embed(words) self.assertEqual(result.size(), (1, 4, 16)) + + +class TestBertWordPieceEncoder(unittest.TestCase): + def test_bert_word_piece_encoder(self): + embed = BertWordPieceEncoder(model_dir_or_name='test/data_for_tests/embedding/small_bert', word_dropout=0.1) + from fastNLP import DataSet + ds = DataSet({'words': ["this is a test . [SEP]".split()]}) + embed.index_datasets(ds, field_name='words') + self.assertTrue(ds.has_field('word_pieces')) diff --git a/test/modules/test_utils.py b/test/modules/test_utils.py new file mode 100644 index 00000000..73226f97 --- /dev/null +++ b/test/modules/test_utils.py @@ -0,0 +1,9 @@ +import unittest +import torch +from fastNLP.modules.utils import get_dropout_mask + +class TestUtil(unittest.TestCase): + def test_get_dropout_mask(self): + tensor = torch.randn(3, 4) + mask = get_dropout_mask(0.3, tensor) + self.assertSequenceEqual(mask.size(), torch.Size([3, 4])) \ No newline at end of file