Merge branch 'dev0.5.0' of https://github.com/fastnlp/fastNLP into dev0.5.0

5 years ago · fdf2a02a5f
--- a/fastNLP/core/batch.py
+++ b/fastNLP/core/batch.py
@@ -201,19 +201,6 @@ class TorchLoaderIter(BatchIter):
        self.batch_size = dataset.batch_size


 class OnlineDataGettter:
    # TODO
    pass


 class OnlineDataIter(BatchIter):
    # TODO
    def __init__(self, dataset, batch_size=1, buffer_size=10000, sampler=None, as_numpy=False,
                 num_workers=0, pin_memory=False, drop_last=False,
                 timeout=0, worker_init_fn=None, **kwargs):
        super().__init__()


 def _to_tensor(batch, field_dtype):
    """

--- a/fastNLP/core/const.py
+++ b/fastNLP/core/const.py
@@ -1,6 +1,5 @@
 """
 .. todo::
    doc
 r"""
 fastNLP包当中的field命名均符合一定的规范，该规范由fastNLP.Const类进行定义。
 """

 __all__ = [
@@ -50,11 +49,13 @@ class Const:
    
    @staticmethod
    def RAW_WORDS(i):
        """得到第 i 个 ``RAW_WORDS`` 的命名"""
        i = int(i) + 1
        return Const.RAW_WORD + str(i)
    
    @staticmethod
    def RAW_CHARS(i):
        """得到第 i 个 ``RAW_CHARS`` 的命名"""
        i = int(i) + 1
        return Const.RAW_CHAR + str(i)
    
--- a/fastNLP/core/losses.py
+++ b/fastNLP/core/losses.py
@@ -352,82 +352,3 @@ def _prepare_losser(losser):
        return losser
    else:
        raise TypeError(f"Type of loss should be `fastNLP.LossBase`, got {type(losser)}")


 def squash(predict, truth, **kwargs):
    """To reshape tensors in order to fit loss functions in PyTorch.

    :param predict: Tensor, model output
    :param truth: Tensor, truth from dataset
    :param kwargs: extra arguments
    :return predict , truth: predict & truth after processing
    """
    return predict.view(-1, predict.size()[-1]), truth.view(-1, )


 def unpad(predict, truth, **kwargs):
    """To process padded sequence output to get true loss.

    :param predict: Tensor, [batch_size , max_len , tag_size]
    :param truth: Tensor, [batch_size , max_len]
    :param kwargs: kwargs["lens"] is a list or LongTensor, with size [batch_size]. The i-th element is true lengths of i-th sequence.

    :return predict , truth: predict & truth after processing
    """
    if kwargs.get("lens") is None:
        return predict, truth
    lens = torch.LongTensor(kwargs["lens"])
    lens, idx = torch.sort(lens, descending=True)
    predict = torch.nn.utils.rnn.pack_padded_sequence(predict[idx], lens, batch_first=True).data
    truth = torch.nn.utils.rnn.pack_padded_sequence(truth[idx], lens, batch_first=True).data
    return predict, truth


 def unpad_mask(predict, truth, **kwargs):
    """To process padded sequence output to get true loss.

    :param predict: Tensor, [batch_size , max_len , tag_size]
    :param truth: Tensor, [batch_size , max_len]
    :param kwargs: kwargs["lens"] is a list or LongTensor, with size [batch_size]. The i-th element is true lengths of i-th sequence.

    :return predict , truth: predict & truth after processing
    """
    if kwargs.get("lens") is None:
        return predict, truth
    mas = make_mask(kwargs["lens"], truth.size()[1])
    return mask(predict, truth, mask=mas)


 def mask(predict, truth, **kwargs):
    """To select specific elements from Tensor. This method calls ``squash()``.

    :param predict: Tensor, [batch_size , max_len , tag_size]
    :param truth: Tensor, [batch_size , max_len]
    :param kwargs: extra arguments, kwargs["mask"]: ByteTensor, [batch_size , max_len], the mask Tensor. The position that is 1 will be selected.

    :return predict , truth: predict & truth after processing
    """
    if kwargs.get("mask") is None:
        return predict, truth
    mask = kwargs["mask"]
    
    predict, truth = squash(predict, truth)
    mask = mask.view(-1, )
    
    predict = torch.masked_select(predict.permute(1, 0), mask).view(predict.size()[-1], -1).permute(1, 0)
    truth = torch.masked_select(truth, mask)
    
    return predict, truth


 def make_mask(lens, tar_len):
    """To generate a mask over a sequence.

    :param lens: list or LongTensor, [batch_size]
    :param tar_len: int
    :return mask: ByteTensor
    """
    lens = torch.LongTensor(lens)
    mask = [torch.ge(lens, i + 1) for i in range(tar_len)]
    mask = torch.stack(mask, 1)
    return mask
--- a/fastNLP/core/optimizer.py
+++ b/fastNLP/core/optimizer.py
@@ -33,8 +33,9 @@ class Optimizer(object):
    
    def construct_from_pytorch(self, model_params):
        raise NotImplementedError
    
    def _get_require_grads_param(self, params):

    @staticmethod
    def _get_require_grads_param(params):
        """
        将params中不需要gradient的删除
        
@@ -43,6 +44,7 @@ class Optimizer(object):
        """
        return [param for param in params if param.requires_grad]


 class NullOptimizer(Optimizer):
    """
    当不希望Trainer更新optimizer时，传入本optimizer，但请确保通过callback的方式对参数进行了更新。
@@ -113,7 +115,8 @@ class Adam(Optimizer):

 class AdamW(TorchOptimizer):
    r"""
    对AdamW的实现，该实现应该会在pytorch更高版本中出现，https://github.com/pytorch/pytorch/pull/21250。这里提前加入
    对AdamW的实现，该实现在pytorch 1.2.0版本中已经出现，https://github.com/pytorch/pytorch/pull/21250。
    这里加入以适配低版本的pytorch
    
    .. todo::
        翻译成中文
--- a/fastNLP/io/pipe/matching.py
+++ b/fastNLP/io/pipe/matching.py
@@ -51,7 +51,7 @@ class MatchingBertPipe(Pipe):
        super().__init__()
        
        self.lower = bool(lower)
        self.tokenizer = get_tokenizer(tokenizer=tokenizer)
        self.tokenizer = get_tokenizer(tokenize_method=tokenizer)
    
    def _tokenize(self, data_bundle, field_names, new_field_names):
        """
@@ -191,7 +191,7 @@ class MatchingPipe(Pipe):
        super().__init__()
        
        self.lower = bool(lower)
        self.tokenizer = get_tokenizer(tokenizer=tokenizer)
        self.tokenizer = get_tokenizer(tokenize_method=tokenizer)
    
    def _tokenize(self, data_bundle, field_names, new_field_names):
        """
--- a/fastNLP/io/pipe/utils.py
+++ b/fastNLP/io/pipe/utils.py
@@ -65,27 +65,36 @@ def iob2bioes(tags: List[str]) -> List[str]:
    return new_tags


 def get_tokenizer(tokenizer: str, lang='en'):
 def get_tokenizer(tokenize_method: str, lang='en'):
    """

    :param str tokenizer: 获取tokenzier方法
    :param str tokenize_method: 获取tokenzier方法
    :param str lang: 语言，当前仅支持en
    :return: 返回tokenize函数
    """
    if tokenizer == 'spacy':
    tokenizer_dict = {
        'spacy': None,
        'raw': _raw_split,
        'cn-char': _cn_char_split,
    }
    if tokenize_method == 'spacy':
        import spacy
        spacy.prefer_gpu()
        if lang != 'en':
            raise RuntimeError("Spacy only supports en right right.")
        en = spacy.load(lang)
        tokenizer = lambda x: [w.text for w in en.tokenizer(x)]
    elif tokenizer == 'raw':
        tokenizer = _raw_split
    elif tokenize_method in tokenizer_dict:
        tokenizer = tokenizer_dict[tokenize_method]
    else:
        raise RuntimeError("Only support `spacy`, `raw` tokenizer.")
        raise RuntimeError(f"Only support {tokenizer_dict.keys()} tokenizer.")
    return tokenizer


 def _cn_char_split(sent):
    return [chars for chars in sent]


 def _raw_split(sent):
    return sent.split()

--- a/reproduction/matching/matching_bert.py
+++ b/reproduction/matching/matching_bert.py
@@ -8,8 +8,7 @@ from fastNLP.core.optimizer import AdamW
 from fastNLP.embeddings import BertEmbedding
 from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, MNLIBertPipe,\
    QNLIBertPipe, QuoraBertPipe

 from reproduction.matching.model.bert import BertForNLI
 from fastNLP.models.bert import BertForSentenceMatching


 # define hyper-parameters
@@ -65,7 +64,7 @@ print(data_bundle)  # print details in data_bundle
 embed = BertEmbedding(data_bundle.vocabs[Const.INPUT], model_dir_or_name=arg.bert_model_dir_or_name)

 # define model
 model = BertForNLI(embed, class_num=len(data_bundle.vocabs[Const.TARGET]))
 model = BertForSentenceMatching(embed, num_labels=len(data_bundle.vocabs[Const.TARGET]))

 # define optimizer and callback
 optimizer = AdamW(lr=arg.lr, params=model.parameters())
@@ -76,11 +75,11 @@ if arg.task in ['snli']:
    # evaluate test set in every epoch if task is snli.

 # define trainer
 trainer = Trainer(train_data=data_bundle.datasets[arg.train_dataset_name], model=model,
 trainer = Trainer(train_data=data_bundle.get_dataset(arg.train_dataset_name), model=model,
                  optimizer=optimizer,
                  batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu,
                  n_epochs=arg.n_epochs, print_every=-1,
                  dev_data=data_bundle.datasets[arg.dev_dataset_name],
                  dev_data=data_bundle.get_dataset(arg.dev_dataset_name),
                  metrics=AccuracyMetric(), metric_key='acc',
                  device=[i for i in range(torch.cuda.device_count())],
                  check_code_level=-1,
@@ -92,7 +91,7 @@ trainer.train(load_best_model=True)

 # define tester
 tester = Tester(
    data=data_bundle.datasets[arg.test_dataset_name],
    data=data_bundle.get_dataset(arg.test_dataset_name),
    model=model,
    metrics=AccuracyMetric(),
    batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu,
--- a/test/core/test_loss.py
+++ b/test/core/test_loss.py
@@ -4,7 +4,6 @@ import torch
 import torch.nn.functional as F

 import fastNLP as loss
 from fastNLP.core.losses import squash, unpad


 class TestLoss(unittest.TestCase):
@@ -73,15 +72,3 @@ class TestLosserError(unittest.TestCase):
        
        with self.assertRaises(Exception):
            ans = l1({"my_predict": a}, {"truth": b, "my": a})


 class TestLossUtils(unittest.TestCase):
    def test_squash(self):
        a, b = squash(torch.randn(3, 5), torch.randn(3, 5))
        self.assertEqual(tuple(a.size()), (3, 5))
        self.assertEqual(tuple(b.size()), (15,))
    
    def test_unpad(self):
        a, b = unpad(torch.randn(5, 8, 3), torch.randn(5, 8))
        self.assertEqual(tuple(a.size()), (5, 8, 3))
        self.assertEqual(tuple(b.size()), (5, 8))
--- a/test/core/test_optimizer.py
+++ b/test/core/test_optimizer.py
@@ -2,7 +2,7 @@ import unittest

 import torch

 from fastNLP import SGD, Adam
 from fastNLP import SGD, Adam, AdamW


 class TestOptim(unittest.TestCase):
@@ -52,3 +52,12 @@ class TestOptim(unittest.TestCase):
        self.assertEqual(optim.__dict__["settings"]["lr"], 0.001)
        res = optim.construct_from_pytorch(torch.nn.Linear(10, 3).parameters())
        self.assertTrue(isinstance(res, torch.optim.Adam))

    def test_AdamW(self):
        optim = AdamW(params=torch.nn.Linear(10, 3).parameters())
        self.assertTrue('lr' in optim.defaults)
        self.assertTrue('weight_decay' in optim.defaults)

        optim = AdamW(params=torch.nn.Linear(10, 3).parameters(), lr=0.002, weight_decay=0.989)
        self.assertEqual(optim.defaults['lr'], 0.002)
        self.assertTrue(optim.defaults['weight_decay'], 0.989)
--- a/test/data_for_tests/io/rte/dev.tsv
+++ b/test/data_for_tests/io/rte/dev.tsv
@@ -1,3 +0,0 @@
 index	sentence1	sentence2	label
 0	Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation.	Christopher Reeve had an accident.	not_entailment
 1	Yet, we now are discovering that antibiotics are losing their effectiveness against illness. Disease-causing bacteria are mutating faster than we can come up with new antibiotics to fight the new variations.	Bacteria is winning the war against antibiotics.	entailment
--- a/test/data_for_tests/io/rte/test.tsv
+++ b/test/data_for_tests/io/rte/test.tsv
@@ -1,3 +0,0 @@
 index	sentence1	sentence2
 0	Mangla was summoned after Madhumita's sister Nidhi Shukla, who was the first witness in the case.	Shukla is related to Mangla.
 1	Authorities in Brazil say that more than 200 people are being held hostage in a prison in the country's remote, Amazonian-jungle state of Rondonia.	Authorities in Brazil hold 200 people as hostage.
--- a/test/data_for_tests/io/rte/train.tsv
+++ b/test/data_for_tests/io/rte/train.tsv
@@ -1,4 +0,0 @@
 index	sentence1	sentence2	label
 0	No Weapons of Mass Destruction Found in Iraq Yet.	Weapons of Mass Destruction Found in Iraq.	not_entailment
 1	A place of sorrow, after Pope John Paul II died, became a place of celebration, as Roman Catholic faithful gathered in downtown Chicago to mark the installation of new Pope Benedict XVI.	Pope Benedict XVI is the new leader of the Roman Catholic Church.	entailment
 2	Herceptin was already approved to treat the sickest breast cancer patients, and the company said, Monday, it will discuss with federal regulators the possibility of prescribing the drug for more breast cancer patients.	Herceptin can be used to treat breast cancer.	entailment