Browse Source

Merge branch 'dev0.5.0' of https://github.com/fastnlp/fastNLP into dev0.5.0

tags/v0.4.10
yh 5 years ago
parent
commit
fdf2a02a5f
12 changed files with 42 additions and 136 deletions
  1. +0
    -13
      fastNLP/core/batch.py
  2. +4
    -3
      fastNLP/core/const.py
  3. +0
    -79
      fastNLP/core/losses.py
  4. +6
    -3
      fastNLP/core/optimizer.py
  5. +2
    -2
      fastNLP/io/pipe/matching.py
  6. +15
    -6
      fastNLP/io/pipe/utils.py
  7. +5
    -6
      reproduction/matching/matching_bert.py
  8. +0
    -13
      test/core/test_loss.py
  9. +10
    -1
      test/core/test_optimizer.py
  10. +0
    -3
      test/data_for_tests/io/rte/dev.tsv
  11. +0
    -3
      test/data_for_tests/io/rte/test.tsv
  12. +0
    -4
      test/data_for_tests/io/rte/train.tsv

+ 0
- 13
fastNLP/core/batch.py View File

@@ -201,19 +201,6 @@ class TorchLoaderIter(BatchIter):
self.batch_size = dataset.batch_size


class OnlineDataGettter:
# TODO
pass


class OnlineDataIter(BatchIter):
# TODO
def __init__(self, dataset, batch_size=1, buffer_size=10000, sampler=None, as_numpy=False,
num_workers=0, pin_memory=False, drop_last=False,
timeout=0, worker_init_fn=None, **kwargs):
super().__init__()


def _to_tensor(batch, field_dtype):
"""



+ 4
- 3
fastNLP/core/const.py View File

@@ -1,6 +1,5 @@
"""
.. todo::
doc
r"""
fastNLP包当中的field命名均符合一定的规范,该规范由fastNLP.Const类进行定义。
"""

__all__ = [
@@ -50,11 +49,13 @@ class Const:
@staticmethod
def RAW_WORDS(i):
"""得到第 i 个 ``RAW_WORDS`` 的命名"""
i = int(i) + 1
return Const.RAW_WORD + str(i)
@staticmethod
def RAW_CHARS(i):
"""得到第 i 个 ``RAW_CHARS`` 的命名"""
i = int(i) + 1
return Const.RAW_CHAR + str(i)


+ 0
- 79
fastNLP/core/losses.py View File

@@ -352,82 +352,3 @@ def _prepare_losser(losser):
return losser
else:
raise TypeError(f"Type of loss should be `fastNLP.LossBase`, got {type(losser)}")


def squash(predict, truth, **kwargs):
"""To reshape tensors in order to fit loss functions in PyTorch.

:param predict: Tensor, model output
:param truth: Tensor, truth from dataset
:param kwargs: extra arguments
:return predict , truth: predict & truth after processing
"""
return predict.view(-1, predict.size()[-1]), truth.view(-1, )


def unpad(predict, truth, **kwargs):
"""To process padded sequence output to get true loss.

:param predict: Tensor, [batch_size , max_len , tag_size]
:param truth: Tensor, [batch_size , max_len]
:param kwargs: kwargs["lens"] is a list or LongTensor, with size [batch_size]. The i-th element is true lengths of i-th sequence.

:return predict , truth: predict & truth after processing
"""
if kwargs.get("lens") is None:
return predict, truth
lens = torch.LongTensor(kwargs["lens"])
lens, idx = torch.sort(lens, descending=True)
predict = torch.nn.utils.rnn.pack_padded_sequence(predict[idx], lens, batch_first=True).data
truth = torch.nn.utils.rnn.pack_padded_sequence(truth[idx], lens, batch_first=True).data
return predict, truth


def unpad_mask(predict, truth, **kwargs):
"""To process padded sequence output to get true loss.

:param predict: Tensor, [batch_size , max_len , tag_size]
:param truth: Tensor, [batch_size , max_len]
:param kwargs: kwargs["lens"] is a list or LongTensor, with size [batch_size]. The i-th element is true lengths of i-th sequence.

:return predict , truth: predict & truth after processing
"""
if kwargs.get("lens") is None:
return predict, truth
mas = make_mask(kwargs["lens"], truth.size()[1])
return mask(predict, truth, mask=mas)


def mask(predict, truth, **kwargs):
"""To select specific elements from Tensor. This method calls ``squash()``.

:param predict: Tensor, [batch_size , max_len , tag_size]
:param truth: Tensor, [batch_size , max_len]
:param kwargs: extra arguments, kwargs["mask"]: ByteTensor, [batch_size , max_len], the mask Tensor. The position that is 1 will be selected.

:return predict , truth: predict & truth after processing
"""
if kwargs.get("mask") is None:
return predict, truth
mask = kwargs["mask"]
predict, truth = squash(predict, truth)
mask = mask.view(-1, )
predict = torch.masked_select(predict.permute(1, 0), mask).view(predict.size()[-1], -1).permute(1, 0)
truth = torch.masked_select(truth, mask)
return predict, truth


def make_mask(lens, tar_len):
"""To generate a mask over a sequence.

:param lens: list or LongTensor, [batch_size]
:param tar_len: int
:return mask: ByteTensor
"""
lens = torch.LongTensor(lens)
mask = [torch.ge(lens, i + 1) for i in range(tar_len)]
mask = torch.stack(mask, 1)
return mask

+ 6
- 3
fastNLP/core/optimizer.py View File

@@ -33,8 +33,9 @@ class Optimizer(object):
def construct_from_pytorch(self, model_params):
raise NotImplementedError
def _get_require_grads_param(self, params):

@staticmethod
def _get_require_grads_param(params):
"""
将params中不需要gradient的删除
@@ -43,6 +44,7 @@ class Optimizer(object):
"""
return [param for param in params if param.requires_grad]


class NullOptimizer(Optimizer):
"""
当不希望Trainer更新optimizer时,传入本optimizer,但请确保通过callback的方式对参数进行了更新。
@@ -113,7 +115,8 @@ class Adam(Optimizer):

class AdamW(TorchOptimizer):
r"""
对AdamW的实现,该实现应该会在pytorch更高版本中出现,https://github.com/pytorch/pytorch/pull/21250。这里提前加入
对AdamW的实现,该实现在pytorch 1.2.0版本中已经出现,https://github.com/pytorch/pytorch/pull/21250。
这里加入以适配低版本的pytorch
.. todo::
翻译成中文


+ 2
- 2
fastNLP/io/pipe/matching.py View File

@@ -51,7 +51,7 @@ class MatchingBertPipe(Pipe):
super().__init__()
self.lower = bool(lower)
self.tokenizer = get_tokenizer(tokenizer=tokenizer)
self.tokenizer = get_tokenizer(tokenize_method=tokenizer)
def _tokenize(self, data_bundle, field_names, new_field_names):
"""
@@ -191,7 +191,7 @@ class MatchingPipe(Pipe):
super().__init__()
self.lower = bool(lower)
self.tokenizer = get_tokenizer(tokenizer=tokenizer)
self.tokenizer = get_tokenizer(tokenize_method=tokenizer)
def _tokenize(self, data_bundle, field_names, new_field_names):
"""


+ 15
- 6
fastNLP/io/pipe/utils.py View File

@@ -65,27 +65,36 @@ def iob2bioes(tags: List[str]) -> List[str]:
return new_tags


def get_tokenizer(tokenizer: str, lang='en'):
def get_tokenizer(tokenize_method: str, lang='en'):
"""

:param str tokenizer: 获取tokenzier方法
:param str tokenize_method: 获取tokenzier方法
:param str lang: 语言,当前仅支持en
:return: 返回tokenize函数
"""
if tokenizer == 'spacy':
tokenizer_dict = {
'spacy': None,
'raw': _raw_split,
'cn-char': _cn_char_split,
}
if tokenize_method == 'spacy':
import spacy
spacy.prefer_gpu()
if lang != 'en':
raise RuntimeError("Spacy only supports en right right.")
en = spacy.load(lang)
tokenizer = lambda x: [w.text for w in en.tokenizer(x)]
elif tokenizer == 'raw':
tokenizer = _raw_split
elif tokenize_method in tokenizer_dict:
tokenizer = tokenizer_dict[tokenize_method]
else:
raise RuntimeError("Only support `spacy`, `raw` tokenizer.")
raise RuntimeError(f"Only support {tokenizer_dict.keys()} tokenizer.")
return tokenizer


def _cn_char_split(sent):
return [chars for chars in sent]


def _raw_split(sent):
return sent.split()



+ 5
- 6
reproduction/matching/matching_bert.py View File

@@ -8,8 +8,7 @@ from fastNLP.core.optimizer import AdamW
from fastNLP.embeddings import BertEmbedding
from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, MNLIBertPipe,\
QNLIBertPipe, QuoraBertPipe

from reproduction.matching.model.bert import BertForNLI
from fastNLP.models.bert import BertForSentenceMatching


# define hyper-parameters
@@ -65,7 +64,7 @@ print(data_bundle) # print details in data_bundle
embed = BertEmbedding(data_bundle.vocabs[Const.INPUT], model_dir_or_name=arg.bert_model_dir_or_name)

# define model
model = BertForNLI(embed, class_num=len(data_bundle.vocabs[Const.TARGET]))
model = BertForSentenceMatching(embed, num_labels=len(data_bundle.vocabs[Const.TARGET]))

# define optimizer and callback
optimizer = AdamW(lr=arg.lr, params=model.parameters())
@@ -76,11 +75,11 @@ if arg.task in ['snli']:
# evaluate test set in every epoch if task is snli.

# define trainer
trainer = Trainer(train_data=data_bundle.datasets[arg.train_dataset_name], model=model,
trainer = Trainer(train_data=data_bundle.get_dataset(arg.train_dataset_name), model=model,
optimizer=optimizer,
batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu,
n_epochs=arg.n_epochs, print_every=-1,
dev_data=data_bundle.datasets[arg.dev_dataset_name],
dev_data=data_bundle.get_dataset(arg.dev_dataset_name),
metrics=AccuracyMetric(), metric_key='acc',
device=[i for i in range(torch.cuda.device_count())],
check_code_level=-1,
@@ -92,7 +91,7 @@ trainer.train(load_best_model=True)

# define tester
tester = Tester(
data=data_bundle.datasets[arg.test_dataset_name],
data=data_bundle.get_dataset(arg.test_dataset_name),
model=model,
metrics=AccuracyMetric(),
batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu,


+ 0
- 13
test/core/test_loss.py View File

@@ -4,7 +4,6 @@ import torch
import torch.nn.functional as F

import fastNLP as loss
from fastNLP.core.losses import squash, unpad


class TestLoss(unittest.TestCase):
@@ -73,15 +72,3 @@ class TestLosserError(unittest.TestCase):
with self.assertRaises(Exception):
ans = l1({"my_predict": a}, {"truth": b, "my": a})


class TestLossUtils(unittest.TestCase):
def test_squash(self):
a, b = squash(torch.randn(3, 5), torch.randn(3, 5))
self.assertEqual(tuple(a.size()), (3, 5))
self.assertEqual(tuple(b.size()), (15,))
def test_unpad(self):
a, b = unpad(torch.randn(5, 8, 3), torch.randn(5, 8))
self.assertEqual(tuple(a.size()), (5, 8, 3))
self.assertEqual(tuple(b.size()), (5, 8))

+ 10
- 1
test/core/test_optimizer.py View File

@@ -2,7 +2,7 @@ import unittest

import torch

from fastNLP import SGD, Adam
from fastNLP import SGD, Adam, AdamW


class TestOptim(unittest.TestCase):
@@ -52,3 +52,12 @@ class TestOptim(unittest.TestCase):
self.assertEqual(optim.__dict__["settings"]["lr"], 0.001)
res = optim.construct_from_pytorch(torch.nn.Linear(10, 3).parameters())
self.assertTrue(isinstance(res, torch.optim.Adam))

def test_AdamW(self):
optim = AdamW(params=torch.nn.Linear(10, 3).parameters())
self.assertTrue('lr' in optim.defaults)
self.assertTrue('weight_decay' in optim.defaults)

optim = AdamW(params=torch.nn.Linear(10, 3).parameters(), lr=0.002, weight_decay=0.989)
self.assertEqual(optim.defaults['lr'], 0.002)
self.assertTrue(optim.defaults['weight_decay'], 0.989)

+ 0
- 3
test/data_for_tests/io/rte/dev.tsv View File

@@ -1,3 +0,0 @@
index sentence1 sentence2 label
0 Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation. Christopher Reeve had an accident. not_entailment
1 Yet, we now are discovering that antibiotics are losing their effectiveness against illness. Disease-causing bacteria are mutating faster than we can come up with new antibiotics to fight the new variations. Bacteria is winning the war against antibiotics. entailment

+ 0
- 3
test/data_for_tests/io/rte/test.tsv View File

@@ -1,3 +0,0 @@
index sentence1 sentence2
0 Mangla was summoned after Madhumita's sister Nidhi Shukla, who was the first witness in the case. Shukla is related to Mangla.
1 Authorities in Brazil say that more than 200 people are being held hostage in a prison in the country's remote, Amazonian-jungle state of Rondonia. Authorities in Brazil hold 200 people as hostage.

+ 0
- 4
test/data_for_tests/io/rte/train.tsv View File

@@ -1,4 +0,0 @@
index sentence1 sentence2 label
0 No Weapons of Mass Destruction Found in Iraq Yet. Weapons of Mass Destruction Found in Iraq. not_entailment
1 A place of sorrow, after Pope John Paul II died, became a place of celebration, as Roman Catholic faithful gathered in downtown Chicago to mark the installation of new Pope Benedict XVI. Pope Benedict XVI is the new leader of the Roman Catholic Church. entailment
2 Herceptin was already approved to treat the sickest breast cancer patients, and the company said, Monday, it will discuss with federal regulators the possibility of prescribing the drug for more breast cancer patients. Herceptin can be used to treat breast cancer. entailment

Loading…
Cancel
Save