Browse Source

Merge pull request #3 from fastnlp/dev0.5.0

Dev0.5.0 merge
tags/v0.4.10
Danqing Wang GitHub 6 years ago
parent
commit
cdcc235d4c
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 463 additions and 208 deletions
  1. +1
    -1
      fastNLP/core/callback.py
  2. +4
    -2
      fastNLP/core/dataset.py
  3. +12
    -8
      fastNLP/core/losses.py
  4. +17
    -0
      fastNLP/core/optimizer.py
  5. +3
    -1
      fastNLP/core/predictor.py
  6. +16
    -16
      fastNLP/core/tester.py
  7. +9
    -11
      fastNLP/core/trainer.py
  8. +22
    -1
      fastNLP/core/utils.py
  9. +19
    -14
      fastNLP/io/embed_loader.py
  10. +31
    -0
      fastNLP/io/file_utils.py
  11. +23
    -45
      fastNLP/modules/encoder/embedding.py
  12. +1
    -10
      fastNLP/modules/encoder/lstm.py
  13. +2
    -0
      fastNLP/modules/utils.py
  14. +219
    -51
      reproduction/matching/data/MatchingDataLoader.py
  15. +0
    -44
      reproduction/matching/matching.py
  16. +65
    -0
      reproduction/matching/matching_esim.py
  17. +18
    -3
      reproduction/matching/model/esim.py
  18. +1
    -1
      setup.py

+ 1
- 1
fastNLP/core/callback.py View File

@@ -548,7 +548,7 @@ class LRScheduler(Callback):
else: else:
raise ValueError(f"Expect torch.optim.lr_scheduler for LRScheduler. Got {type(lr_scheduler)}.") raise ValueError(f"Expect torch.optim.lr_scheduler for LRScheduler. Got {type(lr_scheduler)}.")
def on_epoch_begin(self):
def on_epoch_end(self):
self.scheduler.step(self.epoch) self.scheduler.step(self.epoch)






+ 4
- 2
fastNLP/core/dataset.py View File

@@ -801,17 +801,19 @@ class DataSet(object):
else: else:
return DataSet() return DataSet()
def split(self, ratio):
def split(self, ratio, shuffle=True):
""" """
将DataSet按照ratio的比例拆分,返回两个DataSet 将DataSet按照ratio的比例拆分,返回两个DataSet


:param float ratio: 0<ratio<1, 返回的第一个DataSet拥有 `(1-ratio)` 这么多数据,第二个DataSet拥有`ratio`这么多数据 :param float ratio: 0<ratio<1, 返回的第一个DataSet拥有 `(1-ratio)` 这么多数据,第二个DataSet拥有`ratio`这么多数据
:param bool shuffle: 在split前是否shuffle一下
:return: [DataSet, DataSet] :return: [DataSet, DataSet]
""" """
assert isinstance(ratio, float) assert isinstance(ratio, float)
assert 0 < ratio < 1 assert 0 < ratio < 1
all_indices = [_ for _ in range(len(self))] all_indices = [_ for _ in range(len(self))]
np.random.shuffle(all_indices)
if shuffle:
np.random.shuffle(all_indices)
split = int(ratio * len(self)) split = int(ratio * len(self))
dev_indices = all_indices[:split] dev_indices = all_indices[:split]
train_indices = all_indices[split:] train_indices = all_indices[split:]


+ 12
- 8
fastNLP/core/losses.py View File

@@ -26,7 +26,7 @@ from .utils import _build_args
from .utils import _check_arg_dict_list from .utils import _check_arg_dict_list
from .utils import _check_function_or_method from .utils import _check_function_or_method
from .utils import _get_func_signature from .utils import _get_func_signature
from .utils import seq_len_to_mask


class LossBase(object): class LossBase(object):
""" """
@@ -223,7 +223,9 @@ class CrossEntropyLoss(LossBase):
:param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred`
:param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` :param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target`
:param padding_idx: padding的index,在计算loss时将忽略target中标号为padding_idx的内容
:param seq_len: 句子的长度, 长度之外的token不会计算loss。。
:param padding_idx: padding的index,在计算loss时将忽略target中标号为padding_idx的内容, 可以通过该值代替
传入seq_len.


Example:: Example::


@@ -231,16 +233,18 @@ class CrossEntropyLoss(LossBase):
""" """
def __init__(self, pred=None, target=None, padding_idx=-100):
def __init__(self, pred=None, target=None, seq_len=None, padding_idx=-100):
super(CrossEntropyLoss, self).__init__() super(CrossEntropyLoss, self).__init__()
self._init_param_map(pred=pred, target=target)
self._init_param_map(pred=pred, target=target, seq_len=seq_len)
self.padding_idx = padding_idx self.padding_idx = padding_idx
def get_loss(self, pred, target):
def get_loss(self, pred, target, seq_len=None):
if pred.dim()>2: if pred.dim()>2:
if pred.size()[:2]==target.size():
# F.cross_entropy在计算时,如果pred是(16, 10 ,4), 会在第二维上去log_softmax, 所以需要交换一下位置
pred = pred.transpose(1, 2)
pred = pred.view(-1, pred.size(-1))
target = target.view(-1)
if seq_len is not None:
mask = seq_len_to_mask(seq_len).view(-1).eq(0)
target = target.masked_fill(mask, self.padding_idx)


return F.cross_entropy(input=pred, target=target, return F.cross_entropy(input=pred, target=target,
ignore_index=self.padding_idx) ignore_index=self.padding_idx)


+ 17
- 0
fastNLP/core/optimizer.py View File

@@ -36,6 +36,23 @@ class Optimizer(object):
""" """
return [param for param in params if param.requires_grad] return [param for param in params if param.requires_grad]


class NullOptimizer(Optimizer):
"""
当不希望Trainer更新optimizer时,传入本optimizer,但请确保通过callback的方式对参数进行了更新。

"""
def __init__(self):
super().__init__(None)

def construct_from_pytorch(self, model_params):
pass

def __getattr__(self, item):
def pass_func(*args, **kwargs):
pass

return pass_func



class SGD(Optimizer): class SGD(Optimizer):
""" """


+ 3
- 1
fastNLP/core/predictor.py View File

@@ -9,7 +9,7 @@ import torch
from . import DataSetIter from . import DataSetIter
from . import DataSet from . import DataSet
from . import SequentialSampler from . import SequentialSampler
from .utils import _build_args
from .utils import _build_args, _move_dict_value_to_device, _get_model_device




class Predictor(object): class Predictor(object):
@@ -43,6 +43,7 @@ class Predictor(object):
raise ValueError("Field name {} not found in DataSet {}.".format(seq_len_field_name, data)) raise ValueError("Field name {} not found in DataSet {}.".format(seq_len_field_name, data))


self.network.eval() self.network.eval()
network_device = _get_model_device(self.network)
batch_output = defaultdict(list) batch_output = defaultdict(list)
data_iterator = DataSetIter(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False) data_iterator = DataSetIter(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False)


@@ -53,6 +54,7 @@ class Predictor(object):


with torch.no_grad(): with torch.no_grad():
for batch_x, _ in data_iterator: for batch_x, _ in data_iterator:
_move_dict_value_to_device(batch_x, _, device=network_device)
refined_batch_x = _build_args(predict_func, **batch_x) refined_batch_x = _build_args(predict_func, **batch_x)
prediction = predict_func(**refined_batch_x) prediction = predict_func(**refined_batch_x)




+ 16
- 16
fastNLP/core/tester.py View File

@@ -48,6 +48,7 @@ from .utils import _move_dict_value_to_device
from .utils import _get_func_signature from .utils import _get_func_signature
from .utils import _get_model_device from .utils import _get_model_device
from .utils import _move_model_to_device from .utils import _move_model_to_device
from .utils import _data_parallel_wrapper


__all__ = [ __all__ = [
"Tester" "Tester"
@@ -104,26 +105,25 @@ class Tester(object):
self.data_iterator = data self.data_iterator = data
else: else:
raise TypeError("data type {} not support".format(type(data))) raise TypeError("data type {} not support".format(type(data)))
# 如果是DataParallel将没有办法使用predict方法
if isinstance(self._model, nn.DataParallel):
if hasattr(self._model.module, 'predict') and not hasattr(self._model, 'predict'):
warnings.warn("Cannot use DataParallel to test your model, because your model offer predict() function,"
" while DataParallel has no predict() function.")
self._model = self._model.module

# check predict # check predict
if hasattr(self._model, 'predict'):
self._predict_func = self._model.predict
if not callable(self._predict_func):
_model_name = model.__class__.__name__
raise TypeError(f"`{_model_name}.predict` must be callable to be used "
f"for evaluation, not `{type(self._predict_func)}`.")
if (hasattr(self._model, 'predict') and callable(self._model.predict)) or \
(isinstance(self._model, nn.DataParallel) and hasattr(self._model.module, 'predict') and
callable(self._model.module.predict)):
if isinstance(self._model, nn.DataParallel):
self._predict_func_wrapper = _data_parallel_wrapper(self._model.module.predict, self._model.device_ids,
self._model.output_device)
self._predict_func = self._model.module.predict
else:
self._predict_func = self._model.predict
self._predict_func_wrapper = self._model.predict
else: else:
if isinstance(model, nn.DataParallel):
if isinstance(self._model, nn.DataParallel):
self._predict_func_wrapper = self._model.forward
self._predict_func = self._model.module.forward self._predict_func = self._model.module.forward
else: else:
self._predict_func = self._model.forward self._predict_func = self._model.forward
self._predict_func_wrapper = self._model.forward
def test(self): def test(self):
"""开始进行验证,并返回验证结果。 """开始进行验证,并返回验证结果。
@@ -180,7 +180,7 @@ class Tester(object):
def _data_forward(self, func, x): def _data_forward(self, func, x):
"""A forward pass of the model. """ """A forward pass of the model. """
x = _build_args(func, **x) x = _build_args(func, **x)
y = func(**x)
y = self._predict_func_wrapper(**x)
return y return y
def _format_eval_results(self, results): def _format_eval_results(self, results):


+ 9
- 11
fastNLP/core/trainer.py View File

@@ -452,17 +452,15 @@ class Trainer(object):
else: else:
raise TypeError("train_data type {} not support".format(type(train_data))) raise TypeError("train_data type {} not support".format(type(train_data)))


self.model = _move_model_to_device(model, device=device)

if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter): if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter):
_check_code(dataset=train_data, model=self.model, losser=losser, metrics=metrics, dev_data=dev_data,
_check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data,
metric_key=metric_key, check_level=check_code_level, metric_key=metric_key, check_level=check_code_level,
batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE)) batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE))
# _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码 # _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码
self.model = _move_model_to_device(model, device=device)

self.train_data = train_data self.train_data = train_data
self.dev_data = dev_data # If None, No validation. self.dev_data = dev_data # If None, No validation.
self.model = model
self.losser = losser self.losser = losser
self.metrics = metrics self.metrics = metrics
self.n_epochs = int(n_epochs) self.n_epochs = int(n_epochs)
@@ -480,16 +478,16 @@ class Trainer(object):
if isinstance(optimizer, torch.optim.Optimizer): if isinstance(optimizer, torch.optim.Optimizer):
self.optimizer = optimizer self.optimizer = optimizer
elif isinstance(optimizer, Optimizer): elif isinstance(optimizer, Optimizer):
self.optimizer = optimizer.construct_from_pytorch(model.parameters())
self.optimizer = optimizer.construct_from_pytorch(self.model.parameters())
elif optimizer is None: elif optimizer is None:
self.optimizer = torch.optim.Adam(model.parameters(), lr=4e-3)
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=4e-3)
else: else:
raise TypeError("optimizer can only be torch.optim.Optimizer type, not {}.".format(type(optimizer))) raise TypeError("optimizer can only be torch.optim.Optimizer type, not {}.".format(type(optimizer)))
self.use_tqdm = use_tqdm self.use_tqdm = use_tqdm
self.pbar = None self.pbar = None
self.print_every = abs(self.print_every) self.print_every = abs(self.print_every)
if self.dev_data is not None: if self.dev_data is not None:
self.tester = Tester(model=self.model, self.tester = Tester(model=self.model,
data=self.dev_data, data=self.dev_data,
@@ -617,7 +615,7 @@ class Trainer(object):
if self.step % self.print_every == 0: if self.step % self.print_every == 0:
avg_loss = float(avg_loss) / self.print_every avg_loss = float(avg_loss) / self.print_every
if self.use_tqdm: if self.use_tqdm:
print_output = "loss:{0:<6.5f}".format(avg_loss)
print_output = "loss:{:<6.5f}".format(avg_loss)
pbar.update(self.print_every) pbar.update(self.print_every)
else: else:
end = time.time() end = time.time()
@@ -681,7 +679,7 @@ class Trainer(object):
"""Perform weight update on a model. """Perform weight update on a model.


""" """
if self.optimizer is not None and (self.step + 1) % self.update_every == 0:
if self.step % self.update_every == 0:
self.optimizer.step() self.optimizer.step()
def _data_forward(self, network, x): def _data_forward(self, network, x):
@@ -699,7 +697,7 @@ class Trainer(object):


For PyTorch, just do "loss.backward()" For PyTorch, just do "loss.backward()"
""" """
if self.step % self.update_every == 0:
if (self.step-1) % self.update_every == 0:
self.model.zero_grad() self.model.zero_grad()
loss.backward() loss.backward()


+ 22
- 1
fastNLP/core/utils.py View File

@@ -16,7 +16,9 @@ from collections import Counter, namedtuple
import numpy as np import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn

from torch.nn.parallel.scatter_gather import scatter_kwargs, gather
from torch.nn.parallel.replicate import replicate
from torch.nn.parallel.parallel_apply import parallel_apply


_CheckRes = namedtuple('_CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed', _CheckRes = namedtuple('_CheckRes', ['missing', 'unused', 'duplicated', 'required', 'all_needed',
'varargs']) 'varargs'])
@@ -277,6 +279,25 @@ def _move_model_to_device(model, device):
model = model.to(device) model = model.to(device)
return model return model


def _data_parallel_wrapper(func, device_ids, output_device):
"""
这个函数是用于对需要多卡执行的函数的wrapper函数。参考的nn.DataParallel的forward函数

:param func: callable
:param device_ids: nn.DataParallel中的device_ids
:param inputs:
:param kwargs:
:return:
"""
def wrapper(*inputs, **kwargs):
inputs, kwargs = scatter_kwargs(inputs, kwargs, device_ids, dim=0)
if len(device_ids) == 1:
return func(*inputs[0], **kwargs[0])
replicas = replicate(func, device_ids[:len(inputs)])
outputs = parallel_apply(replicas, inputs, kwargs)
return gather(outputs, output_device)
return wrapper



def _get_model_device(model): def _get_model_device(model):
""" """


+ 19
- 14
fastNLP/io/embed_loader.py View File

@@ -38,7 +38,8 @@ class EmbedLoader(BaseLoader):
super(EmbedLoader, self).__init__() super(EmbedLoader, self).__init__()


@staticmethod @staticmethod
def load_with_vocab(embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', normalize=True, error='ignore'):
def load_with_vocab(embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', normalize=True,
error='ignore', init_method=None):
""" """
从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是 从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是
word2vec(第一行只有两个元素)还是glove格式的数据。 word2vec(第一行只有两个元素)还是glove格式的数据。
@@ -52,6 +53,7 @@ class EmbedLoader(BaseLoader):
:param bool normalize: 是否将每个vector归一化到norm为1 :param bool normalize: 是否将每个vector归一化到norm为1
:param str error: `ignore` , `strict` ; 如果 `ignore` ,错误将自动跳过; 如果 `strict` , 错误将抛出。 :param str error: `ignore` , `strict` ; 如果 `ignore` ,错误将自动跳过; 如果 `strict` , 错误将抛出。
这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。 这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。
:param callable init_method: 传入numpy.ndarray, 返回numpy.ndarray, 用以初始化embedding
:return numpy.ndarray: shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。 :return numpy.ndarray: shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。
""" """
assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported." assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported."
@@ -69,6 +71,8 @@ class EmbedLoader(BaseLoader):
dim = len(parts) - 1 dim = len(parts) - 1
f.seek(0) f.seek(0)
matrix = np.random.randn(len(vocab), dim).astype(dtype) matrix = np.random.randn(len(vocab), dim).astype(dtype)
if init_method:
matrix = init_method(matrix)
for idx, line in enumerate(f, start_idx): for idx, line in enumerate(f, start_idx):
try: try:
parts = line.strip().split() parts = line.strip().split()
@@ -91,14 +95,15 @@ class EmbedLoader(BaseLoader):
raise e raise e
total_hits = sum(hit_flags) total_hits = sum(hit_flags)
print("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab))) print("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab)))
found_vectors = matrix[hit_flags]
if len(found_vectors) != 0:
mean = np.mean(found_vectors, axis=0, keepdims=True)
std = np.std(found_vectors, axis=0, keepdims=True)
unfound_vec_num = len(vocab) - total_hits
r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype) * std + mean
matrix[hit_flags == False] = r_vecs
if init_method is None:
found_vectors = matrix[hit_flags]
if len(found_vectors) != 0:
mean = np.mean(found_vectors, axis=0, keepdims=True)
std = np.std(found_vectors, axis=0, keepdims=True)
unfound_vec_num = len(vocab) - total_hits
r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype) * std + mean
matrix[hit_flags == False] = r_vecs

if normalize: if normalize:
matrix /= np.linalg.norm(matrix, axis=1, keepdims=True) matrix /= np.linalg.norm(matrix, axis=1, keepdims=True)
@@ -157,13 +162,17 @@ class EmbedLoader(BaseLoader):
if dim == -1: if dim == -1:
raise RuntimeError("{} is an empty file.".format(embed_filepath)) raise RuntimeError("{} is an empty file.".format(embed_filepath))
matrix = np.random.randn(len(vocab), dim).astype(dtype) matrix = np.random.randn(len(vocab), dim).astype(dtype)
for key, vec in vec_dict.items():
index = vocab.to_index(key)
matrix[index] = vec

if (unknown is not None and not found_unknown) or (padding is not None and not found_pad): if (unknown is not None and not found_unknown) or (padding is not None and not found_pad):
start_idx = 0 start_idx = 0
if padding is not None: if padding is not None:
start_idx += 1 start_idx += 1
if unknown is not None: if unknown is not None:
start_idx += 1 start_idx += 1
mean = np.mean(matrix[start_idx:], axis=0, keepdims=True) mean = np.mean(matrix[start_idx:], axis=0, keepdims=True)
std = np.std(matrix[start_idx:], axis=0, keepdims=True) std = np.std(matrix[start_idx:], axis=0, keepdims=True)
if (unknown is not None and not found_unknown): if (unknown is not None and not found_unknown):
@@ -171,10 +180,6 @@ class EmbedLoader(BaseLoader):
if (padding is not None and not found_pad): if (padding is not None and not found_pad):
matrix[0] = np.random.randn(1, dim).astype(dtype) * std + mean matrix[0] = np.random.randn(1, dim).astype(dtype) * std + mean
for key, vec in vec_dict.items():
index = vocab.to_index(key)
matrix[index] = vec
if normalize: if normalize:
matrix /= np.linalg.norm(matrix, axis=1, keepdims=True) matrix /= np.linalg.norm(matrix, axis=1, keepdims=True)


+ 31
- 0
fastNLP/io/file_utils.py View File

@@ -10,6 +10,37 @@ import shutil
import hashlib import hashlib




PRETRAINED_BERT_MODEL_DIR = {
'en': 'bert-base-cased-f89bfe08.zip',
'en-base-uncased': 'bert-base-uncased-3413b23c.zip',
'en-base-cased': 'bert-base-cased-f89bfe08.zip',
'en-large-uncased': 'bert-large-uncased-20939f45.zip',
'en-large-cased': 'bert-large-cased-e0cf90fc.zip',

'cn': 'bert-base-chinese-29d0a84a.zip',
'cn-base': 'bert-base-chinese-29d0a84a.zip',

'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip',
'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip',
'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip',
}

PRETRAINED_ELMO_MODEL_DIR = {
'en': 'elmo_en-d39843fe.tar.gz',
'cn': 'elmo_cn-5e9b34e2.tar.gz'
}

PRETRAIN_STATIC_FILES = {
'en': 'glove.840B.300d-cc1ad5e1.tar.gz',
'en-glove-840b-300': 'glove.840B.300d-cc1ad5e1.tar.gz',
'en-glove-6b-50': "glove.6B.50d-a6028c70.tar.gz",
'en-word2vec-300': "GoogleNews-vectors-negative300-be166d9d.tar.gz",
'en-fasttext': "cc.en.300.vec-d53187b2.gz",
'cn': "tencent_cn-dab24577.tar.gz",
'cn-fasttext': "cc.zh.300.vec-d68a9bcf.gz",
}


def cached_path(url_or_filename: str, cache_dir: Path=None) -> Path: def cached_path(url_or_filename: str, cache_dir: Path=None) -> Path:
""" """
给定一个url或者文件名(可以是具体的文件名,也可以是文件),先在cache_dir下寻找该文件是否存在,如果不存在则去下载, 并 给定一个url或者文件名(可以是具体的文件名,也可以是文件),先在cache_dir下寻找该文件是否存在,如果不存在则去下载, 并


+ 23
- 45
fastNLP/modules/encoder/embedding.py View File

@@ -26,6 +26,7 @@ from ...core.dataset import DataSet
from ...core.batch import DataSetIter from ...core.batch import DataSetIter
from ...core.sampler import SequentialSampler from ...core.sampler import SequentialSampler
from ...core.utils import _move_model_to_device, _get_model_device from ...core.utils import _move_model_to_device, _get_model_device
from ...io.file_utils import PRETRAINED_BERT_MODEL_DIR, PRETRAINED_ELMO_MODEL_DIR, PRETRAIN_STATIC_FILES




class Embedding(nn.Module): class Embedding(nn.Module):
@@ -179,23 +180,14 @@ class StaticEmbedding(TokenEmbedding):
的名称。目前支持的embedding包括{`en` 或者 `en-glove-840b-300` : glove.840B.300d, `en-glove-6b-50` : glove.6B.50d, 的名称。目前支持的embedding包括{`en` 或者 `en-glove-840b-300` : glove.840B.300d, `en-glove-6b-50` : glove.6B.50d,
`en-word2vec-300` : GoogleNews-vectors-negative300}。第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 `en-word2vec-300` : GoogleNews-vectors-negative300}。第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。
:param requires_grad: 是否需要gradient. 默认为True :param requires_grad: 是否需要gradient. 默认为True
:param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。默认使用torch.nn.init.xavier_uniform_
。调用该方法时传入一个tensor对象。

:param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对象。
:param normailize: 是否对vector进行normalize,使得每个vector的norm为1。
""" """
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=True, init_method=None):
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=True, init_method=None,
normalize=False):
super(StaticEmbedding, self).__init__(vocab) super(StaticEmbedding, self).__init__(vocab)


# 优先定义需要下载的static embedding有哪些。这里估计需要自己搞一个server, # 优先定义需要下载的static embedding有哪些。这里估计需要自己搞一个server,
PRETRAIN_STATIC_FILES = {
'en': 'glove.840B.300d-cc1ad5e1.tar.gz',
'en-glove-840b-300': 'glove.840B.300d-cc1ad5e1.tar.gz',
'en-glove-6b-50': "glove.6B.50d-a6028c70.tar.gz",
'en-word2vec-300': "GoogleNews-vectors-negative300-be166d9d.tar.gz",
'en-fasttext': "cc.en.300.vec-d53187b2.gz",
'cn': "tencent_cn-dab24577.tar.gz",
'cn-fasttext': "cc.zh.300.vec-d68a9bcf.gz",
}


# 得到cache_path # 得到cache_path
if model_dir_or_name.lower() in PRETRAIN_STATIC_FILES: if model_dir_or_name.lower() in PRETRAIN_STATIC_FILES:
@@ -210,7 +202,8 @@ class StaticEmbedding(TokenEmbedding):
raise ValueError(f"Cannot recognize {model_dir_or_name}.") raise ValueError(f"Cannot recognize {model_dir_or_name}.")


# 读取embedding # 读取embedding
embedding, hit_flags = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method)
embedding, hit_flags = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method,
normalize=normalize)
self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1], self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1],
padding_idx=vocab.padding_idx, padding_idx=vocab.padding_idx,
max_norm=None, norm_type=2, scale_grad_by_freq=False, max_norm=None, norm_type=2, scale_grad_by_freq=False,
@@ -231,7 +224,7 @@ class StaticEmbedding(TokenEmbedding):
:return: :return:
""" """
requires_grads = set([param.requires_grad for name, param in self.named_parameters() requires_grads = set([param.requires_grad for name, param in self.named_parameters()
if 'words_to_words' not in name])
if 'words_to_words' not in name])
if len(requires_grads) == 1: if len(requires_grads) == 1:
return requires_grads.pop() return requires_grads.pop()
else: else:
@@ -244,8 +237,8 @@ class StaticEmbedding(TokenEmbedding):
continue continue
param.requires_grad = value param.requires_grad = value


def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', normalize=True,
error='ignore', init_method=None):
def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>',
normalize=True, error='ignore', init_method=None):
""" """
从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是 从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是
word2vec(第一行只有两个元素)还是glove格式的数据。 word2vec(第一行只有两个元素)还是glove格式的数据。
@@ -265,10 +258,7 @@ class StaticEmbedding(TokenEmbedding):
assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported." assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported."
if not os.path.exists(embed_filepath): if not os.path.exists(embed_filepath):
raise FileNotFoundError("`{}` does not exist.".format(embed_filepath)) raise FileNotFoundError("`{}` does not exist.".format(embed_filepath))
if init_method is None:
init_method = nn.init.xavier_uniform_
with open(embed_filepath, 'r', encoding='utf-8') as f: with open(embed_filepath, 'r', encoding='utf-8') as f:
found_count = 0
line = f.readline().strip() line = f.readline().strip()
parts = line.split() parts = line.split()
start_idx = 0 start_idx = 0
@@ -279,7 +269,8 @@ class StaticEmbedding(TokenEmbedding):
dim = len(parts) - 1 dim = len(parts) - 1
f.seek(0) f.seek(0)
matrix = torch.zeros(len(vocab), dim) matrix = torch.zeros(len(vocab), dim)
init_method(matrix)
if init_method is not None:
init_method(matrix)
hit_flags = np.zeros(len(vocab), dtype=bool) hit_flags = np.zeros(len(vocab), dtype=bool)
for idx, line in enumerate(f, start_idx): for idx, line in enumerate(f, start_idx):
try: try:
@@ -294,7 +285,6 @@ class StaticEmbedding(TokenEmbedding):
if word in vocab: if word in vocab:
index = vocab.to_index(word) index = vocab.to_index(word)
matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim))
found_count += 1
hit_flags[index] = True hit_flags[index] = True
except Exception as e: except Exception as e:
if error == 'ignore': if error == 'ignore':
@@ -302,7 +292,16 @@ class StaticEmbedding(TokenEmbedding):
else: else:
print("Error occurred at the {} line.".format(idx)) print("Error occurred at the {} line.".format(idx))
raise e raise e
found_count = sum(hit_flags)
print("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab))) print("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab)))
if init_method is None:
if len(vocab)-found_count>0 and found_count>0: # 有的没找到
found_vecs = matrix[torch.LongTensor(hit_flags.astype(int)).byte()]
mean = found_vecs.mean(dim=0, keepdim=True)
std = found_vecs.std(dim=0, keepdim=True)
unfound_vec_num = np.sum(hit_flags==False)
unfound_vecs = torch.randn(unfound_vec_num, dim)*std + mean
matrix[torch.LongTensor(hit_flags.astype(int)).eq(0)] = unfound_vecs


if normalize: if normalize:
matrix /= (torch.norm(matrix, dim=1, keepdim=True) + 1e-12) matrix /= (torch.norm(matrix, dim=1, keepdim=True) + 1e-12)
@@ -329,11 +328,6 @@ class ContextualEmbedding(TokenEmbedding):
""" """
由于动态embedding生成比较耗时,所以可以把每句话embedding缓存下来,这样就不需要每次都运行生成过程。 由于动态embedding生成比较耗时,所以可以把每句话embedding缓存下来,这样就不需要每次都运行生成过程。


Example::

>>>


:param datasets: DataSet对象 :param datasets: DataSet对象
:param batch_size: int, 生成cache的sentence表示时使用的batch的大小 :param batch_size: int, 生成cache的sentence表示时使用的batch的大小
:param device: 参考 :class::fastNLP.Trainer 的device :param device: 参考 :class::fastNLP.Trainer 的device
@@ -363,7 +357,7 @@ class ContextualEmbedding(TokenEmbedding):
seq_len = words.ne(pad_index).sum(dim=-1) seq_len = words.ne(pad_index).sum(dim=-1)
max_len = words.size(1) max_len = words.size(1)
# 因为有些情况可能包含CLS, SEP, 从后面往前计算比较安全。 # 因为有些情况可能包含CLS, SEP, 从后面往前计算比较安全。
seq_len_from_behind =(max_len - seq_len).tolist()
seq_len_from_behind = (max_len - seq_len).tolist()
word_embeds = self(words).detach().cpu().numpy() word_embeds = self(words).detach().cpu().numpy()
for b in range(words.size(0)): for b in range(words.size(0)):
length = seq_len_from_behind[b] length = seq_len_from_behind[b]
@@ -446,9 +440,6 @@ class ElmoEmbedding(ContextualEmbedding):
self.layers = layers self.layers = layers


# 根据model_dir_or_name检查是否存在并下载 # 根据model_dir_or_name检查是否存在并下载
PRETRAINED_ELMO_MODEL_DIR = {'en': 'elmo_en-d39843fe.tar.gz',
'cn': 'elmo_cn-5e9b34e2.tar.gz'}

if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR: if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR:
PRETRAIN_URL = _get_base_url('elmo') PRETRAIN_URL = _get_base_url('elmo')
model_name = PRETRAINED_ELMO_MODEL_DIR[model_dir_or_name] model_name = PRETRAINED_ELMO_MODEL_DIR[model_dir_or_name]
@@ -532,21 +523,8 @@ class BertEmbedding(ContextualEmbedding):
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1', def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1',
pool_method: str='first', include_cls_sep: bool=False, requires_grad: bool=False): pool_method: str='first', include_cls_sep: bool=False, requires_grad: bool=False):
super(BertEmbedding, self).__init__(vocab) super(BertEmbedding, self).__init__(vocab)
# 根据model_dir_or_name检查是否存在并下载
PRETRAINED_BERT_MODEL_DIR = {'en': 'bert-base-cased-f89bfe08.zip',
'en-base-uncased': 'bert-base-uncased-3413b23c.zip',
'en-base-cased': 'bert-base-cased-f89bfe08.zip',
'en-large-uncased': 'bert-large-uncased-20939f45.zip',
'en-large-cased': 'bert-large-cased-e0cf90fc.zip',

'cn': 'bert-base-chinese-29d0a84a.zip',
'cn-base': 'bert-base-chinese-29d0a84a.zip',

'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip',
'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip',
'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip',
}


# 根据model_dir_or_name检查是否存在并下载
if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR:
PRETRAIN_URL = _get_base_url('bert') PRETRAIN_URL = _get_base_url('bert')
model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name] model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]


+ 1
- 10
fastNLP/modules/encoder/lstm.py View File

@@ -73,21 +73,12 @@ class LSTM(nn.Module):
x = x[:, sort_idx] x = x[:, sort_idx]
x = rnn.pack_padded_sequence(x, sort_lens, batch_first=self.batch_first) x = rnn.pack_padded_sequence(x, sort_lens, batch_first=self.batch_first)
output, hx = self.lstm(x, hx) # -> [N,L,C] output, hx = self.lstm(x, hx) # -> [N,L,C]
output, _ = rnn.pad_packed_sequence(output, batch_first=self.batch_first)
output, _ = rnn.pad_packed_sequence(output, batch_first=self.batch_first, total_length=max_len)
_, unsort_idx = torch.sort(sort_idx, dim=0, descending=False) _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
if self.batch_first: if self.batch_first:
output = output[unsort_idx] output = output[unsort_idx]
else: else:
output = output[:, unsort_idx] output = output[:, unsort_idx]
# 解决LSTM无法在DataParallel下使用的问题问题https://github.com/pytorch/pytorch/issues/1591
if self.batch_first:
if output.size(1) < max_len:
dummy_tensor = output.new_zeros(batch_size, max_len - output.size(1), output.size(-1))
output = torch.cat([output, dummy_tensor], 1)
else:
if output.size(0) < max_len:
dummy_tensor = output.new_zeros(max_len - output.size(1), batch_size, output.size(-1))
output = torch.cat([output, dummy_tensor], 0)
else: else:
output, hx = self.lstm(x, hx) output, hx = self.lstm(x, hx)
return output, hx return output, hx

+ 2
- 0
fastNLP/modules/utils.py View File

@@ -82,6 +82,8 @@ def get_embeddings(init_embed):
if isinstance(init_embed, tuple): if isinstance(init_embed, tuple):
res = nn.Embedding( res = nn.Embedding(
num_embeddings=init_embed[0], embedding_dim=init_embed[1]) num_embeddings=init_embed[0], embedding_dim=init_embed[1])
nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)),
b=np.sqrt(3/res.weight.data.size(1)))
elif isinstance(init_embed, nn.Module): elif isinstance(init_embed, nn.Module):
res = init_embed res = init_embed
elif isinstance(init_embed, torch.Tensor): elif isinstance(init_embed, torch.Tensor):


+ 219
- 51
reproduction/matching/data/MatchingDataLoader.py View File

@@ -1,36 +1,63 @@


import os import os


from nltk import Tree
from typing import Union, Dict from typing import Union, Dict


from fastNLP.core.const import Const from fastNLP.core.const import Const
from fastNLP.core.vocabulary import Vocabulary from fastNLP.core.vocabulary import Vocabulary
from fastNLP.core.dataset import DataSet
from fastNLP.io.base_loader import DataInfo from fastNLP.io.base_loader import DataInfo
from fastNLP.io.dataset_loader import JsonLoader
from fastNLP.io.file_utils import _get_base_url, cached_path
from fastNLP.io.dataset_loader import JsonLoader, DataSetLoader, CSVLoader
from fastNLP.io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR
from fastNLP.modules.encoder._bert import BertTokenizer from fastNLP.modules.encoder._bert import BertTokenizer




class MatchingLoader(JsonLoader):
class MatchingLoader(DataSetLoader):
""" """
别名::class:`fastNLP.io.MatchingLoader` :class:`fastNLP.io.dataset_loader.MatchingLoader` 别名::class:`fastNLP.io.MatchingLoader` :class:`fastNLP.io.dataset_loader.MatchingLoader`


读取Matching任务的数据集 读取Matching任务的数据集
""" """


def __init__(self, fields=None, paths: dict=None):
super(MatchingLoader, self).__init__(fields=fields)
def __init__(self, paths: dict=None):
"""
:param dict paths: key是数据集名称(如train、dev、test),value是对应的文件名
"""
self.paths = paths self.paths = paths


def _load(self, path): def _load(self, path):
return super(MatchingLoader, self)._load(path)

def process(self, paths: Union[str, Dict[str, str]], dataset_name=None,
to_lower=False, char_information=False, seq_len_type: str=None,
bert_tokenizer: str=None, get_index=True, set_input: Union[list, str, bool]=True,
"""
:param str path: 待读取数据集的路径名
:return: fastNLP.DataSet ds: 返回一个DataSet对象,里面必须包含3个field:其中两个分别为两个句子
的原始字符串文本,第三个为标签
"""
raise NotImplementedError

def process(self, paths: Union[str, Dict[str, str]], dataset_name: str=None,
to_lower=False, seq_len_type: str=None, bert_tokenizer: str=None,
cut_text: int = None, get_index=True, set_input: Union[list, str, bool]=True,
set_target: Union[list, str, bool] = True, concat: Union[str, list, bool]=None, ) -> DataInfo: set_target: Union[list, str, bool] = True, concat: Union[str, list, bool]=None, ) -> DataInfo:
"""
:param paths: str或者Dict[str, str]。如果是str,则为数据集所在的文件夹或者是全路径文件名:如果是文件夹,
则会从self.paths里面找对应的数据集名称与文件名。如果是Dict,则为数据集名称(如train、dev、test)和
对应的全路径文件名。
:param str dataset_name: 如果在paths里传入的是一个数据集的全路径文件名,那么可以用dataset_name来定义
这个数据集的名字,如果不定义则默认为train。
:param bool to_lower: 是否将文本自动转为小写。默认值为False。
:param str seq_len_type: 提供的seq_len类型,支持 ``seq_len`` :提供一个数字作为句子长度; ``mask`` :
提供一个0/1的mask矩阵作为句子长度; ``bert`` :提供segment_type_id(第一个句子为0,第二个句子为1)和
attention mask矩阵(0/1的mask矩阵)。默认值为None,即不提供seq_len
:param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径
:param int cut_text: 将长于cut_text的内容截掉。默认为None,即不截。
:param bool get_index: 是否需要根据词表将文本转为index
:param set_input: 如果为True,则会自动将相关的field(名字里含有Const.INPUT的)设置为input,如果为False
则不会将任何field设置为input。如果传入str或者List[str],则会根据传入的内容将相对应的field设置为input,
于此同时其他field不会被设置为input。默认值为True。
:param set_target: set_target将控制哪些field可以被设置为target,用法与set_input一致。默认值为True。
:param concat: 是否需要将两个句子拼接起来。如果为False则不会拼接。如果为True则会在两个句子之间插入一个<sep>。
如果传入一个长度为4的list,则分别表示插在第一句开始前、第一句结束后、第二句开始前、第二句结束后的标识符。如果
传入字符串 ``bert`` ,则会采用bert的拼接方式,等价于['[CLS]', '[SEP]', '', '[SEP]'].
:return:
"""
if isinstance(set_input, str): if isinstance(set_input, str):
set_input = [set_input] set_input = [set_input]
if isinstance(set_target, str): if isinstance(set_target, str):
@@ -59,7 +86,8 @@ class MatchingLoader(JsonLoader):
if auto_set_input: if auto_set_input:
data_set.set_input(Const.INPUTS(0), Const.INPUTS(1)) data_set.set_input(Const.INPUTS(0), Const.INPUTS(1))
if auto_set_target: if auto_set_target:
data_set.set_target(Const.TARGET)
if Const.TARGET in data_set.get_field_names():
data_set.set_target(Const.TARGET)


if to_lower: if to_lower:
for data_name, data_set in data_info.datasets.items(): for data_name, data_set in data_info.datasets.items():
@@ -69,19 +97,6 @@ class MatchingLoader(JsonLoader):
is_input=auto_set_input) is_input=auto_set_input)


if bert_tokenizer is not None: if bert_tokenizer is not None:
PRETRAINED_BERT_MODEL_DIR = {'en': 'bert-base-cased-f89bfe08.zip',
'en-base-uncased': 'bert-base-uncased-3413b23c.zip',
'en-base-cased': 'bert-base-cased-f89bfe08.zip',
'en-large-uncased': 'bert-large-uncased-20939f45.zip',
'en-large-cased': 'bert-large-cased-e0cf90fc.zip',

'cn': 'bert-base-chinese-29d0a84a.zip',
'cn-base': 'bert-base-chinese-29d0a84a.zip',

'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip',
'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip',
'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip',
}
if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR: if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR:
PRETRAIN_URL = _get_base_url('bert') PRETRAIN_URL = _get_base_url('bert')
model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer] model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer]
@@ -93,6 +108,13 @@ class MatchingLoader(JsonLoader):
else: else:
raise ValueError(f"Cannot recognize BERT tokenizer from {bert_tokenizer}.") raise ValueError(f"Cannot recognize BERT tokenizer from {bert_tokenizer}.")


words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]')
with open(os.path.join(model_dir, 'vocab.txt'), 'r') as f:
lines = f.readlines()
lines = [line.strip() for line in lines]
words_vocab.add_word_lst(lines)
words_vocab.build_vocab()

tokenizer = BertTokenizer.from_pretrained(model_dir) tokenizer = BertTokenizer.from_pretrained(model_dir)


for data_name, data_set in data_info.datasets.items(): for data_name, data_set in data_info.datasets.items():
@@ -128,14 +150,14 @@ class MatchingLoader(JsonLoader):
for fields in data_set.get_field_names(): for fields in data_set.get_field_names():
if Const.INPUT in fields: if Const.INPUT in fields:
data_set.apply(lambda x: len(x[fields]), data_set.apply(lambda x: len(x[fields]),
new_field_name=fields.replace(Const.INPUT, Const.TARGET),
new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN),
is_input=auto_set_input) is_input=auto_set_input)
elif seq_len_type == 'mask': elif seq_len_type == 'mask':
for data_name, data_set in data_info.datasets.items(): for data_name, data_set in data_info.datasets.items():
for fields in data_set.get_field_names(): for fields in data_set.get_field_names():
if Const.INPUT in fields: if Const.INPUT in fields:
data_set.apply(lambda x: [1] * len(x[fields]), data_set.apply(lambda x: [1] * len(x[fields]),
new_field_name=fields.replace(Const.INPUT, Const.TARGET),
new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN),
is_input=auto_set_input) is_input=auto_set_input)
elif seq_len_type == 'bert': elif seq_len_type == 'bert':
for data_name, data_set in data_info.datasets.items(): for data_name, data_set in data_info.datasets.items():
@@ -147,18 +169,26 @@ class MatchingLoader(JsonLoader):
data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]), data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]),
new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input) new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input)


if cut_text is not None:
for data_name, data_set in data_info.datasets.items():
for fields in data_set.get_field_names():
if (Const.INPUT in fields) or ((Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len')):
data_set.apply(lambda x: x[fields][: cut_text], new_field_name=fields,
is_input=auto_set_input)

data_set_list = [d for n, d in data_info.datasets.items()] data_set_list = [d for n, d in data_info.datasets.items()]
assert len(data_set_list) > 0, f'There are NO data sets in data info!' assert len(data_set_list) > 0, f'There are NO data sets in data info!'


if bert_tokenizer is not None:
words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]')
else:
if bert_tokenizer is None:
words_vocab = Vocabulary() words_vocab = Vocabulary()
words_vocab = words_vocab.from_dataset(*data_set_list,
field_name=[n for n in data_set_list[0].get_field_names()
if (Const.INPUT in n)])
words_vocab = words_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n],
field_name=[n for n in data_set_list[0].get_field_names()
if (Const.INPUT in n)],
no_create_entry_dataset=[d for n, d in data_info.datasets.items()
if 'train' not in n])
target_vocab = Vocabulary(padding=None, unknown=None) target_vocab = Vocabulary(padding=None, unknown=None)
target_vocab = target_vocab.from_dataset(*data_set_list, field_name=Const.TARGET)
target_vocab = target_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n],
field_name=Const.TARGET)
data_info.vocabs = {Const.INPUT: words_vocab, Const.TARGET: target_vocab} data_info.vocabs = {Const.INPUT: words_vocab, Const.TARGET: target_vocab}


if get_index: if get_index:
@@ -168,19 +198,20 @@ class MatchingLoader(JsonLoader):
data_set.apply(lambda x: [words_vocab.to_index(w) for w in x[fields]], new_field_name=fields, data_set.apply(lambda x: [words_vocab.to_index(w) for w in x[fields]], new_field_name=fields,
is_input=auto_set_input) is_input=auto_set_input)


data_set.apply(lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET,
is_input=auto_set_input, is_target=auto_set_target)
if Const.TARGET in data_set.get_field_names():
data_set.apply(lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET,
is_input=auto_set_input, is_target=auto_set_target)


for data_name, data_set in data_info.datasets.items(): for data_name, data_set in data_info.datasets.items():
if isinstance(set_input, list): if isinstance(set_input, list):
data_set.set_input(set_input)
data_set.set_input(*[inputs for inputs in set_input if inputs in data_set.get_field_names()])
if isinstance(set_target, list): if isinstance(set_target, list):
data_set.set_target(set_target)
data_set.set_target(*[target for target in set_target if target in data_set.get_field_names()])


return data_info return data_info




class SNLILoader(MatchingLoader):
class SNLILoader(MatchingLoader, JsonLoader):
""" """
别名::class:`fastNLP.io.SNLILoader` :class:`fastNLP.io.dataset_loader.SNLILoader` 别名::class:`fastNLP.io.SNLILoader` :class:`fastNLP.io.dataset_loader.SNLILoader`


@@ -195,29 +226,166 @@ class SNLILoader(MatchingLoader):


def __init__(self, paths: dict=None): def __init__(self, paths: dict=None):
fields = { fields = {
'sentence1_parse': Const.INPUTS(0),
'sentence2_parse': Const.INPUTS(1),
'sentence1_binary_parse': Const.INPUTS(0),
'sentence2_binary_parse': Const.INPUTS(1),
'gold_label': Const.TARGET, 'gold_label': Const.TARGET,
} }
paths = paths if paths is not None else { paths = paths if paths is not None else {
'train': 'snli_1.0_train.jsonl', 'train': 'snli_1.0_train.jsonl',
'dev': 'snli_1.0_dev.jsonl', 'dev': 'snli_1.0_dev.jsonl',
'test': 'snli_1.0_test.jsonl'} 'test': 'snli_1.0_test.jsonl'}
super(SNLILoader, self).__init__(fields=fields, paths=paths)
MatchingLoader.__init__(self, paths=paths)
JsonLoader.__init__(self, fields=fields)


def _load(self, path): def _load(self, path):
ds = super(SNLILoader, self)._load(path)
ds = JsonLoader._load(self, path)


def parse_tree(x):
t = Tree.fromstring(x)
return t.leaves()
parentheses_table = str.maketrans({'(': None, ')': None})


ds.apply(lambda ins: parse_tree(
ins[Const.INPUTS(0)]), new_field_name=Const.INPUTS(0))
ds.apply(lambda ins: parse_tree(
ins[Const.INPUTS(1)]), new_field_name=Const.INPUTS(1))
ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(),
new_field_name=Const.INPUTS(0))
ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(),
new_field_name=Const.INPUTS(1))
ds.drop(lambda x: x[Const.TARGET] == '-') ds.drop(lambda x: x[Const.TARGET] == '-')
return ds return ds




class RTELoader(MatchingLoader, CSVLoader):
"""
别名::class:`fastNLP.io.RTELoader` :class:`fastNLP.io.dataset_loader.RTELoader`

读取RTE数据集,读取的DataSet包含fields::

words1: list(str),第一句文本, premise
words2: list(str), 第二句文本, hypothesis
target: str, 真实标签

数据来源:
"""

def __init__(self, paths: dict=None):
paths = paths if paths is not None else {
'train': 'train.tsv',
'dev': 'dev.tsv',
# 'test': 'test.tsv' # test set has not label
}
MatchingLoader.__init__(self, paths=paths)
self.fields = {
'sentence1': Const.INPUTS(0),
'sentence2': Const.INPUTS(1),
'label': Const.TARGET,
}
CSVLoader.__init__(self, sep='\t')

def _load(self, path):
ds = CSVLoader._load(self, path)

for k, v in self.fields.items():
ds.rename_field(k, v)
for fields in ds.get_all_fields():
if Const.INPUT in fields:
ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields)

return ds


class QNLILoader(MatchingLoader, CSVLoader):
"""
别名::class:`fastNLP.io.QNLILoader` :class:`fastNLP.io.dataset_loader.QNLILoader`

读取QNLI数据集,读取的DataSet包含fields::

words1: list(str),第一句文本, premise
words2: list(str), 第二句文本, hypothesis
target: str, 真实标签

数据来源:
"""

def __init__(self, paths: dict=None):
paths = paths if paths is not None else {
'train': 'train.tsv',
'dev': 'dev.tsv',
# 'test': 'test.tsv' # test set has not label
}
MatchingLoader.__init__(self, paths=paths)
self.fields = {
'question': Const.INPUTS(0),
'sentence': Const.INPUTS(1),
'label': Const.TARGET,
}
CSVLoader.__init__(self, sep='\t')

def _load(self, path):
ds = CSVLoader._load(self, path)

for k, v in self.fields.items():
ds.rename_field(k, v)
for fields in ds.get_all_fields():
if Const.INPUT in fields:
ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields)

return ds


class MNLILoader(MatchingLoader, CSVLoader):
"""
别名::class:`fastNLP.io.MNLILoader` :class:`fastNLP.io.dataset_loader.MNLILoader`

读取SNLI数据集,读取的DataSet包含fields::

words1: list(str),第一句文本, premise
words2: list(str), 第二句文本, hypothesis
target: str, 真实标签

数据来源:
"""

def __init__(self, paths: dict=None):
paths = paths if paths is not None else {
'train': 'train.tsv',
'dev_matched': 'dev_matched.tsv',
'dev_mismatched': 'dev_mismatched.tsv',
'test_matched': 'test_matched.tsv',
'test_mismatched': 'test_mismatched.tsv',
}
MatchingLoader.__init__(self, paths=paths)
CSVLoader.__init__(self, sep='\t')
self.fields = {
'sentence1_binary_parse': Const.INPUTS(0),
'sentence2_binary_parse': Const.INPUTS(1),
'gold_label': Const.TARGET,
}

def _load(self, path):
ds = CSVLoader._load(self, path)

for k, v in self.fields.items():
if k in ds.get_field_names():
ds.rename_field(k, v)

parentheses_table = str.maketrans({'(': None, ')': None})

ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(),
new_field_name=Const.INPUTS(0))
ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(),
new_field_name=Const.INPUTS(1))
if Const.TARGET in ds.get_field_names():
ds.drop(lambda x: x[Const.TARGET] == '-')
return ds


class QuoraLoader(MatchingLoader, CSVLoader):

def __init__(self, paths: dict=None):
paths = paths if paths is not None else {
'train': 'train.tsv',
'dev': 'dev.tsv',
'test': 'test.tsv',
}
MatchingLoader.__init__(self, paths=paths)
CSVLoader.__init__(self, sep='\t', headers=(Const.TARGET, Const.INPUTS(0), Const.INPUTS(1), 'pairID'))


def _load(self, path):
ds = CSVLoader._load(self, path)
return ds

+ 0
- 44
reproduction/matching/matching.py View File

@@ -1,44 +0,0 @@
import os

import torch

from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const

from fastNLP.io.dataset_loader import MatchingLoader

from reproduction.matching.model.bert import BertForNLI
from reproduction.matching.model.esim import ESIMModel


bert_dirs = 'path/to/bert/dir'

# load data set
# data_info = MatchingLoader(data_format='snli', for_model='bert', bert_dir=bert_dirs).process(...
data_info = MatchingLoader(data_format='snli', for_model='esim').process(
{'train': './data/snli/snli_1.0_train.jsonl',
'dev': './data/snli/snli_1.0_dev.jsonl',
'test': './data/snli/snli_1.0_test.jsonl'},
input_field=[Const.TARGET]
)

# model = BertForNLI(bert_dir=bert_dirs)
model = ESIMModel(data_info.embeddings['elmo'],)

trainer = Trainer(train_data=data_info.datasets['train'], model=model,
optimizer=Adam(lr=1e-4, model_params=model.parameters()),
batch_size=torch.cuda.device_count() * 24, n_epochs=20, print_every=-1,
dev_data=data_info.datasets['dev'],
metrics=AccuracyMetric(), metric_key='acc', device=[i for i in range(torch.cuda.device_count())],
check_code_level=-1)
trainer.train(load_best_model=True)

tester = Tester(
data=data_info.datasets['test'],
model=model,
metrics=AccuracyMetric(),
batch_size=torch.cuda.device_count() * 12,
device=[i for i in range(torch.cuda.device_count())],
)
tester.test()



+ 65
- 0
reproduction/matching/matching_esim.py View File

@@ -0,0 +1,65 @@

import argparse
import torch

from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const
from fastNLP.modules.encoder.embedding import ElmoEmbedding, StaticEmbedding

from reproduction.matching.data.MatchingDataLoader import SNLILoader
from reproduction.matching.model.esim import ESIMModel

argument = argparse.ArgumentParser()
argument.add_argument('--embedding', choices=['glove', 'elmo'], default='glove')
argument.add_argument('--batch-size-per-gpu', type=int, default=128)
argument.add_argument('--n-epochs', type=int, default=100)
argument.add_argument('--lr', type=float, default=1e-4)
argument.add_argument('--seq-len-type', choices=['mask', 'seq_len'], default='seq_len')
argument.add_argument('--save-dir', type=str, default=None)
arg = argument.parse_args()

bert_dirs = 'path/to/bert/dir'

# load data set
data_info = SNLILoader().process(
paths='path/to/snli/data/dir', to_lower=True, seq_len_type=arg.seq_len_type, bert_tokenizer=None,
get_index=True, concat=False,
)

# load embedding
if arg.embedding == 'elmo':
embedding = ElmoEmbedding(data_info.vocabs[Const.INPUT], requires_grad=True)
elif arg.embedding == 'glove':
embedding = StaticEmbedding(data_info.vocabs[Const.INPUT], requires_grad=True)
else:
raise ValueError(f'now we only support elmo or glove embedding for esim model!')

# define model
model = ESIMModel(embedding)

# define trainer
trainer = Trainer(train_data=data_info.datasets['train'], model=model,
optimizer=Adam(lr=arg.lr, model_params=model.parameters()),
batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu,
n_epochs=arg.n_epochs, print_every=-1,
dev_data=data_info.datasets['dev'],
metrics=AccuracyMetric(), metric_key='acc',
device=[i for i in range(torch.cuda.device_count())],
check_code_level=-1,
save_path=arg.save_path)

# train model
trainer.train(load_best_model=True)

# define tester
tester = Tester(
data=data_info.datasets['test'],
model=model,
metrics=AccuracyMetric(),
batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu,
device=[i for i in range(torch.cuda.device_count())],
)

# test model
tester.test()



+ 18
- 3
reproduction/matching/model/esim.py View File

@@ -30,24 +30,37 @@ class ESIMModel(BaseModel):
self.bi_attention = SoftmaxAttention() self.bi_attention = SoftmaxAttention()


self.rnn_high = BiRNN(self.embedding.embed_size, hidden_size, dropout_rate=dropout_rate) self.rnn_high = BiRNN(self.embedding.embed_size, hidden_size, dropout_rate=dropout_rate)
# self.rnn_high = LSTM(hidden_size, hidden_size, dropout=dropout_rate, bidirectional=True)
# self.rnn_high = LSTM(hidden_size, hidden_size, dropout=dropout_rate, bidirectional=True,)


self.classifier = nn.Sequential(nn.Dropout(p=dropout_rate), self.classifier = nn.Sequential(nn.Dropout(p=dropout_rate),
nn.Linear(8 * hidden_size, hidden_size), nn.Linear(8 * hidden_size, hidden_size),
nn.Tanh(), nn.Tanh(),
nn.Dropout(p=dropout_rate), nn.Dropout(p=dropout_rate),
nn.Linear(hidden_size, num_labels)) nn.Linear(hidden_size, num_labels))

self.dropout_rnn = nn.Dropout(p=dropout_rate)

nn.init.xavier_uniform_(self.classifier[1].weight.data) nn.init.xavier_uniform_(self.classifier[1].weight.data)
nn.init.xavier_uniform_(self.classifier[4].weight.data) nn.init.xavier_uniform_(self.classifier[4].weight.data)


def forward(self, words1, words2, seq_len1, seq_len2, target=None): def forward(self, words1, words2, seq_len1, seq_len2, target=None):
mask1 = seq_len_to_mask(seq_len1)
mask2 = seq_len_to_mask(seq_len2)
"""
:param words1: [batch, seq_len]
:param words2: [batch, seq_len]
:param seq_len1: [batch]
:param seq_len2: [batch]
:param target:
:return:
"""
mask1 = seq_len_to_mask(seq_len1, words1.size(1))
mask2 = seq_len_to_mask(seq_len2, words2.size(1))
a0 = self.embedding(words1) # B * len * emb_dim a0 = self.embedding(words1) # B * len * emb_dim
b0 = self.embedding(words2) b0 = self.embedding(words2)
a0, b0 = self.dropout_embed(a0), self.dropout_embed(b0) a0, b0 = self.dropout_embed(a0), self.dropout_embed(b0)
a = self.rnn(a0, mask1.byte()) # a: [B, PL, 2 * H] a = self.rnn(a0, mask1.byte()) # a: [B, PL, 2 * H]
b = self.rnn(b0, mask2.byte()) b = self.rnn(b0, mask2.byte())
# a = self.dropout_rnn(self.rnn(a0, seq_len1)[0]) # a: [B, PL, 2 * H]
# b = self.dropout_rnn(self.rnn(b0, seq_len2)[0])


ai, bi = self.bi_attention(a, mask1, b, mask2) ai, bi = self.bi_attention(a, mask1, b, mask2)


@@ -58,6 +71,8 @@ class ESIMModel(BaseModel):


a_h = self.rnn_high(a_f, mask1.byte()) # ma: [B, PL, 2 * H] a_h = self.rnn_high(a_f, mask1.byte()) # ma: [B, PL, 2 * H]
b_h = self.rnn_high(b_f, mask2.byte()) b_h = self.rnn_high(b_f, mask2.byte())
# a_h = self.dropout_rnn(self.rnn_high(a_f, seq_len1)[0]) # ma: [B, PL, 2 * H]
# b_h = self.dropout_rnn(self.rnn_high(b_f, seq_len2)[0])


a_avg = self.mean_pooling(a_h, mask1, dim=1) a_avg = self.mean_pooling(a_h, mask1, dim=1)
a_max, _ = self.max_pooling(a_h, mask1, dim=1) a_max, _ = self.max_pooling(a_h, mask1, dim=1)


+ 1
- 1
setup.py View File

@@ -13,7 +13,7 @@ with open('requirements.txt', encoding='utf-8') as f:


setup( setup(
name='FastNLP', name='FastNLP',
version='0.4.0',
version='dev0.5.0',
description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team', description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team',
long_description=readme, long_description=readme,
long_description_content_type='text/markdown', long_description_content_type='text/markdown',


Loading…
Cancel
Save