From 9f5c00b17d39b14945c967b53d936a32db5d713c Mon Sep 17 00:00:00 2001 From: yunfan Date: Thu, 6 Jun 2019 01:40:46 +0800 Subject: [PATCH 01/34] change Batch to torch.DataLoader --- fastNLP/core/batch.py | 74 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index 109d4fe9..9aab146c 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -12,8 +12,10 @@ from queue import Empty, Full import numpy as np import torch import torch.multiprocessing as mp +import torch.utils.data from .sampler import RandomSampler +from .dataset import DataSet _python_is_exit = False @@ -25,8 +27,78 @@ def _set_python_is_exit(): atexit.register(_set_python_is_exit) +class DataSetGetter: + def __init__(self, dataset: DataSet, as_numpy=False): + self.dataset = dataset + self.inputs = {n: f for n, f in dataset.get_all_fields().items() if f.is_input} + self.targets = {n: f for n, f in dataset.get_all_fields().items() if f.is_target} + self.as_numpy = as_numpy + + def __getitem__(self, idx: int): + inputs = {n:f.get(idx) for n, f in self.inputs.items()} + targets = {n:f.get(idx) for n, f in self.targets.items()} + return idx, inputs, targets + + def __len__(self): + return len(self.dataset) + + def collate_fn(self, batch: list): + batch_x = {n:[] for n in self.inputs.keys()} + batch_y = {n:[] for n in self.targets.keys()} + indices = [] + for idx, x, y in batch: + indices.append(idx) + for n, v in x.items(): + batch_x[n].append(v) + for n, v in y.items(): + batch_y[n].append(v) + + def pad_batch(batch_dict, field_array): + for n, vlist in batch_dict.items(): + f = field_array[n] + if f.padder is None: + batch_dict[n] = np.array(vlist) + else: + data = f.padder(vlist, field_name=n, field_ele_dtype=f.dtype) + if not self.as_numpy: + data = _to_tensor(data, f.dtype) + batch_dict[n] = data + return batch_dict + + return (indices, + pad_batch(batch_x, self.inputs), + pad_batch(batch_y, self.targets)) + + +class Batch: + def __init__(self, dataset, batch_size, sampler=None, buffer_size=0, as_numpy=False, + num_workers=0, pin_memory=False, drop_last=False, + timeout=0, worker_init_fn=None, **kwargs): + + dataset_getter = DataSetGetter(dataset, as_numpy) + self.buffer_size = buffer_size + self.cur_batch_indices = None + self.num_batches = len(dataset) // batch_size + int(len(dataset) % batch_size != 0) + shuffle = isinstance(sampler, RandomSampler) + self.dataiter = torch.utils.data.DataLoader( + dataset=dataset_getter, batch_size=batch_size, shuffle=shuffle, + collate_fn=dataset_getter.collate_fn, + num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last, + timeout=timeout, worker_init_fn=worker_init_fn) + + def __iter__(self): + for indices, batch_x, batch_y in self.dataiter: + self.cur_batch_indices = indices + yield batch_x, batch_y + + def get_batch_indices(self): + return self.cur_batch_indices + + def __len__(self): + return self.num_batches + -class Batch(object): +class Batch1(object): """ 别名::class:`fastNLP.Batch` :class:`fastNLP.core.batch.Batch` From acf18e2e89c49567959b0da271bd89baf0ba440b Mon Sep 17 00:00:00 2001 From: yh_cc Date: Thu, 13 Jun 2019 21:25:47 +0800 Subject: [PATCH 02/34] =?UTF-8?q?=E4=BF=AE=E6=94=B9DataSet=20split?= =?UTF-8?q?=E7=9A=84=E4=B8=80=E4=B8=AA=E6=B3=A8=E9=87=8A=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/dataset.py | 2 +- fastNLP/modules/encoder/embedding.py | 14 ++++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index b011d15a..4cd1ad9c 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -805,7 +805,7 @@ class DataSet(object): """ 将DataSet按照ratio的比例拆分,返回两个DataSet - :param float ratio: 0int: + return len(self) + + def __len__(self): + return len(self.embed) + @property def embed_size(self) -> int: return self._embed_size @@ -109,9 +116,8 @@ class TokenEmbedding(nn.Module): for param in self.parameters(): param.requires_grad = value - @abstractmethod - def get_original_vocab(self): - pass + def __len__(self): + return len(self._word_vocab) @property def embed_size(self) -> int: @@ -505,7 +511,7 @@ class CNNCharEmbedding(TokenEmbedding): :param embed_size: 该word embedding的大小,默认值为50. :param char_emb_size: character的embed的大小。character是从vocab中生成的。默认值为50. :param filter_nums: filter的数量. 长度需要和kernels一致。默认值为[40, 30, 20]. - :param kernels: kernel的大小. 默认值为[5, 3, 1]. + :param kernel_sizes: kernel的大小. 默认值为[5, 3, 1]. :param pool_method: character的表示在合成一个表示时所使用的pool方法,支持'avg', 'max'. :param activation: CNN之后使用的激活方法,支持'relu', 'sigmoid', 'tanh' 或者自定义函数. :param min_char_freq: character的最少出现次数。默认值为2. From 7564818f4b1b14660322efca1fe7c90debbd5914 Mon Sep 17 00:00:00 2001 From: yunfan Date: Sat, 15 Jun 2019 12:12:57 +0800 Subject: [PATCH 03/34] [unstable] change Batch to torch's DataLoader --- fastNLP/__init__.py | 6 +- fastNLP/core/__init__.py | 2 +- fastNLP/core/batch.py | 259 ++++++++++----------------- fastNLP/core/field.py | 7 +- fastNLP/core/predictor.py | 5 +- fastNLP/core/tester.py | 14 +- fastNLP/core/trainer.py | 28 ++- fastNLP/modules/encoder/embedding.py | 4 +- test/core/test_batch.py | 20 +-- 9 files changed, 146 insertions(+), 199 deletions(-) diff --git a/fastNLP/__init__.py b/fastNLP/__init__.py index c67e5919..e666f65f 100644 --- a/fastNLP/__init__.py +++ b/fastNLP/__init__.py @@ -12,7 +12,11 @@ fastNLP 中最常用的组件可以直接从 fastNLP 包中 import ,他们的 __all__ = [ "Instance", "FieldArray", - "Batch", + + "DataSetIter", + "BatchIter", + "TorchLoaderIter", + "Vocabulary", "DataSet", "Const", diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py index d6ab8983..792bff66 100644 --- a/fastNLP/core/__init__.py +++ b/fastNLP/core/__init__.py @@ -14,7 +14,7 @@ core 模块里实现了 fastNLP 的核心框架,常用的功能都可以从 fa 介绍core 的子模块的分工,好像必要性不大 """ -from .batch import Batch +from .batch import DataSetIter, BatchIter, TorchLoaderIter from .callback import Callback, GradientClipCallback, EarlyStopCallback, TensorboardCallback, LRScheduler, ControlC from .const import Const from .dataset import DataSet diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index ce1a82f4..b23f81e2 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -3,7 +3,9 @@ batch 模块实现了 fastNLP 所需的 Batch 类。 """ __all__ = [ - "Batch" + "BatchIter", + "DataSetIter", + "TorchLoaderIter", ] import atexit @@ -15,7 +17,7 @@ import torch.multiprocessing as mp import torch.utils.data from numbers import Number -from .sampler import RandomSampler +from .sampler import SequentialSampler from .dataset import DataSet _python_is_exit = False @@ -28,14 +30,18 @@ def _set_python_is_exit(): atexit.register(_set_python_is_exit) + class DataSetGetter: def __init__(self, dataset: DataSet, as_numpy=False): self.dataset = dataset self.inputs = {n: f for n, f in dataset.get_all_fields().items() if f.is_input} self.targets = {n: f for n, f in dataset.get_all_fields().items() if f.is_target} self.as_numpy = as_numpy + self.idx_list = list(range(len(dataset))) def __getitem__(self, idx: int): + # mapping idx to sampled idx + idx = self.idx_list[idx] inputs = {n:f.get(idx) for n, f in self.inputs.items()} targets = {n:f.get(idx) for n, f in self.targets.items()} return idx, inputs, targets @@ -60,9 +66,9 @@ class DataSetGetter: if f.padder is None: batch_dict[n] = np.array(vlist) else: - data = f.padder(vlist, field_name=n, field_ele_dtype=f.dtype) + data = f.pad(vlist) if not self.as_numpy: - data = _to_tensor(data, f.dtype) + data, flag = _to_tensor(data, f.dtype) batch_dict[n] = data return batch_dict @@ -70,24 +76,40 @@ class DataSetGetter: pad_batch(batch_x, self.inputs), pad_batch(batch_y, self.targets)) + def set_idx_list(self, idx_list): + if len(idx_list) != len(self.idx_list): + raise ValueError + self.idx_list = idx_list -class Batch: - def __init__(self, dataset, batch_size, sampler=None, buffer_size=0, as_numpy=False, - num_workers=0, pin_memory=False, drop_last=False, - timeout=0, worker_init_fn=None, **kwargs): - dataset_getter = DataSetGetter(dataset, as_numpy) - self.buffer_size = buffer_size +class SamplerAdapter(torch.utils.data.Sampler): + def __init__(self, sampler, dataset): + self.sampler = sampler + self.dataset = dataset + + def __iter__(self): + return iter(self.sampler(self.dataset)) + + +class BatchIter: + def __init__(self): + self.dataiter = None + self.num_batches = None self.cur_batch_indices = None - self.num_batches = len(dataset) // batch_size + int(len(dataset) % batch_size != 0) - shuffle = isinstance(sampler, RandomSampler) - self.dataiter = torch.utils.data.DataLoader( - dataset=dataset_getter, batch_size=batch_size, shuffle=shuffle, - collate_fn=dataset_getter.collate_fn, - num_workers=num_workers, pin_memory=pin_memory, drop_last=drop_last, - timeout=timeout, worker_init_fn=worker_init_fn) + self.batch_size = None + + def init_iter(self): + pass + + @staticmethod + def get_num_batches(num_samples, batch_size, drop_last): + num_batches = num_samples // batch_size + if not drop_last and (num_samples % batch_size > 0): + num_batches += 1 + return num_batches def __iter__(self): + self.init_iter() for indices, batch_x, batch_y in self.dataiter: self.cur_batch_indices = indices yield batch_x, batch_y @@ -98,163 +120,62 @@ class Batch: def __len__(self): return self.num_batches + @property + def dataset(self): + return self.dataiter.dataset -class Batch1(object): - """ - 别名::class:`fastNLP.Batch` :class:`fastNLP.core.batch.Batch` - - Batch 用于从 `DataSet` 中按一定的顺序, 依次按 ``batch_size`` 的大小将数据取出, - 组成 `x` 和 `y`:: - - batch = Batch(data_set, batch_size=16, sampler=SequentialSampler()) - num_batch = len(batch) - for batch_x, batch_y in batch: - # do stuff ... - - :param dataset: :class:`~fastNLP.DataSet` 对象, 数据集 - :param int batch_size: 取出的batch大小 - :param sampler: 规定使用的 :class:`~fastNLP.Sampler` 方式. 若为 ``None`` , 使用 :class:`~fastNLP.RandomSampler`. - - Default: ``None`` - :param bool as_numpy: 若为 ``True`` , 输出batch为 numpy.array. 否则为 :class:`torch.Tensor`. - - Default: ``False`` - :param bool prefetch: 若为 ``True`` 使用多进程预先取出下一batch. - - Default: ``False`` - """ - - def __init__(self, dataset, batch_size, sampler=None, as_numpy=False, prefetch=False): - self.dataset = dataset + +class DataSetIter(BatchIter): + def __init__(self, dataset, batch_size=1, sampler=None, as_numpy=False, + num_workers=0, pin_memory=False, drop_last=False, + timeout=0, worker_init_fn=None): + super().__init__() + assert isinstance(dataset, DataSet) + dataset = DataSetGetter(dataset, as_numpy) + collate_fn = dataset.collate_fn if hasattr(dataset, 'collate_fn') else None + sampler = SamplerAdapter(sampler=sampler or SequentialSampler(), dataset=dataset) + self.dataiter = torch.utils.data.DataLoader( + dataset=dataset, batch_size=batch_size, sampler=sampler, + collate_fn=collate_fn, num_workers=num_workers, + pin_memory=pin_memory, drop_last=drop_last, + timeout=timeout, worker_init_fn=worker_init_fn) + self.num_batches = self.get_num_batches(len(dataset), batch_size, drop_last) self.batch_size = batch_size - if sampler is None: - sampler = RandomSampler() - self.sampler = sampler - self.as_numpy = as_numpy - self.idx_list = None - self.curidx = 0 - self.num_batches = len(dataset) // batch_size + int(len(dataset) % batch_size != 0) - self.cur_batch_indices = None - self.prefetch = prefetch - self.lengths = 0 - - def fetch_one(self): - if self.curidx >= len(self.idx_list): - return None - else: - endidx = min(self.curidx + self.batch_size, len(self.idx_list)) - batch_x, batch_y = {}, {} - - indices = self.idx_list[self.curidx:endidx] - self.cur_batch_indices = indices - - for field_name, field in self.dataset.get_all_fields().items(): - if field.is_target or field.is_input: - batch = field.get(indices) - if not self.as_numpy and \ - field.dtype is not None and \ - issubclass(field.dtype, Number) and not isinstance(batch, torch.Tensor): - batch = _to_tensor(batch) - if field.is_target: - batch_y[field_name] = batch - if field.is_input: - batch_x[field_name] = batch - - self.curidx = endidx - return batch_x, batch_y - - def __iter__(self): - """ - Iterate on dataset, fetch batch data. Fetch process don't block the iterate process - :return: - """ - if self.prefetch: - return self._run_batch_iter(self) - - def batch_iter(): - self.init_iter() - while 1: - res = self.fetch_one() - if res is None: - break - yield res - - return batch_iter() - - def init_iter(self): - self.idx_list = self.sampler(self.dataset) - self.curidx = 0 - self.lengths = self.dataset.get_length() - - def __len__(self): - return self.num_batches - - def get_batch_indices(self): - """ - 取得当前batch在DataSet中所在的index下标序列 - :return list(int) indexes: 下标序列 - """ - return self.cur_batch_indices - - @staticmethod - def _run_fetch(batch, q): - try: - global _python_is_exit - batch.init_iter() - # print('start fetch') - while 1: - res = batch.fetch_one() - # print('fetch one') - while 1: - try: - q.put(res, timeout=3) - break - except Full: - if _python_is_exit: - return - if res is None: - # print('fetch done, waiting processing') - break - # print('fetch exit') - except Exception as e: - q.put(e) - finally: - q.join() - - @staticmethod - def _run_batch_iter(batch): - q = mp.JoinableQueue(maxsize=10) - fetch_p = mp.Process(target=Batch._run_fetch, args=(batch, q)) - fetch_p.daemon = True - fetch_p.start() - # print('fork fetch process') - while 1: - try: - res = q.get(timeout=1) - q.task_done() - # print('get fetched') - if res is None: - break - elif isinstance(res, Exception): - raise res - yield res - except Empty as e: - if fetch_p.is_alive(): - continue - else: - break - fetch_p.terminate() - fetch_p.join() - # print('iter done') + +class TorchLoaderIter(BatchIter): + def __init__(self, dataset): + super().__init__() + assert isinstance(dataset, torch.utils.data.DataLoader) + self.dataiter = dataset + self.num_batches = self.get_num_batches(len(dataset), dataset.batch_size, dataset.drop_last) + self.batch_size = dataset.batch_size + + +class OnlineDataGettter: + # TODO + pass -def _to_tensor(batch): +class OnlineDataIter(BatchIter): + # TODO + def __init__(self, dataset, batch_size=1, buffer_size=10000, sampler=None, as_numpy=False, + num_workers=0, pin_memory=False, drop_last=False, + timeout=0, worker_init_fn=None, **kwargs): + super().__init__() + + +def _to_tensor(batch, field_dtype): try: - if issubclass(batch.dtype.type, np.floating): - batch = torch.as_tensor(batch).float() # 默认使用float32 + if field_dtype is not None \ + and issubclass(field_dtype, Number) \ + and not isinstance(batch, torch.Tensor): + if issubclass(batch.dtype.type, np.floating): + new_batch = torch.as_tensor(batch).float() # 默认使用float32 + else: + new_batch = torch.as_tensor(batch) # 复用内存地址,避免复制 + return new_batch, True else: - batch = torch.as_tensor(batch) # 复用内存地址,避免复制 + return batch, False except: - pass - return batch + return batch, False diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index faa306f3..a8836b5a 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -176,7 +176,12 @@ class FieldArray: if self.padder is None or pad is False: return np.array(contents) else: - return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype, dim=self._cell_ndim) + return self.pad(contents) + + def pad(self, contents): + if self.padder is None: + raise RuntimeError + return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype, dim=self._cell_ndim) def set_padder(self, padder): """ diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py index 4f37e105..06e586c6 100644 --- a/fastNLP/core/predictor.py +++ b/fastNLP/core/predictor.py @@ -6,7 +6,7 @@ from collections import defaultdict import torch -from . import Batch +from . import DataSetIter from . import DataSet from . import SequentialSampler from .utils import _build_args @@ -44,8 +44,7 @@ class Predictor(object): self.network.eval() batch_output = defaultdict(list) - data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False, - prefetch=False) + data_iterator = DataSetIter(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False) if hasattr(self.network, "predict"): predict_func = self.network.predict diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 883e0d01..398afe6b 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -37,7 +37,7 @@ import warnings import torch import torch.nn as nn -from .batch import Batch +from .batch import BatchIter, DataSetIter from .dataset import DataSet from .metrics import _prepare_metrics from .sampler import SequentialSampler @@ -82,7 +82,7 @@ class Tester(object): :param int verbose: 如果为0不输出任何信息; 如果为1,打印出验证结果。 """ - def __init__(self, data, model, metrics, batch_size=16, device=None, verbose=1): + def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1): super(Tester, self).__init__() if not isinstance(data, DataSet): @@ -96,6 +96,14 @@ class Tester(object): self._model = _move_model_to_device(model, device=device) self.batch_size = batch_size self.verbose = verbose + + if isinstance(data, DataSet): + self.data_iterator = DataSetIter( + dataset=data, batch_size=batch_size, num_workers=num_workers) + elif isinstance(data, BatchIter): + self.data_iterator = data + else: + raise TypeError("data type {} not support".format(type(data))) # 如果是DataParallel将没有办法使用predict方法 if isinstance(self._model, nn.DataParallel): @@ -124,7 +132,7 @@ class Tester(object): self._model_device = _get_model_device(self._model) network = self._model self._mode(network, is_test=True) - data_iterator = Batch(self.data, self.batch_size, sampler=SequentialSampler(), as_numpy=False) + data_iterator = self.data_iterator eval_results = {} try: with torch.no_grad(): diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index d7694e00..a882dbeb 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -312,7 +312,7 @@ try: except: from .utils import _pseudo_tqdm as tqdm -from .batch import Batch +from .batch import DataSetIter, BatchIter from .callback import CallbackManager, CallbackException from .dataset import DataSet from .losses import _prepare_losser @@ -394,7 +394,7 @@ class Trainer(object): """ def __init__(self, train_data, model, optimizer=None, loss=None, - batch_size=32, sampler=None, update_every=1, + batch_size=32, sampler=None, update_every=1, num_workers=0, n_epochs=10, print_every=5, dev_data=None, metrics=None, metric_key=None, validate_every=-1, save_path=None, @@ -439,9 +439,19 @@ class Trainer(object): # sampler check if sampler is not None and not isinstance(sampler, Sampler): raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler))) + + if isinstance(train_data, DataSet): + self.data_iterator = DataSetIter( + dataset=train_data, batch_size=batch_size, num_workers=num_workers) + elif isinstance(train_data, BatchIter): + self.data_iterator = train_data + else: + raise TypeError("train_data type {} not support".format(type(train_data))) - if check_code_level > -1: - _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, + if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter): + # TODO 考虑不同的dataset类型怎么check + _check_code(data_iterator=self.data_iterator, + model=model, losser=losser, metrics=metrics, dev_data=dev_data, metric_key=metric_key, check_level=check_code_level, batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE)) # _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码 @@ -493,7 +503,7 @@ class Trainer(object): self.callback_manager = CallbackManager(env={"trainer": self}, callbacks=callbacks) - + def train(self, load_best_model=True, on_exception='auto'): """ 使用该函数使Trainer开始训练。 @@ -572,8 +582,7 @@ class Trainer(object): with inner_tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: self.pbar = pbar avg_loss = 0 - data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, - prefetch=self.prefetch) + data_iterator = self.data_iterator self.batch_per_epoch = data_iterator.num_batches for epoch in range(1, self.n_epochs + 1): self.epoch = epoch @@ -786,13 +795,14 @@ def _get_value_info(_dict): return strs -def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE, +def _check_code(data_iterator, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE, dev_data=None, metric_key=None, check_level=0): # check get_loss 方法 model_devcie = model.parameters().__next__().device - batch = Batch(dataset=dataset, batch_size=batch_size, sampler=SequentialSampler()) + batch = data_iterator + dataset = data_iterator.dataset for batch_count, (batch_x, batch_y) in enumerate(batch): _move_dict_value_to_device(batch_x, batch_y, device=model_devcie) # forward check diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py index e54c1980..6e7406b2 100644 --- a/fastNLP/modules/encoder/embedding.py +++ b/fastNLP/modules/encoder/embedding.py @@ -15,7 +15,7 @@ from ...io.file_utils import cached_path, _get_base_url from ._bert import _WordBertModel from typing import List -from ... import DataSet, Batch, SequentialSampler +from ... import DataSet, DataSetIter, SequentialSampler from ...core.utils import _move_model_to_device, _get_model_device @@ -226,7 +226,7 @@ class ContextualEmbedding(TokenEmbedding): with torch.no_grad(): for index, dataset in enumerate(datasets): try: - batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), prefetch=False) + batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler()) for batch_x, batch_y in batch: words = batch_x['words'].to(device) words_list = words.tolist() diff --git a/test/core/test_batch.py b/test/core/test_batch.py index d1f93b9c..aa9808ee 100644 --- a/test/core/test_batch.py +++ b/test/core/test_batch.py @@ -3,7 +3,7 @@ import unittest import numpy as np import torch -from fastNLP import Batch +from fastNLP import DataSetIter from fastNLP import DataSet from fastNLP import Instance from fastNLP import SequentialSampler @@ -57,7 +57,7 @@ class TestCase1(unittest.TestCase): dataset = construct_dataset( [["FastNLP", "is", "the", "most", "beautiful", "tool", "in", "the", "world"] for _ in range(40)]) dataset.set_target() - batch = Batch(dataset, batch_size=4, sampler=SequentialSampler(), as_numpy=True) + batch = DataSetIter(dataset, batch_size=4, sampler=SequentialSampler(), as_numpy=True) cnt = 0 for _, _ in batch: @@ -68,7 +68,7 @@ class TestCase1(unittest.TestCase): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) ds.set_input("x") ds.set_target("y") - iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) + iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) for x, y in iter: self.assertTrue(isinstance(x["x"], np.ndarray) and isinstance(y["y"], np.ndarray)) self.assertEqual(len(x["x"]), 4) @@ -81,7 +81,7 @@ class TestCase1(unittest.TestCase): "y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10}) ds.set_input("x") ds.set_target("y") - iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) + iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) for x, y in iter: self.assertEqual(x["x"].shape, (4, 4)) self.assertEqual(y["y"].shape, (4, 4)) @@ -91,7 +91,7 @@ class TestCase1(unittest.TestCase): "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)}) ds.set_input("x") ds.set_target("y") - iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) + iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) for x, y in iter: self.assertEqual(x["x"].shape, (4, 4)) self.assertEqual(y["y"].shape, (4, 4)) @@ -101,7 +101,7 @@ class TestCase1(unittest.TestCase): "y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10}) ds.set_input("x") ds.set_target("y") - iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) + iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: self.assertTrue(isinstance(x["x"], torch.Tensor)) self.assertEqual(tuple(x["x"].shape), (4, 4)) @@ -113,7 +113,7 @@ class TestCase1(unittest.TestCase): "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)}) ds.set_input("x") ds.set_target("y") - iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) + iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: self.assertTrue(isinstance(x["x"], torch.Tensor)) self.assertEqual(tuple(x["x"].shape), (4, 4)) @@ -125,7 +125,7 @@ class TestCase1(unittest.TestCase): [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)]) ds.set_input("x") ds.set_target("y") - iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) + iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: self.assertTrue(isinstance(x["x"], torch.Tensor)) self.assertEqual(tuple(x["x"].shape), (4, 4)) @@ -137,7 +137,7 @@ class TestCase1(unittest.TestCase): [Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)]) ds.set_input("x") ds.set_target("y") - iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) + iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: print(x, y) @@ -146,7 +146,7 @@ class TestCase1(unittest.TestCase): num_samples = 1000 dataset = generate_fake_dataset(num_samples) - batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler()) + batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler()) for batch_x, batch_y in batch: pass From 17b5fd0066e525469faff3eeb65044c8e3b40fc7 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Sat, 15 Jun 2019 13:10:28 +0800 Subject: [PATCH 04/34] =?UTF-8?q?1.=20=E5=88=A0=E9=99=A4Trainer=E4=B8=AD?= =?UTF-8?q?=E5=AF=B9train=5Fdata=E5=BF=85=E9=A1=BB=E4=B8=BADataSet?= =?UTF-8?q?=E7=9A=84assert=202.=20=E5=88=A0=E9=99=A4Trainer=E7=9A=84prefet?= =?UTF-8?q?ch=E5=8F=82=E6=95=B0;=20=E5=9C=A8=E6=B3=A8=E9=87=8A=E4=B8=AD?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0num=5Fworkers=E5=8F=82=E6=95=B0=203.=20Traine?= =?UTF-8?q?r=E4=B8=AD=E9=BB=98=E8=AE=A4sampler=E4=B8=BARandomSampler?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/trainer.py | 22 ++-- reproduction/Biaffine_parser/run.py | 7 +- reproduction/POS_tagging/train_pos_tag.py | 10 +- reproduction/Star_transformer/train.py | 12 +-- reproduction/matching/snli.py | 17 +-- .../cws/train_shift_relay.py | 10 +- test/core/test_callbacks.py | 100 +++++------------- test/core/test_trainer.py | 59 ++--------- test/models/model_runner.py | 7 +- test/test_tutorials.py | 30 ++---- 10 files changed, 77 insertions(+), 197 deletions(-) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index a882dbeb..3b989ec0 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -351,6 +351,8 @@ class Trainer(object): :param int batch_size: 训练和验证的时候的batch大小。 :param loss: 使用的 :class:`~fastNLP.core.losses.LossBase` 对象。当为None时,默认使用 :class:`~fastNLP.LossInForward` :param sampler: Batch数据生成的顺序, :class:`~fastNLP.Sampler` 类型。如果为None,默认使用 :class:`~fastNLP.RandomSampler` + :param drop_last: 如果最后一个batch没有正好为batch_size这么多数据,就扔掉最后一个batch + :param num_workers: int, 有多少个线程来进行数据pad处理。 :param update_every: int, 多少步更新一次梯度。用于希望累计梯度的场景,比如需要128的batch_size, 但是直接设为128 会导致内存不足,通过设置batch_size=32, update_every=4达到目的。当optimizer为None时,该参数无效。 :param int n_epochs: 需要优化迭代多少次。 @@ -367,7 +369,6 @@ class Trainer(object): :param int validate_every: 多少个step在验证集上验证一次; 如果为-1,则每个epoch结束验证一次。仅在传入dev_data时有效。 :param str,None save_path: 将模型保存路径。如果为None,则不保存模型。如果dev_data为None,则保存最后一次迭代的模型。 保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。 - :param prefetch: bool, 是否使用额外的进程对产生batch数据。理论上会使得Batch迭代更快。 :param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。 :param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型 的计算位置进行管理。支持以下的输入: @@ -394,16 +395,12 @@ class Trainer(object): """ def __init__(self, train_data, model, optimizer=None, loss=None, - batch_size=32, sampler=None, update_every=1, num_workers=0, - n_epochs=10, print_every=5, + batch_size=32, sampler=None, drop_last=False,update_every=1, + num_workers=0, n_epochs=10, print_every=5, dev_data=None, metrics=None, metric_key=None, - validate_every=-1, save_path=None, - prefetch=False, use_tqdm=True, device=None, - callbacks=None, - check_code_level=0): + validate_every=-1, save_path=None, use_tqdm=True, device=None, + callbacks=None, check_code_level=0): super(Trainer, self).__init__() - if not isinstance(train_data, DataSet): - raise TypeError(f"The type of train_data must be fastNLP.DataSet, got {type(train_data)}.") if not isinstance(model, nn.Module): raise TypeError(f"The type of model must be torch.nn.Module, got {type(model)}.") @@ -440,9 +437,12 @@ class Trainer(object): if sampler is not None and not isinstance(sampler, Sampler): raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler))) + if sampler is None: + sampler = RandomSampler() + if isinstance(train_data, DataSet): self.data_iterator = DataSetIter( - dataset=train_data, batch_size=batch_size, num_workers=num_workers) + dataset=train_data, batch_size=batch_size, num_workers=num_workers, sampler=sampler, drop_last=drop_last) elif isinstance(train_data, BatchIter): self.data_iterator = train_data else: @@ -470,8 +470,6 @@ class Trainer(object): self.best_dev_epoch = None self.best_dev_step = None self.best_dev_perf = None - self.sampler = sampler if sampler is not None else RandomSampler() - self.prefetch = prefetch self.n_steps = (len(self.train_data) // self.batch_size + int( len(self.train_data) % self.batch_size != 0)) * self.n_epochs diff --git a/reproduction/Biaffine_parser/run.py b/reproduction/Biaffine_parser/run.py index a69d3d58..13c79b83 100644 --- a/reproduction/Biaffine_parser/run.py +++ b/reproduction/Biaffine_parser/run.py @@ -184,11 +184,8 @@ def train(path): m.weight.requires_grad = True # Trainer - trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data, - loss=ParserLoss(), metrics=ParserMetric(), metric_key='UAS', - **train_args.data, - optimizer=fastNLP.Adam(**optim_args.data), - save_path=path, + trainer = Trainer(train_data=train_data, model=model, optimizer=fastNLP.Adam(**optim_args.data), loss=ParserLoss(), + dev_data=dev_data, metrics=ParserMetric(), metric_key='UAS', save_path=path, callbacks=[MyCallback()]) # Start training diff --git a/reproduction/POS_tagging/train_pos_tag.py b/reproduction/POS_tagging/train_pos_tag.py index ccf7aa1e..a71531a4 100644 --- a/reproduction/POS_tagging/train_pos_tag.py +++ b/reproduction/POS_tagging/train_pos_tag.py @@ -89,11 +89,11 @@ def train(train_data_path, dev_data_path, checkpoint=None, save=None): model = torch.load(checkpoint) # call trainer to train - trainer = Trainer(dataset, model, loss=None, metrics=SpanFPreRecMetric(tag_proc.vocab, pred="predict", - target="truth", - seq_lens="word_seq_origin_len"), - dev_data=dev_data, metric_key="f", - use_tqdm=True, use_cuda=True, print_every=10, n_epochs=20, save_path=save) + trainer = Trainer(dataset, model, loss=None, n_epochs=20, print_every=10, dev_data=dev_data, + metrics=SpanFPreRecMetric(tag_proc.vocab, pred="predict", + target="truth", + seq_lens="word_seq_origin_len"), metric_key="f", save_path=save, + use_tqdm=True) trainer.train(load_best_model=True) # save model & pipeline diff --git a/reproduction/Star_transformer/train.py b/reproduction/Star_transformer/train.py index dee85c38..6fb58daf 100644 --- a/reproduction/Star_transformer/train.py +++ b/reproduction/Star_transformer/train.py @@ -149,14 +149,10 @@ def train(): ) if x.requires_grad and x.size(0) != len(word_v)] optim_cfg = [{'params': model.enc.embedding.parameters(), 'lr': g_args.lr*0.1}, {'params': ex_param, 'lr': g_args.lr, 'weight_decay': g_args.w_decay}, ] - trainer = FN.Trainer(model=model, train_data=train_data, dev_data=dev_data, - loss=loss, metrics=metric, metric_key=metric_key, - optimizer=torch.optim.Adam(optim_cfg), - n_epochs=g_args.ep, batch_size=g_args.bsz, print_every=10, validate_every=3000, - device=device, - use_tqdm=False, prefetch=False, - save_path=g_args.log, - callbacks=[MyCallback()]) + trainer = FN.Trainer(train_data=train_data, model=model, optimizer=torch.optim.Adam(optim_cfg), loss=loss, + batch_size=g_args.bsz, n_epochs=g_args.ep, print_every=10, dev_data=dev_data, metrics=metric, + metric_key=metric_key, validate_every=3000, save_path=g_args.log, use_tqdm=False, + device=device, callbacks=[MyCallback()]) trainer.train() tester = FN.Tester(data=test_data, model=model, metrics=metric, diff --git a/reproduction/matching/snli.py b/reproduction/matching/snli.py index b389aa11..d7f392bd 100644 --- a/reproduction/matching/snli.py +++ b/reproduction/matching/snli.py @@ -70,19 +70,10 @@ test_data = preprocess_data(test_data, bert_dirs) model = BertForNLI(bert_dir=bert_dirs) -trainer = Trainer( - train_data=train_data, - model=model, - optimizer=Adam(lr=2e-5, model_params=model.parameters()), - batch_size=torch.cuda.device_count() * 12, - n_epochs=4, - print_every=-1, - dev_data=dev_data, - metrics=AccuracyMetric(), - metric_key='acc', - device=[i for i in range(torch.cuda.device_count())], - check_code_level=-1 -) +trainer = Trainer(train_data=train_data, model=model, optimizer=Adam(lr=2e-5, model_params=model.parameters()), + batch_size=torch.cuda.device_count() * 12, n_epochs=4, print_every=-1, dev_data=dev_data, + metrics=AccuracyMetric(), metric_key='acc', device=[i for i in range(torch.cuda.device_count())], + check_code_level=-1) trainer.train(load_best_model=True) tester = Tester( diff --git a/reproduction/seqence_labelling/cws/train_shift_relay.py b/reproduction/seqence_labelling/cws/train_shift_relay.py index 805521e7..55576575 100644 --- a/reproduction/seqence_labelling/cws/train_shift_relay.py +++ b/reproduction/seqence_labelling/cws/train_shift_relay.py @@ -57,12 +57,8 @@ callbacks = [clipper] # if pretrain: # fixer = FixEmbedding([model.char_embedding, model.bigram_embedding], fix_until=fix_until) # callbacks.append(fixer) -trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None, - batch_size=32, sampler=sampler, update_every=5, - n_epochs=3, print_every=5, - dev_data=data.datasets['dev'], metrics=RelayMetric(), metric_key='f', - validate_every=-1, save_path=None, - prefetch=True, use_tqdm=True, device=device, - callbacks=callbacks, +trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None, batch_size=32, sampler=sampler, + update_every=5, n_epochs=3, print_every=5, dev_data=data.datasets['dev'], metrics=RelayMetric(), + metric_key='f', validate_every=-1, save_path=None, use_tqdm=True, device=device, callbacks=callbacks, check_code_level=0) trainer.train() \ No newline at end of file diff --git a/test/core/test_callbacks.py b/test/core/test_callbacks.py index 71a5565d..909295c0 100644 --- a/test/core/test_callbacks.py +++ b/test/core/test_callbacks.py @@ -40,89 +40,50 @@ class TestCallback(unittest.TestCase): def test_gradient_clip(self): data_set, model = prepare_env() - trainer = Trainer(data_set, model, - loss=BCELoss(pred="predict", target="y"), - n_epochs=20, - batch_size=32, - print_every=50, - optimizer=SGD(lr=0.1), - check_code_level=2, - use_tqdm=False, - dev_data=data_set, - metrics=AccuracyMetric(pred="predict", target="y"), - callbacks=[GradientClipCallback(model.parameters(), clip_value=2)]) + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=20, print_every=50, dev_data=data_set, + metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False, + callbacks=[GradientClipCallback(model.parameters(), clip_value=2)], check_code_level=2) trainer.train() def test_early_stop(self): data_set, model = prepare_env() - trainer = Trainer(data_set, model, - loss=BCELoss(pred="predict", target="y"), - n_epochs=20, - batch_size=32, - print_every=50, - optimizer=SGD(lr=0.01), - check_code_level=2, - use_tqdm=False, - dev_data=data_set, - metrics=AccuracyMetric(pred="predict", target="y"), - callbacks=[EarlyStopCallback(5)]) + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.01), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=20, print_every=50, dev_data=data_set, + metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False, + callbacks=[EarlyStopCallback(5)], check_code_level=2) trainer.train() def test_lr_scheduler(self): data_set, model = prepare_env() optimizer = torch.optim.SGD(model.parameters(), lr=0.01) - trainer = Trainer(data_set, model, - loss=BCELoss(pred="predict", target="y"), - n_epochs=5, - batch_size=32, - print_every=50, - optimizer=optimizer, - check_code_level=2, - use_tqdm=False, - dev_data=data_set, - metrics=AccuracyMetric(pred="predict", target="y"), - callbacks=[LRScheduler(torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1))]) + trainer = Trainer(data_set, model, optimizer=optimizer, loss=BCELoss(pred="predict", target="y"), batch_size=32, + n_epochs=5, print_every=50, dev_data=data_set, + metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False, + callbacks=[LRScheduler(torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1))], + check_code_level=2) trainer.train() def test_KeyBoardInterrupt(self): data_set, model = prepare_env() - trainer = Trainer(data_set, model, - loss=BCELoss(pred="predict", target="y"), - n_epochs=5, - batch_size=32, - print_every=50, - optimizer=SGD(lr=0.1), - check_code_level=2, - use_tqdm=False, - callbacks=[ControlC(False)]) + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=5, print_every=50, use_tqdm=False, callbacks=[ControlC(False)], + check_code_level=2) trainer.train() def test_LRFinder(self): data_set, model = prepare_env() - trainer = Trainer(data_set, model, - loss=BCELoss(pred="predict", target="y"), - n_epochs=5, - batch_size=32, - print_every=50, - optimizer=SGD(lr=0.1), - check_code_level=2, - use_tqdm=False, - callbacks=[LRFinder(len(data_set) // 32)]) + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=5, print_every=50, use_tqdm=False, + callbacks=[LRFinder(len(data_set) // 32)], check_code_level=2) trainer.train() def test_TensorboardCallback(self): data_set, model = prepare_env() - trainer = Trainer(data_set, model, - loss=BCELoss(pred="predict", target="y"), - n_epochs=5, - batch_size=32, - print_every=50, - optimizer=SGD(lr=0.1), - check_code_level=2, - use_tqdm=False, - dev_data=data_set, - metrics=AccuracyMetric(pred="predict", target="y"), - callbacks=[TensorboardCallback("loss", "metric")]) + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=5, print_every=50, dev_data=data_set, + metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False, + callbacks=[TensorboardCallback("loss", "metric")], check_code_level=2) trainer.train() def test_readonly_property(self): @@ -141,16 +102,9 @@ class TestCallback(unittest.TestCase): print(self.optimizer) data_set, model = prepare_env() - trainer = Trainer(data_set, model, - loss=BCELoss(pred="predict", target="y"), - n_epochs=total_epochs, - batch_size=32, - print_every=50, - optimizer=SGD(lr=0.1), - check_code_level=2, - use_tqdm=False, - dev_data=data_set, - metrics=AccuracyMetric(pred="predict", target="y"), - callbacks=[MyCallback()]) + trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=total_epochs, print_every=50, dev_data=data_set, + metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False, callbacks=[MyCallback()], + check_code_level=2) trainer.train() assert passed_epochs == list(range(1, total_epochs + 1)) diff --git a/test/core/test_trainer.py b/test/core/test_trainer.py index f559eac5..dc1a531a 100644 --- a/test/core/test_trainer.py +++ b/test/core/test_trainer.py @@ -46,18 +46,10 @@ class TrainerTestGround(unittest.TestCase): model = NaiveClassifier(2, 1) - trainer = Trainer(train_set, model, - loss=BCELoss(pred="predict", target="y"), - metrics=AccuracyMetric(pred="predict", target="y"), - n_epochs=10, - batch_size=32, - print_every=50, - validate_every=-1, - dev_data=dev_set, - optimizer=SGD(lr=0.1), - check_code_level=2, - use_tqdm=True, - save_path=None) + trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"), + batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set, + metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None, + use_tqdm=True, check_code_level=2) trainer.train() """ # 应该正确运行 @@ -83,10 +75,7 @@ class TrainerTestGround(unittest.TestCase): model = Model() with self.assertRaises(RuntimeError): - trainer = Trainer( - train_data=dataset, - model=model - ) + trainer = Trainer(train_data=dataset, model=model) """ # 应该获取到的报错提示 NameError: @@ -116,12 +105,7 @@ class TrainerTestGround(unittest.TestCase): return {'loss': loss} model = Model() - trainer = Trainer( - train_data=dataset, - model=model, - use_tqdm=False, - print_every=2 - ) + trainer = Trainer(train_data=dataset, model=model, print_every=2, use_tqdm=False) trainer.train() """ # 应该正确运行 @@ -147,12 +131,7 @@ class TrainerTestGround(unittest.TestCase): model = Model() with self.assertRaises(NameError): - trainer = Trainer( - train_data=dataset, - model=model, - use_tqdm=False, - print_every=2 - ) + trainer = Trainer(train_data=dataset, model=model, print_every=2, use_tqdm=False) trainer.train() def test_trainer_suggestion4(self): @@ -175,12 +154,7 @@ class TrainerTestGround(unittest.TestCase): model = Model() with self.assertRaises(NameError): - trainer = Trainer( - train_data=dataset, - model=model, - use_tqdm=False, - print_every=2 - ) + trainer = Trainer(train_data=dataset, model=model, print_every=2, use_tqdm=False) def test_trainer_suggestion5(self): # 检查报错提示能否正确提醒用户 @@ -203,12 +177,7 @@ class TrainerTestGround(unittest.TestCase): return {'loss': loss} model = Model() - trainer = Trainer( - train_data=dataset, - model=model, - use_tqdm=False, - print_every=2 - ) + trainer = Trainer(train_data=dataset, model=model, print_every=2, use_tqdm=False) def test_trainer_suggestion6(self): # 检查报错提示能否正确提醒用户 @@ -233,14 +202,8 @@ class TrainerTestGround(unittest.TestCase): model = Model() with self.assertRaises(NameError): - trainer = Trainer( - train_data=dataset, - model=model, - dev_data=dataset, - loss=CrossEntropyLoss(), - metrics=AccuracyMetric(), - use_tqdm=False, - print_every=2) + trainer = Trainer(train_data=dataset, model=model, loss=CrossEntropyLoss(), print_every=2, dev_data=dataset, + metrics=AccuracyMetric(), use_tqdm=False) """ def test_trainer_multiprocess(self): diff --git a/test/models/model_runner.py b/test/models/model_runner.py index 405aa7d6..ae589470 100644 --- a/test/models/model_runner.py +++ b/test/models/model_runner.py @@ -130,11 +130,8 @@ class ModelRunner(): tester = Tester(data=data, model=model, metrics=metrics, batch_size=BATCH_SIZE, verbose=0) before_train = tester.test() - trainer = Trainer(model=model, train_data=data, dev_data=None, - n_epochs=N_EPOCHS, batch_size=BATCH_SIZE, - loss=loss, - save_path=None, - use_tqdm=False) + trainer = Trainer(train_data=data, model=model, loss=loss, batch_size=BATCH_SIZE, n_epochs=N_EPOCHS, + dev_data=None, save_path=None, use_tqdm=False) trainer.train(load_best_model=False) after_train = tester.test() for metric_name, v1 in before_train.items(): diff --git a/test/test_tutorials.py b/test/test_tutorials.py index a38d5ae1..428d584d 100644 --- a/test/test_tutorials.py +++ b/test/test_tutorials.py @@ -60,10 +60,10 @@ class TestTutorial(unittest.TestCase): print(test_data[0]) # 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具 - from fastNLP.core.batch import Batch + from fastNLP.core.batch import DataSetIter from fastNLP.core.sampler import RandomSampler - batch_iterator = Batch(dataset=train_data, batch_size=2, sampler=RandomSampler()) + batch_iterator = DataSetIter(dataset=train_data, batch_size=2, sampler=RandomSampler()) for batch_x, batch_y in batch_iterator: print("batch_x has: ", batch_x) print("batch_y has: ", batch_y) @@ -85,21 +85,14 @@ class TestTutorial(unittest.TestCase): # 实例化Trainer,传入模型和数据,进行训练 # 先在test_data拟合(确保模型的实现是正确的) copy_model = deepcopy(model) - overfit_trainer = Trainer(model=copy_model, train_data=test_data, dev_data=test_data, - loss=loss, - metrics=metric, - save_path=None, - batch_size=32, - n_epochs=5) + overfit_trainer = Trainer(train_data=test_data, model=copy_model, loss=loss, batch_size=32, n_epochs=5, + dev_data=test_data, metrics=metric, save_path=None) overfit_trainer.train() # 用train_data训练,在test_data验证 - trainer = Trainer(model=model, train_data=train_data, dev_data=test_data, - loss=CrossEntropyLoss(pred="output", target="label_seq"), - metrics=AccuracyMetric(pred="predict", target="label_seq"), - save_path=None, - batch_size=32, - n_epochs=5) + trainer = Trainer(train_data=train_data, model=model, loss=CrossEntropyLoss(pred="output", target="label_seq"), + batch_size=32, n_epochs=5, dev_data=test_data, + metrics=AccuracyMetric(pred="predict", target="label_seq"), save_path=None) trainer.train() print('Train finished!') @@ -147,13 +140,8 @@ class TestTutorial(unittest.TestCase): from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam - trainer = Trainer(model=model, - train_data=train_data, - dev_data=dev_data, - loss=CrossEntropyLoss(), - optimizer= Adam(), - metrics=AccuracyMetric(target='target') - ) + trainer = Trainer(train_data=train_data, model=model, optimizer=Adam(), loss=CrossEntropyLoss(), + dev_data=dev_data, metrics=AccuracyMetric(target='target')) trainer.train() print('Train finished!') From 4b5113cbeaf7e5af9db9e2776048994d865eff54 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Sat, 15 Jun 2019 14:17:48 +0800 Subject: [PATCH 05/34] =?UTF-8?q?prefecth=E5=8F=98=E6=9B=B4=E4=B8=BAdeprec?= =?UTF-8?q?ated=20warning;?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/trainer.py | 11 ++++++++--- reproduction/utils.py | 3 ++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 3b989ec0..41f760e3 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -311,6 +311,7 @@ try: from tqdm.auto import tqdm except: from .utils import _pseudo_tqdm as tqdm +import warnings from .batch import DataSetIter, BatchIter from .callback import CallbackManager, CallbackException @@ -320,7 +321,6 @@ from .metrics import _prepare_metrics from .optimizer import Optimizer from .sampler import Sampler from .sampler import RandomSampler -from .sampler import SequentialSampler from .tester import Tester from .utils import _CheckError from .utils import _build_args @@ -395,11 +395,16 @@ class Trainer(object): """ def __init__(self, train_data, model, optimizer=None, loss=None, - batch_size=32, sampler=None, drop_last=False,update_every=1, + batch_size=32, sampler=None, drop_last=False, update_every=1, num_workers=0, n_epochs=10, print_every=5, dev_data=None, metrics=None, metric_key=None, - validate_every=-1, save_path=None, use_tqdm=True, device=None, + validate_every=-1, save_path=None, use_tqdm=True, device=None, prefetch=False, callbacks=None, check_code_level=0): + if prefetch and num_workers==0: + num_workers = 1 + if prefetch: + warnings.warn("prefetch is deprecated, will be removed in version 0.5.0, please use num_workers instead.") + super(Trainer, self).__init__() if not isinstance(model, nn.Module): raise TypeError(f"The type of model must be torch.nn.Module, got {type(model)}.") diff --git a/reproduction/utils.py b/reproduction/utils.py index 58883b43..bbfed4dd 100644 --- a/reproduction/utils.py +++ b/reproduction/utils.py @@ -13,7 +13,8 @@ def check_dataloader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]: } 如果paths为不合法的,将直接进行raise相应的错误 - :param paths: 路径 + :param paths: 路径. 可以为一个文件路径(则认为该文件就是train的文件); 可以为一个文件目录,将在该目录下寻找train.txt, + test.txt, dev.txt; 可以为一个dict, 则key是用户自定义的某个文件的名称,value是这个文件的路径。 :return: """ if isinstance(paths, str): From 30b012ac201614e240c152207bae80fb62a29425 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Mon, 17 Jun 2019 13:27:52 +0800 Subject: [PATCH 06/34] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dmetric=E5=92=8Closs?= =?UTF-8?q?=E5=9C=A8=E6=98=A0=E5=B0=84=E6=97=B6=E5=87=BA=E7=8E=B0=E9=87=8D?= =?UTF-8?q?=E5=A4=8D=E5=90=8C=E5=90=8D=E8=BE=93=E5=85=A5=E6=97=B6=E4=BC=9A?= =?UTF-8?q?=E8=A6=86=E7=9B=96=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/losses.py | 27 +++++++-------------- fastNLP/core/metrics.py | 52 ++++++++++++++++++++++++++++++----------- 2 files changed, 47 insertions(+), 32 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 9dc02f3d..8b17f75a 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -128,29 +128,21 @@ class LossBase(object): self.param_map[arg] = arg # This param does not need mapping. self._evaluate_args = func_args self._reverse_param_map = {input_arg: func_arg for func_arg, input_arg in self.param_map.items()} - - # need to wrap inputs in dict. + mapped_pred_dict = {} mapped_target_dict = {} - duplicated = [] - for input_arg in set(list(pred_dict.keys()) + list(target_dict.keys())): - not_duplicate_flag = 0 - if input_arg in self._reverse_param_map: - mapped_arg = self._reverse_param_map[input_arg] - not_duplicate_flag += 1 - else: - mapped_arg = input_arg + for input_arg, mapped_arg in self._reverse_param_map.items(): if input_arg in pred_dict: mapped_pred_dict[mapped_arg] = pred_dict[input_arg] - not_duplicate_flag += 1 if input_arg in target_dict: mapped_target_dict[mapped_arg] = target_dict[input_arg] - not_duplicate_flag += 1 - if not_duplicate_flag == 3: - duplicated.append(input_arg) # missing if not self._checked: + duplicated = [] + for input_arg, mapped_arg in self._reverse_param_map.items(): + if input_arg in pred_dict and input_arg in target_dict: + duplicated.append(input_arg) check_res = _check_arg_dict_list(self.get_loss, [mapped_pred_dict, mapped_target_dict]) # replace missing. missing = check_res.missing @@ -204,15 +196,12 @@ class LossFunc(LossBase): super(LossFunc, self).__init__() _check_function_or_method(func) + self.get_loss = func if key_map is not None: if not isinstance(key_map, dict): raise RuntimeError(f"Loss error: key_map except a {type({})} but got a {type(key_map)}") - self.param_map = key_map - if len(kwargs) > 0: - for key, val in kwargs.items(): - self.param_map.update({key: val}) + self._init_param_map(key_map, **kwargs) - self.get_loss = func class CrossEntropyLoss(LossBase): diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index 19c33c86..37a94a08 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -223,25 +223,18 @@ class MetricBase(object): # need to wrap inputs in dict. mapped_pred_dict = {} mapped_target_dict = {} - duplicated = [] - for input_arg in set(list(pred_dict.keys()) + list(target_dict.keys())): - not_duplicate_flag = 0 - if input_arg in self._reverse_param_map: - mapped_arg = self._reverse_param_map[input_arg] - not_duplicate_flag += 1 - else: - mapped_arg = input_arg + for input_arg, mapped_arg in self._reverse_param_map.items(): if input_arg in pred_dict: mapped_pred_dict[mapped_arg] = pred_dict[input_arg] - not_duplicate_flag += 1 if input_arg in target_dict: mapped_target_dict[mapped_arg] = target_dict[input_arg] - not_duplicate_flag += 1 - if not_duplicate_flag == 3: - duplicated.append(input_arg) # missing if not self._checked: + duplicated = [] + for input_arg, mapped_arg in self._reverse_param_map.items(): + if input_arg in pred_dict and input_arg in target_dict: + duplicated.append(input_arg) check_res = _check_arg_dict_list(self.evaluate, [mapped_pred_dict, mapped_target_dict]) # only check missing. # replace missing. @@ -411,6 +404,37 @@ def _bmeso_tag_to_spans(tags, ignore_labels=None): ] +def _bioes_tag_to_spans(tags, ignore_labels=None): + """ + 给定一个tags的lis,比如['O', 'B-singer', 'I-singer', 'E-singer', 'O', 'O']。 + 返回[('singer', (1, 4))] (左闭右开区间) + + :param tags: List[str], + :param ignore_labels: List[str], 在该list中的label将被忽略 + :return: List[Tuple[str, List[int, int]]]. [(label,[start, end])] + """ + ignore_labels = set(ignore_labels) if ignore_labels else set() + + spans = [] + prev_bmes_tag = None + for idx, tag in enumerate(tags): + tag = tag.lower() + bmes_tag, label = tag[:1], tag[2:] + if bmes_tag in ('b', 's'): + spans.append((label, [idx, idx])) + elif bmes_tag in ('i', 'e') and prev_bmes_tag in ('b', 'i') and label == spans[-1][0]: + spans[-1][1][1] = idx + elif bmes_tag == 'o': + pass + else: + spans.append((label, [idx, idx])) + prev_bmes_tag = bmes_tag + return [(span[0], (span[1][0], span[1][1] + 1)) + for span in spans + if span[0] not in ignore_labels + ] + + def _bio_tag_to_spans(tags, ignore_labels=None): """ 给定一个tags的lis,比如['O', 'B-singer', 'I-singer', 'I-singer', 'O', 'O']。 @@ -471,7 +495,7 @@ class SpanFPreRecMetric(MetricBase): :param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None,则使用'pred'取数据 :param str target: 用该key在evaluate()时从传入dict中取出target数据。 为None,则使用'target'取数据 :param str seq_len: 用该key在evaluate()时从传入dict中取出sequence length数据。为None,则使用'seq_len'取数据。 - :param str encoding_type: 目前支持bio, bmes + :param str encoding_type: 目前支持bio, bmes, bmeso, bioes :param list ignore_labels: str 组成的list. 这个list中的class不会被用于计算。例如在POS tagging时传入['NN'],则不会计算'NN'这 个label :param bool only_gross: 是否只计算总的f1, precision, recall的值;如果为False,不仅返回总的f1, pre, rec, 还会返回每个 @@ -499,6 +523,8 @@ class SpanFPreRecMetric(MetricBase): self.tag_to_span_func = _bio_tag_to_spans elif self.encoding_type == 'bmeso': self.tag_to_span_func = _bmeso_tag_to_spans + elif self.encoding_type == 'bioes': + self.tag_to_span_func = _bioes_tag_to_spans else: raise ValueError("Only support 'bio', 'bmes', 'bmeso' type.") From 839d712467b83a6bea1aab0b90e95f1432fc3ba6 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Mon, 17 Jun 2019 16:46:39 +0800 Subject: [PATCH 07/34] =?UTF-8?q?=E5=A2=9E=E5=BC=BAfield=E4=B8=AD=E7=9A=84?= =?UTF-8?q?value=5Fcount=E6=94=AF=E6=8C=81=E5=AF=B9nested=E7=9A=84field?= =?UTF-8?q?=E7=9A=84=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/field.py | 9 ++++++++- fastNLP/modules/encoder/embedding.py | 1 - reproduction/utils.py | 2 +- test/test_tutorials.py | 6 +++--- 4 files changed, 12 insertions(+), 6 deletions(-) diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index faa306f3..b0a36765 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -350,8 +350,15 @@ class FieldArray: :return: Counter, key是label,value是出现次数 """ count = Counter() + + def cum(cell): + if _is_iterable(cell) and not isinstance(cell, str): + for cell_ in cell: + cum(cell_) + else: + count[cell] += 1 for cell in self.content: - count[cell] += 1 + cum(cell) return count def _after_process(self, new_contents, inplace): diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py index 631f57e9..5f0b6c3b 100644 --- a/fastNLP/modules/encoder/embedding.py +++ b/fastNLP/modules/encoder/embedding.py @@ -326,7 +326,6 @@ class ElmoEmbedding(ContextualEmbedding): # 根据model_dir_or_name检查是否存在并下载 PRETRAIN_URL = _get_base_url('elmo') - # TODO 把baidu云上的加上去 PRETRAINED_ELMO_MODEL_DIR = {'en': 'elmo_en-d39843fe.tar.gz', 'cn': 'elmo_cn-5e9b34e2.tar.gz'} diff --git a/reproduction/utils.py b/reproduction/utils.py index 58883b43..0d06c99c 100644 --- a/reproduction/utils.py +++ b/reproduction/utils.py @@ -24,7 +24,7 @@ def check_dataloader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]: if not os.path.isfile(train_fp): raise FileNotFoundError(f"train.txt is not found in folder {paths}.") files = {'train': train_fp} - for filename in ['test.txt', 'dev.txt']: + for filename in ['dev.txt', 'test.txt']: fp = os.path.join(paths, filename) if os.path.isfile(fp): files[filename.split('.')[0]] = fp diff --git a/test/test_tutorials.py b/test/test_tutorials.py index a38d5ae1..2e971a4f 100644 --- a/test/test_tutorials.py +++ b/test/test_tutorials.py @@ -80,7 +80,7 @@ class TestTutorial(unittest.TestCase): test_data.rename_field('label', 'label_seq') loss = CrossEntropyLoss(pred="output", target="label_seq") - metric = AccuracyMetric(pred="predict", target="label_seq") + metric = AccuracyMetric(target="label_seq") # 实例化Trainer,传入模型和数据,进行训练 # 先在test_data拟合(确保模型的实现是正确的) @@ -96,7 +96,7 @@ class TestTutorial(unittest.TestCase): # 用train_data训练,在test_data验证 trainer = Trainer(model=model, train_data=train_data, dev_data=test_data, loss=CrossEntropyLoss(pred="output", target="label_seq"), - metrics=AccuracyMetric(pred="predict", target="label_seq"), + metrics=AccuracyMetric(target="label_seq"), save_path=None, batch_size=32, n_epochs=5) @@ -106,7 +106,7 @@ class TestTutorial(unittest.TestCase): # 调用Tester在test_data上评价效果 from fastNLP import Tester - tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"), + tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(target="label_seq"), batch_size=4) acc = tester.test() print(acc) From 39388567ad7e0fd39fa39a993e8ddeaa6e5f4ff7 Mon Sep 17 00:00:00 2001 From: xuyige Date: Mon, 17 Jun 2019 21:48:18 +0800 Subject: [PATCH 08/34] update matching.py --- fastNLP/io/__init__.py | 5 +- fastNLP/io/dataset_loader.py | 163 ++++++++++++++++++++++++++- fastNLP/modules/encoder/__init__.py | 9 +- fastNLP/modules/encoder/_bert.py | 2 +- fastNLP/modules/encoder/embedding.py | 14 ++- reproduction/matching/matching.py | 44 ++++++++ reproduction/matching/snli.py | 88 --------------- 7 files changed, 229 insertions(+), 96 deletions(-) create mode 100644 reproduction/matching/matching.py delete mode 100644 reproduction/matching/snli.py diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index c8d6a441..83425ff7 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -16,6 +16,7 @@ __all__ = [ 'CSVLoader', 'JsonLoader', 'ConllLoader', + 'MatchingLoader', 'SNLILoader', 'SSTLoader', 'PeopleDailyCorpusLoader', @@ -26,6 +27,6 @@ __all__ = [ ] from .embed_loader import EmbedLoader -from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \ - PeopleDailyCorpusLoader, Conll2003Loader +from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, MatchingLoader,\ + SNLILoader, SSTLoader, PeopleDailyCorpusLoader, Conll2003Loader from .model_io import ModelLoader, ModelSaver diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index e366c6ea..0595ad46 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -16,19 +16,24 @@ __all__ = [ 'CSVLoader', 'JsonLoader', 'ConllLoader', + 'MatchingLoader', 'SNLILoader', 'SSTLoader', 'PeopleDailyCorpusLoader', 'Conll2003Loader', ] +import os from nltk import Tree +from typing import Union, Dict +from ..core.vocabulary import Vocabulary from ..core.dataset import DataSet from ..core.instance import Instance from .file_reader import _read_csv, _read_json, _read_conll -from .base_loader import DataSetLoader +from .base_loader import DataSetLoader, DataInfo from .data_loader.sst import SSTLoader from ..core.const import Const +from ..modules.encoder._bert import BertTokenizer class PeopleDailyCorpusLoader(DataSetLoader): @@ -244,6 +249,162 @@ class JsonLoader(DataSetLoader): return ds +class MatchingLoader(DataSetLoader): + """ + 别名::class:`fastNLP.io.MatchingLoader` :class:`fastNLP.io.dataset_loader.MatchingLoader` + + 读取Matching数据集,根据数据集做预处理并返回DataInfo。 + + 数据来源: + SNLI: https://nlp.stanford.edu/projects/snli/snli_1.0.zip + """ + + def __init__(self, data_format: str='snli', for_model: str='esim', bert_dir=None): + super(MatchingLoader, self).__init__() + self.data_format = data_format.lower() + self.for_model = for_model.lower() + self.bert_dir = bert_dir + + def _load(self, path: str) -> DataSet: + raise NotImplementedError + + def process(self, paths: Union[str, Dict[str, str]], **options) -> DataInfo: + if isinstance(paths, str): + paths = {'train': paths} + + data_set = {} + for n, p in paths.items(): + if self.data_format == 'snli': + data = self._load_snli(p) + else: + raise RuntimeError(f'Your data format is {self.data_format}, ' + f'Please choose data format from [snli]') + + if self.for_model == 'esim': + data = self._for_esim(data) + elif self.for_model == 'bert': + data = self._for_bert(data, self.bert_dir) + else: + raise RuntimeError(f'Your model is {self.data_format}, ' + f'Please choose from [esim, bert]') + + data_set[n] = data + print(f'successfully load {n} set!') + + if not hasattr(self, 'vocab'): + raise RuntimeError(f'There is NOT vocab attribute built!') + if not hasattr(self, 'label_vocab'): + raise RuntimeError(f'There is NOT label vocab attribute built!') + + if self.for_model != 'bert': + from fastNLP.modules.encoder.embedding import StaticEmbedding + embedding = StaticEmbedding(self.vocab, model_dir_or_name='en') + + data_info = DataInfo(vocabs={'vocab': self.vocab, 'target_vocab': self.label_vocab}, + embeddings={'glove': embedding} if self.for_model != 'bert' else None, + datasets=data_set) + + return data_info + + @staticmethod + def _load_snli(path: str) -> DataSet: + """ + 读取SNLI数据集 + + 数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip + :param str path: 数据集路径 + :return: + """ + raw_ds = JsonLoader( + fields={ + 'sentence1_parse': Const.INPUTS(0), + 'sentence2_parse': Const.INPUTS(1), + 'gold_label': Const.TARGET, + } + )._load(path) + return raw_ds + + def _for_esim(self, raw_ds: DataSet): + if self.data_format == 'snli' or self.data_format == 'mnli': + def parse_tree(x): + t = Tree.fromstring(x) + return t.leaves() + + raw_ds.apply(lambda ins: parse_tree( + ins[Const.INPUTS(0)]), new_field_name=Const.INPUTS(0)) + raw_ds.apply(lambda ins: parse_tree( + ins[Const.INPUTS(1)]), new_field_name=Const.INPUTS(1)) + raw_ds.drop(lambda x: x[Const.TARGET] == '-') + + if not hasattr(self, 'vocab'): + self.vocab = Vocabulary().from_dataset(raw_ds, [Const.INPUTS(0), Const.INPUTS(1)]) + if not hasattr(self, 'label_vocab'): + self.label_vocab = Vocabulary(padding=None, unknown=None).from_dataset(raw_ds, field_name=Const.TARGET) + + raw_ds.apply(lambda ins: [self.vocab.to_index(w) for w in ins[Const.INPUTS(0)]], new_field_name=Const.INPUTS(0)) + raw_ds.apply(lambda ins: [self.vocab.to_index(w) for w in ins[Const.INPUTS(1)]], new_field_name=Const.INPUTS(1)) + raw_ds.apply(lambda ins: self.label_vocab.to_index(Const.TARGET), new_field_name=Const.TARGET) + + raw_ds.set_input(Const.INPUTS(0), Const.INPUTS(1)) + raw_ds.set_target(Const.TARGET) + + return raw_ds + + def _for_bert(self, raw_ds: DataSet, bert_dir: str): + if self.data_format == 'snli' or self.data_format == 'mnli': + def parse_tree(x): + t = Tree.fromstring(x) + return t.leaves() + + raw_ds.apply(lambda ins: parse_tree( + ins[Const.INPUTS(0)]), new_field_name=Const.INPUTS(0)) + raw_ds.apply(lambda ins: parse_tree( + ins[Const.INPUTS(1)]), new_field_name=Const.INPUTS(1)) + raw_ds.drop(lambda x: x[Const.TARGET] == '-') + + tokenizer = BertTokenizer.from_pretrained(bert_dir) + + vocab = Vocabulary(padding=None, unknown=None) + with open(os.path.join(bert_dir, 'vocab.txt')) as f: + lines = f.readlines() + vocab_list = [] + for line in lines: + vocab_list.append(line.strip()) + vocab.add_word_lst(vocab_list) + vocab.build_vocab() + vocab.padding = '[PAD]' + vocab.unknown = '[UNK]' + + if not hasattr(self, 'vocab'): + self.vocab = vocab + else: + for w, idx in self.vocab: + if vocab[w] != idx: + raise AttributeError(f"{self.__class__.__name__} has ") + + for i in range(2): + raw_ds.apply(lambda x: tokenizer.tokenize(" ".join(x[Const.INPUTS(i)])), new_field_name=Const.INPUTS(i)) + raw_ds.apply(lambda x: ['[CLS]'] + x[Const.INPUTS(0)] + ['[SEP]'] + x[Const.INPUTS(1)] + ['[SEP]'], + new_field_name=Const.INPUT) + raw_ds.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1), + new_field_name=Const.INPUT_LENS(0)) + raw_ds.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]), new_field_name=Const.INPUT_LENS(1)) + + max_len = 512 + raw_ds.apply(lambda x: x[Const.INPUT][: max_len], new_field_name=Const.INPUT) + raw_ds.apply(lambda x: [self.vocab.to_index(w) for w in x[Const.INPUT]], new_field_name=Const.INPUT) + raw_ds.apply(lambda x: x[Const.INPUT_LENS(0)][: max_len], new_field_name=Const.INPUT_LENS(0)) + raw_ds.apply(lambda x: x[Const.INPUT_LENS(1)][: max_len], new_field_name=Const.INPUT_LENS(1)) + + if not hasattr(self, 'label_vocab'): + self.label_vocab = Vocabulary(padding=None, unknown=None) + self.label_vocab.from_dataset(raw_ds, field_name=Const.TARGET) + raw_ds.apply(lambda x: self.label_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET) + + raw_ds.set_input(Const.INPUT, Const.INPUT_LENS(0), Const.INPUT_LENS(1)) + raw_ds.set_target(Const.TARGET) + + class SNLILoader(JsonLoader): """ 别名::class:`fastNLP.io.SNLILoader` :class:`fastNLP.io.dataset_loader.SNLILoader` diff --git a/fastNLP/modules/encoder/__init__.py b/fastNLP/modules/encoder/__init__.py index bdc4cbf3..4be75f20 100644 --- a/fastNLP/modules/encoder/__init__.py +++ b/fastNLP/modules/encoder/__init__.py @@ -7,6 +7,12 @@ __all__ = [ "ConvMaxpool", "Embedding", + "StaticEmbedding", + "ElmoEmbedding", + "BertEmbedding", + "StackEmbedding", + "LSTMCharEmbedding", + "CNNCharEmbedding", "LSTM", @@ -21,7 +27,8 @@ __all__ = [ from .bert import BertModel from .char_encoder import ConvolutionCharEncoder, LSTMCharEncoder from .conv_maxpool import ConvMaxpool -from .embedding import Embedding +from .embedding import Embedding, StaticEmbedding, ElmoEmbedding, BertEmbedding, \ + StackEmbedding, LSTMCharEmbedding, CNNCharEmbedding from .lstm import LSTM from .star_transformer import StarTransformer from .transformer import TransformerEncoder diff --git a/fastNLP/modules/encoder/_bert.py b/fastNLP/modules/encoder/_bert.py index fc62ea9c..1423f333 100644 --- a/fastNLP/modules/encoder/_bert.py +++ b/fastNLP/modules/encoder/_bert.py @@ -9,7 +9,7 @@ import torch from torch import nn -from ... import Vocabulary +from ...core.vocabulary import Vocabulary import collections import os diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py index 7279a372..f956aae7 100644 --- a/fastNLP/modules/encoder/embedding.py +++ b/fastNLP/modules/encoder/embedding.py @@ -1,10 +1,16 @@ __all__ = [ - "Embedding" + "Embedding", + "StaticEmbedding", + "ElmoEmbedding", + "BertEmbedding", + "StackEmbedding", + "LSTMCharEmbedding", + "CNNCharEmbedding", ] import torch.nn as nn from ..utils import get_embeddings from .lstm import LSTM -from ... import Vocabulary +from ...core.vocabulary import Vocabulary from abc import abstractmethod import torch from ...io import EmbedLoader @@ -15,7 +21,9 @@ from ...io.file_utils import cached_path, _get_base_url from ._bert import _WordBertModel from typing import List -from ... import DataSet, DataSetIter, SequentialSampler +from ...core.dataset import DataSet +from ...core.batch import DataSetIter +from ...core.sampler import SequentialSampler from ...core.utils import _move_model_to_device, _get_model_device diff --git a/reproduction/matching/matching.py b/reproduction/matching/matching.py new file mode 100644 index 00000000..52c1c3b5 --- /dev/null +++ b/reproduction/matching/matching.py @@ -0,0 +1,44 @@ +import os + +import torch + +from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric + +from fastNLP.io.dataset_loader import MatchingLoader + +from reproduction.matching.model.bert import BertForNLI + + +# bert_dirs = 'path/to/bert/dir' +bert_dirs = '/remote-home/ygxu/BERT/BERT_English_uncased_L-12_H-768_A_12' + +# load data set +data_info = MatchingLoader(data_format='snli', for_model='bert', bert_dir=bert_dirs).process( + {#'train': './data/snli/snli_1.0_train.jsonl', + 'dev': './data/snli/snli_1.0_dev.jsonl', + 'test': './data/snli/snli_1.0_test.jsonl'} +) + +print('successfully load data sets!') + + +model = BertForNLI(bert_dir=bert_dirs) + +trainer = Trainer(train_data=data_info.datasets['dev'], model=model, + optimizer=Adam(lr=2e-5, model_params=model.parameters()), + batch_size=torch.cuda.device_count() * 12, n_epochs=4, print_every=-1, + dev_data=data_info.datasets['dev'], + metrics=AccuracyMetric(), metric_key='acc', device=[i for i in range(torch.cuda.device_count())], + check_code_level=-1) +trainer.train(load_best_model=True) + +tester = Tester( + data=data_info.datasets['test'], + model=model, + metrics=AccuracyMetric(), + batch_size=torch.cuda.device_count() * 12, + device=[i for i in range(torch.cuda.device_count())], +) +tester.test() + + diff --git a/reproduction/matching/snli.py b/reproduction/matching/snli.py deleted file mode 100644 index d7f392bd..00000000 --- a/reproduction/matching/snli.py +++ /dev/null @@ -1,88 +0,0 @@ -import os - -import torch - -from fastNLP.core import Vocabulary, DataSet, Trainer, Tester, Const, Adam, AccuracyMetric - -from reproduction.matching.data.SNLIDataLoader import SNLILoader -from legacy.component.bert_tokenizer import BertTokenizer -from reproduction.matching.model.bert import BertForNLI - - -def preprocess_data(data: DataSet, bert_dir): - """ - preprocess data set to bert-need data set. - :param data: - :param bert_dir: - :return: - """ - tokenizer = BertTokenizer.from_pretrained(os.path.join(bert_dir, 'vocab.txt')) - - vocab = Vocabulary(padding=None, unknown=None) - with open(os.path.join(bert_dir, 'vocab.txt')) as f: - lines = f.readlines() - vocab_list = [] - for line in lines: - vocab_list.append(line.strip()) - vocab.add_word_lst(vocab_list) - vocab.build_vocab() - vocab.padding = '[PAD]' - vocab.unknown = '[UNK]' - - for i in range(2): - data.apply(lambda x: tokenizer.tokenize(" ".join(x[Const.INPUTS(i)])), - new_field_name=Const.INPUTS(i)) - data.apply(lambda x: ['[CLS]'] + x[Const.INPUTS(0)] + ['[SEP]'] + x[Const.INPUTS(1)] + ['[SEP]'], - new_field_name=Const.INPUT) - data.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1), - new_field_name=Const.INPUT_LENS(0)) - data.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]), new_field_name=Const.INPUT_LENS(1)) - - max_len = 512 - data.apply(lambda x: x[Const.INPUT][: max_len], new_field_name=Const.INPUT) - data.apply(lambda x: [vocab.to_index(w) for w in x[Const.INPUT]], new_field_name=Const.INPUT) - data.apply(lambda x: x[Const.INPUT_LENS(0)][: max_len], new_field_name=Const.INPUT_LENS(0)) - data.apply(lambda x: x[Const.INPUT_LENS(1)][: max_len], new_field_name=Const.INPUT_LENS(1)) - - target_vocab = Vocabulary(padding=None, unknown=None) - target_vocab.add_word_lst(['neutral', 'contradiction', 'entailment']) - target_vocab.build_vocab() - data.apply(lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET) - - data.set_input(Const.INPUT, Const.INPUT_LENS(0), Const.INPUT_LENS(1), Const.TARGET) - data.set_target(Const.TARGET) - - return data - - -bert_dirs = 'path/to/bert/dir' - -# load raw data set -train_data = SNLILoader().load('./data/snli/snli_1.0_train.jsonl') -dev_data = SNLILoader().load('./data/snli/snli_1.0_dev.jsonl') -test_data = SNLILoader().load('./data/snli/snli_1.0_test.jsonl') - -print('successfully load data sets!') - -train_data = preprocess_data(train_data, bert_dirs) -dev_data = preprocess_data(dev_data, bert_dirs) -test_data = preprocess_data(test_data, bert_dirs) - -model = BertForNLI(bert_dir=bert_dirs) - -trainer = Trainer(train_data=train_data, model=model, optimizer=Adam(lr=2e-5, model_params=model.parameters()), - batch_size=torch.cuda.device_count() * 12, n_epochs=4, print_every=-1, dev_data=dev_data, - metrics=AccuracyMetric(), metric_key='acc', device=[i for i in range(torch.cuda.device_count())], - check_code_level=-1) -trainer.train(load_best_model=True) - -tester = Tester( - data=test_data, - model=model, - metrics=AccuracyMetric(), - batch_size=torch.cuda.device_count() * 12, - device=[i for i in range(torch.cuda.device_count())], -) -tester.test() - - From 93620e76edf0162f8b9d8f844728dfd1c203e58d Mon Sep 17 00:00:00 2001 From: xuyige Date: Tue, 18 Jun 2019 02:04:53 +0800 Subject: [PATCH 09/34] update framework of matching --- fastNLP/io/dataset_loader.py | 25 ++-- fastNLP/modules/encoder/bert.py | 4 +- fastNLP/modules/encoder/embedding.py | 11 +- reproduction/matching/matching.py | 26 ++-- reproduction/matching/model/esim.py | 182 +++++++++++++++++++++++++++ 5 files changed, 221 insertions(+), 27 deletions(-) create mode 100644 reproduction/matching/model/esim.py diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index c63ff2f4..b0bf2e60 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -269,7 +269,7 @@ class MatchingLoader(DataSetLoader): def _load(self, path: str) -> DataSet: raise NotImplementedError - def process(self, paths: Union[str, Dict[str, str]], **options) -> DataInfo: + def process(self, paths: Union[str, Dict[str, str]], input_field=None) -> DataInfo: if isinstance(paths, str): paths = {'train': paths} @@ -289,6 +289,13 @@ class MatchingLoader(DataSetLoader): raise RuntimeError(f'Your model is {self.data_format}, ' f'Please choose from [esim, bert]') + if input_field is not None: + if isinstance(input_field, str): + data.set_input(input_field) + elif isinstance(input_field, list): + for field in input_field: + data.set_input(field) + data_set[n] = data print(f'successfully load {n} set!') @@ -298,11 +305,11 @@ class MatchingLoader(DataSetLoader): raise RuntimeError(f'There is NOT label vocab attribute built!') if self.for_model != 'bert': - from fastNLP.modules.encoder.embedding import StaticEmbedding - embedding = StaticEmbedding(self.vocab, model_dir_or_name='en') + from fastNLP.modules.encoder.embedding import ElmoEmbedding + embedding = ElmoEmbedding(self.vocab, model_dir_or_name='en', requires_grad=True, layers='2') data_info = DataInfo(vocabs={'vocab': self.vocab, 'target_vocab': self.label_vocab}, - embeddings={'glove': embedding} if self.for_model != 'bert' else None, + embeddings={'elmo': embedding} if self.for_model != 'bert' else None, datasets=data_set) return data_info @@ -338,15 +345,17 @@ class MatchingLoader(DataSetLoader): raw_ds.drop(lambda x: x[Const.TARGET] == '-') if not hasattr(self, 'vocab'): - self.vocab = Vocabulary().from_dataset(raw_ds, [Const.INPUTS(0), Const.INPUTS(1)]) + self.vocab = Vocabulary().from_dataset(raw_ds, field_name=[Const.INPUTS(0), Const.INPUTS(1)]) if not hasattr(self, 'label_vocab'): self.label_vocab = Vocabulary(padding=None, unknown=None).from_dataset(raw_ds, field_name=Const.TARGET) raw_ds.apply(lambda ins: [self.vocab.to_index(w) for w in ins[Const.INPUTS(0)]], new_field_name=Const.INPUTS(0)) raw_ds.apply(lambda ins: [self.vocab.to_index(w) for w in ins[Const.INPUTS(1)]], new_field_name=Const.INPUTS(1)) - raw_ds.apply(lambda ins: self.label_vocab.to_index(Const.TARGET), new_field_name=Const.TARGET) + raw_ds.apply(lambda ins: self.label_vocab.to_index(ins[Const.TARGET]), new_field_name=Const.TARGET) + raw_ds.apply(lambda ins: len(ins[Const.INPUTS(0)]), new_field_name=Const.INPUT_LENS(0)) + raw_ds.apply(lambda ins: len(ins[Const.INPUTS(1)]), new_field_name=Const.INPUT_LENS(1)) - raw_ds.set_input(Const.INPUTS(0), Const.INPUTS(1)) + raw_ds.set_input(Const.INPUTS(0), Const.INPUTS(1), Const.INPUT_LENS(0), Const.INPUT_LENS(1)) raw_ds.set_target(Const.TARGET) return raw_ds @@ -405,6 +414,8 @@ class MatchingLoader(DataSetLoader): raw_ds.set_input(Const.INPUT, Const.INPUT_LENS(0), Const.INPUT_LENS(1)) raw_ds.set_target(Const.TARGET) + return raw_ds + class SNLILoader(JsonLoader): """ diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index e9739c28..4948d022 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -2,9 +2,9 @@ import os from torch import nn import torch -from ...core import Vocabulary +from ...core.vocabulary import Vocabulary from ...io.file_utils import _get_base_url, cached_path -from ._bert import _WordPieceBertModel +from ._bert import _WordPieceBertModel, BertModel class BertWordPieceEncoder(nn.Module): diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py index 7fd85578..9c1bf35f 100644 --- a/fastNLP/modules/encoder/embedding.py +++ b/fastNLP/modules/encoder/embedding.py @@ -152,6 +152,8 @@ class StaticEmbedding(TokenEmbedding): Example:: + >>> embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50') + :param vocab: Vocabulary. 若该项为None则会读取所有的embedding。 :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding的文件名,第二种是传入embedding @@ -311,8 +313,7 @@ class ElmoEmbedding(ContextualEmbedding): Example:: - >>> - >>> + >>> embedding = ElmoEmbedding(vocab, model_dir_or_name='en', layers='2', requires_grad=True) :param vocab: 词表 :param model_dir_or_name: 可以有两种方式调用预训练好的ELMo embedding:第一种是传入ELMo权重的文件名,第二种是传入ELMo版本的名称, @@ -403,7 +404,7 @@ class BertEmbedding(ContextualEmbedding): Example:: - >>> + >>> embedding = BertEmbedding(vocab, model_dir_or_name='en-base-uncased', requires_grad=False, layers='4,-2,-1') :param fastNLP.Vocabulary vocab: 词表 @@ -513,7 +514,7 @@ class CNNCharEmbedding(TokenEmbedding): Example:: - >>> + >>> cnn_char_embed = CNNCharEmbedding(vocab) :param vocab: 词表 @@ -647,7 +648,7 @@ class LSTMCharEmbedding(TokenEmbedding): Example:: - >>> + >>> lstm_char_embed = LSTMCharEmbedding(vocab) :param vocab: 词表 :param embed_size: embedding的大小。默认值为50. diff --git a/reproduction/matching/matching.py b/reproduction/matching/matching.py index 52c1c3b5..8251b3bc 100644 --- a/reproduction/matching/matching.py +++ b/reproduction/matching/matching.py @@ -2,31 +2,31 @@ import os import torch -from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric +from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const from fastNLP.io.dataset_loader import MatchingLoader from reproduction.matching.model.bert import BertForNLI +from reproduction.matching.model.esim import ESIMModel -# bert_dirs = 'path/to/bert/dir' -bert_dirs = '/remote-home/ygxu/BERT/BERT_English_uncased_L-12_H-768_A_12' +bert_dirs = 'path/to/bert/dir' # load data set -data_info = MatchingLoader(data_format='snli', for_model='bert', bert_dir=bert_dirs).process( - {#'train': './data/snli/snli_1.0_train.jsonl', +# data_info = MatchingLoader(data_format='snli', for_model='bert', bert_dir=bert_dirs).process(... +data_info = MatchingLoader(data_format='snli', for_model='esim').process( + {'train': './data/snli/snli_1.0_train.jsonl', 'dev': './data/snli/snli_1.0_dev.jsonl', - 'test': './data/snli/snli_1.0_test.jsonl'} + 'test': './data/snli/snli_1.0_test.jsonl'}, + input_field=[Const.TARGET] ) -print('successfully load data sets!') +# model = BertForNLI(bert_dir=bert_dirs) +model = ESIMModel(data_info.embeddings['elmo'],) - -model = BertForNLI(bert_dir=bert_dirs) - -trainer = Trainer(train_data=data_info.datasets['dev'], model=model, - optimizer=Adam(lr=2e-5, model_params=model.parameters()), - batch_size=torch.cuda.device_count() * 12, n_epochs=4, print_every=-1, +trainer = Trainer(train_data=data_info.datasets['train'], model=model, + optimizer=Adam(lr=1e-4, model_params=model.parameters()), + batch_size=torch.cuda.device_count() * 24, n_epochs=20, print_every=-1, dev_data=data_info.datasets['dev'], metrics=AccuracyMetric(), metric_key='acc', device=[i for i in range(torch.cuda.device_count())], check_code_level=-1) diff --git a/reproduction/matching/model/esim.py b/reproduction/matching/model/esim.py new file mode 100644 index 00000000..0551bbdb --- /dev/null +++ b/reproduction/matching/model/esim.py @@ -0,0 +1,182 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from torch.nn import CrossEntropyLoss + +from fastNLP.models import BaseModel +from fastNLP.modules.encoder.embedding import TokenEmbedding +from fastNLP.modules.encoder.lstm import LSTM +from fastNLP.core.const import Const +from fastNLP.core.utils import seq_len_to_mask + + +class ESIMModel(BaseModel): + def __init__(self, init_embedding: TokenEmbedding, hidden_size=None, num_labels=3, dropout_rate=0.3, + dropout_embed=0.1): + super(ESIMModel, self).__init__() + + self.embedding = init_embedding + self.dropout_embed = EmbedDropout(p=dropout_embed) + if hidden_size is None: + hidden_size = self.embedding.embed_size + self.rnn = BiRNN(self.embedding.embed_size, hidden_size, dropout_rate=dropout_rate) + # self.rnn = LSTM(self.embedding.embed_size, hidden_size, dropout=dropout_rate, bidirectional=True) + + self.interfere = nn.Sequential(nn.Dropout(p=dropout_rate), + nn.Linear(8 * hidden_size, hidden_size), + nn.ReLU()) + nn.init.xavier_uniform_(self.interfere[1].weight.data) + self.bi_attention = SoftmaxAttention() + + self.rnn_high = BiRNN(self.embedding.embed_size, hidden_size, dropout_rate=dropout_rate) + # self.rnn_high = LSTM(hidden_size, hidden_size, dropout=dropout_rate, bidirectional=True) + + self.classifier = nn.Sequential(nn.Dropout(p=dropout_rate), + nn.Linear(8 * hidden_size, hidden_size), + nn.Tanh(), + nn.Dropout(p=dropout_rate), + nn.Linear(hidden_size, num_labels)) + nn.init.xavier_uniform_(self.classifier[1].weight.data) + nn.init.xavier_uniform_(self.classifier[4].weight.data) + + def forward(self, words1, words2, seq_len1, seq_len2, target=None): + mask1 = seq_len_to_mask(seq_len1) + mask2 = seq_len_to_mask(seq_len2) + a0 = self.embedding(words1) # B * len * emb_dim + b0 = self.embedding(words2) + a0, b0 = self.dropout_embed(a0), self.dropout_embed(b0) + a = self.rnn(a0, mask1.byte()) # a: [B, PL, 2 * H] + b = self.rnn(b0, mask2.byte()) + + ai, bi = self.bi_attention(a, mask1, b, mask2) + + a_ = torch.cat((a, ai, a - ai, a * ai), dim=2) # ma: [B, PL, 8 * H] + b_ = torch.cat((b, bi, b - bi, b * bi), dim=2) + a_f = self.interfere(a_) + b_f = self.interfere(b_) + + a_h = self.rnn_high(a_f, mask1.byte()) # ma: [B, PL, 2 * H] + b_h = self.rnn_high(b_f, mask2.byte()) + + a_avg = self.mean_pooling(a_h, mask1, dim=1) + a_max, _ = self.max_pooling(a_h, mask1, dim=1) + b_avg = self.mean_pooling(b_h, mask2, dim=1) + b_max, _ = self.max_pooling(b_h, mask2, dim=1) + + out = torch.cat((a_avg, a_max, b_avg, b_max), dim=1) # v: [B, 8 * H] + logits = torch.tanh(self.classifier(out)) + + if target is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits, target) + + return {Const.LOSS: loss, Const.OUTPUT: logits} + else: + return {Const.OUTPUT: logits} + + def predict(self, **kwargs): + return self.forward(**kwargs) + + # input [batch_size, len , hidden] + # mask [batch_size, len] (111...00) + @staticmethod + def mean_pooling(input, mask, dim=1): + masks = mask.view(mask.size(0), mask.size(1), -1).float() + return torch.sum(input * masks, dim=dim) / torch.sum(masks, dim=1) + + @staticmethod + def max_pooling(input, mask, dim=1): + my_inf = 10e12 + masks = mask.view(mask.size(0), mask.size(1), -1) + masks = masks.expand(-1, -1, input.size(2)).float() + return torch.max(input + masks.le(0.5).float() * -my_inf, dim=dim) + + +class EmbedDropout(nn.Dropout): + + def forward(self, sequences_batch): + ones = sequences_batch.data.new_ones(sequences_batch.shape[0], sequences_batch.shape[-1]) + dropout_mask = nn.functional.dropout(ones, self.p, self.training, inplace=False) + return dropout_mask.unsqueeze(1) * sequences_batch + + +class BiRNN(nn.Module): + def __init__(self, input_size, hidden_size, dropout_rate=0.3): + super(BiRNN, self).__init__() + self.dropout_rate = dropout_rate + self.rnn = nn.LSTM(input_size, hidden_size, + num_layers=1, + bidirectional=True, + batch_first=True) + + def forward(self, x, x_mask): + # Sort x + lengths = x_mask.data.eq(1).long().sum(1).squeeze() + _, idx_sort = torch.sort(lengths, dim=0, descending=True) + _, idx_unsort = torch.sort(idx_sort, dim=0) + lengths = list(lengths[idx_sort]) + + x = x.index_select(0, idx_sort) + # Pack it up + rnn_input = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True) + # Apply dropout to input + if self.dropout_rate > 0: + dropout_input = F.dropout(rnn_input.data, p=self.dropout_rate, training=self.training) + rnn_input = nn.utils.rnn.PackedSequence(dropout_input, rnn_input.batch_sizes) + output = self.rnn(rnn_input)[0] + # Unpack everything + output = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)[0] + output = output.index_select(0, idx_unsort) + if output.size(1) != x_mask.size(1): + padding = torch.zeros(output.size(0), + x_mask.size(1) - output.size(1), + output.size(2)).type(output.data.type()) + output = torch.cat([output, padding], 1) + return output + + +def masked_softmax(tensor, mask): + tensor_shape = tensor.size() + reshaped_tensor = tensor.view(-1, tensor_shape[-1]) + + # Reshape the mask so it matches the size of the input tensor. + while mask.dim() < tensor.dim(): + mask = mask.unsqueeze(1) + mask = mask.expand_as(tensor).contiguous().float() + reshaped_mask = mask.view(-1, mask.size()[-1]) + result = F.softmax(reshaped_tensor * reshaped_mask, dim=-1) + result = result * reshaped_mask + # 1e-13 is added to avoid divisions by zero. + result = result / (result.sum(dim=-1, keepdim=True) + 1e-13) + return result.view(*tensor_shape) + + +def weighted_sum(tensor, weights, mask): + w_sum = weights.bmm(tensor) + while mask.dim() < w_sum.dim(): + mask = mask.unsqueeze(1) + mask = mask.transpose(-1, -2) + mask = mask.expand_as(w_sum).contiguous().float() + return w_sum * mask + + +class SoftmaxAttention(nn.Module): + + def forward(self, premise_batch, premise_mask, hypothesis_batch, hypothesis_mask): + similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1) + .contiguous()) + + prem_hyp_attn = masked_softmax(similarity_matrix, hypothesis_mask) + hyp_prem_attn = masked_softmax(similarity_matrix.transpose(1, 2) + .contiguous(), + premise_mask) + + attended_premises = weighted_sum(hypothesis_batch, + prem_hyp_attn, + premise_mask) + attended_hypotheses = weighted_sum(premise_batch, + hyp_prem_attn, + hypothesis_mask) + + return attended_premises, attended_hypotheses \ No newline at end of file From 9a8fe42cd4a322d0639fdd64d05574e70de55013 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Tue, 18 Jun 2019 10:02:24 +0800 Subject: [PATCH 10/34] =?UTF-8?q?=E6=96=B0=E5=A2=9ENER=E7=9A=84=E6=95=B0?= =?UTF-8?q?=E6=8D=AE=E5=8A=A0=E8=BD=BD=E4=B8=8E=E6=A8=A1=E5=9E=8B=E4=BB=A3?= =?UTF-8?q?=E7=A0=81;=20=20=E4=BF=AE=E6=94=B9metric=E4=B8=AD=E7=9A=84typo;?= =?UTF-8?q?=20=E4=BF=AE=E6=94=B9LSTM=E4=B8=AD=E7=9A=84=E9=BB=98=E8=AE=A4?= =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E5=B0=86forget=20gate=E8=AE=BE?= =?UTF-8?q?=E7=BD=AE=E4=B8=BA1.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/metrics.py | 10 +- fastNLP/modules/encoder/embedding.py | 19 +-- fastNLP/modules/encoder/lstm.py | 10 +- .../seqence_labelling/ner/__init__.py | 0 .../ner/data/Conll2003Loader.py | 92 +++++++++++++ .../ner/data/OntoNoteLoader.py | 130 ++++++++++++++++++ .../seqence_labelling/ner/data/utils.py | 49 +++++++ .../ner/model/lstm_cnn_crf.py | 62 +++++++++ .../seqence_labelling/ner/test/__init__.py | 0 .../seqence_labelling/ner/test/test.py | 33 +++++ .../ner/train_cnn_lstm_crf_conll2003.py | 42 ++++++ .../seqence_labelling/ner/train_ontonote.py | 39 ++++++ 12 files changed, 469 insertions(+), 17 deletions(-) create mode 100644 reproduction/seqence_labelling/ner/__init__.py create mode 100644 reproduction/seqence_labelling/ner/data/Conll2003Loader.py create mode 100644 reproduction/seqence_labelling/ner/data/OntoNoteLoader.py create mode 100644 reproduction/seqence_labelling/ner/data/utils.py create mode 100644 reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py create mode 100644 reproduction/seqence_labelling/ner/test/__init__.py create mode 100644 reproduction/seqence_labelling/ner/test/test.py create mode 100644 reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py create mode 100644 reproduction/seqence_labelling/ner/train_ontonote.py diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py index cfcb9039..d54bf8ec 100644 --- a/fastNLP/core/metrics.py +++ b/fastNLP/core/metrics.py @@ -428,16 +428,16 @@ def _bioes_tag_to_spans(tags, ignore_labels=None): prev_bioes_tag = None for idx, tag in enumerate(tags): tag = tag.lower() - bieso_tag, label = tag[:1], tag[2:] - if bieso_tag in ('b', 's'): + bioes_tag, label = tag[:1], tag[2:] + if bioes_tag in ('b', 's'): spans.append((label, [idx, idx])) - elif bieso_tag in ('i', 'e') and prev_bioes_tag in ('b', 'i') and label == spans[-1][0]: + elif bioes_tag in ('i', 'e') and prev_bioes_tag in ('b', 'i') and label == spans[-1][0]: spans[-1][1][1] = idx - elif bieso_tag == 'o': + elif bioes_tag == 'o': pass else: spans.append((label, [idx, idx])) - prev_bioes_tag = bieso_tag + prev_bioes_tag = bioes_tag return [(span[0], (span[1][0], span[1][1] + 1)) for span in spans if span[0] not in ignore_labels diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py index e8fe903b..121bc950 100644 --- a/fastNLP/modules/encoder/embedding.py +++ b/fastNLP/modules/encoder/embedding.py @@ -500,8 +500,8 @@ class CNNCharEmbedding(TokenEmbedding): """ 别名::class:`fastNLP.modules.CNNCharEmbedding` :class:`fastNLP.modules.encoder.embedding.CNNCharEmbedding` - 使用CNN生成character embedding。CNN的结果为, CNN(x) -> activation(x) -> pool -> fc. 不同的kernel大小的fitler结果是 - concat起来的。 + 使用CNN生成character embedding。CNN的结果为, embed(x) -> Dropout(x) -> CNN(x) -> activation(x) -> pool + -> fc. 不同的kernel大小的fitler结果是concat起来的。 Example:: @@ -511,13 +511,14 @@ class CNNCharEmbedding(TokenEmbedding): :param vocab: 词表 :param embed_size: 该word embedding的大小,默认值为50. :param char_emb_size: character的embed的大小。character是从vocab中生成的。默认值为50. + :param dropout: 以多大的概率drop :param filter_nums: filter的数量. 长度需要和kernels一致。默认值为[40, 30, 20]. :param kernel_sizes: kernel的大小. 默认值为[5, 3, 1]. :param pool_method: character的表示在合成一个表示时所使用的pool方法,支持'avg', 'max'. :param activation: CNN之后使用的激活方法,支持'relu', 'sigmoid', 'tanh' 或者自定义函数. :param min_char_freq: character的最少出现次数。默认值为2. """ - def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, + def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, dropout:float=0.5, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), pool_method: str='max', activation='relu', min_char_freq: int=2): super(CNNCharEmbedding, self).__init__(vocab) @@ -526,6 +527,7 @@ class CNNCharEmbedding(TokenEmbedding): assert kernel % 2 == 1, "Only odd kernel is allowed." assert pool_method in ('max', 'avg') + self.dropout = nn.Dropout(dropout, inplace=True) self.pool_method = pool_method # activation function if isinstance(activation, str): @@ -583,7 +585,7 @@ class CNNCharEmbedding(TokenEmbedding): # 为1的地方为mask chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了 chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size - + chars = self.dropout(chars) reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1) reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1) @@ -635,7 +637,7 @@ class LSTMCharEmbedding(TokenEmbedding): """ 别名::class:`fastNLP.modules.LSTMCharEmbedding` :class:`fastNLP.modules.encoder.embedding.LSTMCharEmbedding` - 使用LSTM的方式对character进行encode. + 使用LSTM的方式对character进行encode. embed(x) -> Dropout(x) -> LSTM(x) -> activation(x) -> pool Example:: @@ -644,13 +646,14 @@ class LSTMCharEmbedding(TokenEmbedding): :param vocab: 词表 :param embed_size: embedding的大小。默认值为50. :param char_emb_size: character的embedding的大小。默认值为50. + :param dropout: 以多大概率drop :param hidden_size: LSTM的中间hidden的大小,如果为bidirectional的,hidden会除二,默认为50. :param pool_method: 支持'max', 'avg' :param activation: 激活函数,支持'relu', 'sigmoid', 'tanh', 或者自定义函数. :param min_char_freq: character的最小出现次数。默认值为2. :param bidirectional: 是否使用双向的LSTM进行encode。默认值为True。 """ - def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, hidden_size=50, + def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, dropout:float=0.5, hidden_size=50, pool_method: str='max', activation='relu', min_char_freq: int=2, bidirectional=True): super(LSTMCharEmbedding, self).__init__(vocab) @@ -658,7 +661,7 @@ class LSTMCharEmbedding(TokenEmbedding): assert pool_method in ('max', 'avg') self.pool_method = pool_method - + self.dropout = nn.Dropout(dropout, inplace=True) # activation function if isinstance(activation, str): if activation.lower() == 'relu': @@ -715,7 +718,7 @@ class LSTMCharEmbedding(TokenEmbedding): # 为mask的地方为1 chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了 chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size - + chars = self.dropout(chars) reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1) char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len) lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1) diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py index 3b97f4a7..537a446d 100644 --- a/fastNLP/modules/encoder/lstm.py +++ b/fastNLP/modules/encoder/lstm.py @@ -40,12 +40,14 @@ class LSTM(nn.Module): def init_param(self): for name, param in self.named_parameters(): - if 'bias_i' in name: - param.data.fill_(1) - elif 'bias_h' in name: + if 'bias' in name: + # based on https://github.com/pytorch/pytorch/issues/750#issuecomment-280671871 param.data.fill_(0) + n = param.size(0) + start, end = n // 4, n // 2 + param.data[start:end].fill_(1) else: - nn.init.xavier_normal_(param) + nn.init.xavier_uniform_(param) def forward(self, x, seq_len=None, h0=None, c0=None): """ diff --git a/reproduction/seqence_labelling/ner/__init__.py b/reproduction/seqence_labelling/ner/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/reproduction/seqence_labelling/ner/data/Conll2003Loader.py b/reproduction/seqence_labelling/ner/data/Conll2003Loader.py new file mode 100644 index 00000000..65ed7ab8 --- /dev/null +++ b/reproduction/seqence_labelling/ner/data/Conll2003Loader.py @@ -0,0 +1,92 @@ + +from fastNLP.core.vocabulary import VocabularyOption +from fastNLP.io.base_loader import DataSetLoader, DataInfo +from typing import Union, Dict +from fastNLP import Vocabulary +from fastNLP import Const +from reproduction.utils import check_dataloader_paths + +from fastNLP.io.dataset_loader import ConllLoader +from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2 + + +class Conll2003DataLoader(DataSetLoader): + def __init__(self, task:str='ner', encoding_type:str='bioes'): + """ + 加载Conll2003格式的英语语料,该数据集的信息可以在https://www.clips.uantwerpen.be/conll2003/ner/找到。当task为pos + 时,返回的DataSet中target取值于第2列; 当task为chunk时,返回的DataSet中target取值于第3列;当task为ner时,返回 + 的DataSet中target取值于第4列。所有"-DOCSTART- -X- O O"将被忽略,这会导致数据的数量少于很多文献报道的值,但 + 鉴于"-DOCSTART- -X- O O"只是用于文档分割的符号,并不应该作为预测对象,所以我们忽略了数据中的中该值 + ner与chunk任务读取后的数据的target将为encoding_type类型。pos任务读取后就是pos列的数据。 + + :param task: 指定需要标注任务。可选ner, pos, chunk + """ + assert task in ('ner', 'pos', 'chunk') + index = {'ner':3, 'pos':1, 'chunk':2}[task] + self._loader = ConllLoader(headers=['raw_words', 'target'], indexes=[0, index]) + self._tag_converters = None + if task in ('ner', 'chunk'): + self._tag_converters = [iob2] + if encoding_type == 'bioes': + self._tag_converters.append(iob2bioes) + + def load(self, path: str): + dataset = self._loader.load(path) + def convert_tag_schema(tags): + for converter in self._tag_converters: + tags = converter(tags) + return tags + if self._tag_converters: + dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET) + return dataset + + def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None, lower:bool=True): + """ + 读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略 + + :param paths: + :param word_vocab_opt: vocabulary的初始化值 + :param lower: 是否将所有字母转为小写 + :return: + """ + # 读取数据 + paths = check_dataloader_paths(paths) + data = DataInfo() + input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] + target_fields = [Const.TARGET, Const.INPUT_LEN] + for name, path in paths.items(): + dataset = self.load(path) + dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) + if lower: + dataset.apply_field(lambda words:[word.lower() for word in words], field_name=Const.INPUT, + new_field_name=Const.INPUT) + data.datasets[name] = dataset + + # 对construct vocab + word_vocab = Vocabulary(min_freq=3) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) + word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT) + word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) + data.vocabs[Const.INPUT] = word_vocab + + # cap words + cap_word_vocab = Vocabulary() + cap_word_vocab.from_dataset(*data.datasets.values(), field_name='raw_words') + cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') + input_fields.append('cap_words') + data.vocabs['cap_words'] = cap_word_vocab + + # 对target建vocab + target_vocab = Vocabulary(unknown=None, padding=None) + target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET) + target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) + data.vocabs[Const.TARGET] = target_vocab + + for name, dataset in data.datasets.items(): + dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN) + dataset.set_input(*input_fields) + dataset.set_target(*target_fields) + + return data + +if __name__ == '__main__': + pass \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py b/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py new file mode 100644 index 00000000..bf1ab71e --- /dev/null +++ b/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py @@ -0,0 +1,130 @@ +from fastNLP.core.vocabulary import VocabularyOption +from fastNLP.io.base_loader import DataSetLoader, DataInfo +from typing import Union, Dict +from fastNLP import DataSet +from fastNLP import Vocabulary +from fastNLP import Const +from reproduction.utils import check_dataloader_paths + +from fastNLP.io.dataset_loader import ConllLoader +from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2 + +class OntoNoteNERDataLoader(DataSetLoader): + """ + 用于读取处理为Conll格式后的OntoNote数据。将OntoNote数据处理为conll格式的过程可以参考https://github.com/yhcc/OntoNotes-5.0-NER。 + + """ + def __init__(self, encoding_type:str='bioes'): + assert encoding_type in ('bioes', 'bio') + self.encoding_type = encoding_type + if encoding_type=='bioes': + self.encoding_method = iob2bioes + else: + self.encoding_method = iob2 + + def load(self, path:str)->DataSet: + """ + 给定一个文件路径,读取数据。返回的DataSet包含以下的field + raw_words: List[str] + target: List[str] + + :param path: + :return: + """ + dataset = ConllLoader(headers=['raw_words', 'target'], indexes=[3, 10]).load(path) + def convert_to_bio(tags): + bio_tags = [] + flag = None + for tag in tags: + label = tag.strip("()*") + if '(' in tag: + bio_label = 'B-' + label + flag = label + elif flag: + bio_label = 'I-' + flag + else: + bio_label = 'O' + if ')' in tag: + flag = None + bio_tags.append(bio_label) + return self.encoding_method(bio_tags) + + dataset.apply_field(convert_to_bio, field_name='target', new_field_name='target') + + return dataset + + def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None, + lower:bool=True)->DataInfo: + """ + 读取并处理数据。返回的DataInfo包含以下的内容 + vocabs: + word: Vocabulary + target: Vocabulary + datasets: + train: DataSet + words: List[int], 被设置为input + target: int. label,被同时设置为input和target + seq_len: int. 句子的长度,被同时设置为input和target + raw_words: List[str] + xxx(根据传入的paths可能有所变化) + + :param paths: + :param word_vocab_opt: vocabulary的初始化值 + :param lower: 是否使用小写 + :return: + """ + paths = check_dataloader_paths(paths) + data = DataInfo() + input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN] + target_fields = [Const.TARGET, Const.INPUT_LEN] + for name, path in paths.items(): + dataset = self.load(path) + dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) + if lower: + dataset.apply_field(lambda words:[word.lower() for word in words], field_name=Const.INPUT, + new_field_name=Const.INPUT) + data.datasets[name] = dataset + + # 对construct vocab + word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) + word_vocab.from_dataset(data.datasets['train'], field_name='raw_words') + word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name=Const.INPUT) + data.vocabs[Const.INPUT] = word_vocab + + # cap words + cap_word_vocab = Vocabulary() + cap_word_vocab.from_dataset(data.datasets['train'], field_name='raw_words') + cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') + input_fields.append('cap_words') + data.vocabs['cap_words'] = cap_word_vocab + + # 对target建vocab + target_vocab = Vocabulary(unknown=None, padding=None) + target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET) + target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET) + data.vocabs[Const.TARGET] = target_vocab + + for name, dataset in data.datasets.items(): + dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN) + dataset.set_input(*input_fields) + dataset.set_target(*target_fields) + + return data + + +if __name__ == '__main__': + loader = OntoNoteNERDataLoader() + dataset = loader.load('/hdd/fudanNLP/fastNLP/others/data/v4/english/test.txt') + print(dataset.target.value_count()) + print(dataset[:4]) + + +""" +train 115812 2200752 +development 15680 304684 +test 12217 230111 + +train 92403 1901772 +valid 13606 279180 +test 10258 204135 +""" \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/data/utils.py b/reproduction/seqence_labelling/ner/data/utils.py new file mode 100644 index 00000000..8f7af792 --- /dev/null +++ b/reproduction/seqence_labelling/ner/data/utils.py @@ -0,0 +1,49 @@ +from typing import List + +def iob2(tags:List[str])->List[str]: + """ + 检查数据是否是合法的IOB数据,如果是IOB1会被自动转换为IOB2。 + + :param tags: 需要转换的tags + """ + for i, tag in enumerate(tags): + if tag == "O": + continue + split = tag.split("-") + if len(split) != 2 or split[0] not in ["I", "B"]: + raise TypeError("The encoding schema is not a valid IOB type.") + if split[0] == "B": + continue + elif i == 0 or tags[i - 1] == "O": # conversion IOB1 to IOB2 + tags[i] = "B" + tag[1:] + elif tags[i - 1][1:] == tag[1:]: + continue + else: # conversion IOB1 to IOB2 + tags[i] = "B" + tag[1:] + return tags + +def iob2bioes(tags:List[str])->List[str]: + """ + 将iob的tag转换为bmeso编码 + :param tags: + :return: + """ + new_tags = [] + for i, tag in enumerate(tags): + if tag == 'O': + new_tags.append(tag) + else: + split = tag.split('-')[0] + if split == 'B': + if i+1!=len(tags) and tags[i+1].split('-')[0] == 'I': + new_tags.append(tag) + else: + new_tags.append(tag.replace('B-', 'S-')) + elif split == 'I': + if i + 11: + nn.init.xavier_normal_(param) + else: + nn.init.constant_(param, 0) + if 'crf' in name: + nn.init.zeros_(param) + + def _forward(self, words, cap_words, seq_len, target=None): + words = self.embedding(words) + chars = self.char_embedding(cap_words) + words = torch.cat([words, chars], dim=-1) + outputs, _ = self.lstm(words, seq_len) + self.dropout(outputs) + forwards, backwards = outputs.chunk(2, dim=-1) + + # forward_logits = F.log_softmax(self.forward_fc(forwards), dim=-1) + # backward_logits = F.log_softmax(self.backward_fc(backwards), dim=-1) + + logits = self.forward_fc(forwards) + self.backward_fc(backwards) + self.dropout(logits) + + if target is not None: + loss = self.crf(logits, target, seq_len_to_mask(seq_len)) + return {Const.LOSS: loss} + else: + pred, _ = self.crf.viterbi_decode(logits, seq_len_to_mask(seq_len)) + return {Const.OUTPUT: pred} + + def forward(self, words, cap_words, seq_len, target): + return self._forward(words, cap_words, seq_len, target) + + def predict(self, words, cap_words, seq_len): + return self._forward(words, cap_words, seq_len, None) diff --git a/reproduction/seqence_labelling/ner/test/__init__.py b/reproduction/seqence_labelling/ner/test/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/reproduction/seqence_labelling/ner/test/test.py b/reproduction/seqence_labelling/ner/test/test.py new file mode 100644 index 00000000..09d0f468 --- /dev/null +++ b/reproduction/seqence_labelling/ner/test/test.py @@ -0,0 +1,33 @@ + +from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader +from reproduction.seqence_labelling.ner.data.Conll2003Loader import iob2, iob2bioes +import unittest + +class TestTagSchemaConverter(unittest.TestCase): + def test_iob2(self): + tags = ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'] + golden = ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'] + self.assertListEqual(golden, iob2(tags)) + + tags = ['I-ORG', 'O'] + golden = ['B-ORG', 'O'] + self.assertListEqual(golden, iob2(tags)) + + tags = ['I-MISC', 'I-MISC', 'O', 'I-PER', 'I-PER', 'O'] + golden = ['B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O'] + self.assertListEqual(golden, iob2(tags)) + + def test_iob2bemso(self): + tags = ['B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O'] + golden = ['B-MISC', 'E-MISC', 'O', 'B-PER', 'E-PER', 'O'] + self.assertListEqual(golden, iob2bioes(tags)) + + +def test_conll2003_loader(): + path = '/hdd/fudanNLP/fastNLP/others/data/conll2003/train.txt' + loader = Conll2003DataLoader().load(path) + print(loader[:3]) + + +if __name__ == '__main__': + test_conll2003_loader() \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py new file mode 100644 index 00000000..278ff42f --- /dev/null +++ b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py @@ -0,0 +1,42 @@ + + +from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding +from fastNLP.core.vocabulary import VocabularyOption + +from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF +from fastNLP import Trainer +from fastNLP import SpanFPreRecMetric +from fastNLP import BucketSampler +from fastNLP import Const +from torch.optim import SGD, Adam +from fastNLP import GradientClipCallback +from fastNLP.core.callback import FitlogCallback +import fitlog +fitlog.debug() + +from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader + +encoding_type = 'bioes' + +data = Conll2003DataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/conll2003', + word_vocab_opt=VocabularyOption(min_freq=3)) +print(data) +char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], + kernel_sizes=[3]) +word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], + model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', + requires_grad=True) +word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std() + +model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=400, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], + encoding_type=encoding_type) + +optimizer = Adam(model.parameters(), lr=0.001) + +callbacks = [GradientClipCallback(clip_type='value'), FitlogCallback({'test':data.datasets['test']}, verbose=1)] + +trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), + device=0, dev_data=data.datasets['dev'], batch_size=32, + metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), + callbacks=callbacks, num_workers=1, n_epochs=100) +trainer.train() \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/train_ontonote.py b/reproduction/seqence_labelling/ner/train_ontonote.py new file mode 100644 index 00000000..6f443dfd --- /dev/null +++ b/reproduction/seqence_labelling/ner/train_ontonote.py @@ -0,0 +1,39 @@ + + +from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding + +from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF +from fastNLP import Trainer +from fastNLP import SpanFPreRecMetric +from fastNLP import BucketSampler +from fastNLP import Const +from torch.optim import SGD, Adam +from fastNLP import GradientClipCallback +from fastNLP.core.callback import FitlogCallback +import fitlog +fitlog.debug() + +from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader + +encoding_type = 'bioes' + +data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/v4/english') +print(data) +char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], + kernel_sizes=[3]) +word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], + model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', + requires_grad=True) + +model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], + encoding_type=encoding_type) + +optimizer = Adam(model.parameters(), lr=0.001) + +callbacks = [GradientClipCallback(), FitlogCallback(data.datasets['test'], verbose=1)] + +trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), + device=1, dev_data=data.datasets['dev'], batch_size=32, + metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), + callbacks=callbacks, num_workers=1, n_epochs=100) +trainer.train() \ No newline at end of file From 4533427ea369851781f9c97b4b3fc5ac29d769a5 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Wed, 19 Jun 2019 11:14:41 +0800 Subject: [PATCH 11/34] =?UTF-8?q?sequence=20labeling=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ner/data/Conll2003Loader.py | 6 ++++-- .../ner/data/OntoNoteLoader.py | 3 ++- .../ner/model/lstm_cnn_crf.py | 18 ++++++------------ .../ner/train_cnn_lstm_crf_conll2003.py | 16 ++++++++++------ .../seqence_labelling/ner/train_ontonote.py | 4 ++-- 5 files changed, 24 insertions(+), 23 deletions(-) diff --git a/reproduction/seqence_labelling/ner/data/Conll2003Loader.py b/reproduction/seqence_labelling/ner/data/Conll2003Loader.py index 65ed7ab8..037d6081 100644 --- a/reproduction/seqence_labelling/ner/data/Conll2003Loader.py +++ b/reproduction/seqence_labelling/ner/data/Conll2003Loader.py @@ -63,8 +63,10 @@ class Conll2003DataLoader(DataSetLoader): data.datasets[name] = dataset # 对construct vocab - word_vocab = Vocabulary(min_freq=3) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) - word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT) + word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) + # word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT) + # TODO 这样感觉不规范呐 + word_vocab.from_dataset(*data.datasets.values(), field_name=Const.INPUT) word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) data.vocabs[Const.INPUT] = word_vocab diff --git a/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py b/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py index bf1ab71e..5abfe7c5 100644 --- a/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py +++ b/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py @@ -87,7 +87,8 @@ class OntoNoteNERDataLoader(DataSetLoader): # 对construct vocab word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) - word_vocab.from_dataset(data.datasets['train'], field_name='raw_words') + # word_vocab.from_dataset(data.datasets['train'], field_name='raw_words') + word_vocab.from_dataset(*data.datasets.values(), field_name=Const.INPUT) word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name=Const.INPUT) data.vocabs[Const.INPUT] = word_vocab diff --git a/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py b/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py index 79fa7a76..36d86651 100644 --- a/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py +++ b/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py @@ -4,7 +4,7 @@ from torch import nn from fastNLP import seq_len_to_mask from fastNLP.modules import Embedding from fastNLP.modules import LSTM -from fastNLP.modules import ConditionalRandomField, allowed_transitions, TimestepDropout +from fastNLP.modules import ConditionalRandomField, allowed_transitions import torch.nn.functional as F from fastNLP import Const @@ -17,13 +17,12 @@ class CNNBiLSTMCRF(nn.Module): self.lstm = LSTM(input_size=self.embedding.embedding_dim+self.char_embedding.embedding_dim, hidden_size=hidden_size//2, num_layers=num_layers, bidirectional=True, batch_first=True, dropout=dropout) - self.forward_fc = nn.Linear(hidden_size//2, len(tag_vocab)) - self.backward_fc = nn.Linear(hidden_size//2, len(tag_vocab)) + self.fc = nn.Linear(hidden_size, len(tag_vocab)) - transitions = allowed_transitions(tag_vocab.idx2word, encoding_type=encoding_type, include_start_end=False) - self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=False, allowed_transitions=transitions) + transitions = allowed_transitions(tag_vocab.idx2word, encoding_type=encoding_type, include_start_end=True) + self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=transitions) - self.dropout = TimestepDropout(dropout, inplace=True) + self.dropout = nn.Dropout(dropout, inplace=True) for name, param in self.named_parameters(): if 'ward_fc' in name: @@ -40,13 +39,8 @@ class CNNBiLSTMCRF(nn.Module): words = torch.cat([words, chars], dim=-1) outputs, _ = self.lstm(words, seq_len) self.dropout(outputs) - forwards, backwards = outputs.chunk(2, dim=-1) - # forward_logits = F.log_softmax(self.forward_fc(forwards), dim=-1) - # backward_logits = F.log_softmax(self.backward_fc(backwards), dim=-1) - - logits = self.forward_fc(forwards) + self.backward_fc(backwards) - self.dropout(logits) + logits = F.log_softmax(self.fc(outputs), dim=-1) if target is not None: loss = self.crf(logits, target, seq_len_to_mask(seq_len)) diff --git a/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py index 278ff42f..507be4f6 100644 --- a/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py +++ b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py @@ -10,7 +10,8 @@ from fastNLP import BucketSampler from fastNLP import Const from torch.optim import SGD, Adam from fastNLP import GradientClipCallback -from fastNLP.core.callback import FitlogCallback +from fastNLP.core.callback import FitlogCallback, LRScheduler +from torch.optim.lr_scheduler import LambdaLR import fitlog fitlog.debug() @@ -19,7 +20,7 @@ from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003Dat encoding_type = 'bioes' data = Conll2003DataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/conll2003', - word_vocab_opt=VocabularyOption(min_freq=3)) + word_vocab_opt=VocabularyOption(min_freq=2)) print(data) char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], kernel_sizes=[3]) @@ -28,15 +29,18 @@ word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], requires_grad=True) word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std() -model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=400, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], +model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type) -optimizer = Adam(model.parameters(), lr=0.001) +optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9) +scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) -callbacks = [GradientClipCallback(clip_type='value'), FitlogCallback({'test':data.datasets['test']}, verbose=1)] +callbacks = [GradientClipCallback(clip_type='value', clip_value=5), FitlogCallback({'test':data.datasets['test'], + 'train':data.datasets['train']}, verbose=1), + scheduler] trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), - device=0, dev_data=data.datasets['dev'], batch_size=32, + device=0, dev_data=data.datasets['dev'], batch_size=10, metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), callbacks=callbacks, num_workers=1, n_epochs=100) trainer.train() \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/train_ontonote.py b/reproduction/seqence_labelling/ner/train_ontonote.py index 6f443dfd..e2a4158a 100644 --- a/reproduction/seqence_labelling/ner/train_ontonote.py +++ b/reproduction/seqence_labelling/ner/train_ontonote.py @@ -25,10 +25,10 @@ word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', requires_grad=True) -model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], +model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=400, num_layers=2, tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type) -optimizer = Adam(model.parameters(), lr=0.001) +optimizer = SGD(model.parameters(), lr=0.015, momentum=0.9) callbacks = [GradientClipCallback(), FitlogCallback(data.datasets['test'], verbose=1)] From 15c7c073beec5a7e5d5f2f1408bb7ba84150477e Mon Sep 17 00:00:00 2001 From: yunfan Date: Wed, 19 Jun 2019 17:04:25 +0800 Subject: [PATCH 12/34] fix embed_loader --- fastNLP/io/embed_loader.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py index 5237a8a7..e046f1df 100644 --- a/fastNLP/io/embed_loader.py +++ b/fastNLP/io/embed_loader.py @@ -72,7 +72,8 @@ class EmbedLoader(BaseLoader): for idx, line in enumerate(f, start_idx): try: parts = line.strip().split() - word = parts[0] + word = ''.join(parts[:-dim]) + nums = parts[-dim:] # 对齐unk与pad if word==padding and vocab.padding is not None: word = vocab.padding @@ -80,7 +81,7 @@ class EmbedLoader(BaseLoader): word = vocab.unknown if word in vocab: index = vocab.to_index(word) - matrix[index] = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim) + matrix[index] = np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim) hit_flags[index] = True except Exception as e: if error == 'ignore': @@ -135,10 +136,11 @@ class EmbedLoader(BaseLoader): for idx, line in enumerate(f, start=start): try: parts = line.strip().split() - word = parts[0] if dim == -1: dim = len(parts) - 1 - vec = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim) + word = ''.join(parts[:-dim]) + nums = parts[-dim:] + vec = np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim) vec_dict[word] = vec vocab.add_word(word) if unknown is not None and unknown == word: From a137038eb2cc840581adacdcfb76e685a2eed63b Mon Sep 17 00:00:00 2001 From: yh Date: Wed, 19 Jun 2019 19:43:53 +0800 Subject: [PATCH 13/34] =?UTF-8?q?=E4=BF=AE=E5=A4=8DELMO=E4=B8=8ELSTM?= =?UTF-8?q?=E6=97=A0=E6=B3=95=E4=BD=BF=E7=94=A8nn.DataParallel=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/io/file_utils.py | 4 ++-- fastNLP/modules/encoder/_bert.py | 2 +- fastNLP/modules/encoder/_elmo.py | 6 +++++- fastNLP/modules/encoder/lstm.py | 9 ++++++++- .../seqence_labelling/ner/data/Conll2003Loader.py | 4 +--- .../seqence_labelling/ner/data/OntoNoteLoader.py | 3 +-- 6 files changed, 18 insertions(+), 10 deletions(-) diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index 11c7ab64..d178626b 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -13,7 +13,7 @@ import hashlib def cached_path(url_or_filename: str, cache_dir: Path=None) -> Path: """ 给定一个url或者文件名(可以是具体的文件名,也可以是文件),先在cache_dir下寻找该文件是否存在,如果不存在则去下载, 并 - 将文件放入到 + 将文件放入到cache_dir中 """ if cache_dir is None: dataset_cache = Path(get_defalt_path()) @@ -88,7 +88,7 @@ def split_filename_suffix(filepath): def get_from_cache(url: str, cache_dir: Path = None) -> Path: """ 尝试在cache_dir中寻找url定义的资源; 如果没有找到。则从url下载并将结果放在cache_dir下,缓存的名称由url的结果推断而来。 - 如果从url中下载的资源解压后有多个文件,则返回directory的路径; 如果只有一个资源,则返回具体的路径 + 如果从url中下载的资源解压后有多个文件,则返回directory的路径; 如果只有一个资源,则返回具体的路径。 """ cache_dir.mkdir(parents=True, exist_ok=True) diff --git a/fastNLP/modules/encoder/_bert.py b/fastNLP/modules/encoder/_bert.py index 317b78d8..a860054d 100644 --- a/fastNLP/modules/encoder/_bert.py +++ b/fastNLP/modules/encoder/_bert.py @@ -791,7 +791,7 @@ class _WordBertModel(nn.Module): # +2是由于需要加入[CLS]与[SEP] word_pieces = words.new_full((batch_size, max_word_piece_length+2), fill_value=self._wordpiece_pad_index) word_pieces[:, 0].fill_(self._cls_index) - word_pieces[:, word_pieces_lengths+1] = self._sep_index + word_pieces[torch.arange(batch_size).to(words), word_pieces_lengths+1] = self._sep_index attn_masks = torch.zeros_like(word_pieces) # 1. 获取words的word_pieces的id,以及对应的span范围 word_indexes = words.tolist() diff --git a/fastNLP/modules/encoder/_elmo.py b/fastNLP/modules/encoder/_elmo.py index 1f400f1d..7fa29201 100644 --- a/fastNLP/modules/encoder/_elmo.py +++ b/fastNLP/modules/encoder/_elmo.py @@ -16,6 +16,7 @@ import json from ..utils import get_dropout_mask import codecs +from torch import autograd class LstmCellWithProjection(torch.nn.Module): """ @@ -760,7 +761,10 @@ class _ElmoModel(nn.Module): token_embedding = self.token_embedder(expanded_words, chars) if self.config['encoder']['name'] == 'elmo': encoder_output = self.encoder(token_embedding, seq_len) - sz = encoder_output.size() + if encoder_output.size(2) < max_len: + dummy_tensor = autograd.Variable(torch.zeros(batch_size, max_len - encoder_output.size(2), encoder_output.size(-1))) + encoder_output = torch.cat([encoder_output, dummy_tensor], 1) + sz = encoder_output.size() # batch_size, max_len, hidden_size token_embedding = torch.cat([token_embedding, token_embedding], dim=2).view(1, sz[1], sz[2], sz[3]) encoder_output = torch.cat([token_embedding, encoder_output], dim=0) elif self.config['encoder']['name'] == 'lstm': diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py index 537a446d..0118d6d7 100644 --- a/fastNLP/modules/encoder/lstm.py +++ b/fastNLP/modules/encoder/lstm.py @@ -11,13 +11,15 @@ import torch.nn as nn import torch.nn.utils.rnn as rnn from ..utils import initial_parameter +from torch import autograd class LSTM(nn.Module): """ 别名::class:`fastNLP.modules.LSTM` :class:`fastNLP.modules.encoder.lstm.LSTM` - LSTM 模块, 轻量封装的Pytorch LSTM + LSTM 模块, 轻量封装的Pytorch LSTM. 在提供seq_len的情况下,将自动使用pack_padded_sequence; 同时默认将forget gate的bias初始化 + 为1; 且可以应对DataParallel中LSTM的使用问题 :param input_size: 输入 `x` 的特征维度 :param hidden_size: 隐状态 `h` 的特征维度. @@ -59,6 +61,7 @@ class LSTM(nn.Module): :return (output, ht) 或 output: 若 ``get_hidden=True`` [batch, seq_len, hidden_size*num_direction] 输出序列 和 [batch, hidden_size*num_direction] 最后时刻隐状态. """ + batch_size, max_len, _ = x.size() if h0 is not None and c0 is not None: hx = (h0, c0) else: @@ -77,6 +80,10 @@ class LSTM(nn.Module): output = output[unsort_idx] else: output = output[:, unsort_idx] + # 解决LSTM无法在DataParallel下使用的问题问题https://github.com/pytorch/pytorch/issues/1591 + if output.size(1) < max_len: + dummy_tensor = autograd.Variable(torch.zeros(batch_size, max_len - output.size(1), output.size(-1))) + output = torch.cat([output, dummy_tensor], 1) else: output, hx = self.lstm(x, hx) return output, hx diff --git a/reproduction/seqence_labelling/ner/data/Conll2003Loader.py b/reproduction/seqence_labelling/ner/data/Conll2003Loader.py index 037d6081..3140af18 100644 --- a/reproduction/seqence_labelling/ner/data/Conll2003Loader.py +++ b/reproduction/seqence_labelling/ner/data/Conll2003Loader.py @@ -16,7 +16,7 @@ class Conll2003DataLoader(DataSetLoader): 加载Conll2003格式的英语语料,该数据集的信息可以在https://www.clips.uantwerpen.be/conll2003/ner/找到。当task为pos 时,返回的DataSet中target取值于第2列; 当task为chunk时,返回的DataSet中target取值于第3列;当task为ner时,返回 的DataSet中target取值于第4列。所有"-DOCSTART- -X- O O"将被忽略,这会导致数据的数量少于很多文献报道的值,但 - 鉴于"-DOCSTART- -X- O O"只是用于文档分割的符号,并不应该作为预测对象,所以我们忽略了数据中的中该值 + 鉴于"-DOCSTART- -X- O O"只是用于文档分割的符号,并不应该作为预测对象,所以我们忽略了数据中的-DOCTSTART-开头的行 ner与chunk任务读取后的数据的target将为encoding_type类型。pos任务读取后就是pos列的数据。 :param task: 指定需要标注任务。可选ner, pos, chunk @@ -64,8 +64,6 @@ class Conll2003DataLoader(DataSetLoader): # 对construct vocab word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) - # word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT) - # TODO 这样感觉不规范呐 word_vocab.from_dataset(*data.datasets.values(), field_name=Const.INPUT) word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) data.vocabs[Const.INPUT] = word_vocab diff --git a/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py b/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py index 5abfe7c5..fe0236ad 100644 --- a/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py +++ b/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py @@ -87,14 +87,13 @@ class OntoNoteNERDataLoader(DataSetLoader): # 对construct vocab word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) - # word_vocab.from_dataset(data.datasets['train'], field_name='raw_words') word_vocab.from_dataset(*data.datasets.values(), field_name=Const.INPUT) word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name=Const.INPUT) data.vocabs[Const.INPUT] = word_vocab # cap words cap_word_vocab = Vocabulary() - cap_word_vocab.from_dataset(data.datasets['train'], field_name='raw_words') + cap_word_vocab.from_dataset(*data.datasets.values(), field_name='raw_words') cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') input_fields.append('cap_words') data.vocabs['cap_words'] = cap_word_vocab From c4e131a0c551af3d5d22b3d53b673167eba7b613 Mon Sep 17 00:00:00 2001 From: yh Date: Wed, 19 Jun 2019 22:19:41 +0800 Subject: [PATCH 14/34] =?UTF-8?q?=E9=87=8D=E6=96=B0=E4=BF=AE=E6=94=B9ELMO?= =?UTF-8?q?=E4=B8=8ELSTM=20DataParallel=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/modules/encoder/_elmo.py | 5 +++-- fastNLP/modules/encoder/lstm.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fastNLP/modules/encoder/_elmo.py b/fastNLP/modules/encoder/_elmo.py index 7fa29201..11feead6 100644 --- a/fastNLP/modules/encoder/_elmo.py +++ b/fastNLP/modules/encoder/_elmo.py @@ -762,8 +762,9 @@ class _ElmoModel(nn.Module): if self.config['encoder']['name'] == 'elmo': encoder_output = self.encoder(token_embedding, seq_len) if encoder_output.size(2) < max_len: - dummy_tensor = autograd.Variable(torch.zeros(batch_size, max_len - encoder_output.size(2), encoder_output.size(-1))) - encoder_output = torch.cat([encoder_output, dummy_tensor], 1) + dummy_tensor = encoder_output.new_zeros(encoder_output.size(0), batch_size, + max_len - encoder_output.size(2), encoder_output.size(-1)) + encoder_output = torch.cat([encoder_output, dummy_tensor], 2) sz = encoder_output.size() # batch_size, max_len, hidden_size token_embedding = torch.cat([token_embedding, token_embedding], dim=2).view(1, sz[1], sz[2], sz[3]) encoder_output = torch.cat([token_embedding, encoder_output], dim=0) diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py index 0118d6d7..2966426a 100644 --- a/fastNLP/modules/encoder/lstm.py +++ b/fastNLP/modules/encoder/lstm.py @@ -82,7 +82,7 @@ class LSTM(nn.Module): output = output[:, unsort_idx] # 解决LSTM无法在DataParallel下使用的问题问题https://github.com/pytorch/pytorch/issues/1591 if output.size(1) < max_len: - dummy_tensor = autograd.Variable(torch.zeros(batch_size, max_len - output.size(1), output.size(-1))) + dummy_tensor = output.new_zeros(batch_size, max_len - output.size(1), output.size(-1)) output = torch.cat([output, dummy_tensor], 1) else: output, hx = self.lstm(x, hx) From 1167d3b58788aa675e914aed5980f7956a67e713 Mon Sep 17 00:00:00 2001 From: yh Date: Wed, 19 Jun 2019 22:35:44 +0800 Subject: [PATCH 15/34] =?UTF-8?q?=E5=86=8D=E6=AC=A1=E4=BF=AE=E6=94=B9elmo?= =?UTF-8?q?=E7=9A=84dataparallel=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/modules/encoder/_elmo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fastNLP/modules/encoder/_elmo.py b/fastNLP/modules/encoder/_elmo.py index 11feead6..ab43d32f 100644 --- a/fastNLP/modules/encoder/_elmo.py +++ b/fastNLP/modules/encoder/_elmo.py @@ -761,11 +761,11 @@ class _ElmoModel(nn.Module): token_embedding = self.token_embedder(expanded_words, chars) if self.config['encoder']['name'] == 'elmo': encoder_output = self.encoder(token_embedding, seq_len) - if encoder_output.size(2) < max_len: + if encoder_output.size(2) < max_len+2: dummy_tensor = encoder_output.new_zeros(encoder_output.size(0), batch_size, - max_len - encoder_output.size(2), encoder_output.size(-1)) + max_len + 2 - encoder_output.size(2), encoder_output.size(-1)) encoder_output = torch.cat([encoder_output, dummy_tensor], 2) - sz = encoder_output.size() # batch_size, max_len, hidden_size + sz = encoder_output.size() # 2, batch_size, max_len, hidden_size token_embedding = torch.cat([token_embedding, token_embedding], dim=2).view(1, sz[1], sz[2], sz[3]) encoder_output = torch.cat([token_embedding, encoder_output], dim=0) elif self.config['encoder']['name'] == 'lstm': From 76e2330a2ee54db35457bbe65fdc2db2c9680bb3 Mon Sep 17 00:00:00 2001 From: yh Date: Wed, 19 Jun 2019 23:13:53 +0800 Subject: [PATCH 16/34] =?UTF-8?q?=E5=A2=9E=E5=8A=A0seq=5Flen=5Fto=5Fmask?= =?UTF-8?q?=E5=AF=B9=E5=A4=9A=E5=8D=A1=E5=9C=BA=E6=99=AF=E7=9A=84=E6=94=AF?= =?UTF-8?q?=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index 1eb2b70e..df3c45cb 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -643,7 +643,7 @@ def _check_forward_error(forward_func, batch_x, dataset, check_level): warnings.warn(message=_unused_warn) -def seq_len_to_mask(seq_len): +def seq_len_to_mask(seq_len, max_len=None): """ 将一个表示sequence length的一维数组转换为二维的mask,不包含的位置为0。 @@ -661,18 +661,20 @@ def seq_len_to_mask(seq_len): (14, 15) :param np.ndarray,torch.LongTensor seq_len: shape将是(B,) + :param int max_len: 将长度pad到这个长度. 默认使用的是seq_len中最长的长度。但在nn.DataParallel的场景下可能不同卡的seq_len会有 + 区别,所以需要传入一个max_len使得mask的长度是pad到该长度。 :return: np.ndarray or torch.Tensor, shape将是(B, max_length)。 元素类似为bool或torch.uint8 """ if isinstance(seq_len, np.ndarray): assert len(np.shape(seq_len)) == 1, f"seq_len can only have one dimension, got {len(np.shape(seq_len))}." - max_len = int(seq_len.max()) + max_len = max(max_len, int(seq_len.max())) if max_len else int(seq_len.max()) broad_cast_seq_len = np.tile(np.arange(max_len), (len(seq_len), 1)) mask = broad_cast_seq_len < seq_len.reshape(-1, 1) elif isinstance(seq_len, torch.Tensor): assert seq_len.dim() == 1, f"seq_len can only have one dimension, got {seq_len.dim() == 1}." batch_size = seq_len.size(0) - max_len = seq_len.max().long() + max_len = max(max_len, seq_len.max().long()) if max_len else seq_len.max().long() broad_cast_seq_len = torch.arange(max_len).expand(batch_size, -1).to(seq_len) mask = broad_cast_seq_len.lt(seq_len.unsqueeze(1)) else: From 8a766f070b1def33cb59c5193d3ebe69a1715316 Mon Sep 17 00:00:00 2001 From: yh Date: Wed, 19 Jun 2019 23:20:12 +0800 Subject: [PATCH 17/34] =?UTF-8?q?seq=5Flen=5Fto=5Fmask=E4=BF=AE=E6=94=B9?= =?UTF-8?q?=E4=B8=BA=E7=9B=B4=E6=8E=A5=E4=BD=BF=E7=94=A8max=5Flen=E8=80=8C?= =?UTF-8?q?=E4=B8=8D=E5=86=8D=E5=92=8C=E5=8F=A5=E4=B8=AD=E6=9C=80=E5=A4=A7?= =?UTF-8?q?=E9=95=BF=E5=BA=A6=E5=AF=B9=E6=AF=94?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/utils.py | 10 +++++++--- test/core/test_utils.py | 10 ++++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py index df3c45cb..d26df966 100644 --- a/fastNLP/core/utils.py +++ b/fastNLP/core/utils.py @@ -659,22 +659,26 @@ def seq_len_to_mask(seq_len, max_len=None): >>> mask = seq_len_to_mask(seq_len) >>> print(mask.shape) (14, 15) + >>> seq_len = torch.arange(2, 16) + >>> mask = seq_len_to_mask(seq_len, max_len=100) + >>>print(mask.size()) + torch.Size([14, 100]) :param np.ndarray,torch.LongTensor seq_len: shape将是(B,) - :param int max_len: 将长度pad到这个长度. 默认使用的是seq_len中最长的长度。但在nn.DataParallel的场景下可能不同卡的seq_len会有 + :param int max_len: 将长度pad到这个长度。默认(None)使用的是seq_len中最长的长度。但在nn.DataParallel的场景下可能不同卡的seq_len会有 区别,所以需要传入一个max_len使得mask的长度是pad到该长度。 :return: np.ndarray or torch.Tensor, shape将是(B, max_length)。 元素类似为bool或torch.uint8 """ if isinstance(seq_len, np.ndarray): assert len(np.shape(seq_len)) == 1, f"seq_len can only have one dimension, got {len(np.shape(seq_len))}." - max_len = max(max_len, int(seq_len.max())) if max_len else int(seq_len.max()) + max_len = int(max_len) if max_len else int(seq_len.max()) broad_cast_seq_len = np.tile(np.arange(max_len), (len(seq_len), 1)) mask = broad_cast_seq_len < seq_len.reshape(-1, 1) elif isinstance(seq_len, torch.Tensor): assert seq_len.dim() == 1, f"seq_len can only have one dimension, got {seq_len.dim() == 1}." batch_size = seq_len.size(0) - max_len = max(max_len, seq_len.max().long()) if max_len else seq_len.max().long() + max_len = int(max_len) if max_len else seq_len.max().long() broad_cast_seq_len = torch.arange(max_len).expand(batch_size, -1).to(seq_len) mask = broad_cast_seq_len.lt(seq_len.unsqueeze(1)) else: diff --git a/test/core/test_utils.py b/test/core/test_utils.py index e3e019c6..a3e8bdf6 100644 --- a/test/core/test_utils.py +++ b/test/core/test_utils.py @@ -237,6 +237,11 @@ class TestSeqLenToMask(unittest.TestCase): with self.assertRaises(AssertionError): mask = seq_len_to_mask(seq_len) + # 3. pad到指定长度 + seq_len = np.random.randint(1, 10, size=(10,)) + mask = seq_len_to_mask(seq_len, 100) + self.assertEqual(100, mask.size(1)) + def test_pytorch_seq_len(self): # 1. 随机测试 @@ -250,3 +255,8 @@ class TestSeqLenToMask(unittest.TestCase): seq_len = torch.randn(3, 4) with self.assertRaises(AssertionError): mask = seq_len_to_mask(seq_len) + + # 3. pad到指定长度 + seq_len = torch.randint(1, 10, size=(10, )) + mask = seq_len_to_mask(seq_len, 100) + self.assertEqual(100, mask.size(1)) \ No newline at end of file From 6b9bc007ee5fbd759591a7c704a33ae732939afe Mon Sep 17 00:00:00 2001 From: yh Date: Wed, 19 Jun 2019 23:49:01 +0800 Subject: [PATCH 18/34] =?UTF-8?q?LSTM=E4=B8=AD=E4=BF=AE=E5=A4=8D=E6=BD=9C?= =?UTF-8?q?=E5=9C=A8=E7=9A=84DataParallel=E5=8F=AF=E8=83=BD=E5=AD=98?= =?UTF-8?q?=E5=9C=A8=E7=9A=84=E9=97=AE=E9=A2=98,=20=E5=B9=B6=E4=B8=94?= =?UTF-8?q?=E5=88=A0=E9=99=A4init=5Fmethod=E5=8F=82=E6=95=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/modules/encoder/lstm.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py index 2966426a..1cc0dec1 100644 --- a/fastNLP/modules/encoder/lstm.py +++ b/fastNLP/modules/encoder/lstm.py @@ -19,7 +19,7 @@ class LSTM(nn.Module): 别名::class:`fastNLP.modules.LSTM` :class:`fastNLP.modules.encoder.lstm.LSTM` LSTM 模块, 轻量封装的Pytorch LSTM. 在提供seq_len的情况下,将自动使用pack_padded_sequence; 同时默认将forget gate的bias初始化 - 为1; 且可以应对DataParallel中LSTM的使用问题 + 为1; 且可以应对DataParallel中LSTM的使用问题。 :param input_size: 输入 `x` 的特征维度 :param hidden_size: 隐状态 `h` 的特征维度. @@ -32,13 +32,12 @@ class LSTM(nn.Module): """ def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True, - bidirectional=False, bias=True, initial_method=None): + bidirectional=False, bias=True): super(LSTM, self).__init__() self.batch_first = batch_first self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=bias, batch_first=batch_first, dropout=dropout, bidirectional=bidirectional) self.init_param() - initial_parameter(self, initial_method) def init_param(self): for name, param in self.named_parameters(): @@ -81,9 +80,14 @@ class LSTM(nn.Module): else: output = output[:, unsort_idx] # 解决LSTM无法在DataParallel下使用的问题问题https://github.com/pytorch/pytorch/issues/1591 - if output.size(1) < max_len: - dummy_tensor = output.new_zeros(batch_size, max_len - output.size(1), output.size(-1)) - output = torch.cat([output, dummy_tensor], 1) + if self.batch_first: + if output.size(1) < max_len: + dummy_tensor = output.new_zeros(max_len - output.size(1), batch_size, output.size(-1)) + output = torch.cat([output, dummy_tensor], 0) + else: + if output.size(0) < max_len: + dummy_tensor = output.new_zeros(batch_size, max_len - output.size(1), output.size(-1)) + output = torch.cat([output, dummy_tensor], 1) else: output, hx = self.lstm(x, hx) return output, hx From 0f4cf3030130af85d29d14b44c0c2d0ef832f9de Mon Sep 17 00:00:00 2001 From: yh Date: Wed, 19 Jun 2019 23:59:40 +0800 Subject: [PATCH 19/34] =?UTF-8?q?LSTM=E4=BF=AE=E6=94=B9=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/modules/encoder/lstm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py index 1cc0dec1..10d0e339 100644 --- a/fastNLP/modules/encoder/lstm.py +++ b/fastNLP/modules/encoder/lstm.py @@ -82,12 +82,12 @@ class LSTM(nn.Module): # 解决LSTM无法在DataParallel下使用的问题问题https://github.com/pytorch/pytorch/issues/1591 if self.batch_first: if output.size(1) < max_len: - dummy_tensor = output.new_zeros(max_len - output.size(1), batch_size, output.size(-1)) - output = torch.cat([output, dummy_tensor], 0) - else: - if output.size(0) < max_len: dummy_tensor = output.new_zeros(batch_size, max_len - output.size(1), output.size(-1)) output = torch.cat([output, dummy_tensor], 1) + else: + if output.size(0) < max_len: + dummy_tensor = output.new_zeros(max_len - output.size(1), batch_size, output.size(-1)) + output = torch.cat([output, dummy_tensor], 0) else: output, hx = self.lstm(x, hx) return output, hx From 8f7ed074410a098aa14b6fbda5a610d7a3951db6 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Fri, 21 Jun 2019 11:06:35 +0800 Subject: [PATCH 20/34] =?UTF-8?q?1.=20=E5=9C=A8vocabulary=E7=9A=84from=5Fd?= =?UTF-8?q?ataset=E4=B8=AD=E5=A2=9E=E5=8A=A0no=5Fcreate=5Fentry=5Fdataset?= =?UTF-8?q?=E9=80=89=E9=A1=B9=EF=BC=8C=E7=94=A8=E4=BA=8E=E4=BC=A0=E9=80=92?= =?UTF-8?q?dev=E5=92=8Ctest=202.=20=E8=B0=83=E6=95=B4=E5=90=84=E7=A7=8DEmb?= =?UTF-8?q?edding=E7=9A=84=E5=AE=9E=E7=8E=B0=EF=BC=8C=E4=BD=BF=E5=BE=97?= =?UTF-8?q?=E7=A1=AE=E4=BF=9D=E6=9D=A5=E8=87=AAdev=E5=92=8Ctest=E7=9A=84?= =?UTF-8?q?=E6=9C=AA=E5=8F=91=E7=8E=B0=E8=AF=8D=E4=BD=BF=E7=94=A8unk?= =?UTF-8?q?=E7=9A=84=E8=A1=A8=E7=A4=BA=203.=20=E5=9C=A8Embedding=E4=B8=AD?= =?UTF-8?q?=E5=A2=9E=E5=8A=A0dropout=5Fword=E7=9A=84=E9=80=89=E9=A1=B9?= =?UTF-8?q?=EF=BC=8C=E4=BD=BF=E5=BE=97=E5=8F=AF=E4=BB=A5=E9=9A=8F=E6=9C=BA?= =?UTF-8?q?drop=E6=8E=89=E8=AF=8D=E8=AF=AD=204.=20=E4=BB=A5=E5=8F=8A?= =?UTF-8?q?=E5=85=B6=E5=AE=83=E8=8B=A5=E5=B9=B2=E5=B0=8F=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/field.py | 10 +- fastNLP/core/vocabulary.py | 70 +++++++-- fastNLP/modules/encoder/_bert.py | 122 +++++++++------ fastNLP/modules/encoder/_elmo.py | 11 +- fastNLP/modules/encoder/bert.py | 28 ++-- fastNLP/modules/encoder/embedding.py | 145 ++++++++++++++++-- fastNLP/modules/encoder/lstm.py | 4 +- .../ner/data/Conll2003Loader.py | 9 +- .../ner/data/OntoNoteLoader.py | 30 +++- test/core/test_vocabulary.py | 18 +++ 10 files changed, 340 insertions(+), 107 deletions(-) diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index 7dc29ba3..1c0ad235 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -242,7 +242,7 @@ class FieldArray: new_contents.append(cell.split(sep)) except Exception as e: print(f"Exception happens when process value in index {index}.") - print(e) + raise e return self._after_process(new_contents, inplace=inplace) def int(self, inplace:bool=True): @@ -282,7 +282,7 @@ class FieldArray: new_contents.append(float(cell)) except Exception as e: print(f"Exception happens when process value in index {index}.") - print(e) + raise e return self._after_process(new_contents, inplace=inplace) def bool(self, inplace=True): @@ -302,7 +302,7 @@ class FieldArray: new_contents.append(bool(cell)) except Exception as e: print(f"Exception happens when process value in index {index}.") - print(e) + raise e return self._after_process(new_contents, inplace=inplace) @@ -323,7 +323,7 @@ class FieldArray: new_contents.append(cell.lower()) except Exception as e: print(f"Exception happens when process value in index {index}.") - print(e) + raise e return self._after_process(new_contents, inplace=inplace) def upper(self, inplace=True): @@ -343,7 +343,7 @@ class FieldArray: new_contents.append(cell.upper()) except Exception as e: print(f"Exception happens when process value in index {index}.") - print(e) + raise e return self._after_process(new_contents, inplace=inplace) def value_count(self): diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index 1d5d6f32..66aabd3d 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -4,10 +4,11 @@ __all__ = [ ] from functools import wraps -from collections import Counter +from collections import Counter, defaultdict from .dataset import DataSet from .utils import Option - +from functools import partial +import numpy as np class VocabularyOption(Option): def __init__(self, @@ -89,7 +90,9 @@ class Vocabulary(object): self.word2idx = None self.idx2word = None self.rebuild = True - + # 用于承载不需要单独创建entry的词语,具体见from_dataset()方法 + self._no_create_word = defaultdict(int) + @_check_build_status def update(self, word_lst): """依次增加序列中词在词典中的出现频率 @@ -240,8 +243,12 @@ class Vocabulary(object): raise e else: raise RuntimeError("Only DataSet type is allowed.") - - def from_dataset(self, *datasets, field_name): + + @property + def _no_create_word_length(self): + return len(self._no_create_word) + + def from_dataset(self, *datasets, field_name, no_create_entry_dataset=None): """ 使用dataset的对应field中词构建词典:: @@ -253,6 +260,13 @@ class Vocabulary(object): 构建词典所使用的 field(s), 支持一个或多个field 若有多个 DataSet, 每个DataSet都必须有这些field. 目前仅支持的field结构: ``str`` , ``list(str)`` , ``list(list(str))`` + :param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认),该选项用在接下来的模型会使用pretrain + 的embedding(包括glove, word2vec, elmo与bert)且会finetune的情况。如果仅使用来自于train的数据建立vocabulary,会导致test与dev + 中的数据无法充分利用到来自于预训练embedding的信息,所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。 + 如果一个词出现在了train中,但是没在预训练模型中,embedding会为它用unk初始化,但它是单独的一个vector,如果 + finetune embedding的话,这个词在更新之后可能会有更好的表示; 而如果这个词仅出现在了dev或test中,那么就不能为它们单独建立vector, + 而应该让它指向unk这个vector的值。所以只位于no_create_entry_dataset中的token,将首先从预训练的词表中寻找它的表示, + 如果找到了,就使用该表示; 如果没有找到,则认为该词的表示应该为unk的表示。 :return self: """ if isinstance(field_name, str): @@ -260,19 +274,28 @@ class Vocabulary(object): elif not isinstance(field_name, list): raise TypeError('invalid argument field_name: {}'.format(field_name)) - def construct_vocab(ins): + def construct_vocab(ins, no_create_entry=False): for fn in field_name: field = ins[fn] if isinstance(field, str): + if no_create_entry and field not in self.word_count: + self._no_create_word[field] += 1 self.add_word(field) - elif isinstance(field, list): - if not isinstance(field[0], list): - self.add_word_lst(field) + elif isinstance(field, (list, np.ndarray)): + if not isinstance(field[0], (list, np.ndarray)): + for word in field: + if no_create_entry and word not in self.word_count: + self._no_create_word[word] += 1 + self.add_word(word) else: - if isinstance(field[0][0], list): + if isinstance(field[0][0], (list, np.ndarray)): raise RuntimeError("Only support field with 2 dimensions.") - [self.add_word_lst(w) for w in field] - + for words in field: + for word in words: + if no_create_entry and word not in self.word_count: + self._no_create_word[word] += 1 + self.add_word(word) + for idx, dataset in enumerate(datasets): if isinstance(dataset, DataSet): try: @@ -281,9 +304,27 @@ class Vocabulary(object): print("When processing the `{}` dataset, the following error occurred.".format(idx)) raise e else: - raise RuntimeError("Only DataSet type is allowed.") + raise TypeError("Only DataSet type is allowed.") + + if no_create_entry_dataset is not None: + partial_construct_vocab = partial(construct_vocab, no_create_entry=True) + if isinstance(no_create_entry_dataset, DataSet): + no_create_entry_dataset.apply(partial_construct_vocab) + elif isinstance(no_create_entry_dataset, list): + for dataset in no_create_entry_dataset: + if not isinstance(dataset, DataSet): + raise TypeError("Only DataSet type is allowed.") + dataset.apply(partial_construct_vocab) return self - + + def _is_word_no_create_entry(self, word): + """ + 判断当前的word是否是不需要创建entry的,具体参见from_dataset的说明 + :param word: str + :return: bool + """ + return word in self._no_create_word + def to_index(self, w): """ 将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出 @@ -338,6 +379,7 @@ class Vocabulary(object): self.word2idx = None self.idx2word = None self.rebuild = True + self._no_create_word.clear() def __getstate__(self): """Use to prepare data for pickle. diff --git a/fastNLP/modules/encoder/_bert.py b/fastNLP/modules/encoder/_bert.py index a860054d..a0353279 100644 --- a/fastNLP/modules/encoder/_bert.py +++ b/fastNLP/modules/encoder/_bert.py @@ -21,6 +21,7 @@ import os import torch from torch import nn +import glob CONFIG_FILE = 'bert_config.json' MODEL_WEIGHTS = 'pytorch_model.bin' @@ -346,7 +347,12 @@ class BertModel(nn.Module): # Instantiate model. model = cls(*inputs, **config, **kwargs) if state_dict is None: - weights_path = os.path.join(pretrained_model_dir, MODEL_WEIGHTS) + files = glob.glob(os.path.join(pretrained_model_dir, '*.bin')) + if len(files)==0: + raise FileNotFoundError(f"There is no *.bin file in {pretrained_model_dir}") + elif len(files)>1: + raise FileExistsError(f"There are multiple *.bin files in {pretrained_model_dir}") + weights_path = files[0] state_dict = torch.load(weights_path) old_keys = [] @@ -390,16 +396,6 @@ class BertModel(nn.Module): return model - - - - - - - - - - def whitespace_tokenize(text): """Runs basic whitespace cleaning and splitting on a piece of text.""" text = text.strip() @@ -671,6 +667,16 @@ class BertTokenizer(object): self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) self.max_len = max_len if max_len is not None else int(1e12) + def _reinit_on_new_vocab(self, vocab): + """ + 在load bert之后,可能会对vocab进行重新排列。重新排列之后调用这个函数重新初始化与vocab相关的性质 + + :param vocab: + :return: + """ + self.vocab = vocab + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + def tokenize(self, text): split_tokens = [] if self.do_basic_tokenize: @@ -706,6 +712,8 @@ class BertTokenizer(object): index = 0 if os.path.isdir(vocab_path): vocab_file = os.path.join(vocab_path, VOCAB_NAME) + else: + vocab_file = vocab_path with open(vocab_file, "w", encoding="utf-8") as writer: for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: @@ -751,11 +759,44 @@ class _WordBertModel(nn.Module): assert pool_method in ('avg', 'max', 'first', 'last') self.pool_method = pool_method - self.include_cls_sep = include_cls_sep # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP] print("Start to generating word pieces for word.") + # 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值 + word_piece_dict = {'[CLS]':1, '[SEP]':1} # 用到的word_piece以及新增的 + found_count = 0 + for word, index in vocab: + if index == vocab.padding_idx: # pad是个特殊的符号 + word = '[PAD]' + elif index == vocab.unknown_idx: + word = '[UNK]' + word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word) + if len(word_pieces)==1: + if not vocab._is_word_no_create_entry(word): # 如果是train中的值, 但是却没有找到 + if index!=vocab.unknown_idx and word_pieces[0]=='[UNK]': # 说明这个词不在原始的word里面 + word_piece_dict[word] = 1 # 新增一个值 + continue + for word_piece in word_pieces: + word_piece_dict[word_piece] = 1 + found_count += 1 + original_embed = self.encoder.embeddings.word_embeddings.weight.data + # 特殊词汇要特殊处理 + embed = nn.Embedding(len(word_piece_dict), original_embed.size(1)) # 新的embed + new_word_piece_vocab = collections.OrderedDict() + for index, token in enumerate(['[PAD]', '[UNK]']): + word_piece_dict.pop(token, None) + embed.weight.data[index] = original_embed[self.tokenzier.vocab[token]] + new_word_piece_vocab[token] = index + for token in word_piece_dict.keys(): + if token in self.tokenzier.vocab: + embed.weight.data[len(new_word_piece_vocab)] = original_embed[self.tokenzier.vocab[token]] + else: + embed.weight.data[len(new_word_piece_vocab)] = original_embed[self.tokenzier.vocab['[UNK]']] + new_word_piece_vocab[token] = len(new_word_piece_vocab) + self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab) + self.encoder.embeddings.word_embeddings = embed + word_to_wordpieces = [] word_pieces_lengths = [] for word, index in vocab: @@ -767,12 +808,11 @@ class _WordBertModel(nn.Module): word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces) word_to_wordpieces.append(word_pieces) word_pieces_lengths.append(len(word_pieces)) - self._cls_index = len(vocab) - self._sep_index = len(vocab) + 1 + print("Found(Or seg into word pieces) {} words out of {}.".format(found_count, len(vocab))) + self._cls_index = self.tokenzier.vocab['[CLS]'] + self._sep_index = self.tokenzier.vocab['[SEP]'] self._pad_index = vocab.padding_idx - self._wordpiece_pad_index = self.tokenzier.convert_tokens_to_ids(['[PAD]'])[0] # 需要用于生成word_piece - word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[CLS]'])) - word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[SEP]'])) + self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece self.word_to_wordpieces = np.array(word_to_wordpieces) self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False) print("Successfully generate word pieces.") @@ -850,7 +890,7 @@ class _WordPieceBertModel(nn.Module): 这个模块用于直接计算word_piece的结果. """ - def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1'): + def __init__(self, model_dir:str, layers:str='-1'): super().__init__() self.tokenzier = BertTokenizer.from_pretrained(model_dir) @@ -866,44 +906,34 @@ class _WordPieceBertModel(nn.Module): assert layer 0 and vocab._no_create_word_length > 0: # 需要映射,使得来自于dev, test的idx指向unk + words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False) + for word, idx in vocab: + if vocab._is_word_no_create_entry(word): + words_to_words[idx] = vocab.unknown_idx + setattr(self.token_embedder, 'words_to_words', words_to_words) self.output_dim = config['encoder']['projection_dim'] if config['encoder']['name'].lower() == 'elmo': diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index 4948d022..2ddb37ff 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -2,21 +2,19 @@ import os from torch import nn import torch -from ...core.vocabulary import Vocabulary from ...io.file_utils import _get_base_url, cached_path from ._bert import _WordPieceBertModel, BertModel - class BertWordPieceEncoder(nn.Module): """ - 可以通过读取vocabulary使用的Bert的Encoder。传入vocab,然后调用index_datasets方法在vocabulary中生成word piece的表示。 + 读取bert模型,读取之后调用index_dataset方法在dataset中生成word_pieces这一列。 :param fastNLP.Vocabulary vocab: 词表 :param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为``en-base-uncased`` :param str layers:最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层 :param bool requires_grad: 是否需要gradient。 """ - def __init__(self, vocab:Vocabulary, model_dir_or_name:str='en-base', layers:str='-1', + def __init__(self, model_dir_or_name:str='en-base-uncased', layers:str='-1', requires_grad:bool=False): super().__init__() PRETRAIN_URL = _get_base_url('bert') @@ -44,7 +42,7 @@ class BertWordPieceEncoder(nn.Module): else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") - self.model = _WordPieceBertModel(model_dir=model_dir, vocab=vocab, layers=layers) + self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers) self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size self.requires_grad = requires_grad @@ -69,27 +67,27 @@ class BertWordPieceEncoder(nn.Module): def embed_size(self): return self._embed_size - def index_datasets(self, *datasets): + def index_datasets(self, *datasets, field_name): """ - 根据datasets中的'words'列对datasets进行word piece的index。 - - Example:: + 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是 + [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。 - :param datasets: + :param datasets: DataSet对象 + :param field_name: str基于哪一列index :return: """ - self.model.index_dataset(*datasets) + self.model.index_dataset(*datasets, field_name=field_name) + - def forward(self, words, token_type_ids=None): + def forward(self, word_pieces, token_type_ids=None): """ - 计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要 - 删除这两个表示。 + 计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 :param words: batch_size x max_len :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话 :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) """ - outputs = self.model(words, token_type_ids) + outputs = self.model(word_pieces, token_type_ids) outputs = torch.cat([*outputs], dim=-1) return outputs \ No newline at end of file diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py index 637026e5..46e393b1 100644 --- a/fastNLP/modules/encoder/embedding.py +++ b/fastNLP/modules/encoder/embedding.py @@ -13,7 +13,7 @@ from .lstm import LSTM from ...core.vocabulary import Vocabulary from abc import abstractmethod import torch -from ...io import EmbedLoader +import numpy as np import torch.nn.functional as F import os from ._elmo import _ElmoModel @@ -21,6 +21,7 @@ from ...io.file_utils import cached_path, _get_base_url from ._bert import _WordBertModel from typing import List +import warnings from ...core.dataset import DataSet from ...core.batch import DataSetIter from ...core.sampler import SequentialSampler @@ -33,13 +34,15 @@ class Embedding(nn.Module): Embedding组件. 可以通过self.num_embeddings获取词表大小; self.embedding_dim获取embedding的维度""" - def __init__(self, init_embed, dropout=0.0): + def __init__(self, init_embed, dropout=0.0, dropout_word=0, unk_index=None): """ :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), 第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding; 也可以传入TokenEmbedding对象 :param float dropout: 对Embedding的输出的dropout。 + :param float dropout_word: 按照一定比例随机将word设置为unk的idx,这样可以使得unk这个token得到足够的训练 + :param int unk_index: drop word时替换为的index,如果init_embed为TokenEmbedding不需要传入该值。 """ super(Embedding, self).__init__() @@ -48,20 +51,32 @@ class Embedding(nn.Module): self.dropout = nn.Dropout(dropout) if not isinstance(self.embed, TokenEmbedding): self._embed_size = self.embed.weight.size(1) + if dropout_word>0 and isinstance(unk_index, int): + raise ValueError("When drop word is set, you need to pass in the unk_index.") else: self._embed_size = self.embed.embed_size - + unk_index = self.embed.get_word_vocab().unknown_idx + self.unk_index = unk_index + self.dropout_word = dropout_word + def forward(self, x): """ :param torch.LongTensor x: [batch, seq_len] :return: torch.Tensor : [batch, seq_len, embed_dim] """ + if self.dropout_word>0 and self.training: + mask = torch.ones_like(x).float() * self.dropout_word + mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1 + x = x.masked_fill(mask, self.unk_index) x = self.embed(x) return self.dropout(x) @property def num_embedding(self)->int: - return len(self) + if isinstance(self.embed, nn.Embedding): + return self.embed.weight.size(0) + else: + return self.embed.num_embedding def __len__(self): return len(self.embed) @@ -95,7 +110,7 @@ class Embedding(nn.Module): @property def size(self): if isinstance(self.embed, TokenEmbedding): - return torch.Size(self.embed._word_vocab, self.embed.embed_size) + return self.embed.size else: return self.embed.weight.size() @@ -131,6 +146,10 @@ class TokenEmbedding(nn.Module): def embed_size(self) -> int: return self._embed_size + @property + def num_embedding(self) -> int: + return len(self._word_vocab) + def get_word_vocab(self): """ 返回embedding的词典。 @@ -141,7 +160,7 @@ class TokenEmbedding(nn.Module): @property def size(self): - return torch.Size(self.embed._word_vocab, self._embed_size) + return torch.Size(self.num_embedding, self._embed_size) class StaticEmbedding(TokenEmbedding): @@ -159,11 +178,12 @@ class StaticEmbedding(TokenEmbedding): :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding的文件名,第二种是传入embedding 的名称。目前支持的embedding包括{`en` 或者 `en-glove-840b-300` : glove.840B.300d, `en-glove-6b-50` : glove.6B.50d, `en-word2vec-300` : GoogleNews-vectors-negative300}。第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 - :param requires_grad: 是否需要gradient + :param requires_grad: 是否需要gradient. 默认为True + :param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。默认使用torch.nn.init.xavier_uniform_ + 。调用该方法时传入一个tensor对象。 """ - - def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=False): + def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=True, init_method=None): super(StaticEmbedding, self).__init__(vocab) # 优先定义需要下载的static embedding有哪些。这里估计需要自己搞一个server, @@ -190,15 +210,105 @@ class StaticEmbedding(TokenEmbedding): raise ValueError(f"Cannot recognize {model_dir_or_name}.") # 读取embedding - embedding = EmbedLoader.load_with_vocab(model_path, vocab=vocab) - embedding = torch.tensor(embedding) + embedding, hit_flags = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method) self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1], padding_idx=vocab.padding_idx, max_norm=None, norm_type=2, scale_grad_by_freq=False, sparse=False, _weight=embedding) + if vocab._no_create_word_length > 0: # 需要映射,使得来自于dev, test的idx指向unk + words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False) + for word, idx in vocab: + if vocab._is_word_no_create_entry(word) and not hit_flags[idx]: + words_to_words[idx] = vocab.unknown_idx + self.words_to_words = words_to_words self._embed_size = self.embedding.weight.size(1) self.requires_grad = requires_grad + @property + def requires_grad(self): + """ + Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许 + :return: + """ + requires_grads = set([param.requires_grad for name, param in self.named_parameters() + if 'words_to_words' not in name]) + if len(requires_grads) == 1: + return requires_grads.pop() + else: + return None + + @requires_grad.setter + def requires_grad(self, value): + for name, param in self.named_parameters(): + if 'words_to_words' in name: + continue + param.requires_grad = value + + def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='', unknown='', normalize=True, + error='ignore', init_method=None): + """ + 从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是 + word2vec(第一行只有两个元素)还是glove格式的数据。 + + :param str embed_filepath: 预训练的embedding的路径。 + :param vocab: 词表 :class:`~fastNLP.Vocabulary` 类型,读取出现在vocab中的词的embedding。 + 没有出现在vocab中的词的embedding将通过找到的词的embedding的正态分布采样出来,以使得整个Embedding是同分布的。 + :param dtype: 读出的embedding的类型 + :param str padding: 词表中padding的token + :param str unknown: 词表中unknown的token + :param bool normalize: 是否将每个vector归一化到norm为1 + :param str error: `ignore` , `strict` ; 如果 `ignore` ,错误将自动跳过; 如果 `strict` , 错误将抛出。 + 这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。 + :param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。默认使用torch.nn.init.zeros_ + :return torch.tensor: shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。 + """ + assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported." + if not os.path.exists(embed_filepath): + raise FileNotFoundError("`{}` does not exist.".format(embed_filepath)) + if init_method is None: + init_method = nn.init.xavier_uniform_ + with open(embed_filepath, 'r', encoding='utf-8') as f: + found_count = 0 + line = f.readline().strip() + parts = line.split() + start_idx = 0 + if len(parts) == 2: + dim = int(parts[1]) + start_idx += 1 + else: + dim = len(parts) - 1 + f.seek(0) + matrix = torch.zeros(len(vocab), dim) + init_method(matrix) + hit_flags = np.zeros(len(vocab), dtype=bool) + for idx, line in enumerate(f, start_idx): + try: + parts = line.strip().split() + word = ''.join(parts[:-dim]) + nums = parts[-dim:] + # 对齐unk与pad + if word == padding and vocab.padding is not None: + word = vocab.padding + elif word == unknown and vocab.unknown is not None: + word = vocab.unknown + if word in vocab: + index = vocab.to_index(word) + matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) + found_count += 1 + hit_flags[index] = True + except Exception as e: + if error == 'ignore': + warnings.warn("Error occurred at the {} line.".format(idx)) + else: + print("Error occurred at the {} line.".format(idx)) + raise e + print("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab))) + + if normalize: + matrix /= (torch.norm(matrix, dim=1, keepdim=True) + 1e-12) + + return matrix, hit_flags + def forward(self, words): """ 传入words的index @@ -206,6 +316,8 @@ class StaticEmbedding(TokenEmbedding): :param words: torch.LongTensor, [batch_size, max_len] :return: torch.FloatTensor, [batch_size, max_len, embed_size] """ + if hasattr(self, 'words_to_words'): + words = self.words_to_words[words] return self.embedding(words) @@ -382,7 +494,7 @@ class ElmoEmbedding(ContextualEmbedding): :return: """ requires_grads = set([param.requires_grad for name, param in self.named_parameters() - if 'words_to_chars_embedding' not in name]) + if 'words_to_chars_embedding' not in name and 'words_to_words' not in name]) if len(requires_grads) == 1: return requires_grads.pop() else: @@ -391,7 +503,7 @@ class ElmoEmbedding(ContextualEmbedding): @requires_grad.setter def requires_grad(self, value): for name, param in self.named_parameters(): - if 'words_to_chars_embedding' in name: # 这个不能加入到requires_grad中 + if 'words_to_chars_embedding' in name or 'words_to_words' in name: # 这个不能加入到requires_grad中 continue param.requires_grad = value @@ -501,7 +613,8 @@ def _construct_char_vocab_from_vocab(vocab:Vocabulary, min_freq:int=1): """ char_vocab = Vocabulary(min_freq=min_freq) for word, index in vocab: - char_vocab.add_word_lst(list(word)) + if not vocab._is_word_no_create_entry(word): + char_vocab.add_word_lst(list(word)) return char_vocab @@ -566,7 +679,7 @@ class CNNCharEmbedding(TokenEmbedding): requires_grad=False) self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False) for word, index in vocab: - # if index!=vocab.padding_idx: # 如果是pad的话,直接就为pad_value了。 修改为不区分pad, 这样所有的也是同一个embed + # if index!=vocab.padding_idx: # 如果是pad的话,直接就为pad_value了。修改为不区分pad, 这样所有的也是同一个embed self.words_to_chars_embedding[index, :len(word)] = \ torch.LongTensor([self.char_vocab.to_index(c) for c in word]) self.word_lengths[index] = len(word) @@ -638,7 +751,7 @@ class CNNCharEmbedding(TokenEmbedding): if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能reset continue if param.data.dim()>1: - nn.init.xavier_normal_(param, 1) + nn.init.xavier_uniform_(param, 1) else: nn.init.uniform_(param, -1, 1) diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py index 10d0e339..c48f2683 100644 --- a/fastNLP/modules/encoder/lstm.py +++ b/fastNLP/modules/encoder/lstm.py @@ -55,8 +55,8 @@ class LSTM(nn.Module): :param x: [batch, seq_len, input_size] 输入序列 :param seq_len: [batch, ] 序列长度, 若为 ``None``, 所有输入看做一样长. Default: ``None`` - :param h0: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全1向量. Default: ``None`` - :param c0: [batch, hidden_size] 初始Cell状态, 若为 ``None`` , 设为全1向量. Default: ``None`` + :param h0: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全0向量. Default: ``None`` + :param c0: [batch, hidden_size] 初始Cell状态, 若为 ``None`` , 设为全0向量. Default: ``None`` :return (output, ht) 或 output: 若 ``get_hidden=True`` [batch, seq_len, hidden_size*num_direction] 输出序列 和 [batch, hidden_size*num_direction] 最后时刻隐状态. """ diff --git a/reproduction/seqence_labelling/ner/data/Conll2003Loader.py b/reproduction/seqence_labelling/ner/data/Conll2003Loader.py index 3140af18..577987c6 100644 --- a/reproduction/seqence_labelling/ner/data/Conll2003Loader.py +++ b/reproduction/seqence_labelling/ner/data/Conll2003Loader.py @@ -58,19 +58,20 @@ class Conll2003DataLoader(DataSetLoader): dataset = self.load(path) dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) if lower: - dataset.apply_field(lambda words:[word.lower() for word in words], field_name=Const.INPUT, - new_field_name=Const.INPUT) + dataset.words.lower() data.datasets[name] = dataset # 对construct vocab word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) - word_vocab.from_dataset(*data.datasets.values(), field_name=Const.INPUT) + word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT, + no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train']) word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) data.vocabs[Const.INPUT] = word_vocab # cap words cap_word_vocab = Vocabulary() - cap_word_vocab.from_dataset(*data.datasets.values(), field_name='raw_words') + cap_word_vocab.from_dataset(data.datasets['train'], field_name='raw_words', + no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train']) cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words') input_fields.append('cap_words') data.vocabs['cap_words'] = cap_word_vocab diff --git a/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py b/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py index fe0236ad..8a2c567d 100644 --- a/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py +++ b/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py @@ -49,6 +49,28 @@ class OntoNoteNERDataLoader(DataSetLoader): bio_tags.append(bio_label) return self.encoding_method(bio_tags) + def convert_word(words): + converted_words = [] + for word in words: + word = word.replace('/.', '.') # 有些结尾的.是/.形式的 + if not word.startswith('-'): + converted_words.append(word) + continue + # 以下是由于这些符号被转义了,再转回来 + tfrs = {'-LRB-':'(', + '-RRB-': ')', + '-LSB-': '[', + '-RSB-': ']', + '-LCB-': '{', + '-RCB-': '}' + } + if word in tfrs: + converted_words.append(tfrs[word]) + else: + converted_words.append(word) + return converted_words + + dataset.apply_field(convert_word, field_name='raw_words', new_field_name='raw_words') dataset.apply_field(convert_to_bio, field_name='target', new_field_name='target') return dataset @@ -81,14 +103,14 @@ class OntoNoteNERDataLoader(DataSetLoader): dataset = self.load(path) dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT) if lower: - dataset.apply_field(lambda words:[word.lower() for word in words], field_name=Const.INPUT, - new_field_name=Const.INPUT) + dataset.words.lower() data.datasets[name] = dataset # 对construct vocab word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt) - word_vocab.from_dataset(*data.datasets.values(), field_name=Const.INPUT) - word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name=Const.INPUT) + word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT, + no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train']) + word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT) data.vocabs[Const.INPUT] = word_vocab # cap words diff --git a/test/core/test_vocabulary.py b/test/core/test_vocabulary.py index c161ad9d..b3326f6a 100644 --- a/test/core/test_vocabulary.py +++ b/test/core/test_vocabulary.py @@ -70,6 +70,24 @@ class TestAdd(unittest.TestCase): self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2) vocab.index_dataset(dataset, field_name='char') + def test_from_dataset_no_entry(self): + # 测试能否正确将no_create_entry正确设置 + dataset = DataSet() + start_char = 65 + num_samples = 10 + test_dataset = DataSet() + for i in range(num_samples): + char = [chr(start_char + i)] * 6 + ins = Instance(char=char) + dataset.append(ins) + ins = Instance(char=[c+c for c in char]) + test_dataset.append(ins) + vocab = Vocabulary() + vocab.from_dataset(dataset, field_name='char', no_create_entry_dataset=test_dataset) + vocab.index_dataset(dataset, field_name='char') + for i in range(num_samples): + self.assertEqual(True, vocab._is_word_no_create_entry(chr(start_char + i)+chr(start_char + i))) + class TestIndexing(unittest.TestCase): def test_len(self): From e57b8e4fd3f8b1a56d8011761c778611559da39b Mon Sep 17 00:00:00 2001 From: yh_cc Date: Fri, 21 Jun 2019 11:24:42 +0800 Subject: [PATCH 21/34] =?UTF-8?q?seq=5Flen=5Fto=5Fmask=E4=BF=AE=E5=A4=8D?= =?UTF-8?q?=E6=B5=8B=E8=AF=95=E5=A4=B1=E8=B4=A5=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- test/core/test_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/core/test_utils.py b/test/core/test_utils.py index a3e8bdf6..363d5fa1 100644 --- a/test/core/test_utils.py +++ b/test/core/test_utils.py @@ -240,8 +240,7 @@ class TestSeqLenToMask(unittest.TestCase): # 3. pad到指定长度 seq_len = np.random.randint(1, 10, size=(10,)) mask = seq_len_to_mask(seq_len, 100) - self.assertEqual(100, mask.size(1)) - + self.assertEqual(100, mask.shape[1]) def test_pytorch_seq_len(self): # 1. 随机测试 From d34c739a39be53685d03984579fd240c574600aa Mon Sep 17 00:00:00 2001 From: xuyige Date: Sat, 22 Jun 2019 22:53:52 +0800 Subject: [PATCH 22/34] Update embedding.py --- fastNLP/modules/encoder/embedding.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py index 637026e5..0462f2e0 100644 --- a/fastNLP/modules/encoder/embedding.py +++ b/fastNLP/modules/encoder/embedding.py @@ -408,10 +408,10 @@ class BertEmbedding(ContextualEmbedding): :param fastNLP.Vocabulary vocab: 词表 - :param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为``en-base-uncased`` + :param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为 ``en-base-uncased``. :param str layers:最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层 :param str pool_method: 因为在bert中,每个word会被表示为多个word pieces, 当获取一个word的表示的时候,怎样从它的word pieces - 中计算得到他对应的表示。支持``last``, ``first``, ``avg``, ``max``. + 中计算得到他对应的表示。支持 ``last``, ``first``, ``avg``, ``max``. :param bool include_cls_sep: bool,在bert计算句子的表示的时候,需要在前面加上[CLS]和[SEP], 是否在结果中保留这两个内容。 这样 会使得word embedding的结果比输入的结果长两个token。在使用 :class::StackEmbedding 可能会遇到问题。 :param bool requires_grad: 是否需要gradient。 @@ -459,7 +459,7 @@ class BertEmbedding(ContextualEmbedding): 计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要 删除这两个token的表示。 - :param words: batch_size x max_len + :param torch.LongTensor words: [batch_size, max_len] :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) """ outputs = self._get_sent_reprs(words) @@ -777,7 +777,8 @@ class StackEmbedding(TokenEmbedding): Example:: - >>> + >>> embed_1 = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True) + >>> embed_2 = StaticEmbedding(vocab, model_dir_or_name='en-word2vec-300', requires_grad=True) :param embeds: 一个由若干个TokenEmbedding组成的list,要求每一个TokenEmbedding的词表都保持一致 From 4b0c26d33817cd76670a4ef680835f008e1ca6ae Mon Sep 17 00:00:00 2001 From: yh_cc Date: Sun, 23 Jun 2019 15:02:34 +0800 Subject: [PATCH 23/34] =?UTF-8?q?1.=E4=BF=AE=E5=A4=8DBertEmbedding?= =?UTF-8?q?=E4=B8=AD=E7=9A=84bug;=202.=20=E4=BF=AE=E5=A4=8DBatch,=20Field?= =?UTF-8?q?=E5=9C=A8=E8=BF=9B=E8=A1=8C=E7=B1=BB=E5=9E=8B=E8=BD=AC=E6=8D=A2?= =?UTF-8?q?=E6=97=B6=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/batch.py | 16 ++++++--- fastNLP/core/field.py | 49 ++++++++++++++-------------- fastNLP/core/tester.py | 2 +- fastNLP/modules/encoder/_bert.py | 7 ++-- fastNLP/modules/encoder/bert.py | 3 +- fastNLP/modules/encoder/embedding.py | 13 ++++---- 6 files changed, 49 insertions(+), 41 deletions(-) diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py index 89b55a25..ca48a8e1 100644 --- a/fastNLP/core/batch.py +++ b/fastNLP/core/batch.py @@ -68,7 +68,11 @@ class DataSetGetter: else: data = f.pad(vlist) if not self.as_numpy: - data, flag = _to_tensor(data, f.dtype) + try: + data, flag = _to_tensor(data, f.dtype) + except TypeError as e: + print(f"Field {n} cannot be converted to torch.tensor.") + raise e batch_dict[n] = data return batch_dict @@ -173,15 +177,17 @@ class OnlineDataIter(BatchIter): def _to_tensor(batch, field_dtype): try: - if field_dtype is not None \ + if field_dtype is not None and isinstance(field_dtype, type)\ and issubclass(field_dtype, Number) \ and not isinstance(batch, torch.Tensor): if issubclass(batch.dtype.type, np.floating): new_batch = torch.as_tensor(batch).float() # 默认使用float32 + elif issubclass(batch.dtype.type, np.integer): + new_batch = torch.as_tensor(batch).long() # 复用内存地址,避免复制 else: - new_batch = torch.as_tensor(batch) # 复用内存地址,避免复制 + new_batch = torch.as_tensor(batch) return new_batch, True else: return batch, False - except: - return batch, False + except Exception as e: + raise e diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py index 1c0ad235..65eb0194 100644 --- a/fastNLP/core/field.py +++ b/fastNLP/core/field.py @@ -395,6 +395,8 @@ def _get_ele_type_and_dim(cell:Any, dim=0): :return: """ if isinstance(cell, (str, Number, np.bool_)): + if hasattr(cell, 'dtype'): + return cell.dtype.type, dim return type(cell), dim elif isinstance(cell, list): dim += 1 @@ -412,7 +414,7 @@ def _get_ele_type_and_dim(cell:Any, dim=0): return cell.dtype, cell.dim() + dim # 如果是torch.mean的结果是0 elif isinstance(cell, np.ndarray): if cell.dtype != np.dtype('O'): # 如果不是object的话说明是well-formatted的了 - return cell.dtype.type, cell.ndim + dim + return cell.dtype.type, cell.ndim + dim # dtype.type返回的会是np.int32, np.float等 # 否则需要继续往下iterate dim += 1 res = [_get_ele_type_and_dim(cell_i, dim) for cell_i in cell] @@ -537,31 +539,30 @@ class AutoPadder(Padder): if field_ele_dtype: if dim>3: return np.array(contents) - if isinstance(field_ele_dtype, np.dtype) or field_ele_dtype in (float, int, bool, str): - if isinstance(field_ele_dtype, np.number) or field_ele_dtype in (float, int, bool): - if dim==0: + if isinstance(field_ele_dtype, type) and \ + (issubclass(field_ele_dtype, np.number) or issubclass(field_ele_dtype, Number)): + if dim==0: + array = np.array(contents, dtype=field_ele_dtype) + elif dim==1: + max_len = max(map(len, contents)) + array = np.full((len(contents), max_len), self.pad_val, dtype=field_ele_dtype) + for i, content_i in enumerate(contents): + array[i, :len(content_i)] = content_i + elif dim==2: + max_len = max(map(len, contents)) + max_word_len = max([max([len(content_ii) for content_ii in content_i]) for + content_i in contents]) + array = np.full((len(contents), max_len, max_word_len), self.pad_val, dtype=field_ele_dtype) + for i, content_i in enumerate(contents): + for j, content_ii in enumerate(content_i): + array[i, j, :len(content_ii)] = content_ii + else: + shape = np.shape(contents) + if len(shape)==4: # 说明各dimension是相同的大小 array = np.array(contents, dtype=field_ele_dtype) - elif dim==1: - max_len = max(map(len, contents)) - array = np.full((len(contents), max_len), self.pad_val, dtype=field_ele_dtype) - for i, content_i in enumerate(contents): - array[i, :len(content_i)] = content_i - elif dim==2: - max_len = max(map(len, contents)) - max_word_len = max([max([len(content_ii) for content_ii in content_i]) for - content_i in contents]) - array = np.full((len(contents), max_len, max_word_len), self.pad_val, dtype=field_ele_dtype) - for i, content_i in enumerate(contents): - for j, content_ii in enumerate(content_i): - array[i, j, :len(content_ii)] = content_ii else: - shape = np.shape(contents) - if len(shape)==4: # 说明各dimension是相同的大小 - array = np.array(contents, dtype=field_ele_dtype) - else: - raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") - return array - return np.array(contents) + raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.") + return array elif str(field_ele_dtype).startswith('torch'): if dim==0: tensor = torch.tensor(contents).to(field_ele_dtype) diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 398afe6b..536279de 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -99,7 +99,7 @@ class Tester(object): if isinstance(data, DataSet): self.data_iterator = DataSetIter( - dataset=data, batch_size=batch_size, num_workers=num_workers) + dataset=data, batch_size=batch_size, num_workers=num_workers, sampler=SequentialSampler()) elif isinstance(data, BatchIter): self.data_iterator = data else: diff --git a/fastNLP/modules/encoder/_bert.py b/fastNLP/modules/encoder/_bert.py index a0353279..254917e5 100644 --- a/fastNLP/modules/encoder/_bert.py +++ b/fastNLP/modules/encoder/_bert.py @@ -831,7 +831,8 @@ class _WordBertModel(nn.Module): # +2是由于需要加入[CLS]与[SEP] word_pieces = words.new_full((batch_size, max_word_piece_length+2), fill_value=self._wordpiece_pad_index) word_pieces[:, 0].fill_(self._cls_index) - word_pieces[torch.arange(batch_size).to(words), word_pieces_lengths+1] = self._sep_index + batch_indexes = torch.arange(batch_size).to(words) + word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index attn_masks = torch.zeros_like(word_pieces) # 1. 获取words的word_pieces的id,以及对应的span范围 word_indexes = words.tolist() @@ -879,8 +880,8 @@ class _WordBertModel(nn.Module): start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1] outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2) if self.include_cls_sep: - outputs[:, :, 0] = output_layer[:, 0] - outputs[:, :, seq_len+s_shift] = output_layer[:, seq_len+s_shift] + outputs[l_index, :, 0] = output_layer[:, 0] + outputs[l_index, batch_indexes, seq_len+s_shift] = output_layer[batch_indexes, seq_len+s_shift] # 3. 最终的embedding结果 return outputs diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index 2ddb37ff..757973fe 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -73,12 +73,11 @@ class BertWordPieceEncoder(nn.Module): [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。 :param datasets: DataSet对象 - :param field_name: str基于哪一列index + :param field_name: 基于哪一列的内容生成word_pieces列。这一列中每个数据应该是List[str]的形式。 :return: """ self.model.index_dataset(*datasets, field_name=field_name) - def forward(self, word_pieces, token_type_ids=None): """ 计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py index 46e393b1..810b909f 100644 --- a/fastNLP/modules/encoder/embedding.py +++ b/fastNLP/modules/encoder/embedding.py @@ -51,7 +51,7 @@ class Embedding(nn.Module): self.dropout = nn.Dropout(dropout) if not isinstance(self.embed, TokenEmbedding): self._embed_size = self.embed.weight.size(1) - if dropout_word>0 and isinstance(unk_index, int): + if dropout_word>0 and not isinstance(unk_index, int): raise ValueError("When drop word is set, you need to pass in the unk_index.") else: self._embed_size = self.embed.embed_size @@ -512,7 +512,8 @@ class BertEmbedding(ContextualEmbedding): """ 别名::class:`fastNLP.modules.BertEmbedding` :class:`fastNLP.modules.encoder.embedding.BertEmbedding` - 使用BERT对words进行encode的Embedding。 + 使用BERT对words进行encode的Embedding。建议将输入的words长度限制在450以内,而不要使用512。这是由于预训练的bert模型长 + 度限制为512个token,而因为输入的word是未进行word piece分割的,在分割之后长度可能会超过最大长度限制。 Example:: @@ -523,7 +524,7 @@ class BertEmbedding(ContextualEmbedding): :param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为``en-base-uncased`` :param str layers:最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层 :param str pool_method: 因为在bert中,每个word会被表示为多个word pieces, 当获取一个word的表示的时候,怎样从它的word pieces - 中计算得到他对应的表示。支持``last``, ``first``, ``avg``, ``max``. + 中计算得到它对应的表示。支持``last``, ``first``, ``avg``, ``max``。 :param bool include_cls_sep: bool,在bert计算句子的表示的时候,需要在前面加上[CLS]和[SEP], 是否在结果中保留这两个内容。 这样 会使得word embedding的结果比输入的结果长两个token。在使用 :class::StackEmbedding 可能会遇到问题。 :param bool requires_grad: 是否需要gradient。 @@ -673,8 +674,8 @@ class CNNCharEmbedding(TokenEmbedding): self.char_pad_index = self.char_vocab.padding_idx print(f"In total, there are {len(self.char_vocab)} distinct characters.") # 对vocab进行index - self.max_word_len = max(map(lambda x: len(x[0]), vocab)) - self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), self.max_word_len), + max_word_len = max(map(lambda x: len(x[0]), vocab)) + self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), max_word_len), fill_value=self.char_pad_index, dtype=torch.long), requires_grad=False) self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False) @@ -707,7 +708,7 @@ class CNNCharEmbedding(TokenEmbedding): # 为1的地方为mask chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了 chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size - chars = self.dropout(chars) + self.dropout(chars) reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1) reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1) From d1f531c0496309f9e75719ae52bbe242a0e4ba64 Mon Sep 17 00:00:00 2001 From: xuyige Date: Sun, 23 Jun 2019 18:25:04 +0800 Subject: [PATCH 24/34] update matching dataloader in reproduction/matching --- fastNLP/io/dataset_loader.py | 167 ------------- .../matching/data/MatchingDataLoader.py | 219 ++++++++++++++++++ reproduction/matching/data/SNLIDataLoader.py | 6 - .../matching/test/test_snlidataloader.py | 4 +- 4 files changed, 221 insertions(+), 175 deletions(-) create mode 100644 reproduction/matching/data/MatchingDataLoader.py delete mode 100644 reproduction/matching/data/SNLIDataLoader.py diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index b0bf2e60..01e6c8ed 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -250,173 +250,6 @@ class JsonLoader(DataSetLoader): return ds -class MatchingLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.MatchingLoader` :class:`fastNLP.io.dataset_loader.MatchingLoader` - - 读取Matching数据集,根据数据集做预处理并返回DataInfo。 - - 数据来源: - SNLI: https://nlp.stanford.edu/projects/snli/snli_1.0.zip - """ - - def __init__(self, data_format: str='snli', for_model: str='esim', bert_dir=None): - super(MatchingLoader, self).__init__() - self.data_format = data_format.lower() - self.for_model = for_model.lower() - self.bert_dir = bert_dir - - def _load(self, path: str) -> DataSet: - raise NotImplementedError - - def process(self, paths: Union[str, Dict[str, str]], input_field=None) -> DataInfo: - if isinstance(paths, str): - paths = {'train': paths} - - data_set = {} - for n, p in paths.items(): - if self.data_format == 'snli': - data = self._load_snli(p) - else: - raise RuntimeError(f'Your data format is {self.data_format}, ' - f'Please choose data format from [snli]') - - if self.for_model == 'esim': - data = self._for_esim(data) - elif self.for_model == 'bert': - data = self._for_bert(data, self.bert_dir) - else: - raise RuntimeError(f'Your model is {self.data_format}, ' - f'Please choose from [esim, bert]') - - if input_field is not None: - if isinstance(input_field, str): - data.set_input(input_field) - elif isinstance(input_field, list): - for field in input_field: - data.set_input(field) - - data_set[n] = data - print(f'successfully load {n} set!') - - if not hasattr(self, 'vocab'): - raise RuntimeError(f'There is NOT vocab attribute built!') - if not hasattr(self, 'label_vocab'): - raise RuntimeError(f'There is NOT label vocab attribute built!') - - if self.for_model != 'bert': - from fastNLP.modules.encoder.embedding import ElmoEmbedding - embedding = ElmoEmbedding(self.vocab, model_dir_or_name='en', requires_grad=True, layers='2') - - data_info = DataInfo(vocabs={'vocab': self.vocab, 'target_vocab': self.label_vocab}, - embeddings={'elmo': embedding} if self.for_model != 'bert' else None, - datasets=data_set) - - return data_info - - @staticmethod - def _load_snli(path: str) -> DataSet: - """ - 读取SNLI数据集 - - 数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip - :param str path: 数据集路径 - :return: - """ - raw_ds = JsonLoader( - fields={ - 'sentence1_parse': Const.INPUTS(0), - 'sentence2_parse': Const.INPUTS(1), - 'gold_label': Const.TARGET, - } - )._load(path) - return raw_ds - - def _for_esim(self, raw_ds: DataSet): - if self.data_format == 'snli' or self.data_format == 'mnli': - def parse_tree(x): - t = Tree.fromstring(x) - return t.leaves() - - raw_ds.apply(lambda ins: parse_tree( - ins[Const.INPUTS(0)]), new_field_name=Const.INPUTS(0)) - raw_ds.apply(lambda ins: parse_tree( - ins[Const.INPUTS(1)]), new_field_name=Const.INPUTS(1)) - raw_ds.drop(lambda x: x[Const.TARGET] == '-') - - if not hasattr(self, 'vocab'): - self.vocab = Vocabulary().from_dataset(raw_ds, field_name=[Const.INPUTS(0), Const.INPUTS(1)]) - if not hasattr(self, 'label_vocab'): - self.label_vocab = Vocabulary(padding=None, unknown=None).from_dataset(raw_ds, field_name=Const.TARGET) - - raw_ds.apply(lambda ins: [self.vocab.to_index(w) for w in ins[Const.INPUTS(0)]], new_field_name=Const.INPUTS(0)) - raw_ds.apply(lambda ins: [self.vocab.to_index(w) for w in ins[Const.INPUTS(1)]], new_field_name=Const.INPUTS(1)) - raw_ds.apply(lambda ins: self.label_vocab.to_index(ins[Const.TARGET]), new_field_name=Const.TARGET) - raw_ds.apply(lambda ins: len(ins[Const.INPUTS(0)]), new_field_name=Const.INPUT_LENS(0)) - raw_ds.apply(lambda ins: len(ins[Const.INPUTS(1)]), new_field_name=Const.INPUT_LENS(1)) - - raw_ds.set_input(Const.INPUTS(0), Const.INPUTS(1), Const.INPUT_LENS(0), Const.INPUT_LENS(1)) - raw_ds.set_target(Const.TARGET) - - return raw_ds - - def _for_bert(self, raw_ds: DataSet, bert_dir: str): - if self.data_format == 'snli' or self.data_format == 'mnli': - def parse_tree(x): - t = Tree.fromstring(x) - return t.leaves() - - raw_ds.apply(lambda ins: parse_tree( - ins[Const.INPUTS(0)]), new_field_name=Const.INPUTS(0)) - raw_ds.apply(lambda ins: parse_tree( - ins[Const.INPUTS(1)]), new_field_name=Const.INPUTS(1)) - raw_ds.drop(lambda x: x[Const.TARGET] == '-') - - tokenizer = BertTokenizer.from_pretrained(bert_dir) - - vocab = Vocabulary(padding=None, unknown=None) - with open(os.path.join(bert_dir, 'vocab.txt')) as f: - lines = f.readlines() - vocab_list = [] - for line in lines: - vocab_list.append(line.strip()) - vocab.add_word_lst(vocab_list) - vocab.build_vocab() - vocab.padding = '[PAD]' - vocab.unknown = '[UNK]' - - if not hasattr(self, 'vocab'): - self.vocab = vocab - else: - for w, idx in self.vocab: - if vocab[w] != idx: - raise AttributeError(f"{self.__class__.__name__} has ") - - for i in range(2): - raw_ds.apply(lambda x: tokenizer.tokenize(" ".join(x[Const.INPUTS(i)])), new_field_name=Const.INPUTS(i)) - raw_ds.apply(lambda x: ['[CLS]'] + x[Const.INPUTS(0)] + ['[SEP]'] + x[Const.INPUTS(1)] + ['[SEP]'], - new_field_name=Const.INPUT) - raw_ds.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1), - new_field_name=Const.INPUT_LENS(0)) - raw_ds.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]), new_field_name=Const.INPUT_LENS(1)) - - max_len = 512 - raw_ds.apply(lambda x: x[Const.INPUT][: max_len], new_field_name=Const.INPUT) - raw_ds.apply(lambda x: [self.vocab.to_index(w) for w in x[Const.INPUT]], new_field_name=Const.INPUT) - raw_ds.apply(lambda x: x[Const.INPUT_LENS(0)][: max_len], new_field_name=Const.INPUT_LENS(0)) - raw_ds.apply(lambda x: x[Const.INPUT_LENS(1)][: max_len], new_field_name=Const.INPUT_LENS(1)) - - if not hasattr(self, 'label_vocab'): - self.label_vocab = Vocabulary(padding=None, unknown=None) - self.label_vocab.from_dataset(raw_ds, field_name=Const.TARGET) - raw_ds.apply(lambda x: self.label_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET) - - raw_ds.set_input(Const.INPUT, Const.INPUT_LENS(0), Const.INPUT_LENS(1)) - raw_ds.set_target(Const.TARGET) - - return raw_ds - - class SNLILoader(JsonLoader): """ 别名::class:`fastNLP.io.SNLILoader` :class:`fastNLP.io.dataset_loader.SNLILoader` diff --git a/reproduction/matching/data/MatchingDataLoader.py b/reproduction/matching/data/MatchingDataLoader.py new file mode 100644 index 00000000..305143b9 --- /dev/null +++ b/reproduction/matching/data/MatchingDataLoader.py @@ -0,0 +1,219 @@ + +import os + +from nltk import Tree +from typing import Union, Dict + +from fastNLP.core.const import Const +from fastNLP.core.vocabulary import Vocabulary +from fastNLP.core.dataset import DataSet +from fastNLP.io.base_loader import DataInfo +from fastNLP.io.dataset_loader import JsonLoader +from fastNLP.io.file_utils import _get_base_url, cached_path +from fastNLP.modules.encoder._bert import BertTokenizer + + +class MatchingLoader(JsonLoader): + """ + 别名::class:`fastNLP.io.MatchingLoader` :class:`fastNLP.io.dataset_loader.MatchingLoader` + + 读取Matching任务的数据集 + """ + + def __init__(self, fields=None, paths: dict=None): + super(MatchingLoader, self).__init__(fields=fields) + self.paths = paths + + def _load(self, path): + return super(MatchingLoader, self)._load(path) + + def process(self, paths: Union[str, Dict[str, str]], dataset_name=None, + to_lower=False, char_information=False, seq_len_type: str=None, + bert_tokenizer: str=None, get_index=True, set_input: Union[list, bool]=True, + set_target: Union[list, bool] = True, concat: Union[str, list, bool]=None, ) -> DataInfo: + if isinstance(set_input, bool): + auto_set_input = set_input + else: + auto_set_input = False + if isinstance(set_target, bool): + auto_set_target = set_target + else: + auto_set_target = False + if isinstance(paths, str): + if os.path.isdir(paths): + path = {n: os.path.join(paths, self.paths[n]) for n in self.paths.keys()} + else: + path = {dataset_name if dataset_name is not None else 'train': paths} + else: + path = paths + + data_info = DataInfo() + for data_name in path.keys(): + data_info.datasets[data_name] = self._load(path[data_name]) + + for data_name, data_set in data_info.datasets.items(): + if auto_set_input: + data_set.set_input(Const.INPUTS(0), Const.INPUTS(1)) + if auto_set_target: + data_set.set_target(Const.TARGET) + + if to_lower: + for data_name, data_set in data_info.datasets.items(): + data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(0)]], new_field_name=Const.INPUTS(0), + is_input=auto_set_input) + data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(1)]], new_field_name=Const.INPUTS(1), + is_input=auto_set_input) + + if bert_tokenizer is not None: + PRETRAINED_BERT_MODEL_DIR = {'en': 'bert-base-cased-f89bfe08.zip', + 'en-base-uncased': 'bert-base-uncased-3413b23c.zip', + 'en-base-cased': 'bert-base-cased-f89bfe08.zip', + 'en-large-uncased': 'bert-large-uncased-20939f45.zip', + 'en-large-cased': 'bert-large-cased-e0cf90fc.zip', + + 'cn': 'bert-base-chinese-29d0a84a.zip', + 'cn-base': 'bert-base-chinese-29d0a84a.zip', + + 'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip', + 'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip', + 'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip', + } + if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR: + PRETRAIN_URL = _get_base_url('bert') + model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer] + model_url = PRETRAIN_URL + model_name + model_dir = cached_path(model_url) + # 检查是否存在 + elif os.path.isdir(bert_tokenizer): + model_dir = bert_tokenizer + else: + raise ValueError(f"Cannot recognize BERT tokenizer from {bert_tokenizer}.") + + tokenizer = BertTokenizer.from_pretrained(model_dir) + + for data_name, data_set in data_info.datasets.items(): + for fields in data_set.get_field_names(): + if Const.INPUT in fields: + data_set.apply(lambda x: tokenizer.tokenize(' '.join(x[fields])), new_field_name=fields, + is_input=auto_set_input) + + if isinstance(concat, bool): + concat = 'default' if concat else None + if concat is not None: + if isinstance(concat, str): + CONCAT_MAP = {'bert': ['[CLS]', '[SEP]', '', '[SEP]'], + 'default': ['', '', '', '']} + if concat.lower() in CONCAT_MAP: + concat = CONCAT_MAP[concat] + else: + concat = 4 * [concat] + assert len(concat) == 4, \ + f'Please choose a list with 4 symbols which at the beginning of first sentence ' \ + f'the end of first sentence, the begin of second sentence, and the end of second' \ + f'sentence. Your input is {concat}' + + for data_name, data_set in data_info.datasets.items(): + data_set.apply(lambda x: [concat[0]] + x[Const.INPUTS(0)] + [concat[1]] + [concat[2]] + + x[Const.INPUTS(1)] + [concat[3]], new_field_name=Const.INPUT) + data_set.apply(lambda x: [w for w in x[Const.INPUT] if len(w) > 0], new_field_name=Const.INPUT, + is_input=auto_set_input) + + if seq_len_type is not None: + if seq_len_type == 'seq_len': # + for data_name, data_set in data_info.datasets.items(): + for fields in data_set.get_field_names(): + if Const.INPUT in fields: + data_set.apply(lambda x: len(x[fields]), + new_field_name=fields.replace(Const.INPUT, Const.TARGET), + is_input=auto_set_input) + elif seq_len_type == 'mask': + for data_name, data_set in data_info.datasets.items(): + for fields in data_set.get_field_names(): + if Const.INPUT in fields: + data_set.apply(lambda x: [1] * len(x[fields]), + new_field_name=fields.replace(Const.INPUT, Const.TARGET), + is_input=auto_set_input) + elif seq_len_type == 'bert': + for data_name, data_set in data_info.datasets.items(): + if Const.INPUT not in data_set.get_field_names(): + raise KeyError(f'Field ``{Const.INPUT}`` not in {data_name} data set: ' + f'got {data_set.get_field_names()}') + data_set.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1), + new_field_name=Const.INPUT_LENS(0), is_input=auto_set_input) + data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]), + new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input) + + data_set_list = [d for n, d in data_info.datasets.items()] + assert len(data_set_list) > 0, f'There are NO data sets in data info!' + + if bert_tokenizer is not None: + words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]') + else: + words_vocab = Vocabulary() + words_vocab = words_vocab.from_dataset(*data_set_list, + field_name=[n for n in data_set_list[0].get_field_names() + if (Const.INPUT in n)]) + target_vocab = Vocabulary(padding=None, unknown=None) + target_vocab = target_vocab.from_dataset(*data_set_list, field_name=Const.TARGET) + data_info.vocabs = {Const.INPUT: words_vocab, Const.TARGET: target_vocab} + + if get_index: + for data_name, data_set in data_info.datasets.items(): + for fields in data_set.get_field_names(): + if Const.INPUT in fields: + data_set.apply(lambda x: [words_vocab.to_index(w) for w in x[fields]], new_field_name=fields, + is_input=auto_set_input) + + data_set.apply(lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET, + is_input=auto_set_input, is_target=auto_set_target) + + for data_name, data_set in data_info.datasets.items(): + if isinstance(set_input, list): + data_set.set_input(set_input) + if isinstance(set_target, list): + data_set.set_target(set_target) + + return data_info + + +class SNLILoader(MatchingLoader): + """ + 别名::class:`fastNLP.io.SNLILoader` :class:`fastNLP.io.dataset_loader.SNLILoader` + + 读取SNLI数据集,读取的DataSet包含fields:: + + words1: list(str),第一句文本, premise + words2: list(str), 第二句文本, hypothesis + target: str, 真实标签 + + 数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip + """ + + def __init__(self, paths: dict=None): + fields = { + 'sentence1_parse': Const.INPUTS(0), + 'sentence2_parse': Const.INPUTS(1), + 'gold_label': Const.TARGET, + } + paths = paths if paths is not None else { + 'train': 'snli_1.0_train.jsonl', + 'dev': 'snli_1.0_dev.jsonl', + 'test': 'snli_1.0_test.jsonl'} + super(SNLILoader, self).__init__(fields=fields, paths=paths) + + def _load(self, path): + ds = super(SNLILoader, self)._load(path) + + def parse_tree(x): + t = Tree.fromstring(x) + return t.leaves() + + ds.apply(lambda ins: parse_tree( + ins[Const.INPUTS(0)]), new_field_name=Const.INPUTS(0)) + ds.apply(lambda ins: parse_tree( + ins[Const.INPUTS(1)]), new_field_name=Const.INPUTS(1)) + ds.drop(lambda x: x[Const.TARGET] == '-') + return ds + + + diff --git a/reproduction/matching/data/SNLIDataLoader.py b/reproduction/matching/data/SNLIDataLoader.py deleted file mode 100644 index 6f6bbecd..00000000 --- a/reproduction/matching/data/SNLIDataLoader.py +++ /dev/null @@ -1,6 +0,0 @@ - -from fastNLP.io.dataset_loader import SNLILoader - -# TODO: still in progress - - diff --git a/reproduction/matching/test/test_snlidataloader.py b/reproduction/matching/test/test_snlidataloader.py index bd5c58b6..60b3ad59 100644 --- a/reproduction/matching/test/test_snlidataloader.py +++ b/reproduction/matching/test/test_snlidataloader.py @@ -1,10 +1,10 @@ import unittest -from ..data import SNLIDataLoader +from ..data import MatchingDataLoader from fastNLP.core.vocabulary import Vocabulary class TestCWSDataLoader(unittest.TestCase): def test_case1(self): - snli_loader = SNLIDataLoader() + snli_loader = MatchingDataLoader() # TODO: still in progress From 3593f0a545b05d2d8588ba98bbcaf85612249336 Mon Sep 17 00:00:00 2001 From: xuyige Date: Sun, 23 Jun 2019 18:42:57 +0800 Subject: [PATCH 25/34] fix bugs in matching dataloader --- fastNLP/io/dataset_loader.py | 1 - reproduction/matching/data/MatchingDataLoader.py | 8 ++++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index 01e6c8ed..558fe20e 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -16,7 +16,6 @@ __all__ = [ 'CSVLoader', 'JsonLoader', 'ConllLoader', - 'MatchingLoader', 'SNLILoader', 'SSTLoader', 'PeopleDailyCorpusLoader', diff --git a/reproduction/matching/data/MatchingDataLoader.py b/reproduction/matching/data/MatchingDataLoader.py index 305143b9..139b1d4f 100644 --- a/reproduction/matching/data/MatchingDataLoader.py +++ b/reproduction/matching/data/MatchingDataLoader.py @@ -29,8 +29,12 @@ class MatchingLoader(JsonLoader): def process(self, paths: Union[str, Dict[str, str]], dataset_name=None, to_lower=False, char_information=False, seq_len_type: str=None, - bert_tokenizer: str=None, get_index=True, set_input: Union[list, bool]=True, - set_target: Union[list, bool] = True, concat: Union[str, list, bool]=None, ) -> DataInfo: + bert_tokenizer: str=None, get_index=True, set_input: Union[list, str, bool]=True, + set_target: Union[list, str, bool] = True, concat: Union[str, list, bool]=None, ) -> DataInfo: + if isinstance(set_input, str): + set_input = [set_input] + if isinstance(set_target, str): + set_target = [set_target] if isinstance(set_input, bool): auto_set_input = set_input else: From 4d9eb7c64a3de661d0e399426b55ccd383d8665b Mon Sep 17 00:00:00 2001 From: xuyige Date: Sun, 23 Jun 2019 18:44:36 +0800 Subject: [PATCH 26/34] update framework of matching --- fastNLP/io/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index 83425ff7..28f466a8 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -16,7 +16,6 @@ __all__ = [ 'CSVLoader', 'JsonLoader', 'ConllLoader', - 'MatchingLoader', 'SNLILoader', 'SSTLoader', 'PeopleDailyCorpusLoader', @@ -27,6 +26,6 @@ __all__ = [ ] from .embed_loader import EmbedLoader -from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, MatchingLoader,\ +from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, \ SNLILoader, SSTLoader, PeopleDailyCorpusLoader, Conll2003Loader from .model_io import ModelLoader, ModelSaver From e9137349c3ea72e76931bdb2e96ebd127e07fade Mon Sep 17 00:00:00 2001 From: yh_cc Date: Sun, 23 Jun 2019 19:13:49 +0800 Subject: [PATCH 27/34] =?UTF-8?q?=E8=A7=A3=E5=86=B3=E5=9C=A8DataParallel?= =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E5=9C=BA=E6=99=AF=E4=B8=8B=E6=97=A0=E6=B3=95?= =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E5=8F=82=E6=95=B0=E5=8C=B9=E9=85=8D=E7=9A=84?= =?UTF-8?q?=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/tester.py | 5 ++++- fastNLP/core/trainer.py | 16 +++++++++++----- fastNLP/modules/encoder/_elmo.py | 2 +- 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 536279de..4cdd4ffb 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -120,7 +120,10 @@ class Tester(object): raise TypeError(f"`{_model_name}.predict` must be callable to be used " f"for evaluation, not `{type(self._predict_func)}`.") else: - self._predict_func = self._model.forward + if isinstance(model, nn.DataParallel): + self._predict_func = self._model.module.forward + else: + self._predict_func = self._model.forward def test(self): """开始进行验证,并返回验证结果。 diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 8dece12d..55bc4ee0 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -578,7 +578,10 @@ class Trainer(object): self.step = 0 self.epoch = 0 start = time.time() - + if isinstance(self.model, nn.DataParallel): + self._forward_func = self.model.module.forward + else: + self._forward_func = self.model.forward with inner_tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: self.pbar = pbar avg_loss = 0 @@ -682,11 +685,11 @@ class Trainer(object): self.optimizer.step() def _data_forward(self, network, x): - x = _build_args(network.forward, **x) + x = _build_args(self._forward_func, **x) y = network(**x) if not isinstance(y, dict): raise TypeError( - f"The return value of {_get_func_signature(network.forward)} should be dict, got {type(y)}.") + f"The return value of {_get_func_signature(self._forward_func)} should be dict, got {type(y)}.") return y def _grad_backward(self, loss): @@ -845,8 +848,11 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_ print(info_str) _check_forward_error(forward_func=model.forward, dataset=dataset, batch_x=batch_x, check_level=check_level) - - refined_batch_x = _build_args(model.forward, **batch_x) + if isinstance(model, nn.DataParallel): + forward_func = model.module.forward + else: + forward_func = model.forward + refined_batch_x = _build_args(forward_func, **batch_x) pred_dict = model(**refined_batch_x) func_signature = _get_func_signature(model.forward) if not isinstance(pred_dict, dict): diff --git a/fastNLP/modules/encoder/_elmo.py b/fastNLP/modules/encoder/_elmo.py index c234a706..4ebee819 100644 --- a/fastNLP/modules/encoder/_elmo.py +++ b/fastNLP/modules/encoder/_elmo.py @@ -709,7 +709,7 @@ class _ElmoModel(nn.Module): config, word_emb_layer, char_emb_layer) self.token_embedder.load_state_dict(token_embedder_states, strict=False) if config['token_embedder']['word_dim'] > 0 and vocab._no_create_word_length > 0: # 需要映射,使得来自于dev, test的idx指向unk - words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False) + words_to_words = nn.Parameter(torch.arange(len(vocab)+2).long(), requires_grad=False) for word, idx in vocab: if vocab._is_word_no_create_entry(word): words_to_words[idx] = vocab.unknown_idx From 39dd08626209375c3f2abe787744dbe3178ff447 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Mon, 24 Jun 2019 09:56:28 +0800 Subject: [PATCH 28/34] =?UTF-8?q?1.=E4=BF=AE=E6=94=B9CrossEntropyLoss?= =?UTF-8?q?=E4=B8=AD=E5=AD=98=E5=9C=A8=E7=9A=84=E5=8F=8D=E7=9B=B4=E8=A7=89?= =?UTF-8?q?bug;=202.=E6=9B=B4=E6=96=B0sequence=20labeling?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/losses.py | 6 ++- fastNLP/core/trainer.py | 10 ++-- fastNLP/modules/encoder/embedding.py | 6 +-- .../cws/train_shift_relay.py | 10 ++-- .../ner/model/lstm_cnn_crf.py | 12 ++--- .../ner/train_cnn_lstm_crf_conll2003.py | 46 ++++++++++++++----- .../seqence_labelling/ner/train_ontonote.py | 46 +++++++++++++++---- 7 files changed, 93 insertions(+), 43 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 66234ce7..62e7a8c8 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -232,12 +232,16 @@ class CrossEntropyLoss(LossBase): """ def __init__(self, pred=None, target=None, padding_idx=-100): - # TODO 需要做一些检查,F.cross_entropy在计算时,如果pred是(16, 10 ,4), target的形状按道理应该是(16, 10), 但实际需要(16,4) super(CrossEntropyLoss, self).__init__() self._init_param_map(pred=pred, target=target) self.padding_idx = padding_idx def get_loss(self, pred, target): + if pred.dim()>2: + if pred.size()[:2]==target.size(): + # F.cross_entropy在计算时,如果pred是(16, 10 ,4), 会在第二维上去log_softmax, 所以需要交换一下位置 + pred = pred.transpose(1, 2) + return F.cross_entropy(input=pred, target=target, ignore_index=self.padding_idx) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index 55bc4ee0..a303f742 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -451,9 +451,11 @@ class Trainer(object): self.data_iterator = train_data else: raise TypeError("train_data type {} not support".format(type(train_data))) - + + self.model = _move_model_to_device(model, device=device) + if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter): - _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, + _check_code(dataset=train_data, model=self.model, losser=losser, metrics=metrics, dev_data=dev_data, metric_key=metric_key, check_level=check_code_level, batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE)) # _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码 @@ -474,9 +476,7 @@ class Trainer(object): self.best_dev_perf = None self.n_steps = (len(self.train_data) // self.batch_size + int( len(self.train_data) % self.batch_size != 0)) * self.n_epochs - - self.model = _move_model_to_device(self.model, device=device) - + if isinstance(optimizer, torch.optim.Optimizer): self.optimizer = optimizer elif isinstance(optimizer, Optimizer): diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py index aa7b399c..c6c95bb7 100644 --- a/fastNLP/modules/encoder/embedding.py +++ b/fastNLP/modules/encoder/embedding.py @@ -204,7 +204,7 @@ class StaticEmbedding(TokenEmbedding): model_url = PRETRAIN_URL + model_name model_path = cached_path(model_url) # 检查是否存在 - elif os.path.isfile(model_dir_or_name): + elif os.path.isfile(os.path.expanduser(os.path.abspath(model_dir_or_name))): model_path = model_dir_or_name else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") @@ -455,7 +455,7 @@ class ElmoEmbedding(ContextualEmbedding): model_url = PRETRAIN_URL + model_name model_dir = cached_path(model_url) # 检查是否存在 - elif os.path.isdir(model_dir_or_name): + elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): model_dir = model_dir_or_name else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") @@ -553,7 +553,7 @@ class BertEmbedding(ContextualEmbedding): model_url = PRETRAIN_URL + model_name model_dir = cached_path(model_url) # 检查是否存在 - elif os.path.isdir(model_dir_or_name): + elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))): model_dir = model_dir_or_name else: raise ValueError(f"Cannot recognize {model_dir_or_name}.") diff --git a/reproduction/seqence_labelling/cws/train_shift_relay.py b/reproduction/seqence_labelling/cws/train_shift_relay.py index 805521e7..55576575 100644 --- a/reproduction/seqence_labelling/cws/train_shift_relay.py +++ b/reproduction/seqence_labelling/cws/train_shift_relay.py @@ -57,12 +57,8 @@ callbacks = [clipper] # if pretrain: # fixer = FixEmbedding([model.char_embedding, model.bigram_embedding], fix_until=fix_until) # callbacks.append(fixer) -trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None, - batch_size=32, sampler=sampler, update_every=5, - n_epochs=3, print_every=5, - dev_data=data.datasets['dev'], metrics=RelayMetric(), metric_key='f', - validate_every=-1, save_path=None, - prefetch=True, use_tqdm=True, device=device, - callbacks=callbacks, +trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None, batch_size=32, sampler=sampler, + update_every=5, n_epochs=3, print_every=5, dev_data=data.datasets['dev'], metrics=RelayMetric(), + metric_key='f', validate_every=-1, save_path=None, use_tqdm=True, device=device, callbacks=callbacks, check_code_level=0) trainer.train() \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py b/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py index 36d86651..e8e7f6d2 100644 --- a/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py +++ b/reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py @@ -12,11 +12,11 @@ class CNNBiLSTMCRF(nn.Module): def __init__(self, embed, char_embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'): super().__init__() - self.embedding = Embedding(embed, dropout=0.5) - self.char_embedding = Embedding(char_embed, dropout=0.5) + self.embedding = Embedding(embed, dropout=0.5, dropout_word=0) + self.char_embedding = Embedding(char_embed, dropout=0.5, dropout_word=0.01) self.lstm = LSTM(input_size=self.embedding.embedding_dim+self.char_embedding.embedding_dim, - hidden_size=hidden_size//2, num_layers=num_layers, - bidirectional=True, batch_first=True, dropout=dropout) + hidden_size=hidden_size//2, num_layers=num_layers, + bidirectional=True, batch_first=True) self.fc = nn.Linear(hidden_size, len(tag_vocab)) transitions = allowed_transitions(tag_vocab.idx2word, encoding_type=encoding_type, include_start_end=True) @@ -25,9 +25,9 @@ class CNNBiLSTMCRF(nn.Module): self.dropout = nn.Dropout(dropout, inplace=True) for name, param in self.named_parameters(): - if 'ward_fc' in name: + if 'fc' in name: if param.data.dim()>1: - nn.init.xavier_normal_(param) + nn.init.xavier_uniform_(param) else: nn.init.constant_(param, 0) if 'crf' in name: diff --git a/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py index 507be4f6..cf491f3b 100644 --- a/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py +++ b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py @@ -1,6 +1,6 @@ -from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding +from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding, ElmoEmbedding, LSTMCharEmbedding from fastNLP.core.vocabulary import VocabularyOption from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF @@ -12,6 +12,8 @@ from torch.optim import SGD, Adam from fastNLP import GradientClipCallback from fastNLP.core.callback import FitlogCallback, LRScheduler from torch.optim.lr_scheduler import LambdaLR +from reproduction.seqence_labelling.ner.model.swats import SWATS + import fitlog fitlog.debug() @@ -19,28 +21,50 @@ from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003Dat encoding_type = 'bioes' -data = Conll2003DataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/conll2003', - word_vocab_opt=VocabularyOption(min_freq=2)) +data = Conll2003DataLoader(encoding_type=encoding_type).process('../../../../others/data/conll2003', + word_vocab_opt=VocabularyOption(min_freq=2), + lower=False) print(data) char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], kernel_sizes=[3]) +# char_embed = LSTMCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30 ,char_emb_size=30) word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], - model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', + model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/wiki_en_100_50_case_2.txt', requires_grad=True) word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std() +# import joblib +# raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/conll_with_data.joblib') +# def convert_to_ids(raw_words): +# ids = [] +# for word in raw_words: +# id = raw_data['word_to_id'][word] +# id = raw_data['id_to_emb_map'][id] +# ids.append(id) +# return ids +# word_embed = raw_data['emb_matrix'] +# for name, dataset in data.datasets.items(): +# dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT) + +# word_embed = ElmoEmbedding(vocab=data.vocabs['cap_words'], +# model_dir_or_name='/hdd/fudanNLP/fastNLP/others/pretrained_models/elmo_en', +# requires_grad=True) + model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type) -optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9) -scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) - -callbacks = [GradientClipCallback(clip_type='value', clip_value=5), FitlogCallback({'test':data.datasets['test'], - 'train':data.datasets['train']}, verbose=1), - scheduler] +callbacks = [ + GradientClipCallback(clip_type='value', clip_value=5) + , FitlogCallback({'test':data.datasets['test']}, verbose=1) + ] +# optimizer = Adam(model.parameters(), lr=0.005) +optimizer = SWATS(model.parameters(), verbose=True) +# optimizer = SGD(model.parameters(), lr=0.008, momentum=0.9) +# scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) +# callbacks.append(scheduler) trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), - device=0, dev_data=data.datasets['dev'], batch_size=10, + device=1, dev_data=data.datasets['dev'], batch_size=10, metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), callbacks=callbacks, num_workers=1, n_epochs=100) trainer.train() \ No newline at end of file diff --git a/reproduction/seqence_labelling/ner/train_ontonote.py b/reproduction/seqence_labelling/ner/train_ontonote.py index e2a4158a..6548cb9f 100644 --- a/reproduction/seqence_labelling/ner/train_ontonote.py +++ b/reproduction/seqence_labelling/ner/train_ontonote.py @@ -1,4 +1,6 @@ +import sys +sys.path.append('../../..') from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding @@ -8,8 +10,11 @@ from fastNLP import SpanFPreRecMetric from fastNLP import BucketSampler from fastNLP import Const from torch.optim import SGD, Adam +from torch.optim.lr_scheduler import LambdaLR from fastNLP import GradientClipCallback -from fastNLP.core.callback import FitlogCallback +from fastNLP.core.callback import FitlogCallback, LRScheduler +from reproduction.seqence_labelling.ner.model.swats import SWATS + import fitlog fitlog.debug() @@ -17,23 +22,44 @@ from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDa encoding_type = 'bioes' -data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/v4/english') +data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/v4/english', + lower=True) + +import joblib +raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/ontonotes_with_data.joblib') +def convert_to_ids(raw_words): + ids = [] + for word in raw_words: + id = raw_data['word_to_id'][word] + id = raw_data['id_to_emb_map'][id] + ids.append(id) + return ids +word_embed = raw_data['emb_matrix'] +for name, dataset in data.datasets.items(): + dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT) + print(data) char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30], kernel_sizes=[3]) -word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], - model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', - requires_grad=True) +# word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT], +# model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt', +# requires_grad=True) -model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=400, num_layers=2, tag_vocab=data.vocabs[Const.TARGET], +model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=1200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type) -optimizer = SGD(model.parameters(), lr=0.015, momentum=0.9) +callbacks = [GradientClipCallback(clip_value=5, clip_type='value'), + FitlogCallback(data.datasets['test'], verbose=1)] + +optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9) +scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch))) +callbacks.append(scheduler) +# optimizer = SWATS(model.parameters(), verbose=True) +# optimizer = Adam(model.parameters(), lr=0.005) -callbacks = [GradientClipCallback(), FitlogCallback(data.datasets['test'], verbose=1)] -trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(), - device=1, dev_data=data.datasets['dev'], batch_size=32, +trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(num_buckets=100), + device=0, dev_data=data.datasets['dev'], batch_size=10, metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type), callbacks=callbacks, num_workers=1, n_epochs=100) trainer.train() \ No newline at end of file From 43d3380b730398ac4594edfbfc28b9e8fc55ce77 Mon Sep 17 00:00:00 2001 From: yh_cc Date: Mon, 24 Jun 2019 18:31:38 +0800 Subject: [PATCH 29/34] =?UTF-8?q?1.=E4=BF=AE=E5=A4=8DTrainer=E5=88=9D?= =?UTF-8?q?=E5=A7=8B=E5=8C=96=E7=9A=84=E5=A4=9Adevice=20bug;=202.=E5=9C=A8?= =?UTF-8?q?CrossEntropyLoss=E4=B8=AD=E5=A2=9E=E5=8A=A0seq=5Flen?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/losses.py | 20 ++++++++++++-------- fastNLP/core/trainer.py | 14 ++++++-------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py index 62e7a8c8..526bf37a 100644 --- a/fastNLP/core/losses.py +++ b/fastNLP/core/losses.py @@ -26,7 +26,7 @@ from .utils import _build_args from .utils import _check_arg_dict_list from .utils import _check_function_or_method from .utils import _get_func_signature - +from .utils import seq_len_to_mask class LossBase(object): """ @@ -223,7 +223,9 @@ class CrossEntropyLoss(LossBase): :param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred` :param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target` - :param padding_idx: padding的index,在计算loss时将忽略target中标号为padding_idx的内容 + :param seq_len: 句子的长度, 长度之外的token不会计算loss。。 + :param padding_idx: padding的index,在计算loss时将忽略target中标号为padding_idx的内容, 可以通过该值代替 + 传入seq_len. Example:: @@ -231,16 +233,18 @@ class CrossEntropyLoss(LossBase): """ - def __init__(self, pred=None, target=None, padding_idx=-100): + def __init__(self, pred=None, target=None, seq_len=None, padding_idx=-100): super(CrossEntropyLoss, self).__init__() - self._init_param_map(pred=pred, target=target) + self._init_param_map(pred=pred, target=target, seq_len=seq_len) self.padding_idx = padding_idx - def get_loss(self, pred, target): + def get_loss(self, pred, target, seq_len=None): if pred.dim()>2: - if pred.size()[:2]==target.size(): - # F.cross_entropy在计算时,如果pred是(16, 10 ,4), 会在第二维上去log_softmax, 所以需要交换一下位置 - pred = pred.transpose(1, 2) + pred = pred.view(-1, pred.size(-1)) + target = target.view(-1) + if seq_len is not None: + mask = seq_len_to_mask(seq_len).view(-1).eq(0) + target = target.masked_fill(mask, self.padding_idx) return F.cross_entropy(input=pred, target=target, ignore_index=self.padding_idx) diff --git a/fastNLP/core/trainer.py b/fastNLP/core/trainer.py index a303f742..e8dfa814 100644 --- a/fastNLP/core/trainer.py +++ b/fastNLP/core/trainer.py @@ -452,17 +452,15 @@ class Trainer(object): else: raise TypeError("train_data type {} not support".format(type(train_data))) - self.model = _move_model_to_device(model, device=device) - if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter): - _check_code(dataset=train_data, model=self.model, losser=losser, metrics=metrics, dev_data=dev_data, + _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data, metric_key=metric_key, check_level=check_code_level, batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE)) # _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码 - + self.model = _move_model_to_device(model, device=device) + self.train_data = train_data self.dev_data = dev_data # If None, No validation. - self.model = model self.losser = losser self.metrics = metrics self.n_epochs = int(n_epochs) @@ -480,16 +478,16 @@ class Trainer(object): if isinstance(optimizer, torch.optim.Optimizer): self.optimizer = optimizer elif isinstance(optimizer, Optimizer): - self.optimizer = optimizer.construct_from_pytorch(model.parameters()) + self.optimizer = optimizer.construct_from_pytorch(self.model.parameters()) elif optimizer is None: - self.optimizer = torch.optim.Adam(model.parameters(), lr=4e-3) + self.optimizer = torch.optim.Adam(self.model.parameters(), lr=4e-3) else: raise TypeError("optimizer can only be torch.optim.Optimizer type, not {}.".format(type(optimizer))) self.use_tqdm = use_tqdm self.pbar = None self.print_every = abs(self.print_every) - + if self.dev_data is not None: self.tester = Tester(model=self.model, data=self.dev_data, From e0b23b16db59b249bc4ffbcbe45f4d8f99b7bbd8 Mon Sep 17 00:00:00 2001 From: xuyige Date: Mon, 24 Jun 2019 21:44:43 +0800 Subject: [PATCH 30/34] update data loader of matching --- fastNLP/io/file_utils.py | 31 +++++++ fastNLP/modules/encoder/embedding.py | 41 ++------- .../matching/data/MatchingDataLoader.py | 92 ++++++++++++------- reproduction/matching/matching_esim.py | 65 +++++++++++++ reproduction/matching/model/esim.py | 21 ++++- 5 files changed, 178 insertions(+), 72 deletions(-) create mode 100644 reproduction/matching/matching_esim.py diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index d178626b..04970cb3 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -10,6 +10,37 @@ import shutil import hashlib +PRETRAINED_BERT_MODEL_DIR = { + 'en': 'bert-base-cased-f89bfe08.zip', + 'en-base-uncased': 'bert-base-uncased-3413b23c.zip', + 'en-base-cased': 'bert-base-cased-f89bfe08.zip', + 'en-large-uncased': 'bert-large-uncased-20939f45.zip', + 'en-large-cased': 'bert-large-cased-e0cf90fc.zip', + + 'cn': 'bert-base-chinese-29d0a84a.zip', + 'cn-base': 'bert-base-chinese-29d0a84a.zip', + + 'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip', + 'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip', + 'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip', +} + +PRETRAINED_ELMO_MODEL_DIR = { + 'en': 'elmo_en-d39843fe.tar.gz', + 'cn': 'elmo_cn-5e9b34e2.tar.gz' +} + +PRETRAIN_STATIC_FILES = { + 'en': 'glove.840B.300d-cc1ad5e1.tar.gz', + 'en-glove-840b-300': 'glove.840B.300d-cc1ad5e1.tar.gz', + 'en-glove-6b-50': "glove.6B.50d-a6028c70.tar.gz", + 'en-word2vec-300': "GoogleNews-vectors-negative300-be166d9d.tar.gz", + 'en-fasttext': "cc.en.300.vec-d53187b2.gz", + 'cn': "tencent_cn-dab24577.tar.gz", + 'cn-fasttext': "cc.zh.300.vec-d68a9bcf.gz", +} + + def cached_path(url_or_filename: str, cache_dir: Path=None) -> Path: """ 给定一个url或者文件名(可以是具体的文件名,也可以是文件),先在cache_dir下寻找该文件是否存在,如果不存在则去下载, 并 diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py index c6c95bb7..a58668da 100644 --- a/fastNLP/modules/encoder/embedding.py +++ b/fastNLP/modules/encoder/embedding.py @@ -26,6 +26,7 @@ from ...core.dataset import DataSet from ...core.batch import DataSetIter from ...core.sampler import SequentialSampler from ...core.utils import _move_model_to_device, _get_model_device +from ...io.file_utils import PRETRAINED_BERT_MODEL_DIR, PRETRAINED_ELMO_MODEL_DIR, PRETRAIN_STATIC_FILES class Embedding(nn.Module): @@ -187,15 +188,6 @@ class StaticEmbedding(TokenEmbedding): super(StaticEmbedding, self).__init__(vocab) # 优先定义需要下载的static embedding有哪些。这里估计需要自己搞一个server, - PRETRAIN_STATIC_FILES = { - 'en': 'glove.840B.300d-cc1ad5e1.tar.gz', - 'en-glove-840b-300': 'glove.840B.300d-cc1ad5e1.tar.gz', - 'en-glove-6b-50': "glove.6B.50d-a6028c70.tar.gz", - 'en-word2vec-300': "GoogleNews-vectors-negative300-be166d9d.tar.gz", - 'en-fasttext': "cc.en.300.vec-d53187b2.gz", - 'cn': "tencent_cn-dab24577.tar.gz", - 'cn-fasttext': "cc.zh.300.vec-d68a9bcf.gz", - } # 得到cache_path if model_dir_or_name.lower() in PRETRAIN_STATIC_FILES: @@ -231,7 +223,7 @@ class StaticEmbedding(TokenEmbedding): :return: """ requires_grads = set([param.requires_grad for name, param in self.named_parameters() - if 'words_to_words' not in name]) + if 'words_to_words' not in name]) if len(requires_grads) == 1: return requires_grads.pop() else: @@ -244,8 +236,8 @@ class StaticEmbedding(TokenEmbedding): continue param.requires_grad = value - def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='', unknown='', normalize=True, - error='ignore', init_method=None): + def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='', unknown='', + normalize=True, error='ignore', init_method=None): """ 从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是 word2vec(第一行只有两个元素)还是glove格式的数据。 @@ -329,11 +321,6 @@ class ContextualEmbedding(TokenEmbedding): """ 由于动态embedding生成比较耗时,所以可以把每句话embedding缓存下来,这样就不需要每次都运行生成过程。 - Example:: - - >>> - - :param datasets: DataSet对象 :param batch_size: int, 生成cache的sentence表示时使用的batch的大小 :param device: 参考 :class::fastNLP.Trainer 的device @@ -363,7 +350,7 @@ class ContextualEmbedding(TokenEmbedding): seq_len = words.ne(pad_index).sum(dim=-1) max_len = words.size(1) # 因为有些情况可能包含CLS, SEP, 从后面往前计算比较安全。 - seq_len_from_behind =(max_len - seq_len).tolist() + seq_len_from_behind = (max_len - seq_len).tolist() word_embeds = self(words).detach().cpu().numpy() for b in range(words.size(0)): length = seq_len_from_behind[b] @@ -446,9 +433,6 @@ class ElmoEmbedding(ContextualEmbedding): self.layers = layers # 根据model_dir_or_name检查是否存在并下载 - PRETRAINED_ELMO_MODEL_DIR = {'en': 'elmo_en-d39843fe.tar.gz', - 'cn': 'elmo_cn-5e9b34e2.tar.gz'} - if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR: PRETRAIN_URL = _get_base_url('elmo') model_name = PRETRAINED_ELMO_MODEL_DIR[model_dir_or_name] @@ -532,21 +516,8 @@ class BertEmbedding(ContextualEmbedding): def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1', pool_method: str='first', include_cls_sep: bool=False, requires_grad: bool=False): super(BertEmbedding, self).__init__(vocab) - # 根据model_dir_or_name检查是否存在并下载 - PRETRAINED_BERT_MODEL_DIR = {'en': 'bert-base-cased-f89bfe08.zip', - 'en-base-uncased': 'bert-base-uncased-3413b23c.zip', - 'en-base-cased': 'bert-base-cased-f89bfe08.zip', - 'en-large-uncased': 'bert-large-uncased-20939f45.zip', - 'en-large-cased': 'bert-large-cased-e0cf90fc.zip', - - 'cn': 'bert-base-chinese-29d0a84a.zip', - 'cn-base': 'bert-base-chinese-29d0a84a.zip', - - 'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip', - 'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip', - 'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip', - } + # 根据model_dir_or_name检查是否存在并下载 if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR: PRETRAIN_URL = _get_base_url('bert') model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name] diff --git a/reproduction/matching/data/MatchingDataLoader.py b/reproduction/matching/data/MatchingDataLoader.py index 139b1d4f..4868598a 100644 --- a/reproduction/matching/data/MatchingDataLoader.py +++ b/reproduction/matching/data/MatchingDataLoader.py @@ -6,31 +6,58 @@ from typing import Union, Dict from fastNLP.core.const import Const from fastNLP.core.vocabulary import Vocabulary -from fastNLP.core.dataset import DataSet from fastNLP.io.base_loader import DataInfo -from fastNLP.io.dataset_loader import JsonLoader -from fastNLP.io.file_utils import _get_base_url, cached_path +from fastNLP.io.dataset_loader import JsonLoader, DataSetLoader +from fastNLP.io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR from fastNLP.modules.encoder._bert import BertTokenizer -class MatchingLoader(JsonLoader): +class MatchingLoader(DataSetLoader): """ 别名::class:`fastNLP.io.MatchingLoader` :class:`fastNLP.io.dataset_loader.MatchingLoader` 读取Matching任务的数据集 """ - def __init__(self, fields=None, paths: dict=None): - super(MatchingLoader, self).__init__(fields=fields) + def __init__(self, paths: dict=None): + """ + :param dict paths: key是数据集名称(如train、dev、test),value是对应的文件名 + """ self.paths = paths def _load(self, path): - return super(MatchingLoader, self)._load(path) - - def process(self, paths: Union[str, Dict[str, str]], dataset_name=None, - to_lower=False, char_information=False, seq_len_type: str=None, - bert_tokenizer: str=None, get_index=True, set_input: Union[list, str, bool]=True, + """ + :param str path: 待读取数据集的路径名 + :return: fastNLP.DataSet ds: 返回一个DataSet对象,里面必须包含3个field:其中两个分别为两个句子 + 的原始字符串文本,第三个为标签 + """ + raise NotImplementedError + + def process(self, paths: Union[str, Dict[str, str]], dataset_name: str=None, + to_lower=False, seq_len_type: str=None, bert_tokenizer: str=None, + get_index=True, set_input: Union[list, str, bool]=True, set_target: Union[list, str, bool] = True, concat: Union[str, list, bool]=None, ) -> DataInfo: + """ + :param paths: str或者Dict[str, str]。如果是str,则为数据集所在的文件夹或者是全路径文件名:如果是文件夹, + 则会从self.paths里面找对应的数据集名称与文件名。如果是Dict,则为数据集名称(如train、dev、test)和 + 对应的全路径文件名。 + :param str dataset_name: 如果在paths里传入的是一个数据集的全路径文件名,那么可以用dataset_name来定义 + 这个数据集的名字,如果不定义则默认为train。 + :param bool to_lower: 是否将文本自动转为小写。默认值为False。 + :param str seq_len_type: 提供的seq_len类型,支持 ``seq_len`` :提供一个数字作为句子长度; ``mask`` : + 提供一个0/1的mask矩阵作为句子长度; ``bert`` :提供segment_type_id(第一个句子为0,第二个句子为1)和 + attention mask矩阵(0/1的mask矩阵)。默认值为None,即不提供seq_len + :param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径 + :param bool get_index: 是否需要根据词表将文本转为index + :param set_input: 如果为True,则会自动将相关的field(名字里含有Const.INPUT的)设置为input,如果为False + 则不会将任何field设置为input。如果传入str或者List[str],则会根据传入的内容将相对应的field设置为input, + 于此同时其他field不会被设置为input。默认值为True。 + :param set_target: set_target将控制哪些field可以被设置为target,用法与set_input一致。默认值为True。 + :param concat: 是否需要将两个句子拼接起来。如果为False则不会拼接。如果为True则会在两个句子之间插入一个。 + 如果传入一个长度为4的list,则分别表示插在第一句开始前、第一句结束后、第二句开始前、第二句结束后的标识符。如果 + 传入字符串 ``bert`` ,则会采用bert的拼接方式,等价于['[CLS]', '[SEP]', '', '[SEP]']. + :return: + """ if isinstance(set_input, str): set_input = [set_input] if isinstance(set_target, str): @@ -69,19 +96,6 @@ class MatchingLoader(JsonLoader): is_input=auto_set_input) if bert_tokenizer is not None: - PRETRAINED_BERT_MODEL_DIR = {'en': 'bert-base-cased-f89bfe08.zip', - 'en-base-uncased': 'bert-base-uncased-3413b23c.zip', - 'en-base-cased': 'bert-base-cased-f89bfe08.zip', - 'en-large-uncased': 'bert-large-uncased-20939f45.zip', - 'en-large-cased': 'bert-large-cased-e0cf90fc.zip', - - 'cn': 'bert-base-chinese-29d0a84a.zip', - 'cn-base': 'bert-base-chinese-29d0a84a.zip', - - 'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip', - 'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip', - 'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip', - } if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR: PRETRAIN_URL = _get_base_url('bert') model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer] @@ -128,14 +142,14 @@ class MatchingLoader(JsonLoader): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply(lambda x: len(x[fields]), - new_field_name=fields.replace(Const.INPUT, Const.TARGET), + new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN), is_input=auto_set_input) elif seq_len_type == 'mask': for data_name, data_set in data_info.datasets.items(): for fields in data_set.get_field_names(): if Const.INPUT in fields: data_set.apply(lambda x: [1] * len(x[fields]), - new_field_name=fields.replace(Const.INPUT, Const.TARGET), + new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN), is_input=auto_set_input) elif seq_len_type == 'bert': for data_name, data_set in data_info.datasets.items(): @@ -152,11 +166,18 @@ class MatchingLoader(JsonLoader): if bert_tokenizer is not None: words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]') + with open(os.path.join(model_dir, 'vocab.txt'), 'r') as f: + lines = f.readlines() + lines = [line.strip() for line in lines] + words_vocab.add_word_lst(lines) + words_vocab.build_vocab() else: words_vocab = Vocabulary() - words_vocab = words_vocab.from_dataset(*data_set_list, - field_name=[n for n in data_set_list[0].get_field_names() - if (Const.INPUT in n)]) + words_vocab = words_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n], + field_name=[n for n in data_set_list[0].get_field_names() + if (Const.INPUT in n)], + no_create_entry_dataset=[d for n, d in data_info.datasets.items() + if 'train' not in n]) target_vocab = Vocabulary(padding=None, unknown=None) target_vocab = target_vocab.from_dataset(*data_set_list, field_name=Const.TARGET) data_info.vocabs = {Const.INPUT: words_vocab, Const.TARGET: target_vocab} @@ -173,14 +194,14 @@ class MatchingLoader(JsonLoader): for data_name, data_set in data_info.datasets.items(): if isinstance(set_input, list): - data_set.set_input(set_input) + data_set.set_input(*set_input) if isinstance(set_target, list): - data_set.set_target(set_target) + data_set.set_target(*set_target) return data_info -class SNLILoader(MatchingLoader): +class SNLILoader(MatchingLoader, JsonLoader): """ 别名::class:`fastNLP.io.SNLILoader` :class:`fastNLP.io.dataset_loader.SNLILoader` @@ -203,10 +224,13 @@ class SNLILoader(MatchingLoader): 'train': 'snli_1.0_train.jsonl', 'dev': 'snli_1.0_dev.jsonl', 'test': 'snli_1.0_test.jsonl'} - super(SNLILoader, self).__init__(fields=fields, paths=paths) + # super(SNLILoader, self).__init__(fields=fields, paths=paths) + MatchingLoader.__init__(self, paths=paths) + JsonLoader.__init__(self, fields=fields) def _load(self, path): - ds = super(SNLILoader, self)._load(path) + # ds = super(SNLILoader, self)._load(path) + ds = JsonLoader._load(self, path) def parse_tree(x): t = Tree.fromstring(x) diff --git a/reproduction/matching/matching_esim.py b/reproduction/matching/matching_esim.py new file mode 100644 index 00000000..3da6141f --- /dev/null +++ b/reproduction/matching/matching_esim.py @@ -0,0 +1,65 @@ + +import argparse +import torch + +from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const +from fastNLP.modules.encoder.embedding import ElmoEmbedding, StaticEmbedding + +from reproduction.matching.data.MatchingDataLoader import SNLILoader +from reproduction.matching.model.esim import ESIMModel + +argument = argparse.ArgumentParser() +argument.add_argument('--embedding', choices=['glove', 'elmo'], default='glove') +argument.add_argument('--batch-size-per-gpu', type=int, default=128) +argument.add_argument('--n-epochs', type=int, default=100) +argument.add_argument('--lr', type=float, default=1e-4) +argument.add_argument('--seq-len-type', choices=['mask', 'seq_len'], default='seq_len') +argument.add_argument('--save-dir', type=str, default=None) +arg = argument.parse_args() + +bert_dirs = 'path/to/bert/dir' + +# load data set +data_info = SNLILoader().process( + paths='path/to/snli/data/dir', to_lower=True, seq_len_type=arg.seq_len_type, bert_tokenizer=None, + get_index=True, concat=False, +) + +# load embedding +if arg.embedding == 'elmo': + embedding = ElmoEmbedding(data_info.vocabs[Const.INPUT], requires_grad=True) +elif arg.embedding == 'glove': + embedding = StaticEmbedding(data_info.vocabs[Const.INPUT], requires_grad=True) +else: + raise ValueError(f'now we only support elmo or glove embedding for esim model!') + +# define model +model = ESIMModel(embedding) + +# define trainer +trainer = Trainer(train_data=data_info.datasets['train'], model=model, + optimizer=Adam(lr=arg.lr, model_params=model.parameters()), + batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu, + n_epochs=arg.n_epochs, print_every=-1, + dev_data=data_info.datasets['dev'], + metrics=AccuracyMetric(), metric_key='acc', + device=[i for i in range(torch.cuda.device_count())], + check_code_level=-1, + save_path=arg.save_path) + +# train model +trainer.train(load_best_model=True) + +# define tester +tester = Tester( + data=data_info.datasets['test'], + model=model, + metrics=AccuracyMetric(), + batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu, + device=[i for i in range(torch.cuda.device_count())], +) + +# test model +tester.test() + + diff --git a/reproduction/matching/model/esim.py b/reproduction/matching/model/esim.py index 0551bbdb..d55034e7 100644 --- a/reproduction/matching/model/esim.py +++ b/reproduction/matching/model/esim.py @@ -30,24 +30,37 @@ class ESIMModel(BaseModel): self.bi_attention = SoftmaxAttention() self.rnn_high = BiRNN(self.embedding.embed_size, hidden_size, dropout_rate=dropout_rate) - # self.rnn_high = LSTM(hidden_size, hidden_size, dropout=dropout_rate, bidirectional=True) + # self.rnn_high = LSTM(hidden_size, hidden_size, dropout=dropout_rate, bidirectional=True,) self.classifier = nn.Sequential(nn.Dropout(p=dropout_rate), nn.Linear(8 * hidden_size, hidden_size), nn.Tanh(), nn.Dropout(p=dropout_rate), nn.Linear(hidden_size, num_labels)) + + self.dropout_rnn = nn.Dropout(p=dropout_rate) + nn.init.xavier_uniform_(self.classifier[1].weight.data) nn.init.xavier_uniform_(self.classifier[4].weight.data) def forward(self, words1, words2, seq_len1, seq_len2, target=None): - mask1 = seq_len_to_mask(seq_len1) - mask2 = seq_len_to_mask(seq_len2) + """ + :param words1: [batch, seq_len] + :param words2: [batch, seq_len] + :param seq_len1: [batch] + :param seq_len2: [batch] + :param target: + :return: + """ + mask1 = seq_len_to_mask(seq_len1, words1.size(1)) + mask2 = seq_len_to_mask(seq_len2, words2.size(1)) a0 = self.embedding(words1) # B * len * emb_dim b0 = self.embedding(words2) a0, b0 = self.dropout_embed(a0), self.dropout_embed(b0) a = self.rnn(a0, mask1.byte()) # a: [B, PL, 2 * H] b = self.rnn(b0, mask2.byte()) + # a = self.dropout_rnn(self.rnn(a0, seq_len1)[0]) # a: [B, PL, 2 * H] + # b = self.dropout_rnn(self.rnn(b0, seq_len2)[0]) ai, bi = self.bi_attention(a, mask1, b, mask2) @@ -58,6 +71,8 @@ class ESIMModel(BaseModel): a_h = self.rnn_high(a_f, mask1.byte()) # ma: [B, PL, 2 * H] b_h = self.rnn_high(b_f, mask2.byte()) + # a_h = self.dropout_rnn(self.rnn_high(a_f, seq_len1)[0]) # ma: [B, PL, 2 * H] + # b_h = self.dropout_rnn(self.rnn_high(b_f, seq_len2)[0]) a_avg = self.mean_pooling(a_h, mask1, dim=1) a_max, _ = self.max_pooling(a_h, mask1, dim=1) From bc5e071253c2a13ef055d13ac6b88f57bc7038e0 Mon Sep 17 00:00:00 2001 From: xuyige Date: Mon, 24 Jun 2019 21:56:14 +0800 Subject: [PATCH 31/34] Delete matching.py --- reproduction/matching/matching.py | 44 ------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 reproduction/matching/matching.py diff --git a/reproduction/matching/matching.py b/reproduction/matching/matching.py deleted file mode 100644 index 8251b3bc..00000000 --- a/reproduction/matching/matching.py +++ /dev/null @@ -1,44 +0,0 @@ -import os - -import torch - -from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const - -from fastNLP.io.dataset_loader import MatchingLoader - -from reproduction.matching.model.bert import BertForNLI -from reproduction.matching.model.esim import ESIMModel - - -bert_dirs = 'path/to/bert/dir' - -# load data set -# data_info = MatchingLoader(data_format='snli', for_model='bert', bert_dir=bert_dirs).process(... -data_info = MatchingLoader(data_format='snli', for_model='esim').process( - {'train': './data/snli/snli_1.0_train.jsonl', - 'dev': './data/snli/snli_1.0_dev.jsonl', - 'test': './data/snli/snli_1.0_test.jsonl'}, - input_field=[Const.TARGET] -) - -# model = BertForNLI(bert_dir=bert_dirs) -model = ESIMModel(data_info.embeddings['elmo'],) - -trainer = Trainer(train_data=data_info.datasets['train'], model=model, - optimizer=Adam(lr=1e-4, model_params=model.parameters()), - batch_size=torch.cuda.device_count() * 24, n_epochs=20, print_every=-1, - dev_data=data_info.datasets['dev'], - metrics=AccuracyMetric(), metric_key='acc', device=[i for i in range(torch.cuda.device_count())], - check_code_level=-1) -trainer.train(load_best_model=True) - -tester = Tester( - data=data_info.datasets['test'], - model=model, - metrics=AccuracyMetric(), - batch_size=torch.cuda.device_count() * 12, - device=[i for i in range(torch.cuda.device_count())], -) -tester.test() - - From 50faa936b44193b2dc44c356e68a8d8b45119f1a Mon Sep 17 00:00:00 2001 From: xuyige Date: Tue, 25 Jun 2019 17:22:10 +0800 Subject: [PATCH 32/34] add RTE and QNLI loader --- .../matching/data/MatchingDataLoader.py | 107 +++++++++++++++--- 1 file changed, 93 insertions(+), 14 deletions(-) diff --git a/reproduction/matching/data/MatchingDataLoader.py b/reproduction/matching/data/MatchingDataLoader.py index 4868598a..0e4e1283 100644 --- a/reproduction/matching/data/MatchingDataLoader.py +++ b/reproduction/matching/data/MatchingDataLoader.py @@ -1,13 +1,12 @@ import os -from nltk import Tree from typing import Union, Dict from fastNLP.core.const import Const from fastNLP.core.vocabulary import Vocabulary from fastNLP.io.base_loader import DataInfo -from fastNLP.io.dataset_loader import JsonLoader, DataSetLoader +from fastNLP.io.dataset_loader import JsonLoader, DataSetLoader, CSVLoader from fastNLP.io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR from fastNLP.modules.encoder._bert import BertTokenizer @@ -35,7 +34,7 @@ class MatchingLoader(DataSetLoader): def process(self, paths: Union[str, Dict[str, str]], dataset_name: str=None, to_lower=False, seq_len_type: str=None, bert_tokenizer: str=None, - get_index=True, set_input: Union[list, str, bool]=True, + cut_text: int = None, get_index=True, set_input: Union[list, str, bool]=True, set_target: Union[list, str, bool] = True, concat: Union[str, list, bool]=None, ) -> DataInfo: """ :param paths: str或者Dict[str, str]。如果是str,则为数据集所在的文件夹或者是全路径文件名:如果是文件夹, @@ -48,6 +47,7 @@ class MatchingLoader(DataSetLoader): 提供一个0/1的mask矩阵作为句子长度; ``bert`` :提供segment_type_id(第一个句子为0,第二个句子为1)和 attention mask矩阵(0/1的mask矩阵)。默认值为None,即不提供seq_len :param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径 + :param int cut_text: 将长于cut_text的内容截掉。默认为None,即不截。 :param bool get_index: 是否需要根据词表将文本转为index :param set_input: 如果为True,则会自动将相关的field(名字里含有Const.INPUT的)设置为input,如果为False 则不会将任何field设置为input。如果传入str或者List[str],则会根据传入的内容将相对应的field设置为input, @@ -161,6 +161,13 @@ class MatchingLoader(DataSetLoader): data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]), new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input) + if cut_text is not None: + for data_name, data_set in data_info.datasets.items(): + for fields in data_set.get_field_names(): + if (Const.INPUT in fields) or ((Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len')): + data_set.apply(lambda x: x[fields][: cut_text], new_field_name=fields, + is_input=auto_set_input) + data_set_list = [d for n, d in data_info.datasets.items()] assert len(data_set_list) > 0, f'There are NO data sets in data info!' @@ -216,32 +223,104 @@ class SNLILoader(MatchingLoader, JsonLoader): def __init__(self, paths: dict=None): fields = { - 'sentence1_parse': Const.INPUTS(0), - 'sentence2_parse': Const.INPUTS(1), + 'sentence1_binary_parse': Const.INPUTS(0), + 'sentence2_binary_parse': Const.INPUTS(1), 'gold_label': Const.TARGET, } paths = paths if paths is not None else { 'train': 'snli_1.0_train.jsonl', 'dev': 'snli_1.0_dev.jsonl', 'test': 'snli_1.0_test.jsonl'} - # super(SNLILoader, self).__init__(fields=fields, paths=paths) MatchingLoader.__init__(self, paths=paths) JsonLoader.__init__(self, fields=fields) def _load(self, path): - # ds = super(SNLILoader, self)._load(path) ds = JsonLoader._load(self, path) - def parse_tree(x): - t = Tree.fromstring(x) - return t.leaves() + parentheses_table = str.maketrans({'(': None, ')': None}) - ds.apply(lambda ins: parse_tree( - ins[Const.INPUTS(0)]), new_field_name=Const.INPUTS(0)) - ds.apply(lambda ins: parse_tree( - ins[Const.INPUTS(1)]), new_field_name=Const.INPUTS(1)) + ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(), + new_field_name=Const.INPUTS(0)) + ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(), + new_field_name=Const.INPUTS(1)) ds.drop(lambda x: x[Const.TARGET] == '-') return ds +class RTELoader(MatchingLoader, CSVLoader): + """ + 别名::class:`fastNLP.io.RTELoader` :class:`fastNLP.io.dataset_loader.RTELoader` + + 读取RTE数据集,读取的DataSet包含fields:: + + words1: list(str),第一句文本, premise + words2: list(str), 第二句文本, hypothesis + target: str, 真实标签 + + 数据来源: + """ + + def __init__(self, paths: dict=None): + paths = paths if paths is not None else { + 'train': 'train.tsv', + 'dev': 'dev.tsv', + # 'test': 'test.tsv' # test set has not label + } + MatchingLoader.__init__(self, paths=paths) + self.fields = { + 'sentence1': Const.INPUTS(0), + 'sentence2': Const.INPUTS(1), + 'label': Const.TARGET, + } + CSVLoader.__init__(self, sep='\t') + + def _load(self, path): + ds = CSVLoader._load(self, path) + + for k, v in self.fields.items(): + ds.rename_field(k, v) + for fields in ds.get_all_fields(): + if Const.INPUT in fields: + ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields) + + return ds + + +class QNLILoader(MatchingLoader, CSVLoader): + """ + 别名::class:`fastNLP.io.QNLILoader` :class:`fastNLP.io.dataset_loader.QNLILoader` + + 读取QNLI数据集,读取的DataSet包含fields:: + + words1: list(str),第一句文本, premise + words2: list(str), 第二句文本, hypothesis + target: str, 真实标签 + + 数据来源: + """ + + def __init__(self, paths: dict=None): + paths = paths if paths is not None else { + 'train': 'train.tsv', + 'dev': 'dev.tsv', + # 'test': 'test.tsv' # test set has not label + } + MatchingLoader.__init__(self, paths=paths) + self.fields = { + 'question': Const.INPUTS(0), + 'sentence': Const.INPUTS(1), + 'label': Const.TARGET, + } + CSVLoader.__init__(self, sep='\t') + + def _load(self, path): + ds = CSVLoader._load(self, path) + + for k, v in self.fields.items(): + ds.rename_field(k, v) + for fields in ds.get_all_fields(): + if Const.INPUT in fields: + ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields) + + return ds From 40c4d216d19ebf02515607e8d0e649d2cc781ca5 Mon Sep 17 00:00:00 2001 From: yh Date: Wed, 26 Jun 2019 13:39:14 +0800 Subject: [PATCH 33/34] =?UTF-8?q?=E4=BF=AE=E6=94=B9staticEmbedding?= =?UTF-8?q?=E7=9A=84=E5=88=9D=E5=A7=8B=E5=8C=96=E6=96=B9=E5=BC=8F=EF=BC=8C?= =?UTF-8?q?=E6=98=BE=E7=A4=BA=E9=80=9A=E8=BF=87=E8=BF=99=E7=A7=8D=E5=88=9D?= =?UTF-8?q?=E5=A7=8B=E5=8C=96=E5=9C=A8esmi=E4=B8=8A=E7=9A=84snli=E6=9B=B4?= =?UTF-8?q?=E5=AE=B9=E6=98=93=E8=BE=BE=E5=88=B088=E7=9A=84test=20acc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/modules/encoder/embedding.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py index a58668da..c48cb806 100644 --- a/fastNLP/modules/encoder/embedding.py +++ b/fastNLP/modules/encoder/embedding.py @@ -180,11 +180,11 @@ class StaticEmbedding(TokenEmbedding): 的名称。目前支持的embedding包括{`en` 或者 `en-glove-840b-300` : glove.840B.300d, `en-glove-6b-50` : glove.6B.50d, `en-word2vec-300` : GoogleNews-vectors-negative300}。第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。 :param requires_grad: 是否需要gradient. 默认为True - :param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。默认使用torch.nn.init.xavier_uniform_ - 。调用该方法时传入一个tensor对象。 - + :param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对象。 + :param normailize: 是否对vector进行normalize,使得每个vector的norm为1。 """ - def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=True, init_method=None): + def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=True, init_method=None, + normalize=False): super(StaticEmbedding, self).__init__(vocab) # 优先定义需要下载的static embedding有哪些。这里估计需要自己搞一个server, @@ -202,7 +202,8 @@ class StaticEmbedding(TokenEmbedding): raise ValueError(f"Cannot recognize {model_dir_or_name}.") # 读取embedding - embedding, hit_flags = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method) + embedding, hit_flags = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method, + normalize=normalize) self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1], padding_idx=vocab.padding_idx, max_norm=None, norm_type=2, scale_grad_by_freq=False, @@ -257,10 +258,7 @@ class StaticEmbedding(TokenEmbedding): assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported." if not os.path.exists(embed_filepath): raise FileNotFoundError("`{}` does not exist.".format(embed_filepath)) - if init_method is None: - init_method = nn.init.xavier_uniform_ with open(embed_filepath, 'r', encoding='utf-8') as f: - found_count = 0 line = f.readline().strip() parts = line.split() start_idx = 0 @@ -271,7 +269,8 @@ class StaticEmbedding(TokenEmbedding): dim = len(parts) - 1 f.seek(0) matrix = torch.zeros(len(vocab), dim) - init_method(matrix) + if init_method is not None: + init_method(matrix) hit_flags = np.zeros(len(vocab), dtype=bool) for idx, line in enumerate(f, start_idx): try: @@ -286,7 +285,6 @@ class StaticEmbedding(TokenEmbedding): if word in vocab: index = vocab.to_index(word) matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)) - found_count += 1 hit_flags[index] = True except Exception as e: if error == 'ignore': @@ -294,7 +292,16 @@ class StaticEmbedding(TokenEmbedding): else: print("Error occurred at the {} line.".format(idx)) raise e + found_count = sum(hit_flags) print("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab))) + if init_method is None: + if len(vocab)-found_count>0 and found_count>0: # 有的没找到 + found_vecs = matrix[torch.LongTensor(hit_flags.astype(int)).byte()] + mean = found_vecs.mean(dim=0, keepdim=True) + std = found_vecs.std(dim=0, keepdim=True) + unfound_vec_num = np.sum(hit_flags==False) + unfound_vecs = torch.randn(unfound_vec_num, dim)*std + mean + matrix[torch.LongTensor(hit_flags.astype(int)).eq(0)] = unfound_vecs if normalize: matrix /= (torch.norm(matrix, dim=1, keepdim=True) + 1e-12) From 9c1b4914d8f4fda018f449cf5374941b1fa03c9d Mon Sep 17 00:00:00 2001 From: yh_cc Date: Sun, 30 Jun 2019 09:52:01 +0800 Subject: [PATCH 34/34] =?UTF-8?q?1.=E4=BF=AE=E5=A4=8Dtrainer=E4=B8=AD?= =?UTF-8?q?=E6=BD=9C=E5=9C=A8=E5=A4=9A=E6=AD=A5=E6=9B=B4=E6=96=B0bug;=202.?= =?UTF-8?q?=20LSTM=E7=9A=84=E6=95=B0=E6=8D=AE=E5=B9=B6=E8=A1=8C=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=EF=BC=9B3.=20embed=5Floader=E4=B8=ADbug=E4=BF=AE?= =?UTF-8?q?=E5=A4=8D,=20=E4=B8=94=E5=85=81=E8=AE=B8=E6=89=8B=E5=8A=A8?= =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=EF=BC=9B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/callback.py | 2 +- fastNLP/core/dataset.py | 6 ++++-- fastNLP/core/optimizer.py | 17 +++++++++++++++++ fastNLP/core/trainer.py | 6 +++--- fastNLP/io/embed_loader.py | 33 +++++++++++++++++++-------------- fastNLP/modules/encoder/lstm.py | 11 +---------- fastNLP/modules/utils.py | 2 ++ setup.py | 2 +- 8 files changed, 48 insertions(+), 31 deletions(-) diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py index 483f6dc1..5dfd889b 100644 --- a/fastNLP/core/callback.py +++ b/fastNLP/core/callback.py @@ -548,7 +548,7 @@ class LRScheduler(Callback): else: raise ValueError(f"Expect torch.optim.lr_scheduler for LRScheduler. Got {type(lr_scheduler)}.") - def on_epoch_begin(self): + def on_epoch_end(self): self.scheduler.step(self.epoch) diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py index 4cd1ad9c..b7df9dec 100644 --- a/fastNLP/core/dataset.py +++ b/fastNLP/core/dataset.py @@ -801,17 +801,19 @@ class DataSet(object): else: return DataSet() - def split(self, ratio): + def split(self, ratio, shuffle=True): """ 将DataSet按照ratio的比例拆分,返回两个DataSet :param float ratio: 0 [N,L,C] - output, _ = rnn.pad_packed_sequence(output, batch_first=self.batch_first) + output, _ = rnn.pad_packed_sequence(output, batch_first=self.batch_first, total_length=max_len) _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False) if self.batch_first: output = output[unsort_idx] else: output = output[:, unsort_idx] - # 解决LSTM无法在DataParallel下使用的问题问题https://github.com/pytorch/pytorch/issues/1591 - if self.batch_first: - if output.size(1) < max_len: - dummy_tensor = output.new_zeros(batch_size, max_len - output.size(1), output.size(-1)) - output = torch.cat([output, dummy_tensor], 1) - else: - if output.size(0) < max_len: - dummy_tensor = output.new_zeros(max_len - output.size(1), batch_size, output.size(-1)) - output = torch.cat([output, dummy_tensor], 0) else: output, hx = self.lstm(x, hx) return output, hx diff --git a/fastNLP/modules/utils.py b/fastNLP/modules/utils.py index c87f3a68..3c6a3d27 100644 --- a/fastNLP/modules/utils.py +++ b/fastNLP/modules/utils.py @@ -82,6 +82,8 @@ def get_embeddings(init_embed): if isinstance(init_embed, tuple): res = nn.Embedding( num_embeddings=init_embed[0], embedding_dim=init_embed[1]) + nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)), + b=np.sqrt(3/res.weight.data.size(1))) elif isinstance(init_embed, nn.Module): res = init_embed elif isinstance(init_embed, torch.Tensor): diff --git a/setup.py b/setup.py index 49646761..0dbef455 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ with open('requirements.txt', encoding='utf-8') as f: setup( name='FastNLP', - version='0.4.0', + version='dev0.5.0', description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team', long_description=readme, long_description_content_type='text/markdown',