Browse Source

Merge pull request #5 from fastnlp/dev0.5.0

Dev0.5.0
tags/v0.4.10
lyhuang18 GitHub 6 years ago
parent
commit
2e4d84d950
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
59 changed files with 2621 additions and 1254 deletions
  1. +5
    -1
      fastNLP/__init__.py
  2. +1
    -1
      fastNLP/core/__init__.py
  3. +154
    -149
      fastNLP/core/batch.py
  4. +1
    -1
      fastNLP/core/callback.py
  5. +5
    -3
      fastNLP/core/dataset.py
  6. +42
    -31
      fastNLP/core/field.py
  7. +50
    -42
      fastNLP/core/losses.py
  8. +65
    -30
      fastNLP/core/metrics.py
  9. +17
    -0
      fastNLP/core/optimizer.py
  10. +2
    -3
      fastNLP/core/predictor.py
  11. +15
    -4
      fastNLP/core/tester.py
  12. +80
    -56
      fastNLP/core/trainer.py
  13. +9
    -3
      fastNLP/core/utils.py
  14. +56
    -14
      fastNLP/core/vocabulary.py
  15. +2
    -2
      fastNLP/io/__init__.py
  16. +8
    -0
      fastNLP/io/base_loader.py
  17. +7
    -2
      fastNLP/io/dataset_loader.py
  18. +25
    -18
      fastNLP/io/embed_loader.py
  19. +8
    -5
      fastNLP/io/file_reader.py
  20. +33
    -2
      fastNLP/io/file_utils.py
  21. +15
    -5
      fastNLP/modules/decoder/crf.py
  22. +10
    -2
      fastNLP/modules/encoder/__init__.py
  23. +455
    -119
      fastNLP/modules/encoder/_bert.py
  24. +16
    -2
      fastNLP/modules/encoder/_elmo.py
  25. +86
    -372
      fastNLP/modules/encoder/bert.py
  26. +205
    -86
      fastNLP/modules/encoder/embedding.py
  27. +21
    -7
      fastNLP/modules/encoder/lstm.py
  28. +2
    -0
      fastNLP/modules/utils.py
  29. +2
    -5
      reproduction/Biaffine_parser/run.py
  30. +5
    -5
      reproduction/POS_tagging/train_pos_tag.py
  31. +4
    -8
      reproduction/Star_transformer/train.py
  32. +326
    -0
      reproduction/matching/data/MatchingDataLoader.py
  33. +0
    -6
      reproduction/matching/data/SNLIDataLoader.py
  34. +65
    -0
      reproduction/matching/matching_esim.py
  35. +197
    -0
      reproduction/matching/model/esim.py
  36. +0
    -97
      reproduction/matching/snli.py
  37. +2
    -2
      reproduction/matching/test/test_snlidataloader.py
  38. +3
    -7
      reproduction/seqence_labelling/cws/train_shift_relay.py
  39. +0
    -0
      reproduction/seqence_labelling/ner/__init__.py
  40. +93
    -0
      reproduction/seqence_labelling/ner/data/Conll2003Loader.py
  41. +152
    -0
      reproduction/seqence_labelling/ner/data/OntoNoteLoader.py
  42. +49
    -0
      reproduction/seqence_labelling/ner/data/utils.py
  43. +56
    -0
      reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py
  44. +0
    -0
      reproduction/seqence_labelling/ner/test/__init__.py
  45. +33
    -0
      reproduction/seqence_labelling/ner/test/test.py
  46. +70
    -0
      reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py
  47. +65
    -0
      reproduction/seqence_labelling/ner/train_ontonote.py
  48. +3
    -2
      reproduction/utils.py
  49. +1
    -1
      setup.py
  50. +10
    -10
      test/core/test_batch.py
  51. +27
    -73
      test/core/test_callbacks.py
  52. +9
    -1
      test/core/test_metrics.py
  53. +11
    -48
      test/core/test_trainer.py
  54. +9
    -0
      test/core/test_utils.py
  55. +18
    -0
      test/core/test_vocabulary.py
  56. +2
    -5
      test/models/model_runner.py
  57. +0
    -1
      test/models/test_biaffine_parser.py
  58. +5
    -5
      test/modules/decoder/test_CRF.py
  59. +9
    -18
      test/test_tutorials.py

+ 5
- 1
fastNLP/__init__.py View File

@@ -12,7 +12,11 @@ fastNLP 中最常用的组件可以直接从 fastNLP 包中 import ,他们的
__all__ = [
"Instance",
"FieldArray",
"Batch",

"DataSetIter",
"BatchIter",
"TorchLoaderIter",

"Vocabulary",
"DataSet",
"Const",


+ 1
- 1
fastNLP/core/__init__.py View File

@@ -14,7 +14,7 @@ core 模块里实现了 fastNLP 的核心框架,常用的功能都可以从 fa
介绍core 的子模块的分工,好像必要性不大
"""
from .batch import Batch
from .batch import DataSetIter, BatchIter, TorchLoaderIter
from .callback import Callback, GradientClipCallback, EarlyStopCallback, TensorboardCallback, LRScheduler, ControlC
from .const import Const
from .dataset import DataSet


+ 154
- 149
fastNLP/core/batch.py View File

@@ -3,7 +3,9 @@ batch 模块实现了 fastNLP 所需的 Batch 类。

"""
__all__ = [
"Batch"
"BatchIter",
"DataSetIter",
"TorchLoaderIter",
]

import atexit
@@ -12,9 +14,11 @@ from queue import Empty, Full
import numpy as np
import torch
import torch.multiprocessing as mp
import torch.utils.data
from numbers import Number

from .sampler import RandomSampler
from .sampler import SequentialSampler
from .dataset import DataSet

_python_is_exit = False

@@ -27,162 +31,163 @@ def _set_python_is_exit():
atexit.register(_set_python_is_exit)


class Batch(object):
"""
别名::class:`fastNLP.Batch` :class:`fastNLP.core.batch.Batch`

Batch 用于从 `DataSet` 中按一定的顺序, 依次按 ``batch_size`` 的大小将数据取出,
组成 `x` 和 `y`::

batch = Batch(data_set, batch_size=16, sampler=SequentialSampler())
num_batch = len(batch)
for batch_x, batch_y in batch:
# do stuff ...

:param dataset: :class:`~fastNLP.DataSet` 对象, 数据集
:param int batch_size: 取出的batch大小
:param sampler: 规定使用的 :class:`~fastNLP.Sampler` 方式. 若为 ``None`` , 使用 :class:`~fastNLP.RandomSampler`.
Default: ``None``
:param bool as_numpy: 若为 ``True`` , 输出batch为 numpy.array. 否则为 :class:`torch.Tensor`.
Default: ``False``
:param bool prefetch: 若为 ``True`` 使用多进程预先取出下一batch.
Default: ``False``
"""
def __init__(self, dataset, batch_size, sampler=None, as_numpy=False, prefetch=False):
class DataSetGetter:
def __init__(self, dataset: DataSet, as_numpy=False):
self.dataset = dataset
self.batch_size = batch_size
if sampler is None:
sampler = RandomSampler()
self.sampler = sampler
self.inputs = {n: f for n, f in dataset.get_all_fields().items() if f.is_input}
self.targets = {n: f for n, f in dataset.get_all_fields().items() if f.is_target}
self.as_numpy = as_numpy
self.idx_list = None
self.curidx = 0
self.num_batches = len(dataset) // batch_size + int(len(dataset) % batch_size != 0)
self.cur_batch_indices = None
self.prefetch = prefetch
self.lengths = 0
def fetch_one(self):
if self.curidx >= len(self.idx_list):
return None
self.idx_list = list(range(len(dataset)))

def __getitem__(self, idx: int):
# mapping idx to sampled idx
idx = self.idx_list[idx]
inputs = {n:f.get(idx) for n, f in self.inputs.items()}
targets = {n:f.get(idx) for n, f in self.targets.items()}
return idx, inputs, targets

def __len__(self):
return len(self.dataset)

def collate_fn(self, batch: list):
batch_x = {n:[] for n in self.inputs.keys()}
batch_y = {n:[] for n in self.targets.keys()}
indices = []
for idx, x, y in batch:
indices.append(idx)
for n, v in x.items():
batch_x[n].append(v)
for n, v in y.items():
batch_y[n].append(v)

def pad_batch(batch_dict, field_array):
for n, vlist in batch_dict.items():
f = field_array[n]
if f.padder is None:
batch_dict[n] = np.array(vlist)
else:
data = f.pad(vlist)
if not self.as_numpy:
try:
data, flag = _to_tensor(data, f.dtype)
except TypeError as e:
print(f"Field {n} cannot be converted to torch.tensor.")
raise e
batch_dict[n] = data
return batch_dict

return (indices,
pad_batch(batch_x, self.inputs),
pad_batch(batch_y, self.targets))

def set_idx_list(self, idx_list):
if len(idx_list) != len(self.idx_list):
raise ValueError
self.idx_list = idx_list

def __getattr__(self, item):
if hasattr(self.dataset, item):
return getattr(self.dataset, item)
else:
endidx = min(self.curidx + self.batch_size, len(self.idx_list))
batch_x, batch_y = {}, {}
indices = self.idx_list[self.curidx:endidx]
self.cur_batch_indices = indices
for field_name, field in self.dataset.get_all_fields().items():
if field.is_target or field.is_input:
batch = field.get(indices)
if not self.as_numpy and \
field.dtype is not None and \
issubclass(field.dtype, Number) and not isinstance(batch, torch.Tensor):
batch = _to_tensor(batch)
if field.is_target:
batch_y[field_name] = batch
if field.is_input:
batch_x[field_name] = batch
self.curidx = endidx
return batch_x, batch_y
raise AttributeError("'DataSetGetter' object has no attribute '{}'".format(item))


class SamplerAdapter(torch.utils.data.Sampler):
def __init__(self, sampler, dataset):
self.sampler = sampler
self.dataset = dataset

def __iter__(self):
"""
Iterate on dataset, fetch batch data. Fetch process don't block the iterate process
:return:
"""
if self.prefetch:
return self._run_batch_iter(self)
def batch_iter():
self.init_iter()
while 1:
res = self.fetch_one()
if res is None:
break
yield res
return batch_iter()
return iter(self.sampler(self.dataset))


class BatchIter:
def __init__(self):
self.dataiter = None
self.num_batches = None
self.cur_batch_indices = None
self.batch_size = None

def init_iter(self):
self.idx_list = self.sampler(self.dataset)
self.curidx = 0
self.lengths = self.dataset.get_length()
pass

@staticmethod
def get_num_batches(num_samples, batch_size, drop_last):
num_batches = num_samples // batch_size
if not drop_last and (num_samples % batch_size > 0):
num_batches += 1
return num_batches

def __iter__(self):
self.init_iter()
for indices, batch_x, batch_y in self.dataiter:
self.cur_batch_indices = indices
yield batch_x, batch_y

def get_batch_indices(self):
return self.cur_batch_indices

def __len__(self):
return self.num_batches
def get_batch_indices(self):
"""
取得当前batch在DataSet中所在的index下标序列

:return list(int) indexes: 下标序列
"""
return self.cur_batch_indices
@staticmethod
def _run_fetch(batch, q):
try:
global _python_is_exit
batch.init_iter()
# print('start fetch')
while 1:
res = batch.fetch_one()
# print('fetch one')
while 1:
try:
q.put(res, timeout=3)
break
except Full:
if _python_is_exit:
return
if res is None:
# print('fetch done, waiting processing')
break
# print('fetch exit')
except Exception as e:
q.put(e)
finally:
q.join()
@staticmethod
def _run_batch_iter(batch):
q = mp.JoinableQueue(maxsize=10)
fetch_p = mp.Process(target=Batch._run_fetch, args=(batch, q))
fetch_p.daemon = True
fetch_p.start()
# print('fork fetch process')
while 1:
try:
res = q.get(timeout=1)
q.task_done()
# print('get fetched')
if res is None:
break
elif isinstance(res, Exception):
raise res
yield res
except Empty as e:
if fetch_p.is_alive():
continue
else:
break
fetch_p.terminate()
fetch_p.join()
# print('iter done')
@property
def dataset(self):
return self.dataiter.dataset


class DataSetIter(BatchIter):
def __init__(self, dataset, batch_size=1, sampler=None, as_numpy=False,
num_workers=0, pin_memory=False, drop_last=False,
timeout=0, worker_init_fn=None):
super().__init__()
assert isinstance(dataset, DataSet)
sampler = SamplerAdapter(sampler=sampler or SequentialSampler(), dataset=dataset)
dataset = DataSetGetter(dataset, as_numpy)
collate_fn = dataset.collate_fn if hasattr(dataset, 'collate_fn') else None
self.dataiter = torch.utils.data.DataLoader(
dataset=dataset, batch_size=batch_size, sampler=sampler,
collate_fn=collate_fn, num_workers=num_workers,
pin_memory=pin_memory, drop_last=drop_last,
timeout=timeout, worker_init_fn=worker_init_fn)
self.num_batches = self.get_num_batches(len(dataset), batch_size, drop_last)
self.batch_size = batch_size


class TorchLoaderIter(BatchIter):
def __init__(self, dataset):
super().__init__()
assert isinstance(dataset, torch.utils.data.DataLoader)
self.dataiter = dataset
self.num_batches = self.get_num_batches(len(dataset), dataset.batch_size, dataset.drop_last)
self.batch_size = dataset.batch_size


def _to_tensor(batch):
class OnlineDataGettter:
# TODO
pass


class OnlineDataIter(BatchIter):
# TODO
def __init__(self, dataset, batch_size=1, buffer_size=10000, sampler=None, as_numpy=False,
num_workers=0, pin_memory=False, drop_last=False,
timeout=0, worker_init_fn=None, **kwargs):
super().__init__()


def _to_tensor(batch, field_dtype):
try:
if issubclass(batch.dtype.type, np.floating):
batch = torch.as_tensor(batch).float() # 默认使用float32
if field_dtype is not None and isinstance(field_dtype, type)\
and issubclass(field_dtype, Number) \
and not isinstance(batch, torch.Tensor):
if issubclass(batch.dtype.type, np.floating):
new_batch = torch.as_tensor(batch).float() # 默认使用float32
elif issubclass(batch.dtype.type, np.integer):
new_batch = torch.as_tensor(batch).long() # 复用内存地址,避免复制
else:
new_batch = torch.as_tensor(batch)
return new_batch, True
else:
batch = torch.as_tensor(batch) # 复用内存地址,避免复制
except:
pass
return batch
return batch, False
except Exception as e:
raise e

+ 1
- 1
fastNLP/core/callback.py View File

@@ -548,7 +548,7 @@ class LRScheduler(Callback):
else:
raise ValueError(f"Expect torch.optim.lr_scheduler for LRScheduler. Got {type(lr_scheduler)}.")
def on_epoch_begin(self):
def on_epoch_end(self):
self.scheduler.step(self.epoch)




+ 5
- 3
fastNLP/core/dataset.py View File

@@ -801,17 +801,19 @@ class DataSet(object):
else:
return DataSet()
def split(self, ratio):
def split(self, ratio, shuffle=True):
"""
将DataSet按照ratio的比例拆分,返回两个DataSet

:param float ratio: 0<ratio<1, 返回的第一个DataSet拥有 `ratio` 这么多数据,第二个DataSet拥有 `(1-ratio)` 这么多数据
:param float ratio: 0<ratio<1, 返回的第一个DataSet拥有 `(1-ratio)` 这么多数据,第二个DataSet拥有`ratio`这么多数据
:param bool shuffle: 在split前是否shuffle一下
:return: [DataSet, DataSet]
"""
assert isinstance(ratio, float)
assert 0 < ratio < 1
all_indices = [_ for _ in range(len(self))]
np.random.shuffle(all_indices)
if shuffle:
np.random.shuffle(all_indices)
split = int(ratio * len(self))
dev_indices = all_indices[:split]
train_indices = all_indices[split:]


+ 42
- 31
fastNLP/core/field.py View File

@@ -176,7 +176,10 @@ class FieldArray:
if self.padder is None or pad is False:
return np.array(contents)
else:
return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype, dim=self._cell_ndim)
return self.pad(contents)

def pad(self, contents):
return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype, dim=self._cell_ndim)

def set_padder(self, padder):
"""
@@ -239,7 +242,7 @@ class FieldArray:
new_contents.append(cell.split(sep))
except Exception as e:
print(f"Exception happens when process value in index {index}.")
print(e)
raise e
return self._after_process(new_contents, inplace=inplace)

def int(self, inplace:bool=True):
@@ -279,7 +282,7 @@ class FieldArray:
new_contents.append(float(cell))
except Exception as e:
print(f"Exception happens when process value in index {index}.")
print(e)
raise e
return self._after_process(new_contents, inplace=inplace)

def bool(self, inplace=True):
@@ -299,7 +302,7 @@ class FieldArray:
new_contents.append(bool(cell))
except Exception as e:
print(f"Exception happens when process value in index {index}.")
print(e)
raise e

return self._after_process(new_contents, inplace=inplace)

@@ -320,7 +323,7 @@ class FieldArray:
new_contents.append(cell.lower())
except Exception as e:
print(f"Exception happens when process value in index {index}.")
print(e)
raise e
return self._after_process(new_contents, inplace=inplace)

def upper(self, inplace=True):
@@ -340,7 +343,7 @@ class FieldArray:
new_contents.append(cell.upper())
except Exception as e:
print(f"Exception happens when process value in index {index}.")
print(e)
raise e
return self._after_process(new_contents, inplace=inplace)

def value_count(self):
@@ -350,8 +353,15 @@ class FieldArray:
:return: Counter, key是label,value是出现次数
"""
count = Counter()

def cum(cell):
if _is_iterable(cell) and not isinstance(cell, str):
for cell_ in cell:
cum(cell_)
else:
count[cell] += 1
for cell in self.content:
count[cell] += 1
cum(cell)
return count

def _after_process(self, new_contents, inplace):
@@ -385,6 +395,8 @@ def _get_ele_type_and_dim(cell:Any, dim=0):
:return:
"""
if isinstance(cell, (str, Number, np.bool_)):
if hasattr(cell, 'dtype'):
return cell.dtype.type, dim
return type(cell), dim
elif isinstance(cell, list):
dim += 1
@@ -402,7 +414,7 @@ def _get_ele_type_and_dim(cell:Any, dim=0):
return cell.dtype, cell.dim() + dim # 如果是torch.mean的结果是0
elif isinstance(cell, np.ndarray):
if cell.dtype != np.dtype('O'): # 如果不是object的话说明是well-formatted的了
return cell.dtype.type, cell.ndim + dim
return cell.dtype.type, cell.ndim + dim # dtype.type返回的会是np.int32, np.float等
# 否则需要继续往下iterate
dim += 1
res = [_get_ele_type_and_dim(cell_i, dim) for cell_i in cell]
@@ -527,31 +539,30 @@ class AutoPadder(Padder):
if field_ele_dtype:
if dim>3:
return np.array(contents)
if isinstance(field_ele_dtype, np.dtype) or field_ele_dtype in (float, int, bool, str):
if isinstance(field_ele_dtype, np.number) or field_ele_dtype in (float, int, bool):
if dim==0:
if isinstance(field_ele_dtype, type) and \
(issubclass(field_ele_dtype, np.number) or issubclass(field_ele_dtype, Number)):
if dim==0:
array = np.array(contents, dtype=field_ele_dtype)
elif dim==1:
max_len = max(map(len, contents))
array = np.full((len(contents), max_len), self.pad_val, dtype=field_ele_dtype)
for i, content_i in enumerate(contents):
array[i, :len(content_i)] = content_i
elif dim==2:
max_len = max(map(len, contents))
max_word_len = max([max([len(content_ii) for content_ii in content_i]) for
content_i in contents])
array = np.full((len(contents), max_len, max_word_len), self.pad_val, dtype=field_ele_dtype)
for i, content_i in enumerate(contents):
for j, content_ii in enumerate(content_i):
array[i, j, :len(content_ii)] = content_ii
else:
shape = np.shape(contents)
if len(shape)==4: # 说明各dimension是相同的大小
array = np.array(contents, dtype=field_ele_dtype)
elif dim==1:
max_len = max(map(len, contents))
array = np.full((len(contents), max_len), self.pad_val, dtype=field_ele_dtype)
for i, content_i in enumerate(contents):
array[i, :len(content_i)] = content_i
elif dim==2:
max_len = max(map(len, contents))
max_word_len = max([max([len(content_ii) for content_ii in content_i]) for
content_i in contents])
array = np.full((len(contents), max_len, max_word_len), self.pad_val, dtype=field_ele_dtype)
for i, content_i in enumerate(contents):
for j, content_ii in enumerate(content_i):
array[i, j, :len(content_ii)] = content_ii
else:
shape = np.shape(contents)
if len(shape)==4: # 说明各dimension是相同的大小
array = np.array(contents, dtype=field_ele_dtype)
else:
raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.")
return array
return np.array(contents)
raise RuntimeError(f"Field:{field_name} has 3 dimensions, every sample should have the same shape.")
return array
elif str(field_ele_dtype).startswith('torch'):
if dim==0:
tensor = torch.tensor(contents).to(field_ele_dtype)


+ 50
- 42
fastNLP/core/losses.py View File

@@ -26,7 +26,7 @@ from .utils import _build_args
from .utils import _check_arg_dict_list
from .utils import _check_function_or_method
from .utils import _get_func_signature
from .utils import seq_len_to_mask

class LossBase(object):
"""
@@ -34,14 +34,23 @@ class LossBase(object):
"""
def __init__(self):
self.param_map = {}
self._param_map = {} # key是fun的参数,value是以该值从传入的dict取出value
self._checked = False

@property
def param_map(self):
if len(self._param_map) == 0: # 如果为空说明还没有初始化
func_spect = inspect.getfullargspec(self.get_loss)
func_args = [arg for arg in func_spect.args if arg != 'self']
for arg in func_args:
self._param_map[arg] = arg
return self._param_map

def get_loss(self, *args, **kwargs):
raise NotImplementedError
def _init_param_map(self, key_map=None, **kwargs):
"""检查key_map和其他参数map,并将这些映射关系添加到self.param_map
"""检查key_map和其他参数map,并将这些映射关系添加到self._param_map

:param dict key_map: 表示key的映射关系
:param kwargs: key word args里面的每一个的键-值对都会被构造成映射关系
@@ -53,30 +62,30 @@ class LossBase(object):
raise TypeError("key_map must be `dict`, got {}.".format(type(key_map)))
for key, value in key_map.items():
if value is None:
self.param_map[key] = key
self._param_map[key] = key
continue
if not isinstance(key, str):
raise TypeError(f"key in key_map must be `str`, not `{type(key)}`.")
if not isinstance(value, str):
raise TypeError(f"value in key_map must be `str`, not `{type(value)}`.")
self.param_map[key] = value
self._param_map[key] = value
value_counter[value].add(key)
for key, value in kwargs.items():
if value is None:
self.param_map[key] = key
self._param_map[key] = key
continue
if not isinstance(value, str):
raise TypeError(f"in {key}={value}, value must be `str`, not `{type(value)}`.")
self.param_map[key] = value
self._param_map[key] = value
value_counter[value].add(key)
for value, key_set in value_counter.items():
if len(key_set) > 1:
raise ValueError(f"Several parameters:{key_set} are provided with one output {value}.")
# check consistence between signature and param_map
# check consistence between signature and _param_map
func_spect = inspect.getfullargspec(self.get_loss)
func_args = [arg for arg in func_spect.args if arg != 'self']
for func_param, input_param in self.param_map.items():
for func_param, input_param in self._param_map.items():
if func_param not in func_args:
raise NameError(
f"Parameter `{func_param}` is not in {_get_func_signature(self.get_loss)}. Please check the "
@@ -96,7 +105,7 @@ class LossBase(object):
:return: dict, if dict is not {}, pass it to self.evaluate. Otherwise do mapping.
"""
fast_param = {}
if len(self.param_map) == 2 and len(pred_dict) == 1 and len(target_dict) == 1:
if len(self._param_map) == 2 and len(pred_dict) == 1 and len(target_dict) == 1:
fast_param['pred'] = list(pred_dict.values())[0]
fast_param['target'] = list(target_dict.values())[0]
return fast_param
@@ -115,49 +124,41 @@ class LossBase(object):
return loss
if not self._checked:
# 1. check consistence between signature and param_map
# 1. check consistence between signature and _param_map
func_spect = inspect.getfullargspec(self.get_loss)
func_args = set([arg for arg in func_spect.args if arg != 'self'])
for func_arg, input_arg in self.param_map.items():
for func_arg, input_arg in self._param_map.items():
if func_arg not in func_args:
raise NameError(f"`{func_arg}` not in {_get_func_signature(self.get_loss)}.")
# 2. only part of the param_map are passed, left are not
# 2. only part of the _param_map are passed, left are not
for arg in func_args:
if arg not in self.param_map:
self.param_map[arg] = arg # This param does not need mapping.
if arg not in self._param_map:
self._param_map[arg] = arg # This param does not need mapping.
self._evaluate_args = func_args
self._reverse_param_map = {input_arg: func_arg for func_arg, input_arg in self.param_map.items()}
# need to wrap inputs in dict.
self._reverse_param_map = {input_arg: func_arg for func_arg, input_arg in self._param_map.items()}

mapped_pred_dict = {}
mapped_target_dict = {}
duplicated = []
for input_arg in set(list(pred_dict.keys()) + list(target_dict.keys())):
not_duplicate_flag = 0
if input_arg in self._reverse_param_map:
mapped_arg = self._reverse_param_map[input_arg]
not_duplicate_flag += 1
else:
mapped_arg = input_arg
for input_arg, mapped_arg in self._reverse_param_map.items():
if input_arg in pred_dict:
mapped_pred_dict[mapped_arg] = pred_dict[input_arg]
not_duplicate_flag += 1
if input_arg in target_dict:
mapped_target_dict[mapped_arg] = target_dict[input_arg]
not_duplicate_flag += 1
if not_duplicate_flag == 3:
duplicated.append(input_arg)
# missing
if not self._checked:
duplicated = []
for input_arg, mapped_arg in self._reverse_param_map.items():
if input_arg in pred_dict and input_arg in target_dict:
duplicated.append(input_arg)
check_res = _check_arg_dict_list(self.get_loss, [mapped_pred_dict, mapped_target_dict])
# replace missing.
missing = check_res.missing
replaced_missing = list(missing)
for idx, func_arg in enumerate(missing):
# Don't delete `` in this information, nor add ``
replaced_missing[idx] = f"{self.param_map[func_arg]}" + f"(assign to `{func_arg}` " \
replaced_missing[idx] = f"{self._param_map[func_arg]}" + f"(assign to `{func_arg}` " \
f"in `{self.__class__.__name__}`)"
check_res = _CheckRes(missing=replaced_missing,
@@ -170,6 +171,8 @@ class LossBase(object):
if check_res.missing or check_res.duplicated:
raise _CheckError(check_res=check_res,
func_signature=_get_func_signature(self.get_loss))
self._checked = True

refined_args = _build_args(self.get_loss, **mapped_pred_dict, **mapped_target_dict)
loss = self.get_loss(**refined_args)
@@ -204,15 +207,12 @@ class LossFunc(LossBase):
super(LossFunc, self).__init__()
_check_function_or_method(func)
self.get_loss = func
if key_map is not None:
if not isinstance(key_map, dict):
raise RuntimeError(f"Loss error: key_map except a {type({})} but got a {type(key_map)}")
self.param_map = key_map
if len(kwargs) > 0:
for key, val in kwargs.items():
self.param_map.update({key: val})
self._init_param_map(key_map, **kwargs)
self.get_loss = func


class CrossEntropyLoss(LossBase):
@@ -223,7 +223,9 @@ class CrossEntropyLoss(LossBase):
:param pred: 参数映射表中 `pred` 的映射关系,None表示映射关系为 `pred` -> `pred`
:param target: 参数映射表中 `target` 的映射关系,None表示映射关系为 `target` -> `target`
:param padding_idx: padding的index,在计算loss时将忽略target中标号为padding_idx的内容
:param seq_len: 句子的长度, 长度之外的token不会计算loss。。
:param padding_idx: padding的index,在计算loss时将忽略target中标号为padding_idx的内容, 可以通过该值代替
传入seq_len.

Example::

@@ -231,13 +233,19 @@ class CrossEntropyLoss(LossBase):
"""
def __init__(self, pred=None, target=None, padding_idx=-100):
# TODO 需要做一些检查,F.cross_entropy在计算时,如果pred是(16, 10 ,4), target的形状按道理应该是(16, 10), 但实际需要(16,4)
def __init__(self, pred=None, target=None, seq_len=None, padding_idx=-100):
super(CrossEntropyLoss, self).__init__()
self._init_param_map(pred=pred, target=target)
self._init_param_map(pred=pred, target=target, seq_len=seq_len)
self.padding_idx = padding_idx
def get_loss(self, pred, target):
def get_loss(self, pred, target, seq_len=None):
if pred.dim()>2:
pred = pred.view(-1, pred.size(-1))
target = target.view(-1)
if seq_len is not None:
mask = seq_len_to_mask(seq_len).view(-1).eq(0)
target = target.masked_fill(mask, self.padding_idx)

return F.cross_entropy(input=pred, target=target,
ignore_index=self.padding_idx)



+ 65
- 30
fastNLP/core/metrics.py View File

@@ -115,9 +115,18 @@ class MetricBase(object):
"""
def __init__(self):
self.param_map = {} # key is param in function, value is input param.
self._param_map = {} # key is param in function, value is input param.
self._checked = False

@property
def param_map(self):
if len(self._param_map) == 0: # 如果为空说明还没有初始化
func_spect = inspect.getfullargspec(self.evaluate)
func_args = [arg for arg in func_spect.args if arg != 'self']
for arg in func_args:
self._param_map[arg] = arg
return self._param_map

@abstractmethod
def evaluate(self, *args, **kwargs):
raise NotImplementedError
@@ -127,7 +136,7 @@ class MetricBase(object):
raise NotImplemented
def _init_param_map(self, key_map=None, **kwargs):
"""检查key_map和其他参数map,并将这些映射关系添加到self.param_map
"""检查key_map和其他参数map,并将这些映射关系添加到self._param_map

:param dict key_map: 表示key的映射关系
:param kwargs: key word args里面的每一个的键-值对都会被构造成映射关系
@@ -139,30 +148,30 @@ class MetricBase(object):
raise TypeError("key_map must be `dict`, got {}.".format(type(key_map)))
for key, value in key_map.items():
if value is None:
self.param_map[key] = key
self._param_map[key] = key
continue
if not isinstance(key, str):
raise TypeError(f"key in key_map must be `str`, not `{type(key)}`.")
if not isinstance(value, str):
raise TypeError(f"value in key_map must be `str`, not `{type(value)}`.")
self.param_map[key] = value
self._param_map[key] = value
value_counter[value].add(key)
for key, value in kwargs.items():
if value is None:
self.param_map[key] = key
self._param_map[key] = key
continue
if not isinstance(value, str):
raise TypeError(f"in {key}={value}, value must be `str`, not `{type(value)}`.")
self.param_map[key] = value
self._param_map[key] = value
value_counter[value].add(key)
for value, key_set in value_counter.items():
if len(key_set) > 1:
raise ValueError(f"Several parameters:{key_set} are provided with one output {value}.")
# check consistence between signature and param_map
# check consistence between signature and _param_map
func_spect = inspect.getfullargspec(self.evaluate)
func_args = [arg for arg in func_spect.args if arg != 'self']
for func_param, input_param in self.param_map.items():
for func_param, input_param in self._param_map.items():
if func_param not in func_args:
raise NameError(
f"Parameter `{func_param}` is not in {_get_func_signature(self.evaluate)}. Please check the "
@@ -177,7 +186,7 @@ class MetricBase(object):
:return: dict, if dict is not {}, pass it to self.evaluate. Otherwise do mapping.
"""
fast_param = {}
if len(self.param_map) == 2 and len(pred_dict) == 1 and len(target_dict) == 1:
if len(self._param_map) == 2 and len(pred_dict) == 1 and len(target_dict) == 1:
fast_param['pred'] = list(pred_dict.values())[0]
fast_param['target'] = list(target_dict.values())[0]
return fast_param
@@ -206,42 +215,35 @@ class MetricBase(object):
if not self._checked:
if not callable(self.evaluate):
raise TypeError(f"{self.__class__.__name__}.evaluate has to be callable, not {type(self.evaluate)}.")
# 1. check consistence between signature and param_map
# 1. check consistence between signature and _param_map
func_spect = inspect.getfullargspec(self.evaluate)
func_args = set([arg for arg in func_spect.args if arg != 'self'])
for func_arg, input_arg in self.param_map.items():
for func_arg, input_arg in self._param_map.items():
if func_arg not in func_args:
raise NameError(f"`{func_arg}` not in {_get_func_signature(self.evaluate)}.")
# 2. only part of the param_map are passed, left are not
# 2. only part of the _param_map are passed, left are not
for arg in func_args:
if arg not in self.param_map:
self.param_map[arg] = arg # This param does not need mapping.
if arg not in self._param_map:
self._param_map[arg] = arg # This param does not need mapping.
self._evaluate_args = func_args
self._reverse_param_map = {input_arg: func_arg for func_arg, input_arg in self.param_map.items()}
self._reverse_param_map = {input_arg: func_arg for func_arg, input_arg in self._param_map.items()}
# need to wrap inputs in dict.
mapped_pred_dict = {}
mapped_target_dict = {}
duplicated = []
for input_arg in set(list(pred_dict.keys()) + list(target_dict.keys())):
not_duplicate_flag = 0
if input_arg in self._reverse_param_map:
mapped_arg = self._reverse_param_map[input_arg]
not_duplicate_flag += 1
else:
mapped_arg = input_arg
for input_arg, mapped_arg in self._reverse_param_map.items():
if input_arg in pred_dict:
mapped_pred_dict[mapped_arg] = pred_dict[input_arg]
not_duplicate_flag += 1
if input_arg in target_dict:
mapped_target_dict[mapped_arg] = target_dict[input_arg]
not_duplicate_flag += 1
if not_duplicate_flag == 3:
duplicated.append(input_arg)
# missing
if not self._checked:
duplicated = []
for input_arg, mapped_arg in self._reverse_param_map.items():
if input_arg in pred_dict and input_arg in target_dict:
duplicated.append(input_arg)
check_res = _check_arg_dict_list(self.evaluate, [mapped_pred_dict, mapped_target_dict])
# only check missing.
# replace missing.
@@ -249,7 +251,7 @@ class MetricBase(object):
replaced_missing = list(missing)
for idx, func_arg in enumerate(missing):
# Don't delete `` in this information, nor add ``
replaced_missing[idx] = f"{self.param_map[func_arg]}" + f"(assign to `{func_arg}` " \
replaced_missing[idx] = f"{self._param_map[func_arg]}" + f"(assign to `{func_arg}` " \
f"in `{self.__class__.__name__}`)"
check_res = _CheckRes(missing=replaced_missing,
@@ -262,10 +264,10 @@ class MetricBase(object):
if check_res.missing or check_res.duplicated:
raise _CheckError(check_res=check_res,
func_signature=_get_func_signature(self.evaluate))
self._checked = True
refined_args = _build_args(self.evaluate, **mapped_pred_dict, **mapped_target_dict)
self.evaluate(**refined_args)
self._checked = True
return

@@ -411,6 +413,37 @@ def _bmeso_tag_to_spans(tags, ignore_labels=None):
]


def _bioes_tag_to_spans(tags, ignore_labels=None):
"""
给定一个tags的lis,比如['O', 'B-singer', 'I-singer', 'E-singer', 'O', 'O']。
返回[('singer', (1, 4))] (左闭右开区间)

:param tags: List[str],
:param ignore_labels: List[str], 在该list中的label将被忽略
:return: List[Tuple[str, List[int, int]]]. [(label,[start, end])]
"""
ignore_labels = set(ignore_labels) if ignore_labels else set()

spans = []
prev_bioes_tag = None
for idx, tag in enumerate(tags):
tag = tag.lower()
bioes_tag, label = tag[:1], tag[2:]
if bioes_tag in ('b', 's'):
spans.append((label, [idx, idx]))
elif bioes_tag in ('i', 'e') and prev_bioes_tag in ('b', 'i') and label == spans[-1][0]:
spans[-1][1][1] = idx
elif bioes_tag == 'o':
pass
else:
spans.append((label, [idx, idx]))
prev_bioes_tag = bioes_tag
return [(span[0], (span[1][0], span[1][1] + 1))
for span in spans
if span[0] not in ignore_labels
]


def _bio_tag_to_spans(tags, ignore_labels=None):
"""
给定一个tags的lis,比如['O', 'B-singer', 'I-singer', 'I-singer', 'O', 'O']。
@@ -471,7 +504,7 @@ class SpanFPreRecMetric(MetricBase):
:param str pred: 用该key在evaluate()时从传入dict中取出prediction数据。 为None,则使用'pred'取数据
:param str target: 用该key在evaluate()时从传入dict中取出target数据。 为None,则使用'target'取数据
:param str seq_len: 用该key在evaluate()时从传入dict中取出sequence length数据。为None,则使用'seq_len'取数据。
:param str encoding_type: 目前支持bio, bmes
:param str encoding_type: 目前支持bio, bmes, bmeso, bioes
:param list ignore_labels: str 组成的list. 这个list中的class不会被用于计算。例如在POS tagging时传入['NN'],则不会计算'NN'这
个label
:param bool only_gross: 是否只计算总的f1, precision, recall的值;如果为False,不仅返回总的f1, pre, rec, 还会返回每个
@@ -499,6 +532,8 @@ class SpanFPreRecMetric(MetricBase):
self.tag_to_span_func = _bio_tag_to_spans
elif self.encoding_type == 'bmeso':
self.tag_to_span_func = _bmeso_tag_to_spans
elif self.encoding_type == 'bioes':
self.tag_to_span_func = _bioes_tag_to_spans
else:
raise ValueError("Only support 'bio', 'bmes', 'bmeso' type.")


+ 17
- 0
fastNLP/core/optimizer.py View File

@@ -36,6 +36,23 @@ class Optimizer(object):
"""
return [param for param in params if param.requires_grad]

class NullOptimizer(Optimizer):
"""
当不希望Trainer更新optimizer时,传入本optimizer,但请确保通过callback的方式对参数进行了更新。

"""
def __init__(self):
super().__init__(None)

def construct_from_pytorch(self, model_params):
pass

def __getattr__(self, item):
def pass_func(*args, **kwargs):
pass

return pass_func


class SGD(Optimizer):
"""


+ 2
- 3
fastNLP/core/predictor.py View File

@@ -6,7 +6,7 @@ from collections import defaultdict

import torch

from . import Batch
from . import DataSetIter
from . import DataSet
from . import SequentialSampler
from .utils import _build_args
@@ -44,8 +44,7 @@ class Predictor(object):

self.network.eval()
batch_output = defaultdict(list)
data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False,
prefetch=False)
data_iterator = DataSetIter(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False)

if hasattr(self.network, "predict"):
predict_func = self.network.predict


+ 15
- 4
fastNLP/core/tester.py View File

@@ -37,7 +37,7 @@ import warnings
import torch
import torch.nn as nn

from .batch import Batch
from .batch import BatchIter, DataSetIter
from .dataset import DataSet
from .metrics import _prepare_metrics
from .sampler import SequentialSampler
@@ -82,7 +82,7 @@ class Tester(object):
:param int verbose: 如果为0不输出任何信息; 如果为1,打印出验证结果。
"""
def __init__(self, data, model, metrics, batch_size=16, device=None, verbose=1):
def __init__(self, data, model, metrics, batch_size=16, num_workers=0, device=None, verbose=1):
super(Tester, self).__init__()
if not isinstance(data, DataSet):
@@ -96,6 +96,14 @@ class Tester(object):
self._model = _move_model_to_device(model, device=device)
self.batch_size = batch_size
self.verbose = verbose

if isinstance(data, DataSet):
self.data_iterator = DataSetIter(
dataset=data, batch_size=batch_size, num_workers=num_workers, sampler=SequentialSampler())
elif isinstance(data, BatchIter):
self.data_iterator = data
else:
raise TypeError("data type {} not support".format(type(data)))
# 如果是DataParallel将没有办法使用predict方法
if isinstance(self._model, nn.DataParallel):
@@ -112,7 +120,10 @@ class Tester(object):
raise TypeError(f"`{_model_name}.predict` must be callable to be used "
f"for evaluation, not `{type(self._predict_func)}`.")
else:
self._predict_func = self._model.forward
if isinstance(model, nn.DataParallel):
self._predict_func = self._model.module.forward
else:
self._predict_func = self._model.forward
def test(self):
"""开始进行验证,并返回验证结果。
@@ -124,7 +135,7 @@ class Tester(object):
self._model_device = _get_model_device(self._model)
network = self._model
self._mode(network, is_test=True)
data_iterator = Batch(self.data, self.batch_size, sampler=SequentialSampler(), as_numpy=False)
data_iterator = self.data_iterator
eval_results = {}
try:
with torch.no_grad():


+ 80
- 56
fastNLP/core/trainer.py View File

@@ -311,8 +311,9 @@ try:
from tqdm.auto import tqdm
except:
from .utils import _pseudo_tqdm as tqdm
import warnings

from .batch import Batch
from .batch import DataSetIter, BatchIter
from .callback import CallbackManager, CallbackException
from .dataset import DataSet
from .losses import _prepare_losser
@@ -320,7 +321,6 @@ from .metrics import _prepare_metrics
from .optimizer import Optimizer
from .sampler import Sampler
from .sampler import RandomSampler
from .sampler import SequentialSampler
from .tester import Tester
from .utils import _CheckError
from .utils import _build_args
@@ -351,6 +351,8 @@ class Trainer(object):
:param int batch_size: 训练和验证的时候的batch大小。
:param loss: 使用的 :class:`~fastNLP.core.losses.LossBase` 对象。当为None时,默认使用 :class:`~fastNLP.LossInForward`
:param sampler: Batch数据生成的顺序, :class:`~fastNLP.Sampler` 类型。如果为None,默认使用 :class:`~fastNLP.RandomSampler`
:param drop_last: 如果最后一个batch没有正好为batch_size这么多数据,就扔掉最后一个batch
:param num_workers: int, 有多少个线程来进行数据pad处理。
:param update_every: int, 多少步更新一次梯度。用于希望累计梯度的场景,比如需要128的batch_size, 但是直接设为128
会导致内存不足,通过设置batch_size=32, update_every=4达到目的。当optimizer为None时,该参数无效。
:param int n_epochs: 需要优化迭代多少次。
@@ -367,7 +369,6 @@ class Trainer(object):
:param int validate_every: 多少个step在验证集上验证一次; 如果为-1,则每个epoch结束验证一次。仅在传入dev_data时有效。
:param str,None save_path: 将模型保存路径。如果为None,则不保存模型。如果dev_data为None,则保存最后一次迭代的模型。
保存的时候不仅保存了参数,还保存了模型结构。即便使用DataParallel,这里也只保存模型。
:param prefetch: bool, 是否使用额外的进程对产生batch数据。理论上会使得Batch迭代更快。
:param bool use_tqdm: 是否使用tqdm来显示训练进度; 如果为False,则将loss打印在终端中。
:param str,int,torch.device,list(int) device: 将模型load到哪个设备。默认为None,即Trainer不对模型
的计算位置进行管理。支持以下的输入:
@@ -394,16 +395,17 @@ class Trainer(object):
"""
def __init__(self, train_data, model, optimizer=None, loss=None,
batch_size=32, sampler=None, update_every=1,
n_epochs=10, print_every=5,
batch_size=32, sampler=None, drop_last=False, update_every=1,
num_workers=0, n_epochs=10, print_every=5,
dev_data=None, metrics=None, metric_key=None,
validate_every=-1, save_path=None,
prefetch=False, use_tqdm=True, device=None,
callbacks=None,
check_code_level=0):
validate_every=-1, save_path=None, use_tqdm=True, device=None, prefetch=False,
callbacks=None, check_code_level=0):
if prefetch and num_workers==0:
num_workers = 1
if prefetch:
warnings.warn("prefetch is deprecated, will be removed in version 0.5.0, please use num_workers instead.")

super(Trainer, self).__init__()
if not isinstance(train_data, DataSet):
raise TypeError(f"The type of train_data must be fastNLP.DataSet, got {type(train_data)}.")
if not isinstance(model, nn.Module):
raise TypeError(f"The type of model must be torch.nn.Module, got {type(model)}.")
@@ -430,25 +432,35 @@ class Trainer(object):
if metric_key is not None:
self.increase_better = False if metric_key[0] == "-" else True
self.metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key
elif len(metrics) > 0:
self.metric_key = metrics[0].__class__.__name__.lower().strip('metric')
else:
self.metric_key = None
# prepare loss
losser = _prepare_losser(loss)
# sampler check
if sampler is not None and not isinstance(sampler, Sampler):
raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler)))
if check_code_level > -1:

if sampler is None:
sampler = RandomSampler()

if isinstance(train_data, DataSet):
self.data_iterator = DataSetIter(
dataset=train_data, batch_size=batch_size, num_workers=num_workers, sampler=sampler, drop_last=drop_last)
elif isinstance(train_data, BatchIter):
self.data_iterator = train_data
else:
raise TypeError("train_data type {} not support".format(type(train_data)))

if check_code_level > -1 and isinstance(self.data_iterator, DataSetIter):
_check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data,
metric_key=metric_key, check_level=check_code_level,
batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE))
# _check_code 是 fastNLP 帮助你检查代码是否正确的方法 。如果你在错误栈中看到这行注释,请认真检查你的代码
self.model = _move_model_to_device(model, device=device)

self.train_data = train_data
self.dev_data = dev_data # If None, No validation.
self.model = model
self.losser = losser
self.metrics = metrics
self.n_epochs = int(n_epochs)
@@ -460,26 +472,22 @@ class Trainer(object):
self.best_dev_epoch = None
self.best_dev_step = None
self.best_dev_perf = None
self.sampler = sampler if sampler is not None else RandomSampler()
self.prefetch = prefetch
self.n_steps = (len(self.train_data) // self.batch_size + int(
len(self.train_data) % self.batch_size != 0)) * self.n_epochs
self.model = _move_model_to_device(self.model, device=device)

if isinstance(optimizer, torch.optim.Optimizer):
self.optimizer = optimizer
elif isinstance(optimizer, Optimizer):
self.optimizer = optimizer.construct_from_pytorch(model.parameters())
self.optimizer = optimizer.construct_from_pytorch(self.model.parameters())
elif optimizer is None:
self.optimizer = torch.optim.Adam(model.parameters(), lr=4e-3)
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=4e-3)
else:
raise TypeError("optimizer can only be torch.optim.Optimizer type, not {}.".format(type(optimizer)))
self.use_tqdm = use_tqdm
self.pbar = None
self.print_every = abs(self.print_every)
if self.dev_data is not None:
self.tester = Tester(model=self.model,
data=self.dev_data,
@@ -493,7 +501,7 @@ class Trainer(object):
self.callback_manager = CallbackManager(env={"trainer": self},
callbacks=callbacks)
def train(self, load_best_model=True, on_exception='auto'):
"""
使用该函数使Trainer开始训练。
@@ -568,12 +576,14 @@ class Trainer(object):
self.step = 0
self.epoch = 0
start = time.time()
if isinstance(self.model, nn.DataParallel):
self._forward_func = self.model.module.forward
else:
self._forward_func = self.model.forward
with inner_tqdm(total=self.n_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar:
self.pbar = pbar
avg_loss = 0
data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
prefetch=self.prefetch)
data_iterator = self.data_iterator
self.batch_per_epoch = data_iterator.num_batches
for epoch in range(1, self.n_epochs + 1):
self.epoch = epoch
@@ -605,7 +615,7 @@ class Trainer(object):
if self.step % self.print_every == 0:
avg_loss = float(avg_loss) / self.print_every
if self.use_tqdm:
print_output = "loss:{0:<6.5f}".format(avg_loss)
print_output = "loss:{:<6.5f}".format(avg_loss)
pbar.update(self.print_every)
else:
end = time.time()
@@ -669,15 +679,15 @@ class Trainer(object):
"""Perform weight update on a model.

"""
if self.optimizer is not None and (self.step + 1) % self.update_every == 0:
if self.step % self.update_every == 0:
self.optimizer.step()
def _data_forward(self, network, x):
x = _build_args(network.forward, **x)
x = _build_args(self._forward_func, **x)
y = network(**x)
if not isinstance(y, dict):
raise TypeError(
f"The return value of {_get_func_signature(network.forward)} should be dict, got {type(y)}.")
f"The return value of {_get_func_signature(self._forward_func)} should be dict, got {type(y)}.")
return y
def _grad_backward(self, loss):
@@ -687,7 +697,7 @@ class Trainer(object):

For PyTorch, just do "loss.backward()"
"""
if self.step % self.update_every == 0:
if (self.step-1) % self.update_every == 0:
self.model.zero_grad()
loss.backward()
@@ -746,7 +756,9 @@ class Trainer(object):

:return bool value: True means current results on dev set is the best.
"""
indicator_val = _check_eval_results(metrics, self.metric_key, self.metrics)
indicator, indicator_val = _check_eval_results(metrics, self.metric_key, self.metrics)
if self.metric_key is None:
self.metric_key = indicator
is_better = True
if self.best_metric_indicator is None:
# first-time validation
@@ -785,15 +797,34 @@ def _get_value_info(_dict):
strs.append(_str)
return strs


from numbers import Number
from .batch import _to_tensor
def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE,
dev_data=None, metric_key=None,
check_level=0):
# check get_loss 方法
model_devcie = model.parameters().__next__().device
model_devcie = _get_model_device(model=model)
batch = Batch(dataset=dataset, batch_size=batch_size, sampler=SequentialSampler())
for batch_count, (batch_x, batch_y) in enumerate(batch):
def _iter():
start_idx = 0
while start_idx<len(dataset):
batch_x = {}
batch_y = {}
for field_name, field in dataset.get_all_fields().items():
indices = list(range(start_idx, min(start_idx+batch_size, len(dataset))))
if field.is_target or field.is_input:
batch = field.get(indices)
if field.dtype is not None and \
issubclass(field.dtype, Number) and not isinstance(batch, torch.Tensor):
batch, _ = _to_tensor(batch, field.dtype)
if field.is_target:
batch_y[field_name] = batch
if field.is_input:
batch_x[field_name] = batch
yield (batch_x, batch_y)
start_idx += batch_size

for batch_count, (batch_x, batch_y) in enumerate(_iter()):
_move_dict_value_to_device(batch_x, batch_y, device=model_devcie)
# forward check
if batch_count == 0:
@@ -815,8 +846,11 @@ def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_
print(info_str)
_check_forward_error(forward_func=model.forward, dataset=dataset,
batch_x=batch_x, check_level=check_level)
refined_batch_x = _build_args(model.forward, **batch_x)
if isinstance(model, nn.DataParallel):
forward_func = model.module.forward
else:
forward_func = model.forward
refined_batch_x = _build_args(forward_func, **batch_x)
pred_dict = model(**refined_batch_x)
func_signature = _get_func_signature(model.forward)
if not isinstance(pred_dict, dict):
@@ -861,26 +895,16 @@ def _check_eval_results(metrics, metric_key, metric_list):
loss, metrics = metrics
if isinstance(metrics, dict):
if len(metrics) == 1:
# only single metric, just use it
metric_dict = list(metrics.values())[0]
metrics_name = list(metrics.keys())[0]
else:
metrics_name = metric_list[0].__class__.__name__
if metrics_name not in metrics:
raise RuntimeError(f"{metrics_name} is chosen to do validation, but got {metrics}")
metric_dict = metrics[metrics_name]
metric_dict = list(metrics.values())[0] # 取第一个metric
if len(metric_dict) == 1:
if metric_key is None:
indicator_val, indicator = list(metric_dict.values())[0], list(metric_dict.keys())[0]
elif len(metric_dict) > 1 and metric_key is None:
raise RuntimeError(
f"Got multiple metric keys: {metric_dict}, but metric_key is not set. Which one to use?")
else:
# metric_key is set
if metric_key not in metric_dict:
raise RuntimeError(f"metric key {metric_key} not found in {metric_dict}")
indicator_val = metric_dict[metric_key]
indicator = metric_key
else:
raise RuntimeError("Invalid metrics type. Expect {}, got {}".format((tuple, dict), type(metrics)))
return indicator_val
return indicator, indicator_val

+ 9
- 3
fastNLP/core/utils.py View File

@@ -643,7 +643,7 @@ def _check_forward_error(forward_func, batch_x, dataset, check_level):
warnings.warn(message=_unused_warn)


def seq_len_to_mask(seq_len):
def seq_len_to_mask(seq_len, max_len=None):
"""

将一个表示sequence length的一维数组转换为二维的mask,不包含的位置为0。
@@ -659,20 +659,26 @@ def seq_len_to_mask(seq_len):
>>> mask = seq_len_to_mask(seq_len)
>>> print(mask.shape)
(14, 15)
>>> seq_len = torch.arange(2, 16)
>>> mask = seq_len_to_mask(seq_len, max_len=100)
>>>print(mask.size())
torch.Size([14, 100])

:param np.ndarray,torch.LongTensor seq_len: shape将是(B,)
:param int max_len: 将长度pad到这个长度。默认(None)使用的是seq_len中最长的长度。但在nn.DataParallel的场景下可能不同卡的seq_len会有
区别,所以需要传入一个max_len使得mask的长度是pad到该长度。
:return: np.ndarray or torch.Tensor, shape将是(B, max_length)。 元素类似为bool或torch.uint8
"""
if isinstance(seq_len, np.ndarray):
assert len(np.shape(seq_len)) == 1, f"seq_len can only have one dimension, got {len(np.shape(seq_len))}."
max_len = int(seq_len.max())
max_len = int(max_len) if max_len else int(seq_len.max())
broad_cast_seq_len = np.tile(np.arange(max_len), (len(seq_len), 1))
mask = broad_cast_seq_len < seq_len.reshape(-1, 1)
elif isinstance(seq_len, torch.Tensor):
assert seq_len.dim() == 1, f"seq_len can only have one dimension, got {seq_len.dim() == 1}."
batch_size = seq_len.size(0)
max_len = seq_len.max().long()
max_len = int(max_len) if max_len else seq_len.max().long()
broad_cast_seq_len = torch.arange(max_len).expand(batch_size, -1).to(seq_len)
mask = broad_cast_seq_len.lt(seq_len.unsqueeze(1))
else:


+ 56
- 14
fastNLP/core/vocabulary.py View File

@@ -4,10 +4,11 @@ __all__ = [
]

from functools import wraps
from collections import Counter
from collections import Counter, defaultdict
from .dataset import DataSet
from .utils import Option

from functools import partial
import numpy as np

class VocabularyOption(Option):
def __init__(self,
@@ -89,7 +90,9 @@ class Vocabulary(object):
self.word2idx = None
self.idx2word = None
self.rebuild = True
# 用于承载不需要单独创建entry的词语,具体见from_dataset()方法
self._no_create_word = defaultdict(int)

@_check_build_status
def update(self, word_lst):
"""依次增加序列中词在词典中的出现频率
@@ -240,8 +243,12 @@ class Vocabulary(object):
raise e
else:
raise RuntimeError("Only DataSet type is allowed.")
def from_dataset(self, *datasets, field_name):

@property
def _no_create_word_length(self):
return len(self._no_create_word)

def from_dataset(self, *datasets, field_name, no_create_entry_dataset=None):
"""
使用dataset的对应field中词构建词典::

@@ -253,6 +260,13 @@ class Vocabulary(object):
构建词典所使用的 field(s), 支持一个或多个field
若有多个 DataSet, 每个DataSet都必须有这些field.
目前仅支持的field结构: ``str`` , ``list(str)`` , ``list(list(str))``
:param no_create_entry_dataset: 可以传入DataSet, List[DataSet]或者None(默认),该选项用在接下来的模型会使用pretrain
的embedding(包括glove, word2vec, elmo与bert)且会finetune的情况。如果仅使用来自于train的数据建立vocabulary,会导致test与dev
中的数据无法充分利用到来自于预训练embedding的信息,所以在建立词表的时候将test与dev考虑进来会使得最终的结果更好。
如果一个词出现在了train中,但是没在预训练模型中,embedding会为它用unk初始化,但它是单独的一个vector,如果
finetune embedding的话,这个词在更新之后可能会有更好的表示; 而如果这个词仅出现在了dev或test中,那么就不能为它们单独建立vector,
而应该让它指向unk这个vector的值。所以只位于no_create_entry_dataset中的token,将首先从预训练的词表中寻找它的表示,
如果找到了,就使用该表示; 如果没有找到,则认为该词的表示应该为unk的表示。
:return self:
"""
if isinstance(field_name, str):
@@ -260,19 +274,28 @@ class Vocabulary(object):
elif not isinstance(field_name, list):
raise TypeError('invalid argument field_name: {}'.format(field_name))
def construct_vocab(ins):
def construct_vocab(ins, no_create_entry=False):
for fn in field_name:
field = ins[fn]
if isinstance(field, str):
if no_create_entry and field not in self.word_count:
self._no_create_word[field] += 1
self.add_word(field)
elif isinstance(field, list):
if not isinstance(field[0], list):
self.add_word_lst(field)
elif isinstance(field, (list, np.ndarray)):
if not isinstance(field[0], (list, np.ndarray)):
for word in field:
if no_create_entry and word not in self.word_count:
self._no_create_word[word] += 1
self.add_word(word)
else:
if isinstance(field[0][0], list):
if isinstance(field[0][0], (list, np.ndarray)):
raise RuntimeError("Only support field with 2 dimensions.")
[self.add_word_lst(w) for w in field]
for words in field:
for word in words:
if no_create_entry and word not in self.word_count:
self._no_create_word[word] += 1
self.add_word(word)

for idx, dataset in enumerate(datasets):
if isinstance(dataset, DataSet):
try:
@@ -281,9 +304,27 @@ class Vocabulary(object):
print("When processing the `{}` dataset, the following error occurred.".format(idx))
raise e
else:
raise RuntimeError("Only DataSet type is allowed.")
raise TypeError("Only DataSet type is allowed.")

if no_create_entry_dataset is not None:
partial_construct_vocab = partial(construct_vocab, no_create_entry=True)
if isinstance(no_create_entry_dataset, DataSet):
no_create_entry_dataset.apply(partial_construct_vocab)
elif isinstance(no_create_entry_dataset, list):
for dataset in no_create_entry_dataset:
if not isinstance(dataset, DataSet):
raise TypeError("Only DataSet type is allowed.")
dataset.apply(partial_construct_vocab)
return self

def _is_word_no_create_entry(self, word):
"""
判断当前的word是否是不需要创建entry的,具体参见from_dataset的说明
:param word: str
:return: bool
"""
return word in self._no_create_word

def to_index(self, w):
"""
将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出
@@ -338,6 +379,7 @@ class Vocabulary(object):
self.word2idx = None
self.idx2word = None
self.rebuild = True
self._no_create_word.clear()
def __getstate__(self):
"""Use to prepare data for pickle.


+ 2
- 2
fastNLP/io/__init__.py View File

@@ -26,6 +26,6 @@ __all__ = [
]

from .embed_loader import EmbedLoader
from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \
PeopleDailyCorpusLoader, Conll2003Loader
from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, \
SNLILoader, SSTLoader, PeopleDailyCorpusLoader, Conll2003Loader
from .model_io import ModelLoader, ModelSaver

+ 8
- 0
fastNLP/io/base_loader.py View File

@@ -124,6 +124,14 @@ class DataInfo:
self.embeddings = embeddings or {}
self.datasets = datasets or {}

def __repr__(self):
_str = 'In total {} datasets:\n'.format(len(self.datasets))
for name, dataset in self.datasets.items():
_str += '\t{} has {} instances.\n'.format(name, len(dataset))
_str += 'In total {} vocabs:\n'.format(len(self.vocabs))
for name, vocab in self.vocabs.items():
_str += '\t{} has {} entries.\n'.format(name, len(vocab))
return _str

class DataSetLoader:
"""


+ 7
- 2
fastNLP/io/dataset_loader.py View File

@@ -22,13 +22,17 @@ __all__ = [
'Conll2003Loader',
]

import os
from nltk import Tree
from typing import Union, Dict
from ..core.vocabulary import Vocabulary
from ..core.dataset import DataSet
from ..core.instance import Instance
from .file_reader import _read_csv, _read_json, _read_conll
from .base_loader import DataSetLoader
from .base_loader import DataSetLoader, DataInfo
from .data_loader.sst import SSTLoader
from ..core.const import Const
from ..modules.encoder._bert import BertTokenizer


class PeopleDailyCorpusLoader(DataSetLoader):
@@ -115,7 +119,8 @@ class ConllLoader(DataSetLoader):
"""
别名::class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.dataset_loader.ConllLoader`

读取Conll格式的数据. 数据格式详见 http://conll.cemantix.org/2012/data.html
读取Conll格式的数据. 数据格式详见 http://conll.cemantix.org/2012/data.html. 数据中以"-DOCSTART-"开头的行将被忽略,因为
该符号在conll 2003中被用为文档分割符。

列号从0开始, 每列对应内容为::



+ 25
- 18
fastNLP/io/embed_loader.py View File

@@ -38,7 +38,8 @@ class EmbedLoader(BaseLoader):
super(EmbedLoader, self).__init__()

@staticmethod
def load_with_vocab(embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', normalize=True, error='ignore'):
def load_with_vocab(embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', normalize=True,
error='ignore', init_method=None):
"""
从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是
word2vec(第一行只有两个元素)还是glove格式的数据。
@@ -52,6 +53,7 @@ class EmbedLoader(BaseLoader):
:param bool normalize: 是否将每个vector归一化到norm为1
:param str error: `ignore` , `strict` ; 如果 `ignore` ,错误将自动跳过; 如果 `strict` , 错误将抛出。
这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。
:param callable init_method: 传入numpy.ndarray, 返回numpy.ndarray, 用以初始化embedding
:return numpy.ndarray: shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。
"""
assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported."
@@ -69,10 +71,13 @@ class EmbedLoader(BaseLoader):
dim = len(parts) - 1
f.seek(0)
matrix = np.random.randn(len(vocab), dim).astype(dtype)
if init_method:
matrix = init_method(matrix)
for idx, line in enumerate(f, start_idx):
try:
parts = line.strip().split()
word = parts[0]
word = ''.join(parts[:-dim])
nums = parts[-dim:]
# 对齐unk与pad
if word==padding and vocab.padding is not None:
word = vocab.padding
@@ -80,7 +85,7 @@ class EmbedLoader(BaseLoader):
word = vocab.unknown
if word in vocab:
index = vocab.to_index(word)
matrix[index] = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim)
matrix[index] = np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)
hit_flags[index] = True
except Exception as e:
if error == 'ignore':
@@ -90,14 +95,15 @@ class EmbedLoader(BaseLoader):
raise e
total_hits = sum(hit_flags)
print("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab)))
found_vectors = matrix[hit_flags]
if len(found_vectors) != 0:
mean = np.mean(found_vectors, axis=0, keepdims=True)
std = np.std(found_vectors, axis=0, keepdims=True)
unfound_vec_num = len(vocab) - total_hits
r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype) * std + mean
matrix[hit_flags == False] = r_vecs
if init_method is None:
found_vectors = matrix[hit_flags]
if len(found_vectors) != 0:
mean = np.mean(found_vectors, axis=0, keepdims=True)
std = np.std(found_vectors, axis=0, keepdims=True)
unfound_vec_num = len(vocab) - total_hits
r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype) * std + mean
matrix[hit_flags == False] = r_vecs

if normalize:
matrix /= np.linalg.norm(matrix, axis=1, keepdims=True)
@@ -135,10 +141,11 @@ class EmbedLoader(BaseLoader):
for idx, line in enumerate(f, start=start):
try:
parts = line.strip().split()
word = parts[0]
if dim == -1:
dim = len(parts) - 1
vec = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim)
word = ''.join(parts[:-dim])
nums = parts[-dim:]
vec = np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim)
vec_dict[word] = vec
vocab.add_word(word)
if unknown is not None and unknown == word:
@@ -155,13 +162,17 @@ class EmbedLoader(BaseLoader):
if dim == -1:
raise RuntimeError("{} is an empty file.".format(embed_filepath))
matrix = np.random.randn(len(vocab), dim).astype(dtype)
for key, vec in vec_dict.items():
index = vocab.to_index(key)
matrix[index] = vec

if (unknown is not None and not found_unknown) or (padding is not None and not found_pad):
start_idx = 0
if padding is not None:
start_idx += 1
if unknown is not None:
start_idx += 1
mean = np.mean(matrix[start_idx:], axis=0, keepdims=True)
std = np.std(matrix[start_idx:], axis=0, keepdims=True)
if (unknown is not None and not found_unknown):
@@ -169,10 +180,6 @@ class EmbedLoader(BaseLoader):
if (padding is not None and not found_pad):
matrix[0] = np.random.randn(1, dim).astype(dtype) * std + mean
for key, vec in vec_dict.items():
index = vocab.to_index(key)
matrix[index] = vec
if normalize:
matrix /= np.linalg.norm(matrix, axis=1, keepdims=True)


+ 8
- 5
fastNLP/io/file_reader.py View File

@@ -90,11 +90,12 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True):
return sample
with open(path, 'r', encoding=encoding) as f:
sample = []
start = next(f)
if '-DOCSTART-' not in start:
start = next(f).strip()
if '-DOCSTART-' not in start and start!='':
sample.append(start.split())
for line_idx, line in enumerate(f, 1):
if line.startswith('\n'):
line = line.strip()
if line=='':
if len(sample):
try:
res = parse_conll(sample)
@@ -107,7 +108,8 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True):
elif line.startswith('#'):
continue
else:
sample.append(line.split())
if not line.startswith('-DOCSTART-'):
sample.append(line.split())
if len(sample) > 0:
try:
res = parse_conll(sample)
@@ -115,4 +117,5 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True):
except Exception as e:
if dropna:
return
raise ValueError('invalid instance at line: {}'.format(line_idx))
print('invalid instance at line: {}'.format(line_idx))
raise e

+ 33
- 2
fastNLP/io/file_utils.py View File

@@ -10,10 +10,41 @@ import shutil
import hashlib


PRETRAINED_BERT_MODEL_DIR = {
'en': 'bert-base-cased-f89bfe08.zip',
'en-base-uncased': 'bert-base-uncased-3413b23c.zip',
'en-base-cased': 'bert-base-cased-f89bfe08.zip',
'en-large-uncased': 'bert-large-uncased-20939f45.zip',
'en-large-cased': 'bert-large-cased-e0cf90fc.zip',

'cn': 'bert-base-chinese-29d0a84a.zip',
'cn-base': 'bert-base-chinese-29d0a84a.zip',

'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip',
'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip',
'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip',
}

PRETRAINED_ELMO_MODEL_DIR = {
'en': 'elmo_en-d39843fe.tar.gz',
'cn': 'elmo_cn-5e9b34e2.tar.gz'
}

PRETRAIN_STATIC_FILES = {
'en': 'glove.840B.300d-cc1ad5e1.tar.gz',
'en-glove-840b-300': 'glove.840B.300d-cc1ad5e1.tar.gz',
'en-glove-6b-50': "glove.6B.50d-a6028c70.tar.gz",
'en-word2vec-300': "GoogleNews-vectors-negative300-be166d9d.tar.gz",
'en-fasttext': "cc.en.300.vec-d53187b2.gz",
'cn': "tencent_cn-dab24577.tar.gz",
'cn-fasttext': "cc.zh.300.vec-d68a9bcf.gz",
}


def cached_path(url_or_filename: str, cache_dir: Path=None) -> Path:
"""
给定一个url或者文件名(可以是具体的文件名,也可以是文件),先在cache_dir下寻找该文件是否存在,如果不存在则去下载, 并
将文件放入到
将文件放入到cache_dir中
"""
if cache_dir is None:
dataset_cache = Path(get_defalt_path())
@@ -88,7 +119,7 @@ def split_filename_suffix(filepath):
def get_from_cache(url: str, cache_dir: Path = None) -> Path:
"""
尝试在cache_dir中寻找url定义的资源; 如果没有找到。则从url下载并将结果放在cache_dir下,缓存的名称由url的结果推断而来。
如果从url中下载的资源解压后有多个文件,则返回directory的路径; 如果只有一个资源,则返回具体的路径
如果从url中下载的资源解压后有多个文件,则返回directory的路径; 如果只有一个资源,则返回具体的路径

"""
cache_dir.mkdir(parents=True, exist_ok=True)


+ 15
- 5
fastNLP/modules/decoder/crf.py View File

@@ -9,7 +9,7 @@ from torch import nn
from ..utils import initial_parameter


def allowed_transitions(id2target, encoding_type='bio', include_start_end=True):
def allowed_transitions(id2target, encoding_type='bio', include_start_end=False):
"""
别名::class:`fastNLP.modules.allowed_transitions` :class:`fastNLP.modules.decoder.crf.allowed_transitions`

@@ -17,7 +17,7 @@ def allowed_transitions(id2target, encoding_type='bio', include_start_end=True):

:param dict id2target: key是label的indices,value是str类型的tag或tag-label。value可以是只有tag的, 比如"B", "M"; 也可以是
"B-NN", "M-NN", tag和label之间一定要用"-"隔开。一般可以通过Vocabulary.idx2word得到id2label。
:param str encoding_type: 支持"bio", "bmes", "bmeso"。
:param str encoding_type: 支持"bio", "bmes", "bmeso", "bioes"
:param bool include_start_end: 是否包含开始与结尾的转换。比如在bio中,b/o可以在开头,但是i不能在开头;
为True,返回的结果中会包含(start_idx, b_idx), (start_idx, o_idx), 但是不包含(start_idx, i_idx);
start_idx=len(id2label), end_idx=len(id2label)+1。为False, 返回的结果中不含与开始结尾相关的内容
@@ -58,7 +58,7 @@ def allowed_transitions(id2target, encoding_type='bio', include_start_end=True):
def _is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label):
"""

:param str encoding_type: 支持"BIO", "BMES", "BEMSO"。
:param str encoding_type: 支持"BIO", "BMES", "BEMSO", 'bioes'
:param str from_tag: 比如"B", "M"之类的标注tag. 还包括start, end等两种特殊tag
:param str from_label: 比如"PER", "LOC"等label
:param str to_tag: 比如"B", "M"之类的标注tag. 还包括start, end等两种特殊tag
@@ -134,9 +134,19 @@ def _is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label
return to_tag in ['b', 's', 'end', 'o']
else:
raise ValueError("Unexpect tag type {}. Expect only 'B', 'M', 'E', 'S', 'O'.".format(from_tag))
elif encoding_type == 'bioes':
if from_tag == 'start':
return to_tag in ['b', 's', 'o']
elif from_tag == 'b':
return to_tag in ['i', 'e'] and from_label == to_label
elif from_tag == 'i':
return to_tag in ['i', 'e'] and from_label == to_label
elif from_tag in ['e', 's', 'o']:
return to_tag in ['b', 's', 'end', 'o']
else:
raise ValueError("Unexpect tag type {}. Expect only 'B', 'I', 'E', 'S', 'O'.".format(from_tag))
else:
raise ValueError("Only support BIO, BMES, BMESO encoding type, got {}.".format(encoding_type))
raise ValueError("Only support BIO, BMES, BMESO, BIOES encoding type, got {}.".format(encoding_type))


class ConditionalRandomField(nn.Module):


+ 10
- 2
fastNLP/modules/encoder/__init__.py View File

@@ -7,6 +7,12 @@ __all__ = [
"ConvMaxpool",
"Embedding",
"StaticEmbedding",
"ElmoEmbedding",
"BertEmbedding",
"StackEmbedding",
"LSTMCharEmbedding",
"CNNCharEmbedding",
"LSTM",
@@ -18,10 +24,12 @@ __all__ = [
"VarLSTM",
"VarGRU"
]
from .bert import BertModel
from ._bert import BertModel
from .bert import BertWordPieceEncoder
from .char_encoder import ConvolutionCharEncoder, LSTMCharEncoder
from .conv_maxpool import ConvMaxpool
from .embedding import Embedding
from .embedding import Embedding, StaticEmbedding, ElmoEmbedding, BertEmbedding, \
StackEmbedding, LSTMCharEmbedding, CNNCharEmbedding
from .lstm import LSTM
from .star_transformer import StarTransformer
from .transformer import TransformerEncoder


+ 455
- 119
fastNLP/modules/encoder/_bert.py View File

@@ -6,18 +6,395 @@
"""


import torch
from torch import nn

from ... import Vocabulary
from ...core.vocabulary import Vocabulary
import collections

import os
import unicodedata
from ...io.file_utils import _get_base_url, cached_path
from .bert import BertModel
import numpy as np
from itertools import chain
import copy
import json
import math
import os

import torch
from torch import nn
import glob

CONFIG_FILE = 'bert_config.json'
MODEL_WEIGHTS = 'pytorch_model.bin'


def gelu(x):
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


def swish(x):
return x * torch.sigmoid(x)


ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}


class BertLayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-12):
super(BertLayerNorm, self).__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.bias = nn.Parameter(torch.zeros(hidden_size))
self.variance_epsilon = eps

def forward(self, x):
u = x.mean(-1, keepdim=True)
s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias


class BertEmbeddings(nn.Module):
def __init__(self, vocab_size, hidden_size, max_position_embeddings, type_vocab_size, hidden_dropout_prob):
super(BertEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)

# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
self.dropout = nn.Dropout(hidden_dropout_prob)

def forward(self, input_ids, token_type_ids=None):
seq_length = input_ids.size(1)
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)

words_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)

embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings


class BertSelfAttention(nn.Module):
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob):
super(BertSelfAttention, self).__init__()
if hidden_size % num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (hidden_size, num_attention_heads))
self.num_attention_heads = num_attention_heads
self.attention_head_size = int(hidden_size / num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size

self.query = nn.Linear(hidden_size, self.all_head_size)
self.key = nn.Linear(hidden_size, self.all_head_size)
self.value = nn.Linear(hidden_size, self.all_head_size)

self.dropout = nn.Dropout(attention_probs_dropout_prob)

def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)

def forward(self, hidden_states, attention_mask):
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)

query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(mixed_key_layer)
value_layer = self.transpose_for_scores(mixed_value_layer)

# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask

# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)

# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)

context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
return context_layer


class BertSelfOutput(nn.Module):
def __init__(self, hidden_size, hidden_dropout_prob):
super(BertSelfOutput, self).__init__()
self.dense = nn.Linear(hidden_size, hidden_size)
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
self.dropout = nn.Dropout(hidden_dropout_prob)

def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states


class BertAttention(nn.Module):
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob):
super(BertAttention, self).__init__()
self.self = BertSelfAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob)
self.output = BertSelfOutput(hidden_size, hidden_dropout_prob)

def forward(self, input_tensor, attention_mask):
self_output = self.self(input_tensor, attention_mask)
attention_output = self.output(self_output, input_tensor)
return attention_output


class BertIntermediate(nn.Module):
def __init__(self, hidden_size, intermediate_size, hidden_act):
super(BertIntermediate, self).__init__()
self.dense = nn.Linear(hidden_size, intermediate_size)
self.intermediate_act_fn = ACT2FN[hidden_act] \
if isinstance(hidden_act, str) else hidden_act

def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states


class BertOutput(nn.Module):
def __init__(self, hidden_size, intermediate_size, hidden_dropout_prob):
super(BertOutput, self).__init__()
self.dense = nn.Linear(intermediate_size, hidden_size)
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
self.dropout = nn.Dropout(hidden_dropout_prob)

def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states


class BertLayer(nn.Module):
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob,
intermediate_size, hidden_act):
super(BertLayer, self).__init__()
self.attention = BertAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob,
hidden_dropout_prob)
self.intermediate = BertIntermediate(hidden_size, intermediate_size, hidden_act)
self.output = BertOutput(hidden_size, intermediate_size, hidden_dropout_prob)

def forward(self, hidden_states, attention_mask):
attention_output = self.attention(hidden_states, attention_mask)
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output


class BertEncoder(nn.Module):
def __init__(self, num_hidden_layers, hidden_size, num_attention_heads, attention_probs_dropout_prob,
hidden_dropout_prob,
intermediate_size, hidden_act):
super(BertEncoder, self).__init__()
layer = BertLayer(hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob,
intermediate_size, hidden_act)
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_hidden_layers)])

def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
all_encoder_layers = []
for layer_module in self.layer:
hidden_states = layer_module(hidden_states, attention_mask)
if output_all_encoded_layers:
all_encoder_layers.append(hidden_states)
if not output_all_encoded_layers:
all_encoder_layers.append(hidden_states)
return all_encoder_layers


class BertPooler(nn.Module):
def __init__(self, hidden_size):
super(BertPooler, self).__init__()
self.dense = nn.Linear(hidden_size, hidden_size)
self.activation = nn.Tanh()

def forward(self, hidden_states):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output


class BertModel(nn.Module):
"""BERT(Bidirectional Embedding Representations from Transformers).

如果你想使用预训练好的权重矩阵,请在以下网址下载.
sources::

'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",


用预训练权重矩阵来建立BERT模型::

model = BertModel.from_pretrained("path/to/weights/directory")

用随机初始化权重矩阵来建立BERT模型::

model = BertModel()

:param int vocab_size: 词表大小,默认值为30522,为BERT English uncase版本的词表大小
:param int hidden_size: 隐层大小,默认值为768,为BERT base的版本
:param int num_hidden_layers: 隐藏层数,默认值为12,为BERT base的版本
:param int num_attention_heads: 多头注意力头数,默认值为12,为BERT base的版本
:param int intermediate_size: FFN隐藏层大小,默认值是3072,为BERT base的版本
:param str hidden_act: FFN隐藏层激活函数,默认值为``gelu``
:param float hidden_dropout_prob: FFN隐藏层dropout,默认值为0.1
:param float attention_probs_dropout_prob: Attention层的dropout,默认值为0.1
:param int max_position_embeddings: 最大的序列长度,默认值为512,
:param int type_vocab_size: 最大segment数量,默认值为2
:param int initializer_range: 初始化权重范围,默认值为0.02
"""

def __init__(self, vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02):
super(BertModel, self).__init__()
self.hidden_size = hidden_size
self.embeddings = BertEmbeddings(vocab_size, hidden_size, max_position_embeddings,
type_vocab_size, hidden_dropout_prob)
self.encoder = BertEncoder(num_hidden_layers, hidden_size, num_attention_heads,
attention_probs_dropout_prob, hidden_dropout_prob, intermediate_size,
hidden_act)
self.pooler = BertPooler(hidden_size)
self.initializer_range = initializer_range

self.apply(self.init_bert_weights)

def init_bert_weights(self, module):
if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.initializer_range)
elif isinstance(module, BertLayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()

def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)

# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

embedding_output = self.embeddings(input_ids, token_type_ids)
encoded_layers = self.encoder(embedding_output,
extended_attention_mask,
output_all_encoded_layers=output_all_encoded_layers)
sequence_output = encoded_layers[-1]
pooled_output = self.pooler(sequence_output)
if not output_all_encoded_layers:
encoded_layers = encoded_layers[-1]
return encoded_layers, pooled_output

@classmethod
def from_pretrained(cls, pretrained_model_dir, state_dict=None, *inputs, **kwargs):
# Load config
config_file = os.path.join(pretrained_model_dir, CONFIG_FILE)
config = json.load(open(config_file, "r"))
# config = BertConfig.from_json_file(config_file)
# logger.info("Model config {}".format(config))
# Instantiate model.
model = cls(*inputs, **config, **kwargs)
if state_dict is None:
files = glob.glob(os.path.join(pretrained_model_dir, '*.bin'))
if len(files)==0:
raise FileNotFoundError(f"There is no *.bin file in {pretrained_model_dir}")
elif len(files)>1:
raise FileExistsError(f"There are multiple *.bin files in {pretrained_model_dir}")
weights_path = files[0]
state_dict = torch.load(weights_path)

old_keys = []
new_keys = []
for key in state_dict.keys():
new_key = None
if 'gamma' in key:
new_key = key.replace('gamma', 'weight')
if 'beta' in key:
new_key = key.replace('beta', 'bias')
if new_key:
old_keys.append(key)
new_keys.append(new_key)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key] = state_dict.pop(old_key)

missing_keys = []
unexpected_keys = []
error_msgs = []
# copy state_dict so _load_from_state_dict can modify it
metadata = getattr(state_dict, '_metadata', None)
state_dict = state_dict.copy()
if metadata is not None:
state_dict._metadata = metadata

def load(module, prefix=''):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
module._load_from_state_dict(
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
for name, child in module._modules.items():
if child is not None:
load(child, prefix + name + '.')

load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
if len(missing_keys) > 0:
print("Weights of {} not initialized from pretrained model: {}".format(
model.__class__.__name__, missing_keys))
if len(unexpected_keys) > 0:
print("Weights from pretrained model not used in {}: {}".format(
model.__class__.__name__, unexpected_keys))
return model


def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
@@ -290,6 +667,16 @@ class BertTokenizer(object):
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
self.max_len = max_len if max_len is not None else int(1e12)

def _reinit_on_new_vocab(self, vocab):
"""
在load bert之后,可能会对vocab进行重新排列。重新排列之后调用这个函数重新初始化与vocab相关的性质

:param vocab:
:return:
"""
self.vocab = vocab
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

def tokenize(self, text):
split_tokens = []
if self.do_basic_tokenize:
@@ -325,6 +712,8 @@ class BertTokenizer(object):
index = 0
if os.path.isdir(vocab_path):
vocab_file = os.path.join(vocab_path, VOCAB_NAME)
else:
vocab_file = vocab_path
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
@@ -370,11 +759,44 @@ class _WordBertModel(nn.Module):

assert pool_method in ('avg', 'max', 'first', 'last')
self.pool_method = pool_method

self.include_cls_sep = include_cls_sep

# 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
print("Start to generating word pieces for word.")
# 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值
word_piece_dict = {'[CLS]':1, '[SEP]':1} # 用到的word_piece以及新增的
found_count = 0
for word, index in vocab:
if index == vocab.padding_idx: # pad是个特殊的符号
word = '[PAD]'
elif index == vocab.unknown_idx:
word = '[UNK]'
word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
if len(word_pieces)==1:
if not vocab._is_word_no_create_entry(word): # 如果是train中的值, 但是却没有找到
if index!=vocab.unknown_idx and word_pieces[0]=='[UNK]': # 说明这个词不在原始的word里面
word_piece_dict[word] = 1 # 新增一个值
continue
for word_piece in word_pieces:
word_piece_dict[word_piece] = 1
found_count += 1
original_embed = self.encoder.embeddings.word_embeddings.weight.data
# 特殊词汇要特殊处理
embed = nn.Embedding(len(word_piece_dict), original_embed.size(1)) # 新的embed
new_word_piece_vocab = collections.OrderedDict()
for index, token in enumerate(['[PAD]', '[UNK]']):
word_piece_dict.pop(token, None)
embed.weight.data[index] = original_embed[self.tokenzier.vocab[token]]
new_word_piece_vocab[token] = index
for token in word_piece_dict.keys():
if token in self.tokenzier.vocab:
embed.weight.data[len(new_word_piece_vocab)] = original_embed[self.tokenzier.vocab[token]]
else:
embed.weight.data[len(new_word_piece_vocab)] = original_embed[self.tokenzier.vocab['[UNK]']]
new_word_piece_vocab[token] = len(new_word_piece_vocab)
self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab)
self.encoder.embeddings.word_embeddings = embed

word_to_wordpieces = []
word_pieces_lengths = []
for word, index in vocab:
@@ -386,12 +808,11 @@ class _WordBertModel(nn.Module):
word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces)
word_to_wordpieces.append(word_pieces)
word_pieces_lengths.append(len(word_pieces))
self._cls_index = len(vocab)
self._sep_index = len(vocab) + 1
print("Found(Or seg into word pieces) {} words out of {}.".format(found_count, len(vocab)))
self._cls_index = self.tokenzier.vocab['[CLS]']
self._sep_index = self.tokenzier.vocab['[SEP]']
self._pad_index = vocab.padding_idx
self._wordpiece_pad_index = self.tokenzier.convert_tokens_to_ids(['[PAD]'])[0] # 需要用于生成word_piece
word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[CLS]']))
word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[SEP]']))
self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece
self.word_to_wordpieces = np.array(word_to_wordpieces)
self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False)
print("Successfully generate word pieces.")
@@ -410,7 +831,8 @@ class _WordBertModel(nn.Module):
# +2是由于需要加入[CLS]与[SEP]
word_pieces = words.new_full((batch_size, max_word_piece_length+2), fill_value=self._wordpiece_pad_index)
word_pieces[:, 0].fill_(self._cls_index)
word_pieces[:, word_pieces_lengths+1] = self._sep_index
batch_indexes = torch.arange(batch_size).to(words)
word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index
attn_masks = torch.zeros_like(word_pieces)
# 1. 获取words的word_pieces的id,以及对应的span范围
word_indexes = words.tolist()
@@ -458,8 +880,8 @@ class _WordBertModel(nn.Module):
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2)
if self.include_cls_sep:
outputs[:, :, 0] = output_layer[:, 0]
outputs[:, :, seq_len+s_shift] = output_layer[:, seq_len+s_shift]
outputs[l_index, :, 0] = output_layer[:, 0]
outputs[l_index, batch_indexes, seq_len+s_shift] = output_layer[batch_indexes, seq_len+s_shift]
# 3. 最终的embedding结果
return outputs

@@ -469,7 +891,7 @@ class _WordPieceBertModel(nn.Module):
这个模块用于直接计算word_piece的结果.

"""
def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1'):
def __init__(self, model_dir:str, layers:str='-1'):
super().__init__()

self.tokenzier = BertTokenizer.from_pretrained(model_dir)
@@ -485,44 +907,34 @@ class _WordPieceBertModel(nn.Module):
assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
f"a bert model with {encoder_layer_number} layers."

# 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
print("Start to generating word pieces for word.")
self.word_to_wordpieces = []
self.word_pieces_length = []
for word, index in vocab:
if index == vocab.padding_idx: # pad是个特殊的符号
word = '[PAD]'
elif index == vocab.unknown_idx:
word = '[UNK]'
word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces)
self.word_to_wordpieces.append(word_pieces)
self.word_pieces_length.append(len(word_pieces))
self._cls_index = len(vocab)
self._sep_index = len(vocab) + 1
self._pad_index = vocab.padding_idx
self._wordpiece_pad_index = self.tokenzier.convert_tokens_to_ids(['[PAD]'])[0] # 需要用于生成word_piece
self.word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[CLS]']))
self.word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[SEP]']))
self.word_to_wordpieces = np.array(self.word_to_wordpieces, dtype=int)
print("Successfully generate word pieces.")
self._cls_index = self.tokenzier.vocab['[CLS]']
self._sep_index = self.tokenzier.vocab['[SEP]']
self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece

def index_dataset(self, *datasets):
def index_dataset(self, *datasets, field_name):
"""
使用bert的tokenizer将word_pieces与word_pieces_seq_len这两列加入到datasets中,并将他们设置为input。加入的word_piece
已经包含了[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。
使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是
[CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。

:param datasets: DataSet对象
:param field_name: 基于哪一列index
:return:
"""
def convert_words_to_word_pieces(words):
word_pieces = list(chain(*self.word_to_wordpieces[words].tolist()))
word_pieces = [self._cls_index] + word_pieces + [self._sep_index]
word_pieces = []
for word in words:
tokens = self.tokenzier.wordpiece_tokenizer.tokenize(word)
word_piece_ids = self.tokenzier.convert_tokens_to_ids(tokens)
word_pieces.extend(word_piece_ids)
if word_pieces[0]!=self._cls_index:
word_pieces.insert(0, self._cls_index)
if word_pieces[-1]!=self._sep_index:
word_pieces.insert(-1, self._sep_index)
return word_pieces

for index, dataset in enumerate(datasets):
try:
dataset.apply_field(convert_words_to_word_pieces, field_name='words', new_field_name='word_pieces',
dataset.apply_field(convert_words_to_word_pieces, field_name=field_name, new_field_name='word_pieces',
is_input=True)
dataset.set_pad_val('word_pieces', self._wordpiece_pad_index)
except Exception as e:
@@ -538,7 +950,7 @@ class _WordPieceBertModel(nn.Module):
"""
batch_size, max_len = word_pieces.size()

attn_masks = word_pieces.ne(self._pad_index)
attn_masks = word_pieces.ne(self._wordpiece_pad_index)
bert_outputs, _ = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks,
output_all_encoded_layers=True)
# output_layers = [self.layers] # len(self.layers) x batch_size x max_word_piece_length x hidden_size
@@ -547,79 +959,3 @@ class _WordPieceBertModel(nn.Module):
outputs[l_index] = bert_outputs[l]
return outputs

class BertWordPieceEncoder(nn.Module):
"""
可以通过读取vocabulary使用的Bert的Encoder。传入vocab,然后调用index_datasets方法在vocabulary中生成word piece的表示。

:param vocab: Vocabulary.
:param model_dir_or_name:
:param layers:
:param requires_grad:
"""
def __init__(self, vocab:Vocabulary, model_dir_or_name:str='en-base', layers:str='-1',
requires_grad:bool=False):
super().__init__()
PRETRAIN_URL = _get_base_url('bert')
# TODO 修改
PRETRAINED_BERT_MODEL_DIR = {'en-base': 'bert_en-80f95ea7.tar.gz',
'cn': 'elmo_cn.zip'}

if model_dir_or_name in PRETRAINED_BERT_MODEL_DIR:
model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
model_url = PRETRAIN_URL + model_name
model_dir = cached_path(model_url)
# 检查是否存在
elif os.path.isdir(model_dir_or_name):
model_dir = model_dir_or_name
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")

self.model = _WordPieceBertModel(model_dir=model_dir, vocab=vocab, layers=layers)
self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size
self.requires_grad = requires_grad

@property
def requires_grad(self):
"""
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
:return:
"""
requires_grads = set([param.requires_grad for name, param in self.named_parameters()])
if len(requires_grads)==1:
return requires_grads.pop()
else:
return None

@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
param.requires_grad = value

@property
def embed_size(self):
return self._embed_size

def index_datasets(self, *datasets):
"""
对datasets进行word piece的index。

Example::

:param datasets:
:return:
"""
self.model.index_dataset(*datasets)

def forward(self, words, token_type_ids=None):
"""
计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要
删除这两个表示。

:param words: batch_size x max_len
:param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话
:return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
"""
outputs = self.model(words, token_type_ids)
outputs = torch.cat([*outputs], dim=-1)

return outputs

+ 16
- 2
fastNLP/modules/encoder/_elmo.py View File

@@ -16,6 +16,7 @@ import json

from ..utils import get_dropout_mask
import codecs
from torch import autograd

class LstmCellWithProjection(torch.nn.Module):
"""
@@ -429,6 +430,8 @@ class LstmTokenEmbedder(nn.Module):
def forward(self, words, chars):
embs = []
if self.word_emb_layer is not None:
if hasattr(self, 'words_to_words'):
words = self.words_to_words[words]
word_emb = self.word_emb_layer(words)
embs.append(word_emb)

@@ -486,6 +489,8 @@ class ConvTokenEmbedder(nn.Module):
def forward(self, words, chars):
embs = []
if self.word_emb_layer is not None:
if hasattr(self, 'words_to_words'):
words = self.words_to_words[words]
word_emb = self.word_emb_layer(words)
embs.append(word_emb)

@@ -703,7 +708,12 @@ class _ElmoModel(nn.Module):
self.token_embedder = LstmTokenEmbedder(
config, word_emb_layer, char_emb_layer)
self.token_embedder.load_state_dict(token_embedder_states, strict=False)

if config['token_embedder']['word_dim'] > 0 and vocab._no_create_word_length > 0: # 需要映射,使得来自于dev, test的idx指向unk
words_to_words = nn.Parameter(torch.arange(len(vocab)+2).long(), requires_grad=False)
for word, idx in vocab:
if vocab._is_word_no_create_entry(word):
words_to_words[idx] = vocab.unknown_idx
setattr(self.token_embedder, 'words_to_words', words_to_words)
self.output_dim = config['encoder']['projection_dim']

if config['encoder']['name'].lower() == 'elmo':
@@ -760,7 +770,11 @@ class _ElmoModel(nn.Module):
token_embedding = self.token_embedder(expanded_words, chars)
if self.config['encoder']['name'] == 'elmo':
encoder_output = self.encoder(token_embedding, seq_len)
sz = encoder_output.size()
if encoder_output.size(2) < max_len+2:
dummy_tensor = encoder_output.new_zeros(encoder_output.size(0), batch_size,
max_len + 2 - encoder_output.size(2), encoder_output.size(-1))
encoder_output = torch.cat([encoder_output, dummy_tensor], 2)
sz = encoder_output.size() # 2, batch_size, max_len, hidden_size
token_embedding = torch.cat([token_embedding, token_embedding], dim=2).view(1, sz[1], sz[2], sz[3])
encoder_output = torch.cat([token_embedding, encoder_output], dim=0)
elif self.config['encoder']['name'] == 'lstm':


+ 86
- 372
fastNLP/modules/encoder/bert.py View File

@@ -1,378 +1,92 @@
"""
bert.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed under the Apache License 2.0.

"""
import copy
import json
import math
import os

import torch
from torch import nn
import torch
from ...io.file_utils import _get_base_url, cached_path
from ._bert import _WordPieceBertModel, BertModel

CONFIG_FILE = 'bert_config.json'
MODEL_WEIGHTS = 'pytorch_model.bin'


def gelu(x):
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


def swish(x):
return x * torch.sigmoid(x)


ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}


class BertLayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-12):
super(BertLayerNorm, self).__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.bias = nn.Parameter(torch.zeros(hidden_size))
self.variance_epsilon = eps

def forward(self, x):
u = x.mean(-1, keepdim=True)
s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias


class BertEmbeddings(nn.Module):
def __init__(self, vocab_size, hidden_size, max_position_embeddings, type_vocab_size, hidden_dropout_prob):
super(BertEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)

# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
self.dropout = nn.Dropout(hidden_dropout_prob)

def forward(self, input_ids, token_type_ids=None):
seq_length = input_ids.size(1)
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)

words_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)

embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings


class BertSelfAttention(nn.Module):
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob):
super(BertSelfAttention, self).__init__()
if hidden_size % num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (hidden_size, num_attention_heads))
self.num_attention_heads = num_attention_heads
self.attention_head_size = int(hidden_size / num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size

self.query = nn.Linear(hidden_size, self.all_head_size)
self.key = nn.Linear(hidden_size, self.all_head_size)
self.value = nn.Linear(hidden_size, self.all_head_size)

self.dropout = nn.Dropout(attention_probs_dropout_prob)

def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)

def forward(self, hidden_states, attention_mask):
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)

query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(mixed_key_layer)
value_layer = self.transpose_for_scores(mixed_value_layer)

# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask

# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)

# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)

context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
return context_layer


class BertSelfOutput(nn.Module):
def __init__(self, hidden_size, hidden_dropout_prob):
super(BertSelfOutput, self).__init__()
self.dense = nn.Linear(hidden_size, hidden_size)
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
self.dropout = nn.Dropout(hidden_dropout_prob)

def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states


class BertAttention(nn.Module):
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob):
super(BertAttention, self).__init__()
self.self = BertSelfAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob)
self.output = BertSelfOutput(hidden_size, hidden_dropout_prob)

def forward(self, input_tensor, attention_mask):
self_output = self.self(input_tensor, attention_mask)
attention_output = self.output(self_output, input_tensor)
return attention_output


class BertIntermediate(nn.Module):
def __init__(self, hidden_size, intermediate_size, hidden_act):
super(BertIntermediate, self).__init__()
self.dense = nn.Linear(hidden_size, intermediate_size)
self.intermediate_act_fn = ACT2FN[hidden_act] \
if isinstance(hidden_act, str) else hidden_act

def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states


class BertOutput(nn.Module):
def __init__(self, hidden_size, intermediate_size, hidden_dropout_prob):
super(BertOutput, self).__init__()
self.dense = nn.Linear(intermediate_size, hidden_size)
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
self.dropout = nn.Dropout(hidden_dropout_prob)

def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states


class BertLayer(nn.Module):
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob,
intermediate_size, hidden_act):
super(BertLayer, self).__init__()
self.attention = BertAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob,
hidden_dropout_prob)
self.intermediate = BertIntermediate(hidden_size, intermediate_size, hidden_act)
self.output = BertOutput(hidden_size, intermediate_size, hidden_dropout_prob)

def forward(self, hidden_states, attention_mask):
attention_output = self.attention(hidden_states, attention_mask)
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output


class BertEncoder(nn.Module):
def __init__(self, num_hidden_layers, hidden_size, num_attention_heads, attention_probs_dropout_prob,
hidden_dropout_prob,
intermediate_size, hidden_act):
super(BertEncoder, self).__init__()
layer = BertLayer(hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob,
intermediate_size, hidden_act)
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_hidden_layers)])

def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
all_encoder_layers = []
for layer_module in self.layer:
hidden_states = layer_module(hidden_states, attention_mask)
if output_all_encoded_layers:
all_encoder_layers.append(hidden_states)
if not output_all_encoded_layers:
all_encoder_layers.append(hidden_states)
return all_encoder_layers


class BertPooler(nn.Module):
def __init__(self, hidden_size):
super(BertPooler, self).__init__()
self.dense = nn.Linear(hidden_size, hidden_size)
self.activation = nn.Tanh()

def forward(self, hidden_states):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output


class BertModel(nn.Module):
"""BERT(Bidirectional Embedding Representations from Transformers).

如果你想使用预训练好的权重矩阵,请在以下网址下载.
sources::

'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",


用预训练权重矩阵来建立BERT模型::

model = BertModel.from_pretrained("path/to/weights/directory")

用随机初始化权重矩阵来建立BERT模型::

model = BertModel()

:param int vocab_size: 词表大小,默认值为30522,为BERT English uncase版本的词表大小
:param int hidden_size: 隐层大小,默认值为768,为BERT base的版本
:param int num_hidden_layers: 隐藏层数,默认值为12,为BERT base的版本
:param int num_attention_heads: 多头注意力头数,默认值为12,为BERT base的版本
:param int intermediate_size: FFN隐藏层大小,默认值是3072,为BERT base的版本
:param str hidden_act: FFN隐藏层激活函数,默认值为``gelu``
:param float hidden_dropout_prob: FFN隐藏层dropout,默认值为0.1
:param float attention_probs_dropout_prob: Attention层的dropout,默认值为0.1
:param int max_position_embeddings: 最大的序列长度,默认值为512,
:param int type_vocab_size: 最大segment数量,默认值为2
:param int initializer_range: 初始化权重范围,默认值为0.02
class BertWordPieceEncoder(nn.Module):
"""
读取bert模型,读取之后调用index_dataset方法在dataset中生成word_pieces这一列。

def __init__(self, vocab_size=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02):
super(BertModel, self).__init__()
self.hidden_size = hidden_size
self.embeddings = BertEmbeddings(vocab_size, hidden_size, max_position_embeddings,
type_vocab_size, hidden_dropout_prob)
self.encoder = BertEncoder(num_hidden_layers, hidden_size, num_attention_heads,
attention_probs_dropout_prob, hidden_dropout_prob, intermediate_size,
hidden_act)
self.pooler = BertPooler(hidden_size)
self.initializer_range = initializer_range

self.apply(self.init_bert_weights)

def init_bert_weights(self, module):
if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.initializer_range)
elif isinstance(module, BertLayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()

def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)

# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

embedding_output = self.embeddings(input_ids, token_type_ids)
encoded_layers = self.encoder(embedding_output,
extended_attention_mask,
output_all_encoded_layers=output_all_encoded_layers)
sequence_output = encoded_layers[-1]
pooled_output = self.pooler(sequence_output)
if not output_all_encoded_layers:
encoded_layers = encoded_layers[-1]
return encoded_layers, pooled_output

@classmethod
def from_pretrained(cls, pretrained_model_dir, state_dict=None, *inputs, **kwargs):
# Load config
config_file = os.path.join(pretrained_model_dir, CONFIG_FILE)
config = json.load(open(config_file, "r"))
# config = BertConfig.from_json_file(config_file)
# logger.info("Model config {}".format(config))
# Instantiate model.
model = cls(*inputs, **config, **kwargs)
if state_dict is None:
weights_path = os.path.join(pretrained_model_dir, MODEL_WEIGHTS)
state_dict = torch.load(weights_path)

old_keys = []
new_keys = []
for key in state_dict.keys():
new_key = None
if 'gamma' in key:
new_key = key.replace('gamma', 'weight')
if 'beta' in key:
new_key = key.replace('beta', 'bias')
if new_key:
old_keys.append(key)
new_keys.append(new_key)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key] = state_dict.pop(old_key)

missing_keys = []
unexpected_keys = []
error_msgs = []
# copy state_dict so _load_from_state_dict can modify it
metadata = getattr(state_dict, '_metadata', None)
state_dict = state_dict.copy()
if metadata is not None:
state_dict._metadata = metadata

def load(module, prefix=''):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
module._load_from_state_dict(
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
for name, child in module._modules.items():
if child is not None:
load(child, prefix + name + '.')

load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
if len(missing_keys) > 0:
print("Weights of {} not initialized from pretrained model: {}".format(
model.__class__.__name__, missing_keys))
if len(unexpected_keys) > 0:
print("Weights from pretrained model not used in {}: {}".format(
model.__class__.__name__, unexpected_keys))
return model
:param fastNLP.Vocabulary vocab: 词表
:param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为``en-base-uncased``
:param str layers:最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层
:param bool requires_grad: 是否需要gradient。
"""
def __init__(self, model_dir_or_name:str='en-base-uncased', layers:str='-1',
requires_grad:bool=False):
super().__init__()
PRETRAIN_URL = _get_base_url('bert')
PRETRAINED_BERT_MODEL_DIR = {'en': 'bert-base-cased-f89bfe08.zip',
'en-base-uncased': 'bert-base-uncased-3413b23c.zip',
'en-base-cased': 'bert-base-cased-f89bfe08.zip',
'en-large-uncased': 'bert-large-uncased-20939f45.zip',
'en-large-cased': 'bert-large-cased-e0cf90fc.zip',

'cn': 'bert-base-chinese-29d0a84a.zip',
'cn-base': 'bert-base-chinese-29d0a84a.zip',

'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip',
'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip',
'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip',
}

if model_dir_or_name in PRETRAINED_BERT_MODEL_DIR:
model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
model_url = PRETRAIN_URL + model_name
model_dir = cached_path(model_url)
# 检查是否存在
elif os.path.isdir(model_dir_or_name):
model_dir = model_dir_or_name
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")

self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers)
self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size
self.requires_grad = requires_grad

@property
def requires_grad(self):
"""
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
:return:
"""
requires_grads = set([param.requires_grad for name, param in self.named_parameters()])
if len(requires_grads)==1:
return requires_grads.pop()
else:
return None

@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
param.requires_grad = value

@property
def embed_size(self):
return self._embed_size

def index_datasets(self, *datasets, field_name):
"""
使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是
[CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。

:param datasets: DataSet对象
:param field_name: 基于哪一列的内容生成word_pieces列。这一列中每个数据应该是List[str]的形式。
:return:
"""
self.model.index_dataset(*datasets, field_name=field_name)

def forward(self, word_pieces, token_type_ids=None):
"""
计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。

:param words: batch_size x max_len
:param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话
:return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
"""
outputs = self.model(word_pieces, token_type_ids)
outputs = torch.cat([*outputs], dim=-1)

return outputs

+ 205
- 86
fastNLP/modules/encoder/embedding.py View File

@@ -1,13 +1,19 @@
__all__ = [
"Embedding"
"Embedding",
"StaticEmbedding",
"ElmoEmbedding",
"BertEmbedding",
"StackEmbedding",
"LSTMCharEmbedding",
"CNNCharEmbedding",
]
import torch.nn as nn
from ..utils import get_embeddings
from .lstm import LSTM
from ... import Vocabulary
from ...core.vocabulary import Vocabulary
from abc import abstractmethod
import torch
from ...io import EmbedLoader
import numpy as np
import torch.nn.functional as F
import os
from ._elmo import _ElmoModel
@@ -15,8 +21,12 @@ from ...io.file_utils import cached_path, _get_base_url
from ._bert import _WordBertModel
from typing import List

from ... import DataSet, Batch, SequentialSampler
import warnings
from ...core.dataset import DataSet
from ...core.batch import DataSetIter
from ...core.sampler import SequentialSampler
from ...core.utils import _move_model_to_device, _get_model_device
from ...io.file_utils import PRETRAINED_BERT_MODEL_DIR, PRETRAINED_ELMO_MODEL_DIR, PRETRAIN_STATIC_FILES


class Embedding(nn.Module):
@@ -25,13 +35,15 @@ class Embedding(nn.Module):

Embedding组件. 可以通过self.num_embeddings获取词表大小; self.embedding_dim获取embedding的维度"""
def __init__(self, init_embed, dropout=0.0):
def __init__(self, init_embed, dropout=0.0, dropout_word=0, unk_index=None):
"""

:param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int),
第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding;
也可以传入TokenEmbedding对象
:param float dropout: 对Embedding的输出的dropout。
:param float dropout_word: 按照一定比例随机将word设置为unk的idx,这样可以使得unk这个token得到足够的训练
:param int unk_index: drop word时替换为的index,如果init_embed为TokenEmbedding不需要传入该值。
"""
super(Embedding, self).__init__()

@@ -40,17 +52,36 @@ class Embedding(nn.Module):
self.dropout = nn.Dropout(dropout)
if not isinstance(self.embed, TokenEmbedding):
self._embed_size = self.embed.weight.size(1)
if dropout_word>0 and not isinstance(unk_index, int):
raise ValueError("When drop word is set, you need to pass in the unk_index.")
else:
self._embed_size = self.embed.embed_size
unk_index = self.embed.get_word_vocab().unknown_idx
self.unk_index = unk_index
self.dropout_word = dropout_word

def forward(self, x):
"""
:param torch.LongTensor x: [batch, seq_len]
:return: torch.Tensor : [batch, seq_len, embed_dim]
"""
if self.dropout_word>0 and self.training:
mask = torch.ones_like(x).float() * self.dropout_word
mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1
x = x.masked_fill(mask, self.unk_index)
x = self.embed(x)
return self.dropout(x)

@property
def num_embedding(self)->int:
if isinstance(self.embed, nn.Embedding):
return self.embed.weight.size(0)
else:
return self.embed.num_embedding

def __len__(self):
return len(self.embed)

@property
def embed_size(self) -> int:
return self._embed_size
@@ -80,7 +111,7 @@ class Embedding(nn.Module):
@property
def size(self):
if isinstance(self.embed, TokenEmbedding):
return torch.Size(self.embed._word_vocab, self.embed.embed_size)
return self.embed.size
else:
return self.embed.weight.size()

@@ -109,14 +140,17 @@ class TokenEmbedding(nn.Module):
for param in self.parameters():
param.requires_grad = value

@abstractmethod
def get_original_vocab(self):
pass
def __len__(self):
return len(self._word_vocab)

@property
def embed_size(self) -> int:
return self._embed_size

@property
def num_embedding(self) -> int:
return len(self._word_vocab)

def get_word_vocab(self):
"""
返回embedding的词典。
@@ -127,7 +161,7 @@ class TokenEmbedding(nn.Module):

@property
def size(self):
return torch.Size(self.embed._word_vocab, self._embed_size)
return torch.Size(self.num_embedding, self._embed_size)


class StaticEmbedding(TokenEmbedding):
@@ -138,51 +172,142 @@ class StaticEmbedding(TokenEmbedding):

Example::

>>> embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50')


:param vocab: Vocabulary. 若该项为None则会读取所有的embedding。
:param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding的文件名,第二种是传入embedding
的名称。目前支持的embedding包括{`en` 或者 `en-glove-840b-300` : glove.840B.300d, `en-glove-6b-50` : glove.6B.50d,
`en-word2vec-300` : GoogleNews-vectors-negative300}。第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。
:param requires_grad: 是否需要gradient

:param requires_grad: 是否需要gradient. 默认为True
:param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对象。
:param normailize: 是否对vector进行normalize,使得每个vector的norm为1。
"""
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=False):
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=True, init_method=None,
normalize=False):
super(StaticEmbedding, self).__init__(vocab)

# 优先定义需要下载的static embedding有哪些。这里估计需要自己搞一个server,
PRETRAIN_URL = _get_base_url('static')
PRETRAIN_STATIC_FILES = {
'en': 'glove.840B.300d-cc1ad5e1.tar.gz',
'en-glove-840b-300': 'glove.840B.300d-cc1ad5e1.tar.gz',
'en-glove-6b-50': "glove.6B.50d-a6028c70.tar.gz",
'en-word2vec-300': "GoogleNews-vectors-negative300-be166d9d.tar.gz",
'en-fasttext': "cc.en.300.vec-d53187b2.gz",
'cn': "tencent_cn-dab24577.tar.gz",
'cn-fasttext': "cc.zh.300.vec-d68a9bcf.gz",
}

# 得到cache_path
if model_dir_or_name.lower() in PRETRAIN_STATIC_FILES:
PRETRAIN_URL = _get_base_url('static')
model_name = PRETRAIN_STATIC_FILES[model_dir_or_name]
model_url = PRETRAIN_URL + model_name
model_path = cached_path(model_url)
# 检查是否存在
elif os.path.isfile(model_dir_or_name):
elif os.path.isfile(os.path.expanduser(os.path.abspath(model_dir_or_name))):
model_path = model_dir_or_name
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")

# 读取embedding
embedding = EmbedLoader.load_with_vocab(model_path, vocab=vocab)
embedding = torch.tensor(embedding)
embedding, hit_flags = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method,
normalize=normalize)
self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1],
padding_idx=vocab.padding_idx,
max_norm=None, norm_type=2, scale_grad_by_freq=False,
sparse=False, _weight=embedding)
if vocab._no_create_word_length > 0: # 需要映射,使得来自于dev, test的idx指向unk
words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False)
for word, idx in vocab:
if vocab._is_word_no_create_entry(word) and not hit_flags[idx]:
words_to_words[idx] = vocab.unknown_idx
self.words_to_words = words_to_words
self._embed_size = self.embedding.weight.size(1)
self.requires_grad = requires_grad

@property
def requires_grad(self):
"""
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
:return:
"""
requires_grads = set([param.requires_grad for name, param in self.named_parameters()
if 'words_to_words' not in name])
if len(requires_grads) == 1:
return requires_grads.pop()
else:
return None

@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
if 'words_to_words' in name:
continue
param.requires_grad = value

def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>',
normalize=True, error='ignore', init_method=None):
"""
从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是
word2vec(第一行只有两个元素)还是glove格式的数据。

:param str embed_filepath: 预训练的embedding的路径。
:param vocab: 词表 :class:`~fastNLP.Vocabulary` 类型,读取出现在vocab中的词的embedding。
没有出现在vocab中的词的embedding将通过找到的词的embedding的正态分布采样出来,以使得整个Embedding是同分布的。
:param dtype: 读出的embedding的类型
:param str padding: 词表中padding的token
:param str unknown: 词表中unknown的token
:param bool normalize: 是否将每个vector归一化到norm为1
:param str error: `ignore` , `strict` ; 如果 `ignore` ,错误将自动跳过; 如果 `strict` , 错误将抛出。
这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。
:param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。默认使用torch.nn.init.zeros_
:return torch.tensor: shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。
"""
assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported."
if not os.path.exists(embed_filepath):
raise FileNotFoundError("`{}` does not exist.".format(embed_filepath))
with open(embed_filepath, 'r', encoding='utf-8') as f:
line = f.readline().strip()
parts = line.split()
start_idx = 0
if len(parts) == 2:
dim = int(parts[1])
start_idx += 1
else:
dim = len(parts) - 1
f.seek(0)
matrix = torch.zeros(len(vocab), dim)
if init_method is not None:
init_method(matrix)
hit_flags = np.zeros(len(vocab), dtype=bool)
for idx, line in enumerate(f, start_idx):
try:
parts = line.strip().split()
word = ''.join(parts[:-dim])
nums = parts[-dim:]
# 对齐unk与pad
if word == padding and vocab.padding is not None:
word = vocab.padding
elif word == unknown and vocab.unknown is not None:
word = vocab.unknown
if word in vocab:
index = vocab.to_index(word)
matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim))
hit_flags[index] = True
except Exception as e:
if error == 'ignore':
warnings.warn("Error occurred at the {} line.".format(idx))
else:
print("Error occurred at the {} line.".format(idx))
raise e
found_count = sum(hit_flags)
print("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab)))
if init_method is None:
if len(vocab)-found_count>0 and found_count>0: # 有的没找到
found_vecs = matrix[torch.LongTensor(hit_flags.astype(int)).byte()]
mean = found_vecs.mean(dim=0, keepdim=True)
std = found_vecs.std(dim=0, keepdim=True)
unfound_vec_num = np.sum(hit_flags==False)
unfound_vecs = torch.randn(unfound_vec_num, dim)*std + mean
matrix[torch.LongTensor(hit_flags.astype(int)).eq(0)] = unfound_vecs

if normalize:
matrix /= (torch.norm(matrix, dim=1, keepdim=True) + 1e-12)

return matrix, hit_flags

def forward(self, words):
"""
传入words的index
@@ -190,6 +315,8 @@ class StaticEmbedding(TokenEmbedding):
:param words: torch.LongTensor, [batch_size, max_len]
:return: torch.FloatTensor, [batch_size, max_len, embed_size]
"""
if hasattr(self, 'words_to_words'):
words = self.words_to_words[words]
return self.embedding(words)


@@ -201,11 +328,6 @@ class ContextualEmbedding(TokenEmbedding):
"""
由于动态embedding生成比较耗时,所以可以把每句话embedding缓存下来,这样就不需要每次都运行生成过程。

Example::

>>>


:param datasets: DataSet对象
:param batch_size: int, 生成cache的sentence表示时使用的batch的大小
:param device: 参考 :class::fastNLP.Trainer 的device
@@ -228,14 +350,14 @@ class ContextualEmbedding(TokenEmbedding):
with torch.no_grad():
for index, dataset in enumerate(datasets):
try:
batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), prefetch=False)
batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler())
for batch_x, batch_y in batch:
words = batch_x['words'].to(device)
words_list = words.tolist()
seq_len = words.ne(pad_index).sum(dim=-1)
max_len = words.size(1)
# 因为有些情况可能包含CLS, SEP, 从后面往前计算比较安全。
seq_len_from_behind =(max_len - seq_len).tolist()
seq_len_from_behind = (max_len - seq_len).tolist()
word_embeds = self(words).detach().cpu().numpy()
for b in range(words.size(0)):
length = seq_len_from_behind[b]
@@ -297,8 +419,7 @@ class ElmoEmbedding(ContextualEmbedding):

Example::

>>>
>>>
>>> embedding = ElmoEmbedding(vocab, model_dir_or_name='en', layers='2', requires_grad=True)

:param vocab: 词表
:param model_dir_or_name: 可以有两种方式调用预训练好的ELMo embedding:第一种是传入ELMo权重的文件名,第二种是传入ELMo版本的名称,
@@ -319,17 +440,13 @@ class ElmoEmbedding(ContextualEmbedding):
self.layers = layers

# 根据model_dir_or_name检查是否存在并下载
PRETRAIN_URL = _get_base_url('elmo')
# TODO 把baidu云上的加上去
PRETRAINED_ELMO_MODEL_DIR = {'en': 'elmo_en-d39843fe.tar.gz',
'cn': 'elmo_cn-5e9b34e2.tar.gz'}

if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR:
PRETRAIN_URL = _get_base_url('elmo')
model_name = PRETRAINED_ELMO_MODEL_DIR[model_dir_or_name]
model_url = PRETRAIN_URL + model_name
model_dir = cached_path(model_url)
# 检查是否存在
elif os.path.isdir(model_dir_or_name):
elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))):
model_dir = model_dir_or_name
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")
@@ -368,7 +485,7 @@ class ElmoEmbedding(ContextualEmbedding):
:return:
"""
requires_grads = set([param.requires_grad for name, param in self.named_parameters()
if 'words_to_chars_embedding' not in name])
if 'words_to_chars_embedding' not in name and 'words_to_words' not in name])
if len(requires_grads) == 1:
return requires_grads.pop()
else:
@@ -377,8 +494,8 @@ class ElmoEmbedding(ContextualEmbedding):
@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
if 'words_to_chars_embedding' in name: # 这个不能加入到requires_grad中
pass
if 'words_to_chars_embedding' in name or 'words_to_words' in name: # 这个不能加入到requires_grad中
continue
param.requires_grad = value


@@ -386,18 +503,19 @@ class BertEmbedding(ContextualEmbedding):
"""
别名::class:`fastNLP.modules.BertEmbedding` :class:`fastNLP.modules.encoder.embedding.BertEmbedding`

使用BERT对words进行encode的Embedding。
使用BERT对words进行encode的Embedding。建议将输入的words长度限制在450以内,而不要使用512。这是由于预训练的bert模型长
度限制为512个token,而因为输入的word是未进行word piece分割的,在分割之后长度可能会超过最大长度限制。

Example::

>>>
>>> embedding = BertEmbedding(vocab, model_dir_or_name='en-base-uncased', requires_grad=False, layers='4,-2,-1')


:param fastNLP.Vocabulary vocab: 词表
:param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为``en-base-uncased``
:param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为 ``en-base-uncased``.
:param str layers:最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层
:param str pool_method: 因为在bert中,每个word会被表示为多个word pieces, 当获取一个word的表示的时候,怎样从它的word pieces
中计算得到他对应的表示。支持``last``, ``first``, ``avg``, ``max``.
中计算得到它对应的表示。支持``last``, ``first``, ``avg``, ``max``。
:param bool include_cls_sep: bool,在bert计算句子的表示的时候,需要在前面加上[CLS]和[SEP], 是否在结果中保留这两个内容。 这样
会使得word embedding的结果比输入的结果长两个token。在使用 :class::StackEmbedding 可能会遇到问题。
:param bool requires_grad: 是否需要gradient。
@@ -405,28 +523,15 @@ class BertEmbedding(ContextualEmbedding):
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1',
pool_method: str='first', include_cls_sep: bool=False, requires_grad: bool=False):
super(BertEmbedding, self).__init__(vocab)
# 根据model_dir_or_name检查是否存在并下载
PRETRAIN_URL = _get_base_url('bert')
PRETRAINED_BERT_MODEL_DIR = {'en': 'bert-base-cased-f89bfe08.zip',
'en-base-uncased': 'bert-base-uncased-3413b23c.zip',
'en-base-cased': 'bert-base-cased-f89bfe08.zip',
'en-large-uncased': 'bert-large-uncased-20939f45.zip',
'en-large-cased': 'bert-large-cased-e0cf90fc.zip',

'cn': 'bert-base-chinese-29d0a84a.zip',
'cn-base': 'bert-base-chinese-29d0a84a.zip',

'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip',
'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip',
'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip',
}

# 根据model_dir_or_name检查是否存在并下载
if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR:
PRETRAIN_URL = _get_base_url('bert')
model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
model_url = PRETRAIN_URL + model_name
model_dir = cached_path(model_url)
# 检查是否存在
elif os.path.isdir(model_dir_or_name):
elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))):
model_dir = model_dir_or_name
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")
@@ -445,7 +550,7 @@ class BertEmbedding(ContextualEmbedding):
计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要
删除这两个token的表示。

:param words: batch_size x max_len
:param torch.LongTensor words: [batch_size, max_len]
:return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
"""
outputs = self._get_sent_reprs(words)
@@ -473,7 +578,7 @@ class BertEmbedding(ContextualEmbedding):
def requires_grad(self, value):
for name, param in self.named_parameters():
if 'word_pieces_lengths' in name: # 这个不能加入到requires_grad中
pass
continue
param.requires_grad = value


@@ -487,7 +592,8 @@ def _construct_char_vocab_from_vocab(vocab:Vocabulary, min_freq:int=1):
"""
char_vocab = Vocabulary(min_freq=min_freq)
for word, index in vocab:
char_vocab.add_word_lst(list(word))
if not vocab._is_word_no_create_entry(word):
char_vocab.add_word_lst(list(word))
return char_vocab


@@ -495,24 +601,25 @@ class CNNCharEmbedding(TokenEmbedding):
"""
别名::class:`fastNLP.modules.CNNCharEmbedding` :class:`fastNLP.modules.encoder.embedding.CNNCharEmbedding`

使用CNN生成character embedding。CNN的结果为, CNN(x) -> activation(x) -> pool -> fc. 不同的kernel大小的fitler结果是
concat起来的。
使用CNN生成character embedding。CNN的结果为, embed(x) -> Dropout(x) -> CNN(x) -> activation(x) -> pool
-> fc. 不同的kernel大小的fitler结果是concat起来的。

Example::

>>>
>>> cnn_char_embed = CNNCharEmbedding(vocab)


:param vocab: 词表
:param embed_size: 该word embedding的大小,默认值为50.
:param char_emb_size: character的embed的大小。character是从vocab中生成的。默认值为50.
:param dropout: 以多大的概率drop
:param filter_nums: filter的数量. 长度需要和kernels一致。默认值为[40, 30, 20].
:param kernels: kernel的大小. 默认值为[5, 3, 1].
:param kernel_sizes: kernel的大小. 默认值为[5, 3, 1].
:param pool_method: character的表示在合成一个表示时所使用的pool方法,支持'avg', 'max'.
:param activation: CNN之后使用的激活方法,支持'relu', 'sigmoid', 'tanh' 或者自定义函数.
:param min_char_freq: character的最少出现次数。默认值为2.
"""
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50,
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, dropout:float=0.5,
filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), pool_method: str='max',
activation='relu', min_char_freq: int=2):
super(CNNCharEmbedding, self).__init__(vocab)
@@ -521,6 +628,7 @@ class CNNCharEmbedding(TokenEmbedding):
assert kernel % 2 == 1, "Only odd kernel is allowed."

assert pool_method in ('max', 'avg')
self.dropout = nn.Dropout(dropout, inplace=True)
self.pool_method = pool_method
# activation function
if isinstance(activation, str):
@@ -544,13 +652,13 @@ class CNNCharEmbedding(TokenEmbedding):
self.char_pad_index = self.char_vocab.padding_idx
print(f"In total, there are {len(self.char_vocab)} distinct characters.")
# 对vocab进行index
self.max_word_len = max(map(lambda x: len(x[0]), vocab))
self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), self.max_word_len),
max_word_len = max(map(lambda x: len(x[0]), vocab))
self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), max_word_len),
fill_value=self.char_pad_index, dtype=torch.long),
requires_grad=False)
self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False)
for word, index in vocab:
# if index!=vocab.padding_idx: # 如果是pad的话,直接就为pad_value了。 修改为不区分pad, 这样所有的<pad>也是同一个embed
# if index!=vocab.padding_idx: # 如果是pad的话,直接就为pad_value了。修改为不区分pad, 这样所有的<pad>也是同一个embed
self.words_to_chars_embedding[index, :len(word)] = \
torch.LongTensor([self.char_vocab.to_index(c) for c in word])
self.word_lengths[index] = len(word)
@@ -561,6 +669,7 @@ class CNNCharEmbedding(TokenEmbedding):
for i in range(len(kernel_sizes))])
self._embed_size = embed_size
self.fc = nn.Linear(sum(filter_nums), embed_size)
self.init_param()

def forward(self, words):
"""
@@ -577,7 +686,7 @@ class CNNCharEmbedding(TokenEmbedding):
# 为1的地方为mask
chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size
self.dropout(chars)
reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1)
reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M
conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1)
@@ -613,30 +722,39 @@ class CNNCharEmbedding(TokenEmbedding):
def requires_grad(self, value):
for name, param in self.named_parameters():
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能加入到requires_grad中
pass
continue
param.requires_grad = value

def init_param(self):
for name, param in self.named_parameters():
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能reset
continue
if param.data.dim()>1:
nn.init.xavier_uniform_(param, 1)
else:
nn.init.uniform_(param, -1, 1)

class LSTMCharEmbedding(TokenEmbedding):
"""
别名::class:`fastNLP.modules.LSTMCharEmbedding` :class:`fastNLP.modules.encoder.embedding.LSTMCharEmbedding`

使用LSTM的方式对character进行encode.
使用LSTM的方式对character进行encode. embed(x) -> Dropout(x) -> LSTM(x) -> activation(x) -> pool

Example::

>>>
>>> lstm_char_embed = LSTMCharEmbedding(vocab)

:param vocab: 词表
:param embed_size: embedding的大小。默认值为50.
:param char_emb_size: character的embedding的大小。默认值为50.
:param dropout: 以多大概率drop
:param hidden_size: LSTM的中间hidden的大小,如果为bidirectional的,hidden会除二,默认为50.
:param pool_method: 支持'max', 'avg'
:param activation: 激活函数,支持'relu', 'sigmoid', 'tanh', 或者自定义函数.
:param min_char_freq: character的最小出现次数。默认值为2.
:param bidirectional: 是否使用双向的LSTM进行encode。默认值为True。
"""
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, hidden_size=50,
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, dropout:float=0.5, hidden_size=50,
pool_method: str='max', activation='relu', min_char_freq: int=2, bidirectional=True):
super(LSTMCharEmbedding, self).__init__(vocab)

@@ -644,7 +762,7 @@ class LSTMCharEmbedding(TokenEmbedding):

assert pool_method in ('max', 'avg')
self.pool_method = pool_method
self.dropout = nn.Dropout(dropout, inplace=True)
# activation function
if isinstance(activation, str):
if activation.lower() == 'relu':
@@ -701,7 +819,7 @@ class LSTMCharEmbedding(TokenEmbedding):
# 为mask的地方为1
chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size
chars = self.dropout(chars)
reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1)
char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len)
lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1)
@@ -739,7 +857,7 @@ class LSTMCharEmbedding(TokenEmbedding):
def requires_grad(self, value):
for name, param in self.named_parameters():
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能加入到requires_grad中
pass
continue
param.requires_grad = value


@@ -751,7 +869,8 @@ class StackEmbedding(TokenEmbedding):

Example::

>>>
>>> embed_1 = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
>>> embed_2 = StaticEmbedding(vocab, model_dir_or_name='en-word2vec-300', requires_grad=True)


:param embeds: 一个由若干个TokenEmbedding组成的list,要求每一个TokenEmbedding的词表都保持一致


+ 21
- 7
fastNLP/modules/encoder/lstm.py View File

@@ -11,13 +11,15 @@ import torch.nn as nn
import torch.nn.utils.rnn as rnn

from ..utils import initial_parameter
from torch import autograd


class LSTM(nn.Module):
"""
别名::class:`fastNLP.modules.LSTM` :class:`fastNLP.modules.encoder.lstm.LSTM`

LSTM 模块, 轻量封装的Pytorch LSTM
LSTM 模块, 轻量封装的Pytorch LSTM. 在提供seq_len的情况下,将自动使用pack_padded_sequence; 同时默认将forget gate的bias初始化
为1; 且可以应对DataParallel中LSTM的使用问题。

:param input_size: 输入 `x` 的特征维度
:param hidden_size: 隐状态 `h` 的特征维度.
@@ -30,23 +32,35 @@ class LSTM(nn.Module):
"""
def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True,
bidirectional=False, bias=True, initial_method=None):
bidirectional=False, bias=True):
super(LSTM, self).__init__()
self.batch_first = batch_first
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=bias, batch_first=batch_first,
dropout=dropout, bidirectional=bidirectional)
initial_parameter(self, initial_method)
self.init_param()

def init_param(self):
for name, param in self.named_parameters():
if 'bias' in name:
# based on https://github.com/pytorch/pytorch/issues/750#issuecomment-280671871
param.data.fill_(0)
n = param.size(0)
start, end = n // 4, n // 2
param.data[start:end].fill_(1)
else:
nn.init.xavier_uniform_(param)

def forward(self, x, seq_len=None, h0=None, c0=None):
"""

:param x: [batch, seq_len, input_size] 输入序列
:param seq_len: [batch, ] 序列长度, 若为 ``None``, 所有输入看做一样长. Default: ``None``
:param h0: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全1向量. Default: ``None``
:param c0: [batch, hidden_size] 初始Cell状态, 若为 ``None`` , 设为全1向量. Default: ``None``
:param h0: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全0向量. Default: ``None``
:param c0: [batch, hidden_size] 初始Cell状态, 若为 ``None`` , 设为全0向量. Default: ``None``
:return (output, ht) 或 output: 若 ``get_hidden=True`` [batch, seq_len, hidden_size*num_direction] 输出序列
和 [batch, hidden_size*num_direction] 最后时刻隐状态.
"""
batch_size, max_len, _ = x.size()
if h0 is not None and c0 is not None:
hx = (h0, c0)
else:
@@ -59,7 +73,7 @@ class LSTM(nn.Module):
x = x[:, sort_idx]
x = rnn.pack_padded_sequence(x, sort_lens, batch_first=self.batch_first)
output, hx = self.lstm(x, hx) # -> [N,L,C]
output, _ = rnn.pad_packed_sequence(output, batch_first=self.batch_first)
output, _ = rnn.pad_packed_sequence(output, batch_first=self.batch_first, total_length=max_len)
_, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
if self.batch_first:
output = output[unsort_idx]


+ 2
- 0
fastNLP/modules/utils.py View File

@@ -82,6 +82,8 @@ def get_embeddings(init_embed):
if isinstance(init_embed, tuple):
res = nn.Embedding(
num_embeddings=init_embed[0], embedding_dim=init_embed[1])
nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)),
b=np.sqrt(3/res.weight.data.size(1)))
elif isinstance(init_embed, nn.Module):
res = init_embed
elif isinstance(init_embed, torch.Tensor):


+ 2
- 5
reproduction/Biaffine_parser/run.py View File

@@ -184,11 +184,8 @@ def train(path):
m.weight.requires_grad = True

# Trainer
trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data,
loss=ParserLoss(), metrics=ParserMetric(), metric_key='UAS',
**train_args.data,
optimizer=fastNLP.Adam(**optim_args.data),
save_path=path,
trainer = Trainer(train_data=train_data, model=model, optimizer=fastNLP.Adam(**optim_args.data), loss=ParserLoss(),
dev_data=dev_data, metrics=ParserMetric(), metric_key='UAS', save_path=path,
callbacks=[MyCallback()])

# Start training


+ 5
- 5
reproduction/POS_tagging/train_pos_tag.py View File

@@ -89,11 +89,11 @@ def train(train_data_path, dev_data_path, checkpoint=None, save=None):
model = torch.load(checkpoint)

# call trainer to train
trainer = Trainer(dataset, model, loss=None, metrics=SpanFPreRecMetric(tag_proc.vocab, pred="predict",
target="truth",
seq_lens="word_seq_origin_len"),
dev_data=dev_data, metric_key="f",
use_tqdm=True, use_cuda=True, print_every=10, n_epochs=20, save_path=save)
trainer = Trainer(dataset, model, loss=None, n_epochs=20, print_every=10, dev_data=dev_data,
metrics=SpanFPreRecMetric(tag_proc.vocab, pred="predict",
target="truth",
seq_lens="word_seq_origin_len"), metric_key="f", save_path=save,
use_tqdm=True)
trainer.train(load_best_model=True)

# save model & pipeline


+ 4
- 8
reproduction/Star_transformer/train.py View File

@@ -149,14 +149,10 @@ def train():
) if x.requires_grad and x.size(0) != len(word_v)]
optim_cfg = [{'params': model.enc.embedding.parameters(), 'lr': g_args.lr*0.1},
{'params': ex_param, 'lr': g_args.lr, 'weight_decay': g_args.w_decay}, ]
trainer = FN.Trainer(model=model, train_data=train_data, dev_data=dev_data,
loss=loss, metrics=metric, metric_key=metric_key,
optimizer=torch.optim.Adam(optim_cfg),
n_epochs=g_args.ep, batch_size=g_args.bsz, print_every=10, validate_every=3000,
device=device,
use_tqdm=False, prefetch=False,
save_path=g_args.log,
callbacks=[MyCallback()])
trainer = FN.Trainer(train_data=train_data, model=model, optimizer=torch.optim.Adam(optim_cfg), loss=loss,
batch_size=g_args.bsz, n_epochs=g_args.ep, print_every=10, dev_data=dev_data, metrics=metric,
metric_key=metric_key, validate_every=3000, save_path=g_args.log, use_tqdm=False,
device=device, callbacks=[MyCallback()])

trainer.train()
tester = FN.Tester(data=test_data, model=model, metrics=metric,


+ 326
- 0
reproduction/matching/data/MatchingDataLoader.py View File

@@ -0,0 +1,326 @@

import os

from typing import Union, Dict

from fastNLP.core.const import Const
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.io.base_loader import DataInfo
from fastNLP.io.dataset_loader import JsonLoader, DataSetLoader, CSVLoader
from fastNLP.io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR
from fastNLP.modules.encoder._bert import BertTokenizer


class MatchingLoader(DataSetLoader):
"""
别名::class:`fastNLP.io.MatchingLoader` :class:`fastNLP.io.dataset_loader.MatchingLoader`

读取Matching任务的数据集
"""

def __init__(self, paths: dict=None):
"""
:param dict paths: key是数据集名称(如train、dev、test),value是对应的文件名
"""
self.paths = paths

def _load(self, path):
"""
:param str path: 待读取数据集的路径名
:return: fastNLP.DataSet ds: 返回一个DataSet对象,里面必须包含3个field:其中两个分别为两个句子
的原始字符串文本,第三个为标签
"""
raise NotImplementedError

def process(self, paths: Union[str, Dict[str, str]], dataset_name: str=None,
to_lower=False, seq_len_type: str=None, bert_tokenizer: str=None,
cut_text: int = None, get_index=True, set_input: Union[list, str, bool]=True,
set_target: Union[list, str, bool] = True, concat: Union[str, list, bool]=None, ) -> DataInfo:
"""
:param paths: str或者Dict[str, str]。如果是str,则为数据集所在的文件夹或者是全路径文件名:如果是文件夹,
则会从self.paths里面找对应的数据集名称与文件名。如果是Dict,则为数据集名称(如train、dev、test)和
对应的全路径文件名。
:param str dataset_name: 如果在paths里传入的是一个数据集的全路径文件名,那么可以用dataset_name来定义
这个数据集的名字,如果不定义则默认为train。
:param bool to_lower: 是否将文本自动转为小写。默认值为False。
:param str seq_len_type: 提供的seq_len类型,支持 ``seq_len`` :提供一个数字作为句子长度; ``mask`` :
提供一个0/1的mask矩阵作为句子长度; ``bert`` :提供segment_type_id(第一个句子为0,第二个句子为1)和
attention mask矩阵(0/1的mask矩阵)。默认值为None,即不提供seq_len
:param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径
:param int cut_text: 将长于cut_text的内容截掉。默认为None,即不截。
:param bool get_index: 是否需要根据词表将文本转为index
:param set_input: 如果为True,则会自动将相关的field(名字里含有Const.INPUT的)设置为input,如果为False
则不会将任何field设置为input。如果传入str或者List[str],则会根据传入的内容将相对应的field设置为input,
于此同时其他field不会被设置为input。默认值为True。
:param set_target: set_target将控制哪些field可以被设置为target,用法与set_input一致。默认值为True。
:param concat: 是否需要将两个句子拼接起来。如果为False则不会拼接。如果为True则会在两个句子之间插入一个<sep>。
如果传入一个长度为4的list,则分别表示插在第一句开始前、第一句结束后、第二句开始前、第二句结束后的标识符。如果
传入字符串 ``bert`` ,则会采用bert的拼接方式,等价于['[CLS]', '[SEP]', '', '[SEP]'].
:return:
"""
if isinstance(set_input, str):
set_input = [set_input]
if isinstance(set_target, str):
set_target = [set_target]
if isinstance(set_input, bool):
auto_set_input = set_input
else:
auto_set_input = False
if isinstance(set_target, bool):
auto_set_target = set_target
else:
auto_set_target = False
if isinstance(paths, str):
if os.path.isdir(paths):
path = {n: os.path.join(paths, self.paths[n]) for n in self.paths.keys()}
else:
path = {dataset_name if dataset_name is not None else 'train': paths}
else:
path = paths

data_info = DataInfo()
for data_name in path.keys():
data_info.datasets[data_name] = self._load(path[data_name])

for data_name, data_set in data_info.datasets.items():
if auto_set_input:
data_set.set_input(Const.INPUTS(0), Const.INPUTS(1))
if auto_set_target:
data_set.set_target(Const.TARGET)

if to_lower:
for data_name, data_set in data_info.datasets.items():
data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(0)]], new_field_name=Const.INPUTS(0),
is_input=auto_set_input)
data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(1)]], new_field_name=Const.INPUTS(1),
is_input=auto_set_input)

if bert_tokenizer is not None:
if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR:
PRETRAIN_URL = _get_base_url('bert')
model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer]
model_url = PRETRAIN_URL + model_name
model_dir = cached_path(model_url)
# 检查是否存在
elif os.path.isdir(bert_tokenizer):
model_dir = bert_tokenizer
else:
raise ValueError(f"Cannot recognize BERT tokenizer from {bert_tokenizer}.")

tokenizer = BertTokenizer.from_pretrained(model_dir)

for data_name, data_set in data_info.datasets.items():
for fields in data_set.get_field_names():
if Const.INPUT in fields:
data_set.apply(lambda x: tokenizer.tokenize(' '.join(x[fields])), new_field_name=fields,
is_input=auto_set_input)

if isinstance(concat, bool):
concat = 'default' if concat else None
if concat is not None:
if isinstance(concat, str):
CONCAT_MAP = {'bert': ['[CLS]', '[SEP]', '', '[SEP]'],
'default': ['', '<sep>', '', '']}
if concat.lower() in CONCAT_MAP:
concat = CONCAT_MAP[concat]
else:
concat = 4 * [concat]
assert len(concat) == 4, \
f'Please choose a list with 4 symbols which at the beginning of first sentence ' \
f'the end of first sentence, the begin of second sentence, and the end of second' \
f'sentence. Your input is {concat}'

for data_name, data_set in data_info.datasets.items():
data_set.apply(lambda x: [concat[0]] + x[Const.INPUTS(0)] + [concat[1]] + [concat[2]] +
x[Const.INPUTS(1)] + [concat[3]], new_field_name=Const.INPUT)
data_set.apply(lambda x: [w for w in x[Const.INPUT] if len(w) > 0], new_field_name=Const.INPUT,
is_input=auto_set_input)

if seq_len_type is not None:
if seq_len_type == 'seq_len': #
for data_name, data_set in data_info.datasets.items():
for fields in data_set.get_field_names():
if Const.INPUT in fields:
data_set.apply(lambda x: len(x[fields]),
new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN),
is_input=auto_set_input)
elif seq_len_type == 'mask':
for data_name, data_set in data_info.datasets.items():
for fields in data_set.get_field_names():
if Const.INPUT in fields:
data_set.apply(lambda x: [1] * len(x[fields]),
new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN),
is_input=auto_set_input)
elif seq_len_type == 'bert':
for data_name, data_set in data_info.datasets.items():
if Const.INPUT not in data_set.get_field_names():
raise KeyError(f'Field ``{Const.INPUT}`` not in {data_name} data set: '
f'got {data_set.get_field_names()}')
data_set.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1),
new_field_name=Const.INPUT_LENS(0), is_input=auto_set_input)
data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]),
new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input)

if cut_text is not None:
for data_name, data_set in data_info.datasets.items():
for fields in data_set.get_field_names():
if (Const.INPUT in fields) or ((Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len')):
data_set.apply(lambda x: x[fields][: cut_text], new_field_name=fields,
is_input=auto_set_input)

data_set_list = [d for n, d in data_info.datasets.items()]
assert len(data_set_list) > 0, f'There are NO data sets in data info!'

if bert_tokenizer is not None:
words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]')
with open(os.path.join(model_dir, 'vocab.txt'), 'r') as f:
lines = f.readlines()
lines = [line.strip() for line in lines]
words_vocab.add_word_lst(lines)
words_vocab.build_vocab()
else:
words_vocab = Vocabulary()
words_vocab = words_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n],
field_name=[n for n in data_set_list[0].get_field_names()
if (Const.INPUT in n)],
no_create_entry_dataset=[d for n, d in data_info.datasets.items()
if 'train' not in n])
target_vocab = Vocabulary(padding=None, unknown=None)
target_vocab = target_vocab.from_dataset(*data_set_list, field_name=Const.TARGET)
data_info.vocabs = {Const.INPUT: words_vocab, Const.TARGET: target_vocab}

if get_index:
for data_name, data_set in data_info.datasets.items():
for fields in data_set.get_field_names():
if Const.INPUT in fields:
data_set.apply(lambda x: [words_vocab.to_index(w) for w in x[fields]], new_field_name=fields,
is_input=auto_set_input)

data_set.apply(lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET,
is_input=auto_set_input, is_target=auto_set_target)

for data_name, data_set in data_info.datasets.items():
if isinstance(set_input, list):
data_set.set_input(*set_input)
if isinstance(set_target, list):
data_set.set_target(*set_target)

return data_info


class SNLILoader(MatchingLoader, JsonLoader):
"""
别名::class:`fastNLP.io.SNLILoader` :class:`fastNLP.io.dataset_loader.SNLILoader`

读取SNLI数据集,读取的DataSet包含fields::

words1: list(str),第一句文本, premise
words2: list(str), 第二句文本, hypothesis
target: str, 真实标签

数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip
"""

def __init__(self, paths: dict=None):
fields = {
'sentence1_binary_parse': Const.INPUTS(0),
'sentence2_binary_parse': Const.INPUTS(1),
'gold_label': Const.TARGET,
}
paths = paths if paths is not None else {
'train': 'snli_1.0_train.jsonl',
'dev': 'snli_1.0_dev.jsonl',
'test': 'snli_1.0_test.jsonl'}
MatchingLoader.__init__(self, paths=paths)
JsonLoader.__init__(self, fields=fields)

def _load(self, path):
ds = JsonLoader._load(self, path)

parentheses_table = str.maketrans({'(': None, ')': None})

ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(),
new_field_name=Const.INPUTS(0))
ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(),
new_field_name=Const.INPUTS(1))
ds.drop(lambda x: x[Const.TARGET] == '-')
return ds


class RTELoader(MatchingLoader, CSVLoader):
"""
别名::class:`fastNLP.io.RTELoader` :class:`fastNLP.io.dataset_loader.RTELoader`

读取RTE数据集,读取的DataSet包含fields::

words1: list(str),第一句文本, premise
words2: list(str), 第二句文本, hypothesis
target: str, 真实标签

数据来源:
"""

def __init__(self, paths: dict=None):
paths = paths if paths is not None else {
'train': 'train.tsv',
'dev': 'dev.tsv',
# 'test': 'test.tsv' # test set has not label
}
MatchingLoader.__init__(self, paths=paths)
self.fields = {
'sentence1': Const.INPUTS(0),
'sentence2': Const.INPUTS(1),
'label': Const.TARGET,
}
CSVLoader.__init__(self, sep='\t')

def _load(self, path):
ds = CSVLoader._load(self, path)

for k, v in self.fields.items():
ds.rename_field(k, v)
for fields in ds.get_all_fields():
if Const.INPUT in fields:
ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields)

return ds


class QNLILoader(MatchingLoader, CSVLoader):
"""
别名::class:`fastNLP.io.QNLILoader` :class:`fastNLP.io.dataset_loader.QNLILoader`

读取QNLI数据集,读取的DataSet包含fields::

words1: list(str),第一句文本, premise
words2: list(str), 第二句文本, hypothesis
target: str, 真实标签

数据来源:
"""

def __init__(self, paths: dict=None):
paths = paths if paths is not None else {
'train': 'train.tsv',
'dev': 'dev.tsv',
# 'test': 'test.tsv' # test set has not label
}
MatchingLoader.__init__(self, paths=paths)
self.fields = {
'question': Const.INPUTS(0),
'sentence': Const.INPUTS(1),
'label': Const.TARGET,
}
CSVLoader.__init__(self, sep='\t')

def _load(self, path):
ds = CSVLoader._load(self, path)

for k, v in self.fields.items():
ds.rename_field(k, v)
for fields in ds.get_all_fields():
if Const.INPUT in fields:
ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields)

return ds


+ 0
- 6
reproduction/matching/data/SNLIDataLoader.py View File

@@ -1,6 +0,0 @@

from fastNLP.io.dataset_loader import SNLILoader

# TODO: still in progress



+ 65
- 0
reproduction/matching/matching_esim.py View File

@@ -0,0 +1,65 @@

import argparse
import torch

from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const
from fastNLP.modules.encoder.embedding import ElmoEmbedding, StaticEmbedding

from reproduction.matching.data.MatchingDataLoader import SNLILoader
from reproduction.matching.model.esim import ESIMModel

argument = argparse.ArgumentParser()
argument.add_argument('--embedding', choices=['glove', 'elmo'], default='glove')
argument.add_argument('--batch-size-per-gpu', type=int, default=128)
argument.add_argument('--n-epochs', type=int, default=100)
argument.add_argument('--lr', type=float, default=1e-4)
argument.add_argument('--seq-len-type', choices=['mask', 'seq_len'], default='seq_len')
argument.add_argument('--save-dir', type=str, default=None)
arg = argument.parse_args()

bert_dirs = 'path/to/bert/dir'

# load data set
data_info = SNLILoader().process(
paths='path/to/snli/data/dir', to_lower=True, seq_len_type=arg.seq_len_type, bert_tokenizer=None,
get_index=True, concat=False,
)

# load embedding
if arg.embedding == 'elmo':
embedding = ElmoEmbedding(data_info.vocabs[Const.INPUT], requires_grad=True)
elif arg.embedding == 'glove':
embedding = StaticEmbedding(data_info.vocabs[Const.INPUT], requires_grad=True)
else:
raise ValueError(f'now we only support elmo or glove embedding for esim model!')

# define model
model = ESIMModel(embedding)

# define trainer
trainer = Trainer(train_data=data_info.datasets['train'], model=model,
optimizer=Adam(lr=arg.lr, model_params=model.parameters()),
batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu,
n_epochs=arg.n_epochs, print_every=-1,
dev_data=data_info.datasets['dev'],
metrics=AccuracyMetric(), metric_key='acc',
device=[i for i in range(torch.cuda.device_count())],
check_code_level=-1,
save_path=arg.save_path)

# train model
trainer.train(load_best_model=True)

# define tester
tester = Tester(
data=data_info.datasets['test'],
model=model,
metrics=AccuracyMetric(),
batch_size=torch.cuda.device_count() * arg.batch_size_per_gpu,
device=[i for i in range(torch.cuda.device_count())],
)

# test model
tester.test()



+ 197
- 0
reproduction/matching/model/esim.py View File

@@ -0,0 +1,197 @@
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.nn import CrossEntropyLoss

from fastNLP.models import BaseModel
from fastNLP.modules.encoder.embedding import TokenEmbedding
from fastNLP.modules.encoder.lstm import LSTM
from fastNLP.core.const import Const
from fastNLP.core.utils import seq_len_to_mask


class ESIMModel(BaseModel):
def __init__(self, init_embedding: TokenEmbedding, hidden_size=None, num_labels=3, dropout_rate=0.3,
dropout_embed=0.1):
super(ESIMModel, self).__init__()

self.embedding = init_embedding
self.dropout_embed = EmbedDropout(p=dropout_embed)
if hidden_size is None:
hidden_size = self.embedding.embed_size
self.rnn = BiRNN(self.embedding.embed_size, hidden_size, dropout_rate=dropout_rate)
# self.rnn = LSTM(self.embedding.embed_size, hidden_size, dropout=dropout_rate, bidirectional=True)

self.interfere = nn.Sequential(nn.Dropout(p=dropout_rate),
nn.Linear(8 * hidden_size, hidden_size),
nn.ReLU())
nn.init.xavier_uniform_(self.interfere[1].weight.data)
self.bi_attention = SoftmaxAttention()

self.rnn_high = BiRNN(self.embedding.embed_size, hidden_size, dropout_rate=dropout_rate)
# self.rnn_high = LSTM(hidden_size, hidden_size, dropout=dropout_rate, bidirectional=True,)

self.classifier = nn.Sequential(nn.Dropout(p=dropout_rate),
nn.Linear(8 * hidden_size, hidden_size),
nn.Tanh(),
nn.Dropout(p=dropout_rate),
nn.Linear(hidden_size, num_labels))

self.dropout_rnn = nn.Dropout(p=dropout_rate)

nn.init.xavier_uniform_(self.classifier[1].weight.data)
nn.init.xavier_uniform_(self.classifier[4].weight.data)

def forward(self, words1, words2, seq_len1, seq_len2, target=None):
"""
:param words1: [batch, seq_len]
:param words2: [batch, seq_len]
:param seq_len1: [batch]
:param seq_len2: [batch]
:param target:
:return:
"""
mask1 = seq_len_to_mask(seq_len1, words1.size(1))
mask2 = seq_len_to_mask(seq_len2, words2.size(1))
a0 = self.embedding(words1) # B * len * emb_dim
b0 = self.embedding(words2)
a0, b0 = self.dropout_embed(a0), self.dropout_embed(b0)
a = self.rnn(a0, mask1.byte()) # a: [B, PL, 2 * H]
b = self.rnn(b0, mask2.byte())
# a = self.dropout_rnn(self.rnn(a0, seq_len1)[0]) # a: [B, PL, 2 * H]
# b = self.dropout_rnn(self.rnn(b0, seq_len2)[0])

ai, bi = self.bi_attention(a, mask1, b, mask2)

a_ = torch.cat((a, ai, a - ai, a * ai), dim=2) # ma: [B, PL, 8 * H]
b_ = torch.cat((b, bi, b - bi, b * bi), dim=2)
a_f = self.interfere(a_)
b_f = self.interfere(b_)

a_h = self.rnn_high(a_f, mask1.byte()) # ma: [B, PL, 2 * H]
b_h = self.rnn_high(b_f, mask2.byte())
# a_h = self.dropout_rnn(self.rnn_high(a_f, seq_len1)[0]) # ma: [B, PL, 2 * H]
# b_h = self.dropout_rnn(self.rnn_high(b_f, seq_len2)[0])

a_avg = self.mean_pooling(a_h, mask1, dim=1)
a_max, _ = self.max_pooling(a_h, mask1, dim=1)
b_avg = self.mean_pooling(b_h, mask2, dim=1)
b_max, _ = self.max_pooling(b_h, mask2, dim=1)

out = torch.cat((a_avg, a_max, b_avg, b_max), dim=1) # v: [B, 8 * H]
logits = torch.tanh(self.classifier(out))

if target is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits, target)

return {Const.LOSS: loss, Const.OUTPUT: logits}
else:
return {Const.OUTPUT: logits}

def predict(self, **kwargs):
return self.forward(**kwargs)

# input [batch_size, len , hidden]
# mask [batch_size, len] (111...00)
@staticmethod
def mean_pooling(input, mask, dim=1):
masks = mask.view(mask.size(0), mask.size(1), -1).float()
return torch.sum(input * masks, dim=dim) / torch.sum(masks, dim=1)

@staticmethod
def max_pooling(input, mask, dim=1):
my_inf = 10e12
masks = mask.view(mask.size(0), mask.size(1), -1)
masks = masks.expand(-1, -1, input.size(2)).float()
return torch.max(input + masks.le(0.5).float() * -my_inf, dim=dim)


class EmbedDropout(nn.Dropout):

def forward(self, sequences_batch):
ones = sequences_batch.data.new_ones(sequences_batch.shape[0], sequences_batch.shape[-1])
dropout_mask = nn.functional.dropout(ones, self.p, self.training, inplace=False)
return dropout_mask.unsqueeze(1) * sequences_batch


class BiRNN(nn.Module):
def __init__(self, input_size, hidden_size, dropout_rate=0.3):
super(BiRNN, self).__init__()
self.dropout_rate = dropout_rate
self.rnn = nn.LSTM(input_size, hidden_size,
num_layers=1,
bidirectional=True,
batch_first=True)

def forward(self, x, x_mask):
# Sort x
lengths = x_mask.data.eq(1).long().sum(1).squeeze()
_, idx_sort = torch.sort(lengths, dim=0, descending=True)
_, idx_unsort = torch.sort(idx_sort, dim=0)
lengths = list(lengths[idx_sort])

x = x.index_select(0, idx_sort)
# Pack it up
rnn_input = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)
# Apply dropout to input
if self.dropout_rate > 0:
dropout_input = F.dropout(rnn_input.data, p=self.dropout_rate, training=self.training)
rnn_input = nn.utils.rnn.PackedSequence(dropout_input, rnn_input.batch_sizes)
output = self.rnn(rnn_input)[0]
# Unpack everything
output = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)[0]
output = output.index_select(0, idx_unsort)
if output.size(1) != x_mask.size(1):
padding = torch.zeros(output.size(0),
x_mask.size(1) - output.size(1),
output.size(2)).type(output.data.type())
output = torch.cat([output, padding], 1)
return output


def masked_softmax(tensor, mask):
tensor_shape = tensor.size()
reshaped_tensor = tensor.view(-1, tensor_shape[-1])

# Reshape the mask so it matches the size of the input tensor.
while mask.dim() < tensor.dim():
mask = mask.unsqueeze(1)
mask = mask.expand_as(tensor).contiguous().float()
reshaped_mask = mask.view(-1, mask.size()[-1])
result = F.softmax(reshaped_tensor * reshaped_mask, dim=-1)
result = result * reshaped_mask
# 1e-13 is added to avoid divisions by zero.
result = result / (result.sum(dim=-1, keepdim=True) + 1e-13)
return result.view(*tensor_shape)


def weighted_sum(tensor, weights, mask):
w_sum = weights.bmm(tensor)
while mask.dim() < w_sum.dim():
mask = mask.unsqueeze(1)
mask = mask.transpose(-1, -2)
mask = mask.expand_as(w_sum).contiguous().float()
return w_sum * mask


class SoftmaxAttention(nn.Module):

def forward(self, premise_batch, premise_mask, hypothesis_batch, hypothesis_mask):
similarity_matrix = premise_batch.bmm(hypothesis_batch.transpose(2, 1)
.contiguous())

prem_hyp_attn = masked_softmax(similarity_matrix, hypothesis_mask)
hyp_prem_attn = masked_softmax(similarity_matrix.transpose(1, 2)
.contiguous(),
premise_mask)

attended_premises = weighted_sum(hypothesis_batch,
prem_hyp_attn,
premise_mask)
attended_hypotheses = weighted_sum(premise_batch,
hyp_prem_attn,
hypothesis_mask)

return attended_premises, attended_hypotheses

+ 0
- 97
reproduction/matching/snli.py View File

@@ -1,97 +0,0 @@
import os

import torch

from fastNLP.core import Vocabulary, DataSet, Trainer, Tester, Const, Adam, AccuracyMetric

from reproduction.matching.data.SNLIDataLoader import SNLILoader
from legacy.component.bert_tokenizer import BertTokenizer
from reproduction.matching.model.bert import BertForNLI


def preprocess_data(data: DataSet, bert_dir):
"""
preprocess data set to bert-need data set.
:param data:
:param bert_dir:
:return:
"""
tokenizer = BertTokenizer.from_pretrained(os.path.join(bert_dir, 'vocab.txt'))

vocab = Vocabulary(padding=None, unknown=None)
with open(os.path.join(bert_dir, 'vocab.txt')) as f:
lines = f.readlines()
vocab_list = []
for line in lines:
vocab_list.append(line.strip())
vocab.add_word_lst(vocab_list)
vocab.build_vocab()
vocab.padding = '[PAD]'
vocab.unknown = '[UNK]'

for i in range(2):
data.apply(lambda x: tokenizer.tokenize(" ".join(x[Const.INPUTS(i)])),
new_field_name=Const.INPUTS(i))
data.apply(lambda x: ['[CLS]'] + x[Const.INPUTS(0)] + ['[SEP]'] + x[Const.INPUTS(1)] + ['[SEP]'],
new_field_name=Const.INPUT)
data.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1),
new_field_name=Const.INPUT_LENS(0))
data.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]), new_field_name=Const.INPUT_LENS(1))

max_len = 512
data.apply(lambda x: x[Const.INPUT][: max_len], new_field_name=Const.INPUT)
data.apply(lambda x: [vocab.to_index(w) for w in x[Const.INPUT]], new_field_name=Const.INPUT)
data.apply(lambda x: x[Const.INPUT_LENS(0)][: max_len], new_field_name=Const.INPUT_LENS(0))
data.apply(lambda x: x[Const.INPUT_LENS(1)][: max_len], new_field_name=Const.INPUT_LENS(1))

target_vocab = Vocabulary(padding=None, unknown=None)
target_vocab.add_word_lst(['neutral', 'contradiction', 'entailment'])
target_vocab.build_vocab()
data.apply(lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET)

data.set_input(Const.INPUT, Const.INPUT_LENS(0), Const.INPUT_LENS(1), Const.TARGET)
data.set_target(Const.TARGET)

return data


bert_dirs = 'path/to/bert/dir'

# load raw data set
train_data = SNLILoader().load('./data/snli/snli_1.0_train.jsonl')
dev_data = SNLILoader().load('./data/snli/snli_1.0_dev.jsonl')
test_data = SNLILoader().load('./data/snli/snli_1.0_test.jsonl')

print('successfully load data sets!')

train_data = preprocess_data(train_data, bert_dirs)
dev_data = preprocess_data(dev_data, bert_dirs)
test_data = preprocess_data(test_data, bert_dirs)

model = BertForNLI(bert_dir=bert_dirs)

trainer = Trainer(
train_data=train_data,
model=model,
optimizer=Adam(lr=2e-5, model_params=model.parameters()),
batch_size=torch.cuda.device_count() * 12,
n_epochs=4,
print_every=-1,
dev_data=dev_data,
metrics=AccuracyMetric(),
metric_key='acc',
device=[i for i in range(torch.cuda.device_count())],
check_code_level=-1
)
trainer.train(load_best_model=True)

tester = Tester(
data=test_data,
model=model,
metrics=AccuracyMetric(),
batch_size=torch.cuda.device_count() * 12,
device=[i for i in range(torch.cuda.device_count())],
)
tester.test()



+ 2
- 2
reproduction/matching/test/test_snlidataloader.py View File

@@ -1,10 +1,10 @@
import unittest
from ..data import SNLIDataLoader
from ..data import MatchingDataLoader
from fastNLP.core.vocabulary import Vocabulary


class TestCWSDataLoader(unittest.TestCase):
def test_case1(self):
snli_loader = SNLIDataLoader()
snli_loader = MatchingDataLoader()
# TODO: still in progress


+ 3
- 7
reproduction/seqence_labelling/cws/train_shift_relay.py View File

@@ -57,12 +57,8 @@ callbacks = [clipper]
# if pretrain:
# fixer = FixEmbedding([model.char_embedding, model.bigram_embedding], fix_until=fix_until)
# callbacks.append(fixer)
trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None,
batch_size=32, sampler=sampler, update_every=5,
n_epochs=3, print_every=5,
dev_data=data.datasets['dev'], metrics=RelayMetric(), metric_key='f',
validate_every=-1, save_path=None,
prefetch=True, use_tqdm=True, device=device,
callbacks=callbacks,
trainer = Trainer(data.datasets['train'], model, optimizer=optimizer, loss=None, batch_size=32, sampler=sampler,
update_every=5, n_epochs=3, print_every=5, dev_data=data.datasets['dev'], metrics=RelayMetric(),
metric_key='f', validate_every=-1, save_path=None, use_tqdm=True, device=device, callbacks=callbacks,
check_code_level=0)
trainer.train()

+ 0
- 0
reproduction/seqence_labelling/ner/__init__.py View File


+ 93
- 0
reproduction/seqence_labelling/ner/data/Conll2003Loader.py View File

@@ -0,0 +1,93 @@

from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io.base_loader import DataSetLoader, DataInfo
from typing import Union, Dict
from fastNLP import Vocabulary
from fastNLP import Const
from reproduction.utils import check_dataloader_paths

from fastNLP.io.dataset_loader import ConllLoader
from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2


class Conll2003DataLoader(DataSetLoader):
def __init__(self, task:str='ner', encoding_type:str='bioes'):
"""
加载Conll2003格式的英语语料,该数据集的信息可以在https://www.clips.uantwerpen.be/conll2003/ner/找到。当task为pos
时,返回的DataSet中target取值于第2列; 当task为chunk时,返回的DataSet中target取值于第3列;当task为ner时,返回
的DataSet中target取值于第4列。所有"-DOCSTART- -X- O O"将被忽略,这会导致数据的数量少于很多文献报道的值,但
鉴于"-DOCSTART- -X- O O"只是用于文档分割的符号,并不应该作为预测对象,所以我们忽略了数据中的-DOCTSTART-开头的行
ner与chunk任务读取后的数据的target将为encoding_type类型。pos任务读取后就是pos列的数据。

:param task: 指定需要标注任务。可选ner, pos, chunk
"""
assert task in ('ner', 'pos', 'chunk')
index = {'ner':3, 'pos':1, 'chunk':2}[task]
self._loader = ConllLoader(headers=['raw_words', 'target'], indexes=[0, index])
self._tag_converters = None
if task in ('ner', 'chunk'):
self._tag_converters = [iob2]
if encoding_type == 'bioes':
self._tag_converters.append(iob2bioes)

def load(self, path: str):
dataset = self._loader.load(path)
def convert_tag_schema(tags):
for converter in self._tag_converters:
tags = converter(tags)
return tags
if self._tag_converters:
dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET)
return dataset

def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None, lower:bool=True):
"""
读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略

:param paths:
:param word_vocab_opt: vocabulary的初始化值
:param lower: 是否将所有字母转为小写
:return:
"""
# 读取数据
paths = check_dataloader_paths(paths)
data = DataInfo()
input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
target_fields = [Const.TARGET, Const.INPUT_LEN]
for name, path in paths.items():
dataset = self.load(path)
dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT)
if lower:
dataset.words.lower()
data.datasets[name] = dataset

# 对construct vocab
word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt)
word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT,
no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train'])
word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT)
data.vocabs[Const.INPUT] = word_vocab

# cap words
cap_word_vocab = Vocabulary()
cap_word_vocab.from_dataset(data.datasets['train'], field_name='raw_words',
no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train'])
cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words')
input_fields.append('cap_words')
data.vocabs['cap_words'] = cap_word_vocab

# 对target建vocab
target_vocab = Vocabulary(unknown=None, padding=None)
target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET)
target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET)
data.vocabs[Const.TARGET] = target_vocab

for name, dataset in data.datasets.items():
dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN)
dataset.set_input(*input_fields)
dataset.set_target(*target_fields)

return data

if __name__ == '__main__':
pass

+ 152
- 0
reproduction/seqence_labelling/ner/data/OntoNoteLoader.py View File

@@ -0,0 +1,152 @@
from fastNLP.core.vocabulary import VocabularyOption
from fastNLP.io.base_loader import DataSetLoader, DataInfo
from typing import Union, Dict
from fastNLP import DataSet
from fastNLP import Vocabulary
from fastNLP import Const
from reproduction.utils import check_dataloader_paths

from fastNLP.io.dataset_loader import ConllLoader
from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2

class OntoNoteNERDataLoader(DataSetLoader):
"""
用于读取处理为Conll格式后的OntoNote数据。将OntoNote数据处理为conll格式的过程可以参考https://github.com/yhcc/OntoNotes-5.0-NER。

"""
def __init__(self, encoding_type:str='bioes'):
assert encoding_type in ('bioes', 'bio')
self.encoding_type = encoding_type
if encoding_type=='bioes':
self.encoding_method = iob2bioes
else:
self.encoding_method = iob2

def load(self, path:str)->DataSet:
"""
给定一个文件路径,读取数据。返回的DataSet包含以下的field
raw_words: List[str]
target: List[str]

:param path:
:return:
"""
dataset = ConllLoader(headers=['raw_words', 'target'], indexes=[3, 10]).load(path)
def convert_to_bio(tags):
bio_tags = []
flag = None
for tag in tags:
label = tag.strip("()*")
if '(' in tag:
bio_label = 'B-' + label
flag = label
elif flag:
bio_label = 'I-' + flag
else:
bio_label = 'O'
if ')' in tag:
flag = None
bio_tags.append(bio_label)
return self.encoding_method(bio_tags)

def convert_word(words):
converted_words = []
for word in words:
word = word.replace('/.', '.') # 有些结尾的.是/.形式的
if not word.startswith('-'):
converted_words.append(word)
continue
# 以下是由于这些符号被转义了,再转回来
tfrs = {'-LRB-':'(',
'-RRB-': ')',
'-LSB-': '[',
'-RSB-': ']',
'-LCB-': '{',
'-RCB-': '}'
}
if word in tfrs:
converted_words.append(tfrs[word])
else:
converted_words.append(word)
return converted_words

dataset.apply_field(convert_word, field_name='raw_words', new_field_name='raw_words')
dataset.apply_field(convert_to_bio, field_name='target', new_field_name='target')

return dataset

def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None,
lower:bool=True)->DataInfo:
"""
读取并处理数据。返回的DataInfo包含以下的内容
vocabs:
word: Vocabulary
target: Vocabulary
datasets:
train: DataSet
words: List[int], 被设置为input
target: int. label,被同时设置为input和target
seq_len: int. 句子的长度,被同时设置为input和target
raw_words: List[str]
xxx(根据传入的paths可能有所变化)

:param paths:
:param word_vocab_opt: vocabulary的初始化值
:param lower: 是否使用小写
:return:
"""
paths = check_dataloader_paths(paths)
data = DataInfo()
input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
target_fields = [Const.TARGET, Const.INPUT_LEN]
for name, path in paths.items():
dataset = self.load(path)
dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT)
if lower:
dataset.words.lower()
data.datasets[name] = dataset

# 对construct vocab
word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt)
word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT,
no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train'])
word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT)
data.vocabs[Const.INPUT] = word_vocab

# cap words
cap_word_vocab = Vocabulary()
cap_word_vocab.from_dataset(*data.datasets.values(), field_name='raw_words')
cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words')
input_fields.append('cap_words')
data.vocabs['cap_words'] = cap_word_vocab

# 对target建vocab
target_vocab = Vocabulary(unknown=None, padding=None)
target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET)
target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET)
data.vocabs[Const.TARGET] = target_vocab

for name, dataset in data.datasets.items():
dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN)
dataset.set_input(*input_fields)
dataset.set_target(*target_fields)

return data


if __name__ == '__main__':
loader = OntoNoteNERDataLoader()
dataset = loader.load('/hdd/fudanNLP/fastNLP/others/data/v4/english/test.txt')
print(dataset.target.value_count())
print(dataset[:4])


"""
train 115812 2200752
development 15680 304684
test 12217 230111

train 92403 1901772
valid 13606 279180
test 10258 204135
"""

+ 49
- 0
reproduction/seqence_labelling/ner/data/utils.py View File

@@ -0,0 +1,49 @@
from typing import List

def iob2(tags:List[str])->List[str]:
"""
检查数据是否是合法的IOB数据,如果是IOB1会被自动转换为IOB2。

:param tags: 需要转换的tags
"""
for i, tag in enumerate(tags):
if tag == "O":
continue
split = tag.split("-")
if len(split) != 2 or split[0] not in ["I", "B"]:
raise TypeError("The encoding schema is not a valid IOB type.")
if split[0] == "B":
continue
elif i == 0 or tags[i - 1] == "O": # conversion IOB1 to IOB2
tags[i] = "B" + tag[1:]
elif tags[i - 1][1:] == tag[1:]:
continue
else: # conversion IOB1 to IOB2
tags[i] = "B" + tag[1:]
return tags

def iob2bioes(tags:List[str])->List[str]:
"""
将iob的tag转换为bmeso编码
:param tags:
:return:
"""
new_tags = []
for i, tag in enumerate(tags):
if tag == 'O':
new_tags.append(tag)
else:
split = tag.split('-')[0]
if split == 'B':
if i+1!=len(tags) and tags[i+1].split('-')[0] == 'I':
new_tags.append(tag)
else:
new_tags.append(tag.replace('B-', 'S-'))
elif split == 'I':
if i + 1<len(tags) and tags[i+1].split('-')[0] == 'I':
new_tags.append(tag)
else:
new_tags.append(tag.replace('I-', 'E-'))
else:
raise TypeError("Invalid IOB format.")
return new_tags

+ 56
- 0
reproduction/seqence_labelling/ner/model/lstm_cnn_crf.py View File

@@ -0,0 +1,56 @@

import torch
from torch import nn
from fastNLP import seq_len_to_mask
from fastNLP.modules import Embedding
from fastNLP.modules import LSTM
from fastNLP.modules import ConditionalRandomField, allowed_transitions
import torch.nn.functional as F
from fastNLP import Const

class CNNBiLSTMCRF(nn.Module):
def __init__(self, embed, char_embed, hidden_size, num_layers, tag_vocab, dropout=0.5, encoding_type='bioes'):
super().__init__()

self.embedding = Embedding(embed, dropout=0.5, dropout_word=0)
self.char_embedding = Embedding(char_embed, dropout=0.5, dropout_word=0.01)
self.lstm = LSTM(input_size=self.embedding.embedding_dim+self.char_embedding.embedding_dim,
hidden_size=hidden_size//2, num_layers=num_layers,
bidirectional=True, batch_first=True)
self.fc = nn.Linear(hidden_size, len(tag_vocab))

transitions = allowed_transitions(tag_vocab.idx2word, encoding_type=encoding_type, include_start_end=True)
self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=transitions)

self.dropout = nn.Dropout(dropout, inplace=True)

for name, param in self.named_parameters():
if 'fc' in name:
if param.data.dim()>1:
nn.init.xavier_uniform_(param)
else:
nn.init.constant_(param, 0)
if 'crf' in name:
nn.init.zeros_(param)

def _forward(self, words, cap_words, seq_len, target=None):
words = self.embedding(words)
chars = self.char_embedding(cap_words)
words = torch.cat([words, chars], dim=-1)
outputs, _ = self.lstm(words, seq_len)
self.dropout(outputs)

logits = F.log_softmax(self.fc(outputs), dim=-1)

if target is not None:
loss = self.crf(logits, target, seq_len_to_mask(seq_len))
return {Const.LOSS: loss}
else:
pred, _ = self.crf.viterbi_decode(logits, seq_len_to_mask(seq_len))
return {Const.OUTPUT: pred}

def forward(self, words, cap_words, seq_len, target):
return self._forward(words, cap_words, seq_len, target)

def predict(self, words, cap_words, seq_len):
return self._forward(words, cap_words, seq_len, None)

+ 0
- 0
reproduction/seqence_labelling/ner/test/__init__.py View File


+ 33
- 0
reproduction/seqence_labelling/ner/test/test.py View File

@@ -0,0 +1,33 @@

from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader
from reproduction.seqence_labelling.ner.data.Conll2003Loader import iob2, iob2bioes
import unittest

class TestTagSchemaConverter(unittest.TestCase):
def test_iob2(self):
tags = ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
golden = ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
self.assertListEqual(golden, iob2(tags))

tags = ['I-ORG', 'O']
golden = ['B-ORG', 'O']
self.assertListEqual(golden, iob2(tags))

tags = ['I-MISC', 'I-MISC', 'O', 'I-PER', 'I-PER', 'O']
golden = ['B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O']
self.assertListEqual(golden, iob2(tags))

def test_iob2bemso(self):
tags = ['B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O']
golden = ['B-MISC', 'E-MISC', 'O', 'B-PER', 'E-PER', 'O']
self.assertListEqual(golden, iob2bioes(tags))


def test_conll2003_loader():
path = '/hdd/fudanNLP/fastNLP/others/data/conll2003/train.txt'
loader = Conll2003DataLoader().load(path)
print(loader[:3])


if __name__ == '__main__':
test_conll2003_loader()

+ 70
- 0
reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py View File

@@ -0,0 +1,70 @@


from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding, ElmoEmbedding, LSTMCharEmbedding
from fastNLP.core.vocabulary import VocabularyOption

from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF
from fastNLP import Trainer
from fastNLP import SpanFPreRecMetric
from fastNLP import BucketSampler
from fastNLP import Const
from torch.optim import SGD, Adam
from fastNLP import GradientClipCallback
from fastNLP.core.callback import FitlogCallback, LRScheduler
from torch.optim.lr_scheduler import LambdaLR
from reproduction.seqence_labelling.ner.model.swats import SWATS

import fitlog
fitlog.debug()

from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader

encoding_type = 'bioes'

data = Conll2003DataLoader(encoding_type=encoding_type).process('../../../../others/data/conll2003',
word_vocab_opt=VocabularyOption(min_freq=2),
lower=False)
print(data)
char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30],
kernel_sizes=[3])
# char_embed = LSTMCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30 ,char_emb_size=30)
word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/wiki_en_100_50_case_2.txt',
requires_grad=True)
word_embed.embedding.weight.data = word_embed.embedding.weight.data/word_embed.embedding.weight.data.std()

# import joblib
# raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/conll_with_data.joblib')
# def convert_to_ids(raw_words):
# ids = []
# for word in raw_words:
# id = raw_data['word_to_id'][word]
# id = raw_data['id_to_emb_map'][id]
# ids.append(id)
# return ids
# word_embed = raw_data['emb_matrix']
# for name, dataset in data.datasets.items():
# dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT)

# word_embed = ElmoEmbedding(vocab=data.vocabs['cap_words'],
# model_dir_or_name='/hdd/fudanNLP/fastNLP/others/pretrained_models/elmo_en',
# requires_grad=True)

model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET],
encoding_type=encoding_type)

callbacks = [
GradientClipCallback(clip_type='value', clip_value=5)
, FitlogCallback({'test':data.datasets['test']}, verbose=1)
]
# optimizer = Adam(model.parameters(), lr=0.005)
optimizer = SWATS(model.parameters(), verbose=True)
# optimizer = SGD(model.parameters(), lr=0.008, momentum=0.9)
# scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
# callbacks.append(scheduler)

trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(),
device=1, dev_data=data.datasets['dev'], batch_size=10,
metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type),
callbacks=callbacks, num_workers=1, n_epochs=100)
trainer.train()

+ 65
- 0
reproduction/seqence_labelling/ner/train_ontonote.py View File

@@ -0,0 +1,65 @@
import sys

sys.path.append('../../..')

from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding

from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF
from fastNLP import Trainer
from fastNLP import SpanFPreRecMetric
from fastNLP import BucketSampler
from fastNLP import Const
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import LambdaLR
from fastNLP import GradientClipCallback
from fastNLP.core.callback import FitlogCallback, LRScheduler
from reproduction.seqence_labelling.ner.model.swats import SWATS

import fitlog
fitlog.debug()

from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader

encoding_type = 'bioes'

data = OntoNoteNERDataLoader(encoding_type=encoding_type).process('/hdd/fudanNLP/fastNLP/others/data/v4/english',
lower=True)

import joblib
raw_data = joblib.load('/hdd/fudanNLP/fastNLP/others/NER-with-LS/data/ontonotes_with_data.joblib')
def convert_to_ids(raw_words):
ids = []
for word in raw_words:
id = raw_data['word_to_id'][word]
id = raw_data['id_to_emb_map'][id]
ids.append(id)
return ids
word_embed = raw_data['emb_matrix']
for name, dataset in data.datasets.items():
dataset.apply_field(convert_to_ids, field_name='raw_words', new_field_name=Const.INPUT)

print(data)
char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30],
kernel_sizes=[3])
# word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
# model_dir_or_name='/hdd/fudanNLP/pretrain_vectors/glove.6B.100d.txt',
# requires_grad=True)

model = CNNBiLSTMCRF(word_embed, char_embed, hidden_size=1200, num_layers=1, tag_vocab=data.vocabs[Const.TARGET],
encoding_type=encoding_type)

callbacks = [GradientClipCallback(clip_value=5, clip_type='value'),
FitlogCallback(data.datasets['test'], verbose=1)]

optimizer = SGD(model.parameters(), lr=0.01, momentum=0.9)
scheduler = LRScheduler(LambdaLR(optimizer, lr_lambda=lambda epoch: 1 / (1 + 0.05 * epoch)))
callbacks.append(scheduler)
# optimizer = SWATS(model.parameters(), verbose=True)
# optimizer = Adam(model.parameters(), lr=0.005)


trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(num_buckets=100),
device=0, dev_data=data.datasets['dev'], batch_size=10,
metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type),
callbacks=callbacks, num_workers=1, n_epochs=100)
trainer.train()

+ 3
- 2
reproduction/utils.py View File

@@ -13,7 +13,8 @@ def check_dataloader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]:
}
如果paths为不合法的,将直接进行raise相应的错误

:param paths: 路径
:param paths: 路径. 可以为一个文件路径(则认为该文件就是train的文件); 可以为一个文件目录,将在该目录下寻找train.txt,
test.txt, dev.txt; 可以为一个dict, 则key是用户自定义的某个文件的名称,value是这个文件的路径。
:return:
"""
if isinstance(paths, str):
@@ -24,7 +25,7 @@ def check_dataloader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]:
if not os.path.isfile(train_fp):
raise FileNotFoundError(f"train.txt is not found in folder {paths}.")
files = {'train': train_fp}
for filename in ['test.txt', 'dev.txt']:
for filename in ['dev.txt', 'test.txt']:
fp = os.path.join(paths, filename)
if os.path.isfile(fp):
files[filename.split('.')[0]] = fp


+ 1
- 1
setup.py View File

@@ -13,7 +13,7 @@ with open('requirements.txt', encoding='utf-8') as f:

setup(
name='FastNLP',
version='0.4.0',
version='dev0.5.0',
description='fastNLP: Deep Learning Toolkit for NLP, developed by Fudan FastNLP Team',
long_description=readme,
long_description_content_type='text/markdown',


+ 10
- 10
test/core/test_batch.py View File

@@ -3,7 +3,7 @@ import unittest
import numpy as np
import torch

from fastNLP import Batch
from fastNLP import DataSetIter
from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import SequentialSampler
@@ -57,7 +57,7 @@ class TestCase1(unittest.TestCase):
dataset = construct_dataset(
[["FastNLP", "is", "the", "most", "beautiful", "tool", "in", "the", "world"] for _ in range(40)])
dataset.set_target()
batch = Batch(dataset, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
batch = DataSetIter(dataset, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
cnt = 0
for _, _ in batch:
@@ -68,7 +68,7 @@ class TestCase1(unittest.TestCase):
ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
for x, y in iter:
self.assertTrue(isinstance(x["x"], np.ndarray) and isinstance(y["y"], np.ndarray))
self.assertEqual(len(x["x"]), 4)
@@ -81,7 +81,7 @@ class TestCase1(unittest.TestCase):
"y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
for x, y in iter:
self.assertEqual(x["x"].shape, (4, 4))
self.assertEqual(y["y"].shape, (4, 4))
@@ -91,7 +91,7 @@ class TestCase1(unittest.TestCase):
"y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
for x, y in iter:
self.assertEqual(x["x"].shape, (4, 4))
self.assertEqual(y["y"].shape, (4, 4))
@@ -101,7 +101,7 @@ class TestCase1(unittest.TestCase):
"y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
for x, y in iter:
self.assertTrue(isinstance(x["x"], torch.Tensor))
self.assertEqual(tuple(x["x"].shape), (4, 4))
@@ -113,7 +113,7 @@ class TestCase1(unittest.TestCase):
"y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
for x, y in iter:
self.assertTrue(isinstance(x["x"], torch.Tensor))
self.assertEqual(tuple(x["x"].shape), (4, 4))
@@ -125,7 +125,7 @@ class TestCase1(unittest.TestCase):
[Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)])
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
for x, y in iter:
self.assertTrue(isinstance(x["x"], torch.Tensor))
self.assertEqual(tuple(x["x"].shape), (4, 4))
@@ -137,7 +137,7 @@ class TestCase1(unittest.TestCase):
[Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)])
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
iter = DataSetIter(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
for x, y in iter:
print(x, y)
@@ -146,7 +146,7 @@ class TestCase1(unittest.TestCase):
num_samples = 1000
dataset = generate_fake_dataset(num_samples)
batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler())
batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler())
for batch_x, batch_y in batch:
pass


+ 27
- 73
test/core/test_callbacks.py View File

@@ -40,89 +40,50 @@ class TestCallback(unittest.TestCase):
def test_gradient_clip(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=20,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.1),
check_code_level=2,
use_tqdm=False,
dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"),
callbacks=[GradientClipCallback(model.parameters(), clip_value=2)])
trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"),
batch_size=32, n_epochs=20, print_every=50, dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False,
callbacks=[GradientClipCallback(model.parameters(), clip_value=2)], check_code_level=2)
trainer.train()
def test_early_stop(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=20,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.01),
check_code_level=2,
use_tqdm=False,
dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"),
callbacks=[EarlyStopCallback(5)])
trainer = Trainer(data_set, model, optimizer=SGD(lr=0.01), loss=BCELoss(pred="predict", target="y"),
batch_size=32, n_epochs=20, print_every=50, dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False,
callbacks=[EarlyStopCallback(5)], check_code_level=2)
trainer.train()
def test_lr_scheduler(self):
data_set, model = prepare_env()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=5,
batch_size=32,
print_every=50,
optimizer=optimizer,
check_code_level=2,
use_tqdm=False,
dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"),
callbacks=[LRScheduler(torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1))])
trainer = Trainer(data_set, model, optimizer=optimizer, loss=BCELoss(pred="predict", target="y"), batch_size=32,
n_epochs=5, print_every=50, dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False,
callbacks=[LRScheduler(torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1))],
check_code_level=2)
trainer.train()
def test_KeyBoardInterrupt(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=5,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.1),
check_code_level=2,
use_tqdm=False,
callbacks=[ControlC(False)])
trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"),
batch_size=32, n_epochs=5, print_every=50, use_tqdm=False, callbacks=[ControlC(False)],
check_code_level=2)
trainer.train()
def test_LRFinder(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=5,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.1),
check_code_level=2,
use_tqdm=False,
callbacks=[LRFinder(len(data_set) // 32)])
trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"),
batch_size=32, n_epochs=5, print_every=50, use_tqdm=False,
callbacks=[LRFinder(len(data_set) // 32)], check_code_level=2)
trainer.train()
def test_TensorboardCallback(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=5,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.1),
check_code_level=2,
use_tqdm=False,
dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"),
callbacks=[TensorboardCallback("loss", "metric")])
trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"),
batch_size=32, n_epochs=5, print_every=50, dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False,
callbacks=[TensorboardCallback("loss", "metric")], check_code_level=2)
trainer.train()
def test_readonly_property(self):
@@ -141,16 +102,9 @@ class TestCallback(unittest.TestCase):
print(self.optimizer)
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=total_epochs,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.1),
check_code_level=2,
use_tqdm=False,
dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"),
callbacks=[MyCallback()])
trainer = Trainer(data_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"),
batch_size=32, n_epochs=total_epochs, print_every=50, dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"), use_tqdm=False, callbacks=[MyCallback()],
check_code_level=2)
trainer.train()
assert passed_epochs == list(range(1, total_epochs + 1))

+ 9
- 1
test/core/test_metrics.py View File

@@ -161,7 +161,15 @@ class TestAccuracyMetric(unittest.TestCase):
print(e)
return
self.assertTrue(True, False), "No exception catches."

def test_duplicate(self):
# 0.4.1的潜在bug,不能出现形参重复的情况
metric = AccuracyMetric(pred='predictions', target='targets')
pred_dict = {"predictions": torch.zeros(4, 3, 2), "seq_len": torch.ones(4) * 3, 'pred':0}
target_dict = {'targets':torch.zeros(4, 3), 'target': 0}
metric(pred_dict=pred_dict, target_dict=target_dict)


def test_seq_len(self):
N = 256
seq_len = torch.zeros(N).long()


+ 11
- 48
test/core/test_trainer.py View File

@@ -46,18 +46,10 @@ class TrainerTestGround(unittest.TestCase):
model = NaiveClassifier(2, 1)
trainer = Trainer(train_set, model,
loss=BCELoss(pred="predict", target="y"),
metrics=AccuracyMetric(pred="predict", target="y"),
n_epochs=10,
batch_size=32,
print_every=50,
validate_every=-1,
dev_data=dev_set,
optimizer=SGD(lr=0.1),
check_code_level=2,
use_tqdm=True,
save_path=None)
trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCELoss(pred="predict", target="y"),
batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set,
metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None,
use_tqdm=True, check_code_level=2)
trainer.train()
"""
# 应该正确运行
@@ -83,10 +75,7 @@ class TrainerTestGround(unittest.TestCase):
model = Model()
with self.assertRaises(RuntimeError):
trainer = Trainer(
train_data=dataset,
model=model
)
trainer = Trainer(train_data=dataset, model=model)
"""
# 应该获取到的报错提示
NameError:
@@ -116,12 +105,7 @@ class TrainerTestGround(unittest.TestCase):
return {'loss': loss}
model = Model()
trainer = Trainer(
train_data=dataset,
model=model,
use_tqdm=False,
print_every=2
)
trainer = Trainer(train_data=dataset, model=model, print_every=2, use_tqdm=False)
trainer.train()
"""
# 应该正确运行
@@ -147,12 +131,7 @@ class TrainerTestGround(unittest.TestCase):
model = Model()
with self.assertRaises(NameError):
trainer = Trainer(
train_data=dataset,
model=model,
use_tqdm=False,
print_every=2
)
trainer = Trainer(train_data=dataset, model=model, print_every=2, use_tqdm=False)
trainer.train()
def test_trainer_suggestion4(self):
@@ -175,12 +154,7 @@ class TrainerTestGround(unittest.TestCase):
model = Model()
with self.assertRaises(NameError):
trainer = Trainer(
train_data=dataset,
model=model,
use_tqdm=False,
print_every=2
)
trainer = Trainer(train_data=dataset, model=model, print_every=2, use_tqdm=False)
def test_trainer_suggestion5(self):
# 检查报错提示能否正确提醒用户
@@ -203,12 +177,7 @@ class TrainerTestGround(unittest.TestCase):
return {'loss': loss}
model = Model()
trainer = Trainer(
train_data=dataset,
model=model,
use_tqdm=False,
print_every=2
)
trainer = Trainer(train_data=dataset, model=model, print_every=2, use_tqdm=False)
def test_trainer_suggestion6(self):
# 检查报错提示能否正确提醒用户
@@ -233,14 +202,8 @@ class TrainerTestGround(unittest.TestCase):
model = Model()
with self.assertRaises(NameError):
trainer = Trainer(
train_data=dataset,
model=model,
dev_data=dataset,
loss=CrossEntropyLoss(),
metrics=AccuracyMetric(),
use_tqdm=False,
print_every=2)
trainer = Trainer(train_data=dataset, model=model, loss=CrossEntropyLoss(), print_every=2, dev_data=dataset,
metrics=AccuracyMetric(), use_tqdm=False)
"""
def test_trainer_multiprocess(self):


+ 9
- 0
test/core/test_utils.py View File

@@ -237,6 +237,10 @@ class TestSeqLenToMask(unittest.TestCase):
with self.assertRaises(AssertionError):
mask = seq_len_to_mask(seq_len)

# 3. pad到指定长度
seq_len = np.random.randint(1, 10, size=(10,))
mask = seq_len_to_mask(seq_len, 100)
self.assertEqual(100, mask.shape[1])

def test_pytorch_seq_len(self):
# 1. 随机测试
@@ -250,3 +254,8 @@ class TestSeqLenToMask(unittest.TestCase):
seq_len = torch.randn(3, 4)
with self.assertRaises(AssertionError):
mask = seq_len_to_mask(seq_len)

# 3. pad到指定长度
seq_len = torch.randint(1, 10, size=(10, ))
mask = seq_len_to_mask(seq_len, 100)
self.assertEqual(100, mask.size(1))

+ 18
- 0
test/core/test_vocabulary.py View File

@@ -70,6 +70,24 @@ class TestAdd(unittest.TestCase):
self.assertEqual(vocab.to_index(chr(start_char + i)), i + 2)
vocab.index_dataset(dataset, field_name='char')

def test_from_dataset_no_entry(self):
# 测试能否正确将no_create_entry正确设置
dataset = DataSet()
start_char = 65
num_samples = 10
test_dataset = DataSet()
for i in range(num_samples):
char = [chr(start_char + i)] * 6
ins = Instance(char=char)
dataset.append(ins)
ins = Instance(char=[c+c for c in char])
test_dataset.append(ins)
vocab = Vocabulary()
vocab.from_dataset(dataset, field_name='char', no_create_entry_dataset=test_dataset)
vocab.index_dataset(dataset, field_name='char')
for i in range(num_samples):
self.assertEqual(True, vocab._is_word_no_create_entry(chr(start_char + i)+chr(start_char + i)))


class TestIndexing(unittest.TestCase):
def test_len(self):


+ 2
- 5
test/models/model_runner.py View File

@@ -130,11 +130,8 @@ class ModelRunner():
tester = Tester(data=data, model=model, metrics=metrics,
batch_size=BATCH_SIZE, verbose=0)
before_train = tester.test()
trainer = Trainer(model=model, train_data=data, dev_data=None,
n_epochs=N_EPOCHS, batch_size=BATCH_SIZE,
loss=loss,
save_path=None,
use_tqdm=False)
trainer = Trainer(train_data=data, model=model, loss=loss, batch_size=BATCH_SIZE, n_epochs=N_EPOCHS,
dev_data=None, save_path=None, use_tqdm=False)
trainer.train(load_best_model=False)
after_train = tester.test()
for metric_name, v1 in before_train.items():


+ 0
- 1
test/models/test_biaffine_parser.py View File

@@ -1,6 +1,5 @@
import unittest

import fastNLP
from fastNLP.models.biaffine_parser import BiaffineParser, ParserLoss, ParserMetric
from .model_runner import *



+ 5
- 5
test/modules/decoder/test_CRF.py View File

@@ -10,14 +10,14 @@ class TestCRF(unittest.TestCase):
id2label = {0: 'B', 1: 'I', 2:'O'}
expected_res = {(0, 0), (0, 1), (0, 2), (0, 4), (1, 0), (1, 1), (1, 2), (1, 4), (2, 0), (2, 2),
(2, 4), (3, 0), (3, 2)}
self.assertSetEqual(expected_res, set(allowed_transitions(id2label)))
self.assertSetEqual(expected_res, set(allowed_transitions(id2label, include_start_end=True)))

id2label = {0: 'B', 1:'M', 2:'E', 3:'S'}
expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 5), (3, 0), (3, 3), (3, 5), (4, 0), (4, 3)}
self.assertSetEqual(expected_res, set(allowed_transitions(id2label, encoding_type='BMES')))
self.assertSetEqual(expected_res, set(allowed_transitions(id2label, encoding_type='BMES', include_start_end=True)))

id2label = {0: 'B', 1: 'I', 2:'O', 3: '<pad>', 4:"<unk>"}
allowed_transitions(id2label)
allowed_transitions(id2label, include_start_end=True)

labels = ['O']
for label in ['X', 'Y']:
@@ -27,7 +27,7 @@ class TestCRF(unittest.TestCase):
expected_res = {(0, 0), (0, 1), (0, 3), (0, 6), (1, 0), (1, 1), (1, 2), (1, 3), (1, 6), (2, 0), (2, 1),
(2, 2), (2, 3), (2, 6), (3, 0), (3, 1), (3, 3), (3, 4), (3, 6), (4, 0), (4, 1), (4, 3),
(4, 4), (4, 6), (5, 0), (5, 1), (5, 3)}
self.assertSetEqual(expected_res, set(allowed_transitions(id2label)))
self.assertSetEqual(expected_res, set(allowed_transitions(id2label, include_start_end=True)))

labels = []
for label in ['X', 'Y']:
@@ -37,7 +37,7 @@ class TestCRF(unittest.TestCase):
expected_res = {(0, 1), (0, 2), (1, 1), (1, 2), (2, 0), (2, 3), (2, 4), (2, 7), (2, 9), (3, 0), (3, 3), (3, 4),
(3, 7), (3, 9), (4, 5), (4, 6), (5, 5), (5, 6), (6, 0), (6, 3), (6, 4), (6, 7), (6, 9), (7, 0),
(7, 3), (7, 4), (7, 7), (7, 9), (8, 0), (8, 3), (8, 4), (8, 7)}
self.assertSetEqual(expected_res, set(allowed_transitions(id2label, encoding_type='BMES')))
self.assertSetEqual(expected_res, set(allowed_transitions(id2label, encoding_type='BMES', include_start_end=True)))

def test_case2(self):
# 测试CRF能否避免解码出非法跃迁, 使用allennlp做了验证。


+ 9
- 18
test/test_tutorials.py View File

@@ -60,10 +60,10 @@ class TestTutorial(unittest.TestCase):
print(test_data[0])

# 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具
from fastNLP.core.batch import Batch
from fastNLP.core.batch import DataSetIter
from fastNLP.core.sampler import RandomSampler

batch_iterator = Batch(dataset=train_data, batch_size=2, sampler=RandomSampler())
batch_iterator = DataSetIter(dataset=train_data, batch_size=2, sampler=RandomSampler())
for batch_x, batch_y in batch_iterator:
print("batch_x has: ", batch_x)
print("batch_y has: ", batch_y)
@@ -80,23 +80,19 @@ class TestTutorial(unittest.TestCase):
test_data.rename_field('label', 'label_seq')

loss = CrossEntropyLoss(pred="output", target="label_seq")
metric = AccuracyMetric(pred="predict", target="label_seq")
metric = AccuracyMetric(target="label_seq")

# 实例化Trainer,传入模型和数据,进行训练
# 先在test_data拟合(确保模型的实现是正确的)
copy_model = deepcopy(model)
overfit_trainer = Trainer(model=copy_model, train_data=test_data, dev_data=test_data,
loss=loss,
metrics=metric,
save_path=None,
batch_size=32,
n_epochs=5)
overfit_trainer = Trainer(train_data=test_data, model=copy_model, loss=loss, batch_size=32, n_epochs=5,
dev_data=test_data, metrics=metric, save_path=None)
overfit_trainer.train()

# 用train_data训练,在test_data验证
trainer = Trainer(model=model, train_data=train_data, dev_data=test_data,
loss=CrossEntropyLoss(pred="output", target="label_seq"),
metrics=AccuracyMetric(pred="predict", target="label_seq"),
metrics=AccuracyMetric(target="label_seq"),
save_path=None,
batch_size=32,
n_epochs=5)
@@ -106,7 +102,7 @@ class TestTutorial(unittest.TestCase):
# 调用Tester在test_data上评价效果
from fastNLP import Tester

tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"),
tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(target="label_seq"),
batch_size=4)
acc = tester.test()
print(acc)
@@ -147,13 +143,8 @@ class TestTutorial(unittest.TestCase):

from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam

trainer = Trainer(model=model,
train_data=train_data,
dev_data=dev_data,
loss=CrossEntropyLoss(),
optimizer= Adam(),
metrics=AccuracyMetric(target='target')
)
trainer = Trainer(train_data=train_data, model=model, optimizer=Adam(), loss=CrossEntropyLoss(),
dev_data=dev_data, metrics=AccuracyMetric(target='target'))
trainer.train()
print('Train finished!')



Loading…
Cancel
Save