Browse Source

- add document

tags/v0.4.10
yunfan 5 years ago
parent
commit
001586fa3e
18 changed files with 655 additions and 327 deletions
  1. +2
    -2
      fastNLP/api/api.py
  2. +1
    -1
      fastNLP/core/__init__.py
  3. +36
    -26
      fastNLP/core/batch.py
  4. +1
    -1
      fastNLP/core/dataset.py
  5. +18
    -41
      fastNLP/core/sampler.py
  6. +3
    -3
      fastNLP/core/trainer.py
  7. +92
    -40
      fastNLP/core/vocabulary.py
  8. +80
    -40
      fastNLP/io/dataset_loader.py
  9. +12
    -9
      fastNLP/io/file_reader.py
  10. +135
    -53
      fastNLP/models/biaffine_parser.py
  11. +127
    -28
      fastNLP/models/star_transformer.py
  12. +23
    -13
      fastNLP/modules/encoder/lstm.py
  13. +26
    -21
      fastNLP/modules/encoder/star_transformer.py
  14. +9
    -8
      fastNLP/modules/encoder/transformer.py
  15. +72
    -23
      fastNLP/modules/encoder/variational_rnn.py
  16. +1
    -7
      test/core/test_sampler.py
  17. +10
    -10
      test/models/test_biaffine_parser.py
  18. +7
    -1
      test/test_tutorials.py

+ 2
- 2
fastNLP/api/api.py View File

@@ -47,7 +47,7 @@ from fastNLP.core.dataset import DataSet

from fastNLP.api.utils import load_url
from fastNLP.api.processor import ModelProcessor
from fastNLP.io.dataset_loader import cut_long_sentence, ConllLoader
from fastNLP.io.dataset_loader import _cut_long_sentence, ConllLoader
from fastNLP.core.instance import Instance
from fastNLP.api.pipeline import Pipeline
from fastNLP.core.metrics import SpanFPreRecMetric
@@ -107,7 +107,7 @@ class ConllCWSReader(object):
continue
line = ' '.join(res)
if cut_long_sent:
sents = cut_long_sentence(line)
sents = _cut_long_sentence(line)
else:
sents = [line]
for raw_sentence in sents:


+ 1
- 1
fastNLP/core/__init__.py View File

@@ -5,7 +5,7 @@ from .instance import Instance
from .losses import LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward
from .metrics import AccuracyMetric
from .optimizer import Optimizer, SGD, Adam
from .sampler import SequentialSampler, BucketSampler, RandomSampler, BaseSampler
from .sampler import SequentialSampler, BucketSampler, RandomSampler, Sampler
from .tester import Tester
from .trainer import Trainer
from .vocabulary import Vocabulary


+ 36
- 26
fastNLP/core/batch.py View File

@@ -2,7 +2,7 @@ import numpy as np
import torch
import atexit

from fastNLP.core.sampler import RandomSampler
from fastNLP.core.sampler import RandomSampler, Sampler
import torch.multiprocessing as mp

_python_is_exit = False
@@ -12,19 +12,25 @@ def _set_python_is_exit():
atexit.register(_set_python_is_exit)

class Batch(object):
"""Batch is an iterable object which iterates over mini-batches.

Example::

for batch_x, batch_y in Batch(data_set, batch_size=16, sampler=SequentialSampler()):
# ...

:param DataSet dataset: a DataSet object
:param int batch_size: the size of the batch
:param Sampler sampler: a Sampler object. If None, use fastNLP.sampler.RandomSampler
:param bool as_numpy: If True, return Numpy array. Otherwise, return torch tensors.
:param bool prefetch: If True, use multiprocessing to fetch next batch when training.
:param str or torch.device device: the batch's device, if as_numpy is True, device is ignored.
"""
Batch 用于从 `DataSet` 中按一定的顺序, 依次按 ``batch_size`` 的大小将数据取出.
组成 `x` 和 `y`

Example::

batch = Batch(data_set, batch_size=16, sampler=SequentialSampler())
num_batch = len(batch)
for batch_x, batch_y in batch:
# do stuff ...

:param DataSet dataset: `DataSet` 对象, 数据集
:param int batch_size: 取出的batch大小
:param Sampler sampler: 规定使用的 Sample 方式. 若为 ``None`` , 使用 RandomSampler.
Default: ``None``
:param bool as_numpy: 若为 ``True`` , 输出batch为 numpy.array. 否则为 torch.Tensor.
Default: ``False``
:param bool prefetch: 若为 ``True`` 使用多进程预先取出下一batch.
Default: ``False``
"""

def __init__(self, dataset, batch_size, sampler=None, as_numpy=False, prefetch=False):
@@ -41,7 +47,7 @@ class Batch(object):
self.prefetch = prefetch
self.lengths = 0

def fetch_one(self):
def _fetch_one(self):
if self.curidx >= len(self.idx_list):
return None
else:
@@ -55,7 +61,7 @@ class Batch(object):
if field.is_target or field.is_input:
batch = field.get(indices)
if not self.as_numpy and field.padder is not None:
batch = to_tensor(batch, field.dtype)
batch = _to_tensor(batch, field.dtype)
if field.is_target:
batch_y[field_name] = batch
if field.is_input:
@@ -70,17 +76,17 @@ class Batch(object):
:return:
"""
if self.prefetch:
return run_batch_iter(self)
return _run_batch_iter(self)
def batch_iter():
self.init_iter()
self._init_iter()
while 1:
res = self.fetch_one()
res = self._fetch_one()
if res is None:
break
yield res
return batch_iter()

def init_iter(self):
def _init_iter(self):
self.idx_list = self.sampler(self.dataset)
self.curidx = 0
self.lengths = self.dataset.get_length()
@@ -89,10 +95,14 @@ class Batch(object):
return self.num_batches

def get_batch_indices(self):
"""取得当前batch在DataSet中所在的index下标序列

:return list(int) indexes: 下标序列
"""
return self.cur_batch_indices


def to_tensor(batch, dtype):
def _to_tensor(batch, dtype):
try:
if dtype in (int, np.int8, np.int16, np.int32, np.int64):
batch = torch.LongTensor(batch)
@@ -103,12 +113,12 @@ def to_tensor(batch, dtype):
return batch


def run_fetch(batch, q):
def _run_fetch(batch, q):
global _python_is_exit
batch.init_iter()
batch._init_iter()
# print('start fetch')
while 1:
res = batch.fetch_one()
res = batch._fetch_one()
# print('fetch one')
while 1:
try:
@@ -124,9 +134,9 @@ def run_fetch(batch, q):
# print('fetch exit')


def run_batch_iter(batch):
def _run_batch_iter(batch):
q = mp.JoinableQueue(maxsize=10)
fetch_p = mp.Process(target=run_fetch, args=(batch, q))
fetch_p = mp.Process(target=_run_fetch, args=(batch, q))
fetch_p.daemon = True
fetch_p.start()
# print('fork fetch process')


+ 1
- 1
fastNLP/core/dataset.py View File

@@ -482,7 +482,7 @@ class DataSet(object):

"""
import warnings
warnings.warn('read_csv is deprecated, use CSVLoader instead',
warnings.warn('DataSet.read_csv is deprecated, use CSVLoader instead',
category=DeprecationWarning)
with open(csv_path, "r", encoding='utf-8') as f:
start_idx = 0


+ 18
- 41
fastNLP/core/sampler.py View File

@@ -3,72 +3,49 @@ from itertools import chain
import numpy as np
import torch

class Sampler(object):
""" `Sampler` 类的基类. 规定以何种顺序取出data中的元素

def convert_to_torch_tensor(data_list, use_cuda):
"""Convert lists into (cuda) Tensors.

:param data_list: 2-level lists
:param use_cuda: bool, whether to use GPU or not
:return data_list: PyTorch Tensor of shape [batch_size, max_seq_len]
"""
data_list = torch.Tensor(data_list).long()
if torch.cuda.is_available() and use_cuda:
data_list = data_list.cuda()
return data_list


class BaseSampler(object):
"""The base class of all samplers.

Sub-classes must implement the ``__call__`` method.
``__call__`` takes a DataSet object and returns a list of int - the sampling indices.
子类必须实现 ``__call__`` 方法. 输入 `DataSet` 对象, 返回其中元素的下标序列
"""

def __call__(self, *args, **kwargs):
def __call__(self, data_set):
"""
:param DataSet data_set: `DataSet` 对象, 需要Sample的数据
:return result: list(int) 其中元素的下标序列, ``data_set`` 中元素会按 ``result`` 中顺序取出
"""
raise NotImplementedError


class SequentialSampler(BaseSampler):
"""Sample data in the original order.
class SequentialSampler(Sampler):
"""顺序取出元素的 `Sampler`

"""
def __call__(self, data_set):
"""

:param DataSet data_set:
:return result: a list of integers.
"""
return list(range(len(data_set)))


class RandomSampler(BaseSampler):
"""Sample data in random permutation order.
class RandomSampler(Sampler):
"""随机化取元素的 `Sampler`

"""
def __call__(self, data_set):
"""

:param DataSet data_set:
:return result: a list of integers.
"""
return list(np.random.permutation(len(data_set)))


class BucketSampler(BaseSampler):
"""

:param int num_buckets: the number of buckets to use.
:param int batch_size: batch size per epoch.
:param str seq_lens_field_name: the field name indicating the field about sequence length.
class BucketSampler(Sampler):
"""带Bucket的 `Random Sampler`. 可以随机地取出长度相似的元素

:param int num_buckets: bucket的数量
:param int batch_size: batch的大小
:param str seq_lens_field_name: 对应序列长度的 `field` 的名字
"""
def __init__(self, num_buckets=10, batch_size=32, seq_lens_field_name='seq_lens'):
def __init__(self, num_buckets=10, batch_size=32, seq_lens_field_name='seq_len'):
self.num_buckets = num_buckets
self.batch_size = batch_size
self.seq_lens_field_name = seq_lens_field_name

def __call__(self, data_set):

seq_lens = data_set.get_all_fields()[self.seq_lens_field_name].content
total_sample_num = len(seq_lens)



+ 3
- 3
fastNLP/core/trainer.py View File

@@ -18,7 +18,7 @@ from fastNLP.core.dataset import DataSet
from fastNLP.core.losses import _prepare_losser
from fastNLP.core.metrics import _prepare_metrics
from fastNLP.core.optimizer import Adam
from fastNLP.core.sampler import BaseSampler
from fastNLP.core.sampler import Sampler
from fastNLP.core.sampler import RandomSampler
from fastNLP.core.sampler import SequentialSampler
from fastNLP.core.tester import Tester
@@ -57,7 +57,7 @@ class Trainer(object):
smaller, add "-" in front of the string. For example::

metric_key="-PPL" # language model gets better as perplexity gets smaller
:param BaseSampler sampler: method used to generate batch data.
:param Sampler sampler: method used to generate batch data.
:param prefetch: bool, 是否使用额外的进程对产生batch数据。
:param bool use_tqdm: whether to use tqdm to show train progress.
:param callbacks: List[Callback]. 用于在train过程中起调节作用的回调函数。比如early stop,negative sampling等可以
@@ -102,7 +102,7 @@ class Trainer(object):
losser = _prepare_losser(loss)

# sampler check
if sampler is not None and not isinstance(sampler, BaseSampler):
if sampler is not None and not isinstance(sampler, Sampler):
raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler)))

if check_code_level > -1:


+ 92
- 40
fastNLP/core/vocabulary.py View File

@@ -1,3 +1,4 @@
from functools import wraps
from collections import Counter
from fastNLP.core.dataset import DataSet

@@ -5,7 +6,7 @@ def check_build_vocab(func):
"""A decorator to make sure the indexing is built before used.

"""
@wraps(func) # to solve missing docstring
def _wrapper(self, *args, **kwargs):
if self.word2idx is None or self.rebuild is True:
self.build_vocab()
@@ -18,7 +19,7 @@ def check_build_status(func):
"""A decorator to check whether the vocabulary updates after the last build.

"""
@wraps(func) # to solve missing docstring
def _wrapper(self, *args, **kwargs):
if self.rebuild is False:
self.rebuild = True
@@ -32,23 +33,28 @@ def check_build_status(func):


class Vocabulary(object):
"""Use for word and index one to one mapping
"""
用于构建, 存储和使用 `str` 到 `int` 的一一映射

Example::

vocab = Vocabulary()
word_list = "this is a word list".split()
vocab.update(word_list)
vocab["word"]
vocab.to_word(5)

:param int max_size: set the max number of words in Vocabulary. Default: None
:param int min_freq: set the min occur frequency of words in Vocabulary. Default: None
:param padding: str, padding的字符,默认为<pad>。如果设置为None,则vocabulary中不考虑padding,为None的情况多在为label建立
Vocabulary的情况。
:param unknown: str, unknown的字符,默认为<unk>。如果设置为None,则vocabulary中不考虑unknown,为None的情况多在为label建立
Vocabulary的情况。

vocab["word"] # str to int
vocab.to_word(5) # int to str

:param int max_size: `Vocabulary` 的最大大小, 即能存储词的最大数量
若为 ``None`` , 则不限制大小. Default: ``None``
:param int min_freq: 能被记录下的词在文本中的最小出现频率, 应大于或等于 1.
若小于该频率, 词语将被视为 `unknown`. 若为 ``None`` , 所有文本中的词都被记录. Default: ``None``
:param str padding: padding的字符. 如果设置为 ``None`` ,
则vocabulary中不考虑padding, 也不计入词表大小,为 ``None`` 的情况多在为label建立Vocabulary的情况.
Default: '<pad>'
:param str unknow: unknow的字符,所有未被记录的词在转为 `int` 时将被视为unknown.
如果设置为 ``None`` ,则vocabulary中不考虑unknow, 也不计入词表大小.
为 ``None`` 的情况多在为label建立Vocabulary的情况.
Default: '<unk>'
"""

def __init__(self, max_size=None, min_freq=None, padding='<pad>', unknown='<unk>'):
@@ -63,7 +69,7 @@ class Vocabulary(object):

@check_build_status
def update(self, word_lst):
"""Add a list of words into the vocabulary.
"""依次增加序列中词在词典中的出现频率

:param list word_lst: a list of strings
"""
@@ -71,32 +77,35 @@ class Vocabulary(object):

@check_build_status
def add(self, word):
"""Add a single word into the vocabulary.
"""
增加一个新词在词典中的出现频率

:param str word: a word or token.
:param str word: 新词
"""
self.word_count[word] += 1

@check_build_status
def add_word(self, word):
"""Add a single word into the vocabulary.

:param str word: a word or token.
"""
增加一个新词在词典中的出现频率

:param str word: 新词
"""
self.add(word)

@check_build_status
def add_word_lst(self, word_lst):
"""Add a list of words into the vocabulary.

:param list word_lst: a list of strings
"""
依次增加序列中词在词典中的出现频率

:param list(str) word_lst: 词的序列
"""
self.update(word_lst)

def build_vocab(self):
"""Build a mapping from word to index, and filter the word using ``max_size`` and ``min_freq``.
"""
根据已经出现的词和出现频率构建词典. 注意: 重复构建可能会改变词典的大小,
但已经记录在词典中的词, 不会改变对应的 `int`

"""
self.word2idx = {}
@@ -117,7 +126,8 @@ class Vocabulary(object):
self.rebuild = False

def build_reverse_vocab(self):
"""Build "index to word" dict based on "word to index" dict.
"""
基于 "word to index" dict, 构建 "index to word" dict.

"""
self.idx2word = {i: w for w, i in self.word2idx.items()}
@@ -128,7 +138,8 @@ class Vocabulary(object):

@check_build_vocab
def __contains__(self, item):
"""Check if a word in vocabulary.
"""
检查词是否被记录

:param item: the word
:return: True or False
@@ -136,11 +147,24 @@ class Vocabulary(object):
return item in self.word2idx

def has_word(self, w):
"""
检查词是否被记录

Example::

has_abc = vocab.has_word('abc')
# equals to
has_abc = 'abc' in vocab

:param item: the word
:return: ``True`` or ``False``
"""
return self.__contains__(w)

@check_build_vocab
def __getitem__(self, w):
"""To support usage like::
"""
To support usage like::

vocab[w]
"""
@@ -154,14 +178,19 @@ class Vocabulary(object):
@check_build_vocab
def index_dataset(self, *datasets, field_name, new_field_name=None):
"""
example:
# remember to use `field_name`
vocab.index_dataset(tr_data, dev_data, te_data, field_name='words')
将DataSet中对应field的词转为数字.
Example::

:param datasets: fastNLP Dataset type. you can pass multiple datasets
:param field_name: str, what field to index. Only support 0,1,2 dimension.
:param new_field_name: str. What the indexed field should be named, default is to overwrite field_name
:return:
# remember to use `field_name`
vocab.index_dataset(train_data, dev_data, test_data, field_name='words')

:param DataSet datasets: 需要转index的 DataSet, 支持一个或多个
:param str field_name: 需要转index的field, 若有多个 DataSet, 每个DataSet都必须有此 field.
目前仅支持 ``str`` , ``list(str)`` , ``list(list(str))``
:param str new_field_name: 保存结果的field_name. 若为 ``None`` , 将覆盖原field.
Default: ``None``
:return self:
"""
def index_instance(ins):
"""
@@ -194,11 +223,18 @@ class Vocabulary(object):

def from_dataset(self, *datasets, field_name):
"""
Construct vocab from dataset.
使用dataset的对应field中词构建词典

Example::

# remember to use `field_name`
vocab.from_dataset(train_data1, train_data2, field_name='words')

:param datasets: DataSet.
:param field_name: str, what field is used to construct dataset.
:return:
:param DataSet datasets: 需要转index的 DataSet, 支持一个或多个.
:param str field_name: 构建词典所使用的 field.
若有多个 DataSet, 每个DataSet都必须有此 field.
目前仅支持 ``str`` , ``list(str)`` , ``list(list(str))``
:return self:
"""
def construct_vocab(ins):
field = ins[field_name]
@@ -223,15 +259,27 @@ class Vocabulary(object):
return self

def to_index(self, w):
""" Turn a word to an index. If w is not in Vocabulary, return the unknown label.
"""
将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出
``ValueError``

Example::

index = vocab.to_index('abc')
# equals to
index = vocab['abc']

:param str w: a word
:return int index: the number
"""
return self.__getitem__(w)

@property
@check_build_vocab
def unknown_idx(self):
"""
unknown 对应的数字.
"""
if self.unknown is None:
return None
return self.word2idx[self.unknown]
@@ -239,16 +287,20 @@ class Vocabulary(object):
@property
@check_build_vocab
def padding_idx(self):
"""
padding 对应的数字
"""
if self.padding is None:
return None
return self.word2idx[self.padding]

@check_build_vocab
def to_word(self, idx):
"""given a word's index, return the word itself
"""
给定一个数字, 将其转为对应的词.

:param int idx: the index
:return str word: the indexed word
:return str word: the word
"""
return self.idx2word[idx]



+ 80
- 40
fastNLP/io/dataset_loader.py View File

@@ -4,7 +4,7 @@ from nltk.tree import Tree

from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance
from fastNLP.io.file_reader import read_csv, read_json, read_conll
from fastNLP.io.file_reader import _read_csv, _read_json, _read_conll


def _download_from_url(url, path):
@@ -55,12 +55,12 @@ def _uncompress(src, dst):


class DataSetLoader:
"""Interface for all DataSetLoaders.
"""所有`DataSetLoader`的接口

"""

def load(self, path):
"""Load data from a given file.
"""从指定 ``path`` 的文件中读取数据,返回DataSet

:param str path: file path
:return: a DataSet object
@@ -68,7 +68,7 @@ class DataSetLoader:
raise NotImplementedError

def convert(self, data):
"""Optional operation to build a DataSet.
"""用Python数据对象创建DataSet

:param data: inner data structure (user-defined) to represent the data.
:return: a DataSet object
@@ -77,7 +77,7 @@ class DataSetLoader:


class PeopleDailyCorpusLoader(DataSetLoader):
"""人民日报数据集
"""读取人民日报数据集
"""
def __init__(self):
super(PeopleDailyCorpusLoader, self).__init__()
@@ -154,8 +154,35 @@ class PeopleDailyCorpusLoader(DataSetLoader):
return data_set


class ConllLoader:
class ConllLoader(DataSetLoader):
"""
读取Conll格式的数据. 数据格式详见 http://conll.cemantix.org/2012/data.html

列号从0开始, 每列对应内容为::

Column Type
0 Document ID
1 Part number
2 Word number
3 Word itself
4 Part-of-Speech
5 Parse bit
6 Predicate lemma
7 Predicate Frameset ID
8 Word sense
9 Speaker/Author
10 Named Entities
11:N Predicate Arguments
N Coreference

:param headers: 每一列数据的名称,需为List or Tuple of str。``header`` 与 ``indexs`` 一一对应
:param indexs: 需要保留的数据列下标,从0开始。若为 ``None`` ,则所有列都保留。Default: ``None``
:param dropna: 是否忽略非法数据,若 ``False`` ,遇到非法数据时抛出 ``ValueError`` 。Default: ``True``
"""
def __init__(self, headers, indexs=None, dropna=True):
super(ConllLoader, self).__init__()
if not isinstance(headers, (list, tuple)):
raise TypeError('invalid headers: {}, should be list of strings'.format(headers))
self.headers = headers
self.dropna = dropna
if indexs is None:
@@ -167,24 +194,17 @@ class ConllLoader:

def load(self, path):
ds = DataSet()
for idx, data in read_conll(path, indexes=self.indexs, dropna=self.dropna):
ins = {h:data[idx] for h, idx in zip(self.headers, self.indexs)}
for idx, data in _read_conll(path, indexes=self.indexs, dropna=self.dropna):
ins = {h:data[i] for i, h in enumerate(self.headers)}
ds.append(Instance(**ins))
return ds

def get_one(self, sample):
sample = list(map(list, zip(*sample)))
for field in sample:
if len(field) <= 0:
return None
return sample


class Conll2003Loader(ConllLoader):
"""Loader for conll2003 dataset
"""读取Conll2003数据
More information about the given dataset cound be found on
https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data
关于数据集的更多信息,参考:
https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data
"""
def __init__(self):
headers = [
@@ -193,9 +213,10 @@ class Conll2003Loader(ConllLoader):
super(Conll2003Loader, self).__init__(headers=headers)


def cut_long_sentence(sent, max_sample_length=200):
def _cut_long_sentence(sent, max_sample_length=200):
"""
将长于max_sample_length的sentence截成多段,只会在有空格的地方发生截断。所以截取的句子可能长于或者短于max_sample_length
将长于max_sample_length的sentence截成多段,只会在有空格的地方发生截断。
所以截取的句子可能长于或者短于max_sample_length

:param sent: str.
:param max_sample_length: int.
@@ -223,8 +244,15 @@ def cut_long_sentence(sent, max_sample_length=200):


class SSTLoader(DataSetLoader):
"""load SST data in PTB tree format
data source: https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip
"""读取SST数据集, DataSet包含fields::

words: list(str) 需要分类的文本
target: str 文本的标签

数据来源: https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip

:param subtree: 是否将数据展开为子树,扩充数据量. Default: ``False``
:param fine_grained: 是否使用SST-5标准,若 ``False`` , 使用SST-2。Default: ``False``
"""
def __init__(self, subtree=False, fine_grained=False):
self.subtree = subtree
@@ -247,14 +275,14 @@ class SSTLoader(DataSetLoader):
datas = []
for l in f:
datas.extend([(s, self.tag_v[t])
for s, t in self.get_one(l, self.subtree)])
for s, t in self._get_one(l, self.subtree)])
ds = DataSet()
for words, tag in datas:
ds.append(Instance(words=words, raw_tag=tag))
ds.append(Instance(words=words, target=tag))
return ds

@staticmethod
def get_one(data, subtree):
def _get_one(data, subtree):
tree = Tree.fromstring(data)
if subtree:
return [(t.leaves(), t.label()) for t in tree.subtrees()]
@@ -262,11 +290,17 @@ class SSTLoader(DataSetLoader):


class JsonLoader(DataSetLoader):
"""Load json-format data,
every line contains a json obj, like a dict
fields is the dict key that need to be load
"""
def __init__(self, dropna=False, fields=None):
读取json格式数据.数据必须按行存储,每行是一个包含各类属性的json对象

:param dict fields: 需要读入的json属性名称, 和读入后在DataSet中存储的field_name
``fields`` 的`key`必须是json对象的属性名. ``fields`` 的`value`为读入后在DataSet存储的`field_name`,
`value`也可为 ``None`` , 这时读入后的`field_name`与json对象对应属性同名
``fields`` 可为 ``None`` , 这时,json对象所有属性都保存在DataSet中. Default: ``None``
:param bool dropna: 是否忽略非法数据,若 ``True`` 则忽略,若 ``False`` ,在遇到非法数据时,抛出 ``ValueError`` .
Default: ``True``
"""
def __init__(self, fields=None, dropna=False):
super(JsonLoader, self).__init__()
self.dropna = dropna
self.fields = None
@@ -279,7 +313,7 @@ class JsonLoader(DataSetLoader):

def load(self, path):
ds = DataSet()
for idx, d in read_json(path, fields=self.fields_list, dropna=self.dropna):
for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna):
ins = {self.fields[k]:v for k,v in d.items()}
ds.append(Instance(**ins))
return ds
@@ -287,7 +321,13 @@ class JsonLoader(DataSetLoader):

class SNLILoader(JsonLoader):
"""
data source: https://nlp.stanford.edu/projects/snli/snli_1.0.zip
读取SNLI数据集,读取的DataSet包含fields::

words1: list(str),第一句文本, premise
words2: list(str), 第二句文本, hypothesis
target: str, 真实标签

数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip
"""
def __init__(self):
fields = {
@@ -309,14 +349,14 @@ class SNLILoader(JsonLoader):


class CSVLoader(DataSetLoader):
"""Load data from a CSV file and return a DataSet object.

:param str csv_path: path to the CSV file
:param List[str] or Tuple[str] headers: headers of the CSV file
:param str sep: delimiter in CSV file. Default: ","
:param bool dropna: If True, drop rows that have less entries than headers.
:return dataset: the read data set
"""
读取CSV格式的数据集。返回 ``DataSet``

:param List[str] headers: CSV文件的文件头.定义每一列的属性名称,即返回的DataSet中`field`的名称
若为 ``None`` ,则将读入文件的第一行视作 ``headers`` . Default: ``None``
:param str sep: CSV文件中列与列之间的分隔符. Default: ","
:param bool dropna: 是否忽略非法数据,若 ``True`` 则忽略,若 ``False`` ,在遇到非法数据时,抛出 ``ValueError`` .
Default: ``True``
"""
def __init__(self, headers=None, sep=",", dropna=True):
self.headers = headers
@@ -325,8 +365,8 @@ class CSVLoader(DataSetLoader):

def load(self, path):
ds = DataSet()
for idx, data in read_csv(path, headers=self.headers,
sep=self.sep, dropna=self.dropna):
for idx, data in _read_csv(path, headers=self.headers,
sep=self.sep, dropna=self.dropna):
ds.append(Instance(**data))
return ds



+ 12
- 9
fastNLP/io/file_reader.py View File

@@ -1,15 +1,16 @@
import json


def read_csv(path, encoding='utf-8', headers=None, sep=',', dropna=True):
def _read_csv(path, encoding='utf-8', headers=None, sep=',', dropna=True):
"""
Construct a generator to read csv items
Construct a generator to read csv items.

:param path: file path
:param encoding: file's encoding, default: utf-8
:param headers: file's headers, if None, make file's first line as headers. default: None
:param sep: separator for each column. default: ','
:param dropna: weather to ignore and drop invalid data,
if False, raise ValueError when reading invalid data. default: True
:if False, raise ValueError when reading invalid data. default: True
:return: generator, every time yield (line number, csv item)
"""
with open(path, 'r', encoding=encoding) as f:
@@ -35,14 +36,15 @@ def read_csv(path, encoding='utf-8', headers=None, sep=',', dropna=True):
yield line_idx, _dict


def read_json(path, encoding='utf-8', fields=None, dropna=True):
def _read_json(path, encoding='utf-8', fields=None, dropna=True):
"""
Construct a generator to read json items
Construct a generator to read json items.

:param path: file path
:param encoding: file's encoding, default: utf-8
:param fields: json object's fields that needed, if None, all fields are needed. default: None
:param dropna: weather to ignore and drop invalid data,
if False, raise ValueError when reading invalid data. default: True
:if False, raise ValueError when reading invalid data. default: True
:return: generator, every time yield (line number, json item)
"""
if fields:
@@ -65,14 +67,15 @@ def read_json(path, encoding='utf-8', fields=None, dropna=True):
yield line_idx, _res


def read_conll(path, encoding='utf-8', indexes=None, dropna=True):
def _read_conll(path, encoding='utf-8', indexes=None, dropna=True):
"""
Construct a generator to read conll items
Construct a generator to read conll items.

:param path: file path
:param encoding: file's encoding, default: utf-8
:param indexes: conll object's column indexes that needed, if None, all columns are needed. default: None
:param dropna: weather to ignore and drop invalid data,
if False, raise ValueError when reading invalid data. default: True
:if False, raise ValueError when reading invalid data. default: True
:return: generator, every time yield (line number, conll item)
"""
def parse_conll(sample):


+ 135
- 53
fastNLP/models/biaffine_parser.py View File

@@ -16,7 +16,7 @@ from fastNLP.modules.utils import initial_parameter
from fastNLP.modules.utils import seq_mask


def mst(scores):
def _mst(scores):
"""
with some modification to support parser output for MST decoding
https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/models/nn.py#L692
@@ -120,12 +120,22 @@ def _find_cycle(vertices, edges):


class GraphParser(BaseModel):
"""Graph based Parser helper class, support greedy decoding and MST(Maximum Spanning Tree) decoding
"""
基于图的parser base class, 支持贪婪解码和最大生成树解码
"""
def __init__(self):
super(GraphParser, self).__init__()

def _greedy_decoder(self, arc_matrix, mask=None):
@staticmethod
def greedy_decoder(arc_matrix, mask=None):
"""
贪心解码方式, 输入图, 输出贪心解码的parsing结果, 不保证合法的构成树

:param arc_matrix: [batch, seq_len, seq_len] 输入图矩阵
:param mask: [batch, seq_len] 输入图的padding mask, 有内容的部分为 1, 否则为 0.
若为 ``None`` 时, 默认为全1向量. Default: ``None``
:return heads: [batch, seq_len] 每个元素在树中对应的head(parent)预测结果
"""
_, seq_len, _ = arc_matrix.shape
matrix = arc_matrix + torch.diag(arc_matrix.new(seq_len).fill_(-np.inf))
flip_mask = (mask == 0).byte()
@@ -135,22 +145,34 @@ class GraphParser(BaseModel):
heads *= mask.long()
return heads

def _mst_decoder(self, arc_matrix, mask=None):
@staticmethod
def mst_decoder(arc_matrix, mask=None):
"""
用最大生成树算法, 计算parsing结果, 保证输出合法的树结构

:param arc_matrix: [batch, seq_len, seq_len] 输入图矩阵
:param mask: [batch, seq_len] 输入图的padding mask, 有内容的部分为 1, 否则为 0.
若为 ``None`` 时, 默认为全1向量. Default: ``None``
:return heads: [batch, seq_len] 每个元素在树中对应的head(parent)预测结果
"""
batch_size, seq_len, _ = arc_matrix.shape
matrix = arc_matrix.clone()
ans = matrix.new_zeros(batch_size, seq_len).long()
lens = (mask.long()).sum(1) if mask is not None else torch.zeros(batch_size) + seq_len
batch_idx = torch.arange(batch_size, dtype=torch.long, device=lens.device)
for i, graph in enumerate(matrix):
len_i = lens[i]
ans[i, :len_i] = torch.as_tensor(mst(graph.detach()[:len_i, :len_i].cpu().numpy()), device=ans.device)
ans[i, :len_i] = torch.as_tensor(_mst(graph.detach()[:len_i, :len_i].cpu().numpy()), device=ans.device)
if mask is not None:
ans *= mask.long()
return ans


class ArcBiaffine(nn.Module):
"""helper module for Biaffine Dependency Parser predicting arc
"""
Biaffine Dependency Parser 的子模块, 用于构建预测边的图

:param hidden_size: 输入的特征维度
:param bias: 是否使用bias. Default: ``True``
"""
def __init__(self, hidden_size, bias=True):
super(ArcBiaffine, self).__init__()
@@ -164,10 +186,10 @@ class ArcBiaffine(nn.Module):

def forward(self, head, dep):
"""
:param head arc-head tensor = [batch, length, emb_dim]
:param dep arc-dependent tensor = [batch, length, emb_dim]

:return output tensor = [bacth, length, length]
:param head: arc-head tensor [batch, length, hidden]
:param dep: arc-dependent tensor [batch, length, hidden]
:return output: tensor [bacth, length, length]
"""
output = dep.matmul(self.U)
output = output.bmm(head.transpose(-1, -2))
@@ -177,7 +199,13 @@ class ArcBiaffine(nn.Module):


class LabelBilinear(nn.Module):
"""helper module for Biaffine Dependency Parser predicting label
"""
Biaffine Dependency Parser 的子模块, 用于构建预测边类别的图

:param in1_features: 输入的特征1维度
:param in2_features: 输入的特征2维度
:param num_label: 边类别的个数
:param bias: 是否使用bias. Default: ``True``
"""
def __init__(self, in1_features, in2_features, num_label, bias=True):
super(LabelBilinear, self).__init__()
@@ -185,14 +213,34 @@ class LabelBilinear(nn.Module):
self.lin = nn.Linear(in1_features + in2_features, num_label, bias=False)

def forward(self, x1, x2):
"""

:param x1: [batch, seq_len, hidden] 输入特征1, 即label-head
:param x2: [batch, seq_len, hidden] 输入特征2, 即label-dep
:return output: [batch, seq_len, num_cls] 每个元素对应类别的概率图
"""
output = self.bilinear(x1, x2)
output += self.lin(torch.cat([x1, x2], dim=2))
return output

class BiaffineParser(GraphParser):
"""Biaffine Dependency Parser implemantation.
refer to ` Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016)
"""Biaffine Dependency Parser 实现.
论文参考 ` Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016)
<https://arxiv.org/abs/1611.01734>`_ .

:param word_vocab_size: 单词词典大小
:param word_emb_dim: 单词词嵌入向量的维度
:param pos_vocab_size: part-of-speech 词典大小
:param pos_emb_dim: part-of-speech 向量维度
:param num_label: 边的类别个数
:param rnn_layers: rnn encoder的层数
:param rnn_hidden_size: rnn encoder 的隐状态维度
:param arc_mlp_size: 边预测的MLP维度
:param label_mlp_size: 类别预测的MLP维度
:param dropout: dropout概率.
:param encoder: encoder类别, 可选 ('lstm', 'var-lstm', 'transformer'). Default: lstm
:param use_greedy_infer: 是否在inference时使用贪心算法.
若 ``False`` , 使用更加精确但相对缓慢的MST算法. Default: ``False``
"""
def __init__(self,
word_vocab_size,
@@ -207,7 +255,6 @@ class BiaffineParser(GraphParser):
dropout=0.3,
encoder='lstm',
use_greedy_infer=False):

super(BiaffineParser, self).__init__()
rnn_out_size = 2 * rnn_hidden_size
word_hid_dim = pos_hid_dim = rnn_hidden_size
@@ -275,27 +322,31 @@ class BiaffineParser(GraphParser):
for p in m.parameters():
nn.init.normal_(p, 0, 0.1)

def forward(self, word_seq, pos_seq, seq_lens, gold_heads=None):
"""
:param word_seq: [batch_size, seq_len] sequence of word's indices
:param pos_seq: [batch_size, seq_len] sequence of word's indices
:param seq_lens: [batch_size, seq_len] sequence of length masks
:param gold_heads: [batch_size, seq_len] sequence of golden heads
:return dict: parsing results
arc_pred: [batch_size, seq_len, seq_len]
label_pred: [batch_size, seq_len, seq_len]
mask: [batch_size, seq_len]
head_pred: [batch_size, seq_len] if gold_heads is not provided, predicting the heads
def forward(self, words1, words2, seq_len, gold_heads=None):
"""模型forward阶段

:param words1: [batch_size, seq_len] 输入word序列
:param words2: [batch_size, seq_len] 输入pos序列
:param seq_len: [batch_size, seq_len] 输入序列长度
:param gold_heads: [batch_size, seq_len] 输入真实标注的heads, 仅在训练阶段有效,
用于训练label分类器. 若为 ``None`` , 使用预测的heads输入到label分类器
Default: ``None``
:return dict: parsing结果::

arc_pred: [batch_size, seq_len, seq_len] 边预测logits
label_pred: [batch_size, seq_len, num_label] label预测logits
mask: [batch_size, seq_len] 预测结果的mask
head_pred: [batch_size, seq_len] heads的预测结果, 在 ``gold_heads=None`` 时预测
"""
# prepare embeddings
batch_size, seq_len = word_seq.shape
batch_size, length = words1.shape
# print('forward {} {}'.format(batch_size, seq_len))

# get sequence mask
mask = seq_mask(seq_lens, seq_len).long()
mask = seq_mask(seq_len, length).long()

word = self.word_embedding(word_seq) # [N,L] -> [N,L,C_0]
pos = self.pos_embedding(pos_seq) # [N,L] -> [N,L,C_1]
word = self.word_embedding(words1) # [N,L] -> [N,L,C_0]
pos = self.pos_embedding(words2) # [N,L] -> [N,L,C_1]

word, pos = self.word_fc(word), self.pos_fc(pos)
word, pos = self.word_norm(word), self.pos_norm(pos)
@@ -303,7 +354,7 @@ class BiaffineParser(GraphParser):

# encoder, extract features
if self.encoder_name.endswith('lstm'):
sort_lens, sort_idx = torch.sort(seq_lens, dim=0, descending=True)
sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True)
x = x[sort_idx]
x = nn.utils.rnn.pack_padded_sequence(x, sort_lens, batch_first=True)
feat, _ = self.encoder(x) # -> [N,L,C]
@@ -329,20 +380,20 @@ class BiaffineParser(GraphParser):
if gold_heads is None or not self.training:
# use greedy decoding in training
if self.training or self.use_greedy_infer:
heads = self._greedy_decoder(arc_pred, mask)
heads = self.greedy_decoder(arc_pred, mask)
else:
heads = self._mst_decoder(arc_pred, mask)
heads = self.mst_decoder(arc_pred, mask)
head_pred = heads
else:
assert self.training # must be training mode
if gold_heads is None:
heads = self._greedy_decoder(arc_pred, mask)
heads = self.greedy_decoder(arc_pred, mask)
head_pred = heads
else:
head_pred = None
heads = gold_heads

batch_range = torch.arange(start=0, end=batch_size, dtype=torch.long, device=word_seq.device).unsqueeze(1)
batch_range = torch.arange(start=0, end=batch_size, dtype=torch.long, device=words1.device).unsqueeze(1)
label_head = label_head[batch_range, heads].contiguous()
label_pred = self.label_predictor(label_head, label_dep) # [N, L, num_label]
res_dict = {'arc_pred': arc_pred, 'label_pred': label_pred, 'mask': mask}
@@ -355,11 +406,11 @@ class BiaffineParser(GraphParser):
"""
Compute loss.

:param arc_pred: [batch_size, seq_len, seq_len]
:param label_pred: [batch_size, seq_len, n_tags]
:param arc_true: [batch_size, seq_len]
:param label_true: [batch_size, seq_len]
:param mask: [batch_size, seq_len]
:param arc_pred: [batch_size, seq_len, seq_len] 边预测logits
:param label_pred: [batch_size, seq_len, num_label] label预测logits
:param arc_true: [batch_size, seq_len] 真实边的标注
:param label_true: [batch_size, seq_len] 真实类别的标注
:param mask: [batch_size, seq_len] 预测结果的mask
:return: loss value
"""

@@ -381,16 +432,23 @@ class BiaffineParser(GraphParser):
label_nll = -label_loss.mean()
return arc_nll + label_nll

def predict(self, word_seq, pos_seq, seq_lens):
"""

:param word_seq:
:param pos_seq:
:param seq_lens:
:return: arc_pred: [B, L]
label_pred: [B, L]
def predict(self, words1, words2, seq_len):
"""模型预测API

:param words1: [batch_size, seq_len] 输入word序列
:param words2: [batch_size, seq_len] 输入pos序列
:param seq_len: [batch_size, seq_len] 输入序列长度
:param gold_heads: [batch_size, seq_len] 输入真实标注的heads, 仅在训练阶段有效,
用于训练label分类器. 若为 ``None`` , 使用预测的heads输入到label分类器
Default: ``None``
:return dict: parsing结果::

arc_pred: [batch_size, seq_len, seq_len] 边预测logits
label_pred: [batch_size, seq_len, num_label] label预测logits
mask: [batch_size, seq_len] 预测结果的mask
head_pred: [batch_size, seq_len] heads的预测结果, 在 ``gold_heads=None`` 时预测
"""
res = self(word_seq, pos_seq, seq_lens)
res = self(words1, words2, seq_len)
output = {}
output['arc_pred'] = res.pop('head_pred')
_, label_pred = res.pop('label_pred').max(2)
@@ -399,6 +457,16 @@ class BiaffineParser(GraphParser):


class ParserLoss(LossFunc):
"""
计算parser的loss

:param arc_pred: [batch_size, seq_len, seq_len] 边预测logits
:param label_pred: [batch_size, seq_len, num_label] label预测logits
:param arc_true: [batch_size, seq_len] 真实边的标注
:param label_true: [batch_size, seq_len] 真实类别的标注
:param mask: [batch_size, seq_len] 预测结果的mask
:return loss: scalar
"""
def __init__(self, arc_pred=None, label_pred=None, arc_true=None, label_true=None):
super(ParserLoss, self).__init__(BiaffineParser.loss,
arc_pred=arc_pred,
@@ -408,12 +476,26 @@ class ParserLoss(LossFunc):


class ParserMetric(MetricBase):
"""
评估parser的性能

:param arc_pred: 边预测logits
:param label_pred: label预测logits
:param arc_true: 真实边的标注
:param label_true: 真实类别的标注
:param seq_len: 序列长度
:return dict: 评估结果::

UAS: 不带label时, 边预测的准确率
LAS: 同时预测边和label的准确率
"""
def __init__(self, arc_pred=None, label_pred=None,
arc_true=None, label_true=None, seq_lens=None):
arc_true=None, label_true=None, seq_len=None):

super().__init__()
self._init_param_map(arc_pred=arc_pred, label_pred=label_pred,
arc_true=arc_true, label_true=label_true,
seq_lens=seq_lens)
seq_len=seq_len)
self.num_arc = 0
self.num_label = 0
self.num_sample = 0
@@ -424,13 +506,13 @@ class ParserMetric(MetricBase):
self.num_sample = self.num_label = self.num_arc = 0
return res

def evaluate(self, arc_pred, label_pred, arc_true, label_true, seq_lens=None):
def evaluate(self, arc_pred, label_pred, arc_true, label_true, seq_len=None):
"""Evaluate the performance of prediction.
"""
if seq_lens is None:
if seq_len is None:
seq_mask = arc_pred.new_ones(arc_pred.size(), dtype=torch.long)
else:
seq_mask = seq_lens_to_masks(seq_lens.long(), float=False).long()
seq_mask = seq_lens_to_masks(seq_len.long(), float=False).long()
# mask out <root> tag
seq_mask[:,0] = 0
head_pred_correct = (arc_pred == arc_true).long() * seq_mask


+ 127
- 28
fastNLP/models/star_transformer.py View File

@@ -7,6 +7,21 @@ import torch.nn.functional as F


class StarTransEnc(nn.Module):
"""
带word embedding的Star-Transformer Encoder

:param vocab_size: 词嵌入的词典大小
:param emb_dim: 每个词嵌入的特征维度
:param num_cls: 输出类别个数
:param hidden_size: 模型中特征维度.
:param num_layers: 模型层数.
:param num_head: 模型中multi-head的head个数.
:param head_dim: 模型中multi-head中每个head特征维度.
:param max_len: 模型能接受的最大输入长度.
:param cls_hidden_size: 分类器隐层维度.
:param emb_dropout: 词嵌入的dropout概率.
:param dropout: 模型除词嵌入外的dropout概率.
"""
def __init__(self, vocab_size, emb_dim,
hidden_size,
num_layers,
@@ -27,15 +42,23 @@ class StarTransEnc(nn.Module):
max_len=max_len)

def forward(self, x, mask):
"""
:param FloatTensor data: [batch, length, hidden] 输入的序列
:param ByteTensor mask: [batch, length] 输入序列的padding mask, 在没有内容(padding 部分) 为 0,
否则为 1
:return: [batch, length, hidden] 编码后的输出序列

[batch, hidden] 全局 relay 节点, 详见论文
"""
x = self.embedding(x)
x = self.emb_fc(self.emb_drop(x))
nodes, relay = self.encoder(x, mask)
return nodes, relay


class Cls(nn.Module):
class _Cls(nn.Module):
def __init__(self, in_dim, num_cls, hid_dim, dropout=0.1):
super(Cls, self).__init__()
super(_Cls, self).__init__()
self.fc = nn.Sequential(
nn.Linear(in_dim, hid_dim),
nn.LeakyReLU(),
@@ -48,9 +71,9 @@ class Cls(nn.Module):
return h


class NLICls(nn.Module):
class _NLICls(nn.Module):
def __init__(self, in_dim, num_cls, hid_dim, dropout=0.1):
super(NLICls, self).__init__()
super(_NLICls, self).__init__()
self.fc = nn.Sequential(
nn.Dropout(dropout),
nn.Linear(in_dim*4, hid_dim), #4
@@ -65,7 +88,19 @@ class NLICls(nn.Module):
return h

class STSeqLabel(nn.Module):
"""star-transformer model for sequence labeling
"""用于序列标注的Star-Transformer模型

:param vocab_size: 词嵌入的词典大小
:param emb_dim: 每个词嵌入的特征维度
:param num_cls: 输出类别个数
:param hidden_size: 模型中特征维度. Default: 300
:param num_layers: 模型层数. Default: 4
:param num_head: 模型中multi-head的head个数. Default: 8
:param head_dim: 模型中multi-head中每个head特征维度. Default: 32
:param max_len: 模型能接受的最大输入长度. Default: 512
:param cls_hidden_size: 分类器隐层维度. Default: 600
:param emb_dropout: 词嵌入的dropout概率. Default: 0.1
:param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
"""
def __init__(self, vocab_size, emb_dim, num_cls,
hidden_size=300,
@@ -86,23 +121,47 @@ class STSeqLabel(nn.Module):
max_len=max_len,
emb_dropout=emb_dropout,
dropout=dropout)
self.cls = Cls(hidden_size, num_cls, cls_hidden_size)
self.cls = _Cls(hidden_size, num_cls, cls_hidden_size)

def forward(self, words, seq_len):
"""

def forward(self, word_seq, seq_lens):
mask = seq_lens_to_masks(seq_lens)
nodes, _ = self.enc(word_seq, mask)
:param words: [batch, seq_len] 输入序列
:param seq_len: [batch,] 输入序列的长度
:return output: [batch, num_cls, seq_len] 输出序列中每个元素的分类的概率
"""
mask = seq_lens_to_masks(seq_len)
nodes, _ = self.enc(words, mask)
output = self.cls(nodes)
output = output.transpose(1,2) # make hidden to be dim 1
return {'output': output} # [bsz, n_cls, seq_len]

def predict(self, word_seq, seq_lens):
y = self.forward(word_seq, seq_lens)
def predict(self, words, seq_len):
"""

:param words: [batch, seq_len] 输入序列
:param seq_len: [batch,] 输入序列的长度
:return output: [batch, seq_len] 输出序列中每个元素的分类
"""
y = self.forward(words, seq_len)
_, pred = y['output'].max(1)
return {'output': pred, 'seq_lens': seq_lens}
return {'output': pred}


class STSeqCls(nn.Module):
"""star-transformer model for sequence classification
"""用于分类任务的Star-Transformer

:param vocab_size: 词嵌入的词典大小
:param emb_dim: 每个词嵌入的特征维度
:param num_cls: 输出类别个数
:param hidden_size: 模型中特征维度. Default: 300
:param num_layers: 模型层数. Default: 4
:param num_head: 模型中multi-head的head个数. Default: 8
:param head_dim: 模型中multi-head中每个head特征维度. Default: 32
:param max_len: 模型能接受的最大输入长度. Default: 512
:param cls_hidden_size: 分类器隐层维度. Default: 600
:param emb_dropout: 词嵌入的dropout概率. Default: 0.1
:param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
"""

def __init__(self, vocab_size, emb_dim, num_cls,
@@ -124,23 +183,47 @@ class STSeqCls(nn.Module):
max_len=max_len,
emb_dropout=emb_dropout,
dropout=dropout)
self.cls = Cls(hidden_size, num_cls, cls_hidden_size)
self.cls = _Cls(hidden_size, num_cls, cls_hidden_size)

def forward(self, word_seq, seq_lens):
mask = seq_lens_to_masks(seq_lens)
nodes, relay = self.enc(word_seq, mask)
def forward(self, words, seq_len):
"""

:param words: [batch, seq_len] 输入序列
:param seq_len: [batch,] 输入序列的长度
:return output: [batch, num_cls] 输出序列的分类的概率
"""
mask = seq_lens_to_masks(seq_len)
nodes, relay = self.enc(words, mask)
y = 0.5 * (relay + nodes.max(1)[0])
output = self.cls(y) # [bsz, n_cls]
return {'output': output}

def predict(self, word_seq, seq_lens):
y = self.forward(word_seq, seq_lens)
def predict(self, words, seq_len):
"""

:param words: [batch, seq_len] 输入序列
:param seq_len: [batch,] 输入序列的长度
:return output: [batch, num_cls] 输出序列的分类
"""
y = self.forward(words, seq_len)
_, pred = y['output'].max(1)
return {'output': pred}


class STNLICls(nn.Module):
"""star-transformer model for NLI
"""用于自然语言推断(NLI)的Star-Transformer

:param vocab_size: 词嵌入的词典大小
:param emb_dim: 每个词嵌入的特征维度
:param num_cls: 输出类别个数
:param hidden_size: 模型中特征维度. Default: 300
:param num_layers: 模型层数. Default: 4
:param num_head: 模型中multi-head的head个数. Default: 8
:param head_dim: 模型中multi-head中每个head特征维度. Default: 32
:param max_len: 模型能接受的最大输入长度. Default: 512
:param cls_hidden_size: 分类器隐层维度. Default: 600
:param emb_dropout: 词嵌入的dropout概率. Default: 0.1
:param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
"""

def __init__(self, vocab_size, emb_dim, num_cls,
@@ -162,20 +245,36 @@ class STNLICls(nn.Module):
max_len=max_len,
emb_dropout=emb_dropout,
dropout=dropout)
self.cls = NLICls(hidden_size, num_cls, cls_hidden_size)
self.cls = _NLICls(hidden_size, num_cls, cls_hidden_size)

def forward(self, words1, words2, seq_len1, seq_len2):
"""

def forward(self, word_seq1, word_seq2, seq_lens1, seq_lens2):
mask1 = seq_lens_to_masks(seq_lens1)
mask2 = seq_lens_to_masks(seq_lens2)
:param words1: [batch, seq_len] 输入序列1
:param words2: [batch, seq_len] 输入序列2
:param seq_len1: [batch,] 输入序列1的长度
:param seq_len2: [batch,] 输入序列2的长度
:return output: [batch, num_cls] 输出分类的概率
"""
mask1 = seq_lens_to_masks(seq_len1)
mask2 = seq_lens_to_masks(seq_len2)
def enc(seq, mask):
nodes, relay = self.enc(seq, mask)
return 0.5 * (relay + nodes.max(1)[0])
y1 = enc(word_seq1, mask1)
y2 = enc(word_seq2, mask2)
y1 = enc(words1, mask1)
y2 = enc(words2, mask2)
output = self.cls(y1, y2) # [bsz, n_cls]
return {'output': output}

def predict(self, word_seq1, word_seq2, seq_lens1, seq_lens2):
y = self.forward(word_seq1, word_seq2, seq_lens1, seq_lens2)
def predict(self, words1, words2, seq_len1, seq_len2):
"""

:param words1: [batch, seq_len] 输入序列1
:param words2: [batch, seq_len] 输入序列2
:param seq_len1: [batch,] 输入序列1的长度
:param seq_len2: [batch,] 输入序列2的长度
:return output: [batch, num_cls] 输出分类的概率
"""
y = self.forward(words1, words2, seq_len1, seq_len2)
_, pred = y['output'].max(1)
return {'output': pred}

+ 23
- 13
fastNLP/modules/encoder/lstm.py View File

@@ -6,17 +6,17 @@ from fastNLP.modules.utils import initial_parameter


class LSTM(nn.Module):
"""Long Short Term Memory
"""LSTM 模块, 轻量封装的Pytorch LSTM

:param int input_size:
:param int hidden_size:
:param int num_layers:
:param float dropout:
:param bool batch_first:
:param bool bidirectional:
:param bool bias:
:param str initial_method:
:param bool get_hidden:
:param input_size: 输入 `x` 的特征维度
:param hidden_size: 隐状态 `h` 的特征维度
:param num_layers: rnn的层数. Default: 1
:param dropout: 层间dropout概率. Default: 0
:param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
:param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
:(batch, seq, feature). Default: ``False``
:param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
:param get_hidden: 是否返回隐状态 `h` . Default: ``False``
"""
def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True,
bidirectional=False, bias=True, initial_method=None, get_hidden=False):
@@ -27,14 +27,24 @@ class LSTM(nn.Module):
self.get_hidden = get_hidden
initial_parameter(self, initial_method)

def forward(self, x, seq_lens=None, h0=None, c0=None):
def forward(self, x, seq_len=None, h0=None, c0=None):
"""

:param x: [batch, seq_len, input_size] 输入序列
:param seq_len: [batch, ] 序列长度, 若为 ``None``, 所有输入看做一样长. Default: ``None``
:param h0: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全1向量. Default: ``None``
:param c0: [batch, hidden_size] 初始Cell状态, 若为 ``None`` , 设为全1向量. Default: ``None``
:return (output, ht) 或 output: 若 ``get_hidden=True`` [batch, seq_len, hidden_size*num_direction] 输出序列
:和 [batch, hidden_size*num_direction] 最后时刻隐状态.
:若 ``get_hidden=False`` 仅返回输出序列.
"""
if h0 is not None and c0 is not None:
hx = (h0, c0)
else:
hx = None
if seq_lens is not None and not isinstance(x, rnn.PackedSequence):
if seq_len is not None and not isinstance(x, rnn.PackedSequence):
print('padding')
sort_lens, sort_idx = torch.sort(seq_lens, dim=0, descending=True)
sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True)
if self.batch_first:
x = x[sort_idx]
else:


+ 26
- 21
fastNLP/modules/encoder/star_transformer.py View File

@@ -5,16 +5,19 @@ import numpy as NP


class StarTransformer(nn.Module):
"""Star-Transformer Encoder part。
"""
Star-Transformer 的encoder部分。 输入3d的文本输入, 返回相同长度的文本编码

paper: https://arxiv.org/abs/1902.09113
:param hidden_size: int, 输入维度的大小。同时也是输出维度的大小。
:param num_layers: int, star-transformer的层数
:param num_head: int,head的数量。
:param head_dim: int, 每个head的维度大小。
:param dropout: float dropout 概率
:param max_len: int or None, 如果为int,输入序列的最大长度,
模型会为属于序列加上position embedding。
若为None,忽略加上position embedding的步骤

:param int hidden_size: 输入维度的大小。同时也是输出维度的大小。
:param int num_layers: star-transformer的层数
:param int num_head: head的数量。
:param int head_dim: 每个head的维度大小。
:param float dropout: dropout 概率. Default: 0.1
:param int max_len: int or None, 如果为int,输入序列的最大长度,
模型会为输入序列加上position embedding。
若为`None`,忽略加上position embedding的步骤. Default: `None`
"""
def __init__(self, hidden_size, num_layers, num_head, head_dim, dropout=0.1, max_len=None):
super(StarTransformer, self).__init__()
@@ -22,11 +25,11 @@ class StarTransformer(nn.Module):

self.norm = nn.ModuleList([nn.LayerNorm(hidden_size) for _ in range(self.iters)])
self.ring_att = nn.ModuleList(
[MSA1(hidden_size, nhead=num_head, head_dim=head_dim, dropout=dropout)
for _ in range(self.iters)])
[_MSA1(hidden_size, nhead=num_head, head_dim=head_dim, dropout=dropout)
for _ in range(self.iters)])
self.star_att = nn.ModuleList(
[MSA2(hidden_size, nhead=num_head, head_dim=head_dim, dropout=dropout)
for _ in range(self.iters)])
[_MSA2(hidden_size, nhead=num_head, head_dim=head_dim, dropout=dropout)
for _ in range(self.iters)])

if max_len is not None:
self.pos_emb = self.pos_emb = nn.Embedding(max_len, hidden_size)
@@ -35,10 +38,12 @@ class StarTransformer(nn.Module):

def forward(self, data, mask):
"""
:param FloatTensor data: [batch, length, hidden] the input sequence
:param ByteTensor mask: [batch, length] the padding mask for input, in which padding pos is 0
:return: [batch, length, hidden] the output sequence
[batch, hidden] the global relay node
:param FloatTensor data: [batch, length, hidden] 输入的序列
:param ByteTensor mask: [batch, length] 输入序列的padding mask, 在没有内容(padding 部分) 为 0,
否则为 1
:return: [batch, length, hidden] 编码后的输出序列

[batch, hidden] 全局 relay 节点, 详见论文
"""
def norm_func(f, x):
# B, H, L, 1
@@ -70,9 +75,9 @@ class StarTransformer(nn.Module):
return nodes, relay.view(B, H)


class MSA1(nn.Module):
class _MSA1(nn.Module):
def __init__(self, nhid, nhead=10, head_dim=10, dropout=0.1):
super(MSA1, self).__init__()
super(_MSA1, self).__init__()
# Multi-head Self Attention Case 1, doing self-attention for small regions
# Due to the architecture of GPU, using hadamard production and summation are faster than dot production when unfold_size is very small
self.WQ = nn.Conv2d(nhid, nhead * head_dim, 1)
@@ -113,10 +118,10 @@ class MSA1(nn.Module):
return ret


class MSA2(nn.Module):
class _MSA2(nn.Module):
def __init__(self, nhid, nhead=10, head_dim=10, dropout=0.1):
# Multi-head Self Attention Case 2, a broadcastable query for a sequence key and value
super(MSA2, self).__init__()
super(_MSA2, self).__init__()
self.WQ = nn.Conv2d(nhid, nhead * head_dim, 1)
self.WK = nn.Conv2d(nhid, nhead * head_dim, 1)
self.WV = nn.Conv2d(nhid, nhead * head_dim, 1)


+ 9
- 8
fastNLP/modules/encoder/transformer.py View File

@@ -7,13 +7,13 @@ from ..dropout import TimestepDropout
class TransformerEncoder(nn.Module):
"""transformer的encoder模块,不包含embedding层

:param num_layers: int, transformer的层数
:param model_size: int, 输入维度的大小。同时也是输出维度的大小。
:param inner_size: int, FFN层的hidden大小
:param key_size: int, 每个head的维度大小。
:param value_size: int,每个head中value的维度。
:param num_head: int,head的数量。
:param dropout: float。
:param int num_layers: transformer的层数
:param int model_size: 输入维度的大小。同时也是输出维度的大小。
:param int inner_size: FFN层的hidden大小
:param int key_size: 每个head的维度大小。
:param int value_size: 每个head中value的维度。
:param int num_head: head的数量。
:param float dropout: dropout概率. Default: 0.1
"""
class SubLayer(nn.Module):
def __init__(self, model_size, inner_size, key_size, value_size, num_head, dropout=0.1):
@@ -48,7 +48,8 @@ class TransformerEncoder(nn.Module):
def forward(self, x, seq_mask=None):
"""
:param x: [batch, seq_len, model_size] 输入序列
:param seq_mask: [batch, seq_len] 输入序列的padding mask
:param seq_mask: [batch, seq_len] 输入序列的padding mask, 若为 ``None`` , 生成全1向量.
Default: ``None``
:return: [batch, seq_len, model_size] 输出序列
"""
output = x


+ 72
- 23
fastNLP/modules/encoder/variational_rnn.py View File

@@ -28,11 +28,11 @@ class VarRnnCellWrapper(nn.Module):
"""
:param PackedSequence input_x: [seq_len, batch_size, input_size]
:param hidden: for LSTM, tuple of (h_0, c_0), [batch_size, hidden_size]
for other RNN, h_0, [batch_size, hidden_size]
:for other RNN, h_0, [batch_size, hidden_size]
:param mask_x: [batch_size, input_size] dropout mask for input
:param mask_h: [batch_size, hidden_size] dropout mask for hidden
:return PackedSequence output: [seq_len, bacth_size, hidden_size]
hidden: for LSTM, tuple of (h_n, c_n), [batch_size, hidden_size]
:hidden: for LSTM, tuple of (h_n, c_n), [batch_size, hidden_size]
for other RNN, h_n, [batch_size, hidden_size]
"""
def get_hi(hi, h0, size):
@@ -84,9 +84,21 @@ class VarRnnCellWrapper(nn.Module):


class VarRNNBase(nn.Module):
"""Implementation of Variational Dropout RNN network.
refer to `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016)
"""Variational Dropout RNN 实现.
论文参考: `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016)
https://arxiv.org/abs/1512.05287`.

:param mode: rnn 模式, (lstm or not)
:param Cell: rnn cell 类型, (lstm, gru, etc)
:param input_size: 输入 `x` 的特征维度
:param hidden_size: 隐状态 `h` 的特征维度
:param num_layers: rnn的层数. Default: 1
:param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
:param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
:(batch, seq, feature). Default: ``False``
:param input_dropout: 对输入的dropout概率. Default: 0
:param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
:param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
"""

def __init__(self, mode, Cell, input_size, hidden_size, num_layers=1,
@@ -120,36 +132,43 @@ class VarRNNBase(nn.Module):
output_x, hidden_x = cell(input, hi, mask_x, mask_h, is_reversed=(n_direction == 1))
return output_x, hidden_x

def forward(self, input, hx=None):
def forward(self, x, hx=None):
"""

:param x: [batch, seq_len, input_size] 输入序列
:param hx: [batch, hidden_size] 初始隐状态, 若为 ``None`` , 设为全1向量. Default: ``None``
:return (output, ht): [batch, seq_len, hidden_size*num_direction] 输出序列
:和 [batch, hidden_size*num_direction] 最后时刻隐状态
"""
is_lstm = self.is_lstm
is_packed = isinstance(input, PackedSequence)
is_packed = isinstance(x, PackedSequence)
if not is_packed:
seq_len = input.size(1) if self.batch_first else input.size(0)
max_batch_size = input.size(0) if self.batch_first else input.size(1)
seq_len = x.size(1) if self.batch_first else x.size(0)
max_batch_size = x.size(0) if self.batch_first else x.size(1)
seq_lens = torch.LongTensor([seq_len for _ in range(max_batch_size)])
input, batch_sizes = pack_padded_sequence(input, seq_lens, batch_first=self.batch_first)
x, batch_sizes = pack_padded_sequence(x, seq_lens, batch_first=self.batch_first)
else:
max_batch_size = int(input.batch_sizes[0])
input, batch_sizes = input
max_batch_size = int(x.batch_sizes[0])
x, batch_sizes = x

if hx is None:
hx = input.new_zeros(self.num_layers * self.num_directions,
max_batch_size, self.hidden_size, requires_grad=True)
hx = x.new_zeros(self.num_layers * self.num_directions,
max_batch_size, self.hidden_size, requires_grad=True)
if is_lstm:
hx = (hx, hx.new_zeros(hx.size(), requires_grad=True))

mask_x = input.new_ones((max_batch_size, self.input_size))
mask_out = input.new_ones((max_batch_size, self.hidden_size * self.num_directions))
mask_h_ones = input.new_ones((max_batch_size, self.hidden_size))
mask_x = x.new_ones((max_batch_size, self.input_size))
mask_out = x.new_ones((max_batch_size, self.hidden_size * self.num_directions))
mask_h_ones = x.new_ones((max_batch_size, self.hidden_size))
nn.functional.dropout(mask_x, p=self.input_dropout, training=self.training, inplace=True)
nn.functional.dropout(mask_out, p=self.hidden_dropout, training=self.training, inplace=True)

hidden = input.new_zeros((self.num_layers*self.num_directions, max_batch_size, self.hidden_size))
hidden = x.new_zeros((self.num_layers * self.num_directions, max_batch_size, self.hidden_size))
if is_lstm:
cellstate = input.new_zeros((self.num_layers*self.num_directions, max_batch_size, self.hidden_size))
cellstate = x.new_zeros((self.num_layers * self.num_directions, max_batch_size, self.hidden_size))
for layer in range(self.num_layers):
output_list = []
input_seq = PackedSequence(input, batch_sizes)
input_seq = PackedSequence(x, batch_sizes)
mask_h = nn.functional.dropout(mask_h_ones, p=self.hidden_dropout, training=self.training, inplace=False)
for direction in range(self.num_directions):
output_x, hidden_x = self._forward_one(layer, direction, input_seq, hx,
@@ -161,22 +180,32 @@ class VarRNNBase(nn.Module):
cellstate[idx] = hidden_x[1]
else:
hidden[idx] = hidden_x
input = torch.cat(output_list, dim=-1)
x = torch.cat(output_list, dim=-1)

if is_lstm:
hidden = (hidden, cellstate)

if is_packed:
output = PackedSequence(input, batch_sizes)
output = PackedSequence(x, batch_sizes)
else:
input = PackedSequence(input, batch_sizes)
output, _ = pad_packed_sequence(input, batch_first=self.batch_first)
x = PackedSequence(x, batch_sizes)
output, _ = pad_packed_sequence(x, batch_first=self.batch_first)

return output, hidden


class VarLSTM(VarRNNBase):
"""Variational Dropout LSTM.

:param input_size: 输入 `x` 的特征维度
:param hidden_size: 隐状态 `h` 的特征维度
:param num_layers: rnn的层数. Default: 1
:param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
:param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
:(batch, seq, feature). Default: ``False``
:param input_dropout: 对输入的dropout概率. Default: 0
:param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
:param bidirectional: 若为 ``True``, 使用双向的LSTM. Default: ``False``
"""

def __init__(self, *args, **kwargs):
@@ -185,6 +214,16 @@ class VarLSTM(VarRNNBase):

class VarRNN(VarRNNBase):
"""Variational Dropout RNN.

:param input_size: 输入 `x` 的特征维度
:param hidden_size: 隐状态 `h` 的特征维度
:param num_layers: rnn的层数. Default: 1
:param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
:param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
:(batch, seq, feature). Default: ``False``
:param input_dropout: 对输入的dropout概率. Default: 0
:param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
:param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
"""

def __init__(self, *args, **kwargs):
@@ -193,6 +232,16 @@ class VarRNN(VarRNNBase):

class VarGRU(VarRNNBase):
"""Variational Dropout GRU.

:param input_size: 输入 `x` 的特征维度
:param hidden_size: 隐状态 `h` 的特征维度
:param num_layers: rnn的层数. Default: 1
:param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
:param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
:(batch, seq, feature). Default: ``False``
:param input_dropout: 对输入的dropout概率. Default: 0
:param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
:param bidirectional: 若为 ``True``, 使用双向的GRU. Default: ``False``
"""

def __init__(self, *args, **kwargs):


+ 1
- 7
test/core/test_sampler.py View File

@@ -4,17 +4,11 @@ import unittest
import torch

from fastNLP.core.dataset import DataSet
from fastNLP.core.sampler import convert_to_torch_tensor, SequentialSampler, RandomSampler, \
from fastNLP.core.sampler import SequentialSampler, RandomSampler, \
k_means_1d, k_means_bucketing, simple_sort_bucketing, BucketSampler


class TestSampler(unittest.TestCase):
def test_convert_to_torch_tensor(self):
data = [[1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [1, 3, 4, 5, 2]]
ans = convert_to_torch_tensor(data, False)
assert isinstance(ans, torch.Tensor)
assert tuple(ans.shape) == (3, 5)

def test_sequential_sampler(self):
sampler = SequentialSampler()
data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 10]


+ 10
- 10
test/models/test_biaffine_parser.py View File

@@ -44,34 +44,34 @@ data_file = """

def init_data():
ds = fastNLP.DataSet()
v = {'word_seq': fastNLP.Vocabulary(),
'pos_seq': fastNLP.Vocabulary(),
v = {'words1': fastNLP.Vocabulary(),
'words2': fastNLP.Vocabulary(),
'label_true': fastNLP.Vocabulary()}
data = []
for line in data_file.split('\n'):
line = line.split()
if len(line) == 0 and len(data) > 0:
data = list(zip(*data))
ds.append(fastNLP.Instance(word_seq=data[1],
pos_seq=data[4],
ds.append(fastNLP.Instance(words1=data[1],
words2=data[4],
arc_true=data[6],
label_true=data[7]))
data = []
elif len(line) > 0:
data.append(line)

for name in ['word_seq', 'pos_seq', 'label_true']:
for name in ['words1', 'words2', 'label_true']:
ds.apply(lambda x: ['<st>'] + list(x[name]), new_field_name=name)
ds.apply(lambda x: v[name].add_word_lst(x[name]))

for name in ['word_seq', 'pos_seq', 'label_true']:
for name in ['words1', 'words2', 'label_true']:
ds.apply(lambda x: [v[name].to_index(w) for w in x[name]], new_field_name=name)

ds.apply(lambda x: [0] + list(map(int, x['arc_true'])), new_field_name='arc_true')
ds.apply(lambda x: len(x['word_seq']), new_field_name='seq_lens')
ds.set_input('word_seq', 'pos_seq', 'seq_lens', flag=True)
ds.set_target('arc_true', 'label_true', 'seq_lens', flag=True)
return ds, v['word_seq'], v['pos_seq'], v['label_true']
ds.apply(lambda x: len(x['words1']), new_field_name='seq_len')
ds.set_input('words1', 'words2', 'seq_len', flag=True)
ds.set_target('arc_true', 'label_true', 'seq_len', flag=True)
return ds, v['words1'], v['words2'], v['label_true']


class TestBiaffineParser(unittest.TestCase):


+ 7
- 1
test/test_tutorials.py View File

@@ -437,4 +437,10 @@ class TestTutorial(unittest.TestCase):
)
tester.test()

os.chdir("../..")
def setUp(self):
import os
self._init_wd = os.path.abspath(os.curdir)

def tearDown(self):
import os
os.chdir(self._init_wd)

Loading…
Cancel
Save