Browse Source

Merge pull request #10 from fastnlp/master

update
tags/v0.3.0
lyhuang18 GitHub 5 years ago
parent
commit
7d0efebb1d
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
50 changed files with 1800 additions and 1421 deletions
  1. +3
    -3
      README.md
  2. +12
    -6
      docs/source/fastNLP.core.rst
  3. +0
    -36
      docs/source/fastNLP.modules.aggregation.rst
  4. +36
    -0
      docs/source/fastNLP.modules.aggregator.rst
  5. +0
    -5
      docs/source/fastNLP.modules.interaction.rst
  6. +5
    -0
      docs/source/fastNLP.modules.interactor.rst
  7. +2
    -2
      docs/source/fastNLP.modules.rst
  8. +76
    -222
      fastNLP/core/dataset.py
  9. +38
    -0
      fastNLP/core/field.py
  10. +29
    -1
      fastNLP/core/instance.py
  11. +14
    -0
      fastNLP/core/metrics.py
  12. +3
    -2
      fastNLP/core/predictor.py
  13. +0
    -262
      fastNLP/core/preprocess.py
  14. +8
    -0
      fastNLP/core/tester.py
  15. +14
    -2
      fastNLP/core/trainer.py
  16. +77
    -27
      fastNLP/core/vocabulary.py
  17. +7
    -8
      fastNLP/fastnlp.py
  18. +3
    -2
      fastNLP/loader/config_loader.py
  19. +196
    -9
      fastNLP/loader/dataset_loader.py
  20. +61
    -26
      fastNLP/loader/embed_loader.py
  21. +5
    -65
      fastNLP/models/base_model.py
  22. +364
    -0
      fastNLP/models/biaffine_parser.py
  23. +1
    -1
      fastNLP/models/char_language_model.py
  24. +161
    -0
      fastNLP/models/snli.py
  25. +10
    -9
      fastNLP/modules/decoder/MLP.py
  26. +15
    -0
      fastNLP/modules/dropout.py
  27. +7
    -21
      fastNLP/modules/encoder/char_embedding.py
  28. +4
    -1
      fastNLP/modules/encoder/linear.py
  29. +13
    -6
      fastNLP/modules/encoder/lstm.py
  30. +123
    -354
      fastNLP/modules/encoder/variational_rnn.py
  31. +37
    -0
      reproduction/Biaffine_parser/cfg.cfg
  32. +260
    -0
      reproduction/Biaffine_parser/run.py
  33. +1
    -17
      reproduction/Char-aware_NLM/main.py
  34. +1
    -1
      reproduction/chinese_word_segment/run.py
  35. +1
    -12
      test/core/test_batch.py
  36. +6
    -195
      test/core/test_dataset.py
  37. +6
    -7
      test/core/test_predictor.py
  38. +0
    -72
      test/core/test_preprocess.py
  39. +2
    -2
      test/core/test_tester.py
  40. +2
    -2
      test/core/test_trainer.py
  41. +25
    -0
      test/data_for_tests/config
  42. +12
    -0
      test/data_for_tests/glove.6B.50d_test.txt
  43. +12
    -2
      test/loader/test_dataset_loader.py
  44. +33
    -0
      test/loader/test_embed_loader.py
  45. +15
    -3
      test/model/seq_labeling.py
  46. +25
    -0
      test/model/test_char_language_model.py
  47. +19
    -13
      test/model/test_cws.py
  48. +14
    -8
      test/model/test_seq_label.py
  49. +28
    -0
      test/modules/test_char_embedding.py
  50. +14
    -17
      test/modules/test_variational_rnn.py

+ 3
- 3
README.md View File

@@ -8,8 +8,8 @@

fastNLP is a modular Natural Language Processing system based on PyTorch, for fast development of NLP tools. It divides the NLP model based on deep learning into different modules. These modules fall into 4 categories: encoder, interaction, aggregation and decoder, while each category contains different implemented modules. Encoder modules encode the input into some abstract representation, interaction modules make the information in the representation interact with each other, aggregation modules aggregate and reduce information, and decoder modules decode the representation into the output. Most current NLP models could be built on these modules, which vastly simplifies the process of developing NLP models. The architecture of fastNLP is as the figure below:

![](https://github.com/fastnlp/fastNLP/raw/master/fastnlp-architecture.jpg)
![](https://github.com/fastnlp/fastNLP/raw/master/docs/source/figures/procedures.PNG)
![](https://github.com/fastnlp/fastNLP/raw/master/docs/source/figures/text_classification.png)

## Requirements

@@ -62,4 +62,4 @@ pip install fastNLP
<td><b> fastNLP.fastnlp </b></td>
<td> a high-level interface for prediction </td>
</tr>
</table>
</table>

+ 12
- 6
docs/source/fastNLP.core.rst View File

@@ -1,12 +1,6 @@
fastNLP.core
=============

fastNLP.core.action
--------------------

.. automodule:: fastNLP.core.action
:members:

fastNLP.core.batch
-------------------

@@ -61,6 +55,12 @@ fastNLP.core.preprocess
.. automodule:: fastNLP.core.preprocess
:members:

fastNLP.core.sampler
---------------------

.. automodule:: fastNLP.core.sampler
:members:

fastNLP.core.tester
--------------------

@@ -73,6 +73,12 @@ fastNLP.core.trainer
.. automodule:: fastNLP.core.trainer
:members:

fastNLP.core.vocabulary
------------------------

.. automodule:: fastNLP.core.vocabulary
:members:


.. automodule:: fastNLP.core
:members:

+ 0
- 36
docs/source/fastNLP.modules.aggregation.rst View File

@@ -1,36 +0,0 @@
fastNLP.modules.aggregation
============================

fastNLP.modules.aggregation.attention
--------------------------------------

.. automodule:: fastNLP.modules.aggregation.attention
:members:

fastNLP.modules.aggregation.avg\_pool
--------------------------------------

.. automodule:: fastNLP.modules.aggregation.avg_pool
:members:

fastNLP.modules.aggregation.kmax\_pool
---------------------------------------

.. automodule:: fastNLP.modules.aggregation.kmax_pool
:members:

fastNLP.modules.aggregation.max\_pool
--------------------------------------

.. automodule:: fastNLP.modules.aggregation.max_pool
:members:

fastNLP.modules.aggregation.self\_attention
--------------------------------------------

.. automodule:: fastNLP.modules.aggregation.self_attention
:members:


.. automodule:: fastNLP.modules.aggregation
:members:

+ 36
- 0
docs/source/fastNLP.modules.aggregator.rst View File

@@ -0,0 +1,36 @@
fastNLP.modules.aggregator
===========================

fastNLP.modules.aggregator.attention
-------------------------------------

.. automodule:: fastNLP.modules.aggregator.attention
:members:

fastNLP.modules.aggregator.avg\_pool
-------------------------------------

.. automodule:: fastNLP.modules.aggregator.avg_pool
:members:

fastNLP.modules.aggregator.kmax\_pool
--------------------------------------

.. automodule:: fastNLP.modules.aggregator.kmax_pool
:members:

fastNLP.modules.aggregator.max\_pool
-------------------------------------

.. automodule:: fastNLP.modules.aggregator.max_pool
:members:

fastNLP.modules.aggregator.self\_attention
-------------------------------------------

.. automodule:: fastNLP.modules.aggregator.self_attention
:members:


.. automodule:: fastNLP.modules.aggregator
:members:

+ 0
- 5
docs/source/fastNLP.modules.interaction.rst View File

@@ -1,5 +0,0 @@
fastNLP.modules.interaction
============================

.. automodule:: fastNLP.modules.interaction
:members:

+ 5
- 0
docs/source/fastNLP.modules.interactor.rst View File

@@ -0,0 +1,5 @@
fastNLP.modules.interactor
===========================

.. automodule:: fastNLP.modules.interactor
:members:

+ 2
- 2
docs/source/fastNLP.modules.rst View File

@@ -3,10 +3,10 @@ fastNLP.modules

.. toctree::

fastNLP.modules.aggregation
fastNLP.modules.aggregator
fastNLP.modules.decoder
fastNLP.modules.encoder
fastNLP.modules.interaction
fastNLP.modules.interactor

fastNLP.modules.other\_modules
-------------------------------


+ 76
- 222
fastNLP/core/dataset.py View File

@@ -6,91 +6,45 @@ from copy import deepcopy
from fastNLP.core.field import TextField, LabelField
from fastNLP.core.instance import Instance
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.loader.dataset_loader import POSDataSetLoader, ClassDataSetLoader


def create_dataset_from_lists(str_lists: list, word_vocab: dict, has_target: bool = False, label_vocab: dict = None):
if has_target is True:
if label_vocab is None:
raise RuntimeError("Must provide label vocabulary to transform labels.")
return create_labeled_dataset_from_lists(str_lists, word_vocab, label_vocab)
else:
return create_unlabeled_dataset_from_lists(str_lists, word_vocab)


def create_labeled_dataset_from_lists(str_lists, word_vocab, label_vocab):
"""Create an DataSet instance that contains labels.

:param str_lists: list of list of strings, [num_examples, 2, *].
::
[
[[word_11, word_12, ...], [label_11, label_12, ...]],
...
]

:param word_vocab: dict of (str: int), which means (word: index).
:param label_vocab: dict of (str: int), which means (word: index).
:return data_set: a DataSet instance.

"""
data_set = DataSet()
for example in str_lists:
word_seq, label_seq = example[0], example[1]
x = TextField(word_seq, is_target=False)
y = TextField(label_seq, is_target=True)
data_set.append(Instance(word_seq=x, label_seq=y))
data_set.index_field("word_seq", word_vocab)
data_set.index_field("label_seq", label_vocab)
return data_set


def create_unlabeled_dataset_from_lists(str_lists, word_vocab):
"""Create an DataSet instance that contains no labels.

:param str_lists: list of list of strings, [num_examples, *].
::
[
[word_11, word_12, ...],
...
]

:param word_vocab: dict of (str: int), which means (word: index).
:return data_set: a DataSet instance.

"""
data_set = DataSet()
for word_seq in str_lists:
x = TextField(word_seq, is_target=False)
data_set.append(Instance(word_seq=x))
data_set.index_field("word_seq", word_vocab)
return data_set

_READERS = {}

class DataSet(list):
"""A DataSet object is a list of Instance objects.

"""

def __init__(self, name="", instances=None, load_func=None):
def __init__(self, name="", instances=None):
"""

:param name: str, the name of the dataset. (default: "")
:param instances: list of Instance objects. (default: None)
:param load_func: a function that takes the dataset path (string) as input and returns multi-level lists.
"""
list.__init__([])
self.name = name
self.origin_len = None
if instances is not None:
self.extend(instances)
self.data_set_load_func = load_func

def index_all(self, vocab):
for ins in self:
ins.index_all(vocab)
return self

def index_field(self, field_name, vocab):
for ins in self:
ins.index_field(field_name, vocab)
if isinstance(field_name, str):
field_list = [field_name]
vocab_list = [vocab]
else:
classes = (list, tuple)
assert isinstance(field_name, classes) and isinstance(vocab, classes) and len(field_name) == len(vocab)
field_list = field_name
vocab_list = vocab

for name, vocabs in zip(field_list, vocab_list):
for ins in self:
ins.index_field(name, vocabs)
return self

def to_tensor(self, idx: int, padding_length: dict):
"""Convert an instance in a dataset to tensor.
@@ -102,7 +56,7 @@ class DataSet(list):

"""
ins = self[idx]
return ins.to_tensor(padding_length)
return ins.to_tensor(padding_length, self.origin_len)

def get_length(self):
"""Fetch lengths of all fields in all instances in a dataset.
@@ -117,42 +71,9 @@ class DataSet(list):
lengths[field_name].append(field_length)
return lengths

def convert(self, data):
"""Convert lists of strings into Instances with Fields, creating Vocabulary for labeled data. Used in Training."""
raise NotImplementedError

def convert_with_vocabs(self, data, vocabs):
"""Convert lists of strings into Instances with Fields, using existing Vocabulary, with labels. Used in Testing."""
raise NotImplementedError

def convert_for_infer(self, data, vocabs):
"""Convert lists of strings into Instances with Fields, using existing Vocabulary, without labels. Used in predicting."""

def load(self, data_path, vocabs=None, infer=False):
"""Load data from the given files.

:param data_path: str, the path to the data
:param infer: bool. If True, there is no label information in the data. Default: False.
:param vocabs: dict of (name: Vocabulary object), used to index data. If not provided, a new vocabulary will be constructed.

"""
raw_data = self.data_set_load_func(data_path)
if infer is True:
self.convert_for_infer(raw_data, vocabs)
else:
if vocabs is not None:
self.convert_with_vocabs(raw_data, vocabs)
else:
self.convert(raw_data)

def load_raw(self, raw_data, vocabs):
"""Load raw data without loader. Used in FastNLP class.

:param raw_data:
:param vocabs:
:return:
"""
self.convert_for_infer(raw_data, vocabs)
def shuffle(self):
random.shuffle(self)
return self

def split(self, ratio, shuffle=True):
"""Train/dev splitting
@@ -165,7 +86,7 @@ class DataSet(list):
"""
assert 0 < ratio < 1
if shuffle:
random.shuffle(self)
self.shuffle()
split_idx = int(len(self) * ratio)
dev_set = deepcopy(self)
train_set = deepcopy(self)
@@ -173,134 +94,67 @@ class DataSet(list):
del dev_set[split_idx:]
return train_set, dev_set


class SeqLabelDataSet(DataSet):
def __init__(self, instances=None, load_func=POSDataSetLoader().load):
super(SeqLabelDataSet, self).__init__(name="", instances=instances, load_func=load_func)
self.word_vocab = Vocabulary()
self.label_vocab = Vocabulary()

def convert(self, data):
"""Convert lists of strings into Instances with Fields.

:param data: 3-level lists. Entries are strings.
def rename_field(self, old_name, new_name):
"""rename a field
"""
bar = ProgressBar(total=len(data))
for example in data:
word_seq, label_seq = example[0], example[1]
# list, list
self.word_vocab.update(word_seq)
self.label_vocab.update(label_seq)
x = TextField(word_seq, is_target=False)
x_len = LabelField(len(word_seq), is_target=False)
y = TextField(label_seq, is_target=False)
instance = Instance()
instance.add_field("word_seq", x)
instance.add_field("truth", y)
instance.add_field("word_seq_origin_len", x_len)
self.append(instance)
bar.move()
self.index_field("word_seq", self.word_vocab)
self.index_field("truth", self.label_vocab)
# no need to index "word_seq_origin_len"

def convert_with_vocabs(self, data, vocabs):
for example in data:
word_seq, label_seq = example[0], example[1]
# list, list
x = TextField(word_seq, is_target=False)
x_len = LabelField(len(word_seq), is_target=False)
y = TextField(label_seq, is_target=False)
instance = Instance()
instance.add_field("word_seq", x)
instance.add_field("truth", y)
instance.add_field("word_seq_origin_len", x_len)
self.append(instance)
self.index_field("word_seq", vocabs["word_vocab"])
self.index_field("truth", vocabs["label_vocab"])
# no need to index "word_seq_origin_len"

def convert_for_infer(self, data, vocabs):
for word_seq in data:
# list
x = TextField(word_seq, is_target=False)
x_len = LabelField(len(word_seq), is_target=False)
instance = Instance()
instance.add_field("word_seq", x)
instance.add_field("word_seq_origin_len", x_len)
self.append(instance)
self.index_field("word_seq", vocabs["word_vocab"])
# no need to index "word_seq_origin_len"


class TextClassifyDataSet(DataSet):
def __init__(self, instances=None, load_func=ClassDataSetLoader().load):
super(TextClassifyDataSet, self).__init__(name="", instances=instances, load_func=load_func)
self.word_vocab = Vocabulary()
self.label_vocab = Vocabulary(need_default=False)

def convert(self, data):
for example in data:
word_seq, label = example[0], example[1]
# list, str
self.word_vocab.update(word_seq)
self.label_vocab.update(label)
x = TextField(word_seq, is_target=False)
y = LabelField(label, is_target=True)
instance = Instance()
instance.add_field("word_seq", x)
instance.add_field("label", y)
self.append(instance)
self.index_field("word_seq", self.word_vocab)
self.index_field("label", self.label_vocab)

def convert_with_vocabs(self, data, vocabs):
for example in data:
word_seq, label = example[0], example[1]
# list, str
x = TextField(word_seq, is_target=False)
y = LabelField(label, is_target=True)
instance = Instance()
instance.add_field("word_seq", x)
instance.add_field("label", y)
self.append(instance)
self.index_field("word_seq", vocabs["word_vocab"])
self.index_field("label", vocabs["label_vocab"])
for ins in self:
ins.rename_field(old_name, new_name)
return self

def convert_for_infer(self, data, vocabs):
for word_seq in data:
# list
x = TextField(word_seq, is_target=False)
instance = Instance()
instance.add_field("word_seq", x)
self.append(instance)
self.index_field("word_seq", vocabs["word_vocab"])
def set_target(self, **fields):
"""Change the flag of `is_target` for all instance. For fields not set here, leave their `is_target` unchanged.

:param key-value pairs for field-name and `is_target` value(True, False or None).
"""
for ins in self:
ins.set_target(**fields)
return self

def change_field_is_target(data_set, field_name, new_target):
"""Change the flag of is_target in a field.
def update_vocab(self, **name_vocab):
"""using certain field data to update vocabulary.

:param data_set: a DataSet object
:param field_name: str, the name of the field
:param new_target: one of (True, False, None), representing this field is batch_x / is batch_y / neither.
e.g. ::

"""
for inst in data_set:
inst.fields[field_name].is_target = new_target
# update word vocab and label vocab seperately
dataset.update_vocab(word_seq=word_vocab, label_seq=label_vocab)
"""
for field_name, vocab in name_vocab.items():
for ins in self:
vocab.update(ins[field_name].contents())
return self

def set_origin_len(self, origin_field, origin_len_name=None):
"""make dataset tensor output contain origin_len field.

class ProgressBar:
e.g. ::

def __init__(self, count=0, total=0, width=100):
self.count = count
self.total = total
self.width = width
# output "word_seq_origin_len", lengths based on "word_seq" field
dataset.set_origin_len("word_seq")
"""
if origin_field is None:
self.origin_len = None
else:
self.origin_len = (origin_field + "_origin_len", origin_field) \
if origin_len_name is None else (origin_len_name, origin_field)
return self

def __getattribute__(self, name):
if name in _READERS:
# add read_*data() support
def _read(*args, **kwargs):
data = _READERS[name]().load(*args, **kwargs)
self.extend(data)
return self
return _read
else:
return object.__getattribute__(self, name)

def move(self):
self.count += 1
progress = self.width * self.count // self.total
sys.stdout.write('{0:3}/{1:3}: '.format(self.count, self.total))
sys.stdout.write('#' * progress + '-' * (self.width - progress) + '\r')
if progress == self.width:
sys.stdout.write('\n')
sys.stdout.flush()
@classmethod
def set_reader(cls, method_name):
"""decorator to add dataloader support
"""
assert isinstance(method_name, str)
def wrapper(read_cls):
_READERS[method_name] = read_cls
return read_cls
return wrapper

+ 38
- 0
fastNLP/core/field.py View File

@@ -18,6 +18,8 @@ class Field(object):
def to_tensor(self, padding_length):
raise NotImplementedError

def contents(self):
raise NotImplementedError

class TextField(Field):
def __init__(self, text, is_target):
@@ -57,6 +59,8 @@ class TextField(Field):
pads = [0] * (padding_length - self.get_length())
return torch.LongTensor(self._index + pads)

def contents(self):
return self.text.copy()

class LabelField(Field):
"""The Field representing a single label. Can be a string or integer.
@@ -92,6 +96,40 @@ class LabelField(Field):
else:
return torch.LongTensor([self._index])

def contents(self):
return [self.label]

class SeqLabelField(Field):
def __init__(self, label_seq, is_target=True):
super(SeqLabelField, self).__init__(is_target)
self.label_seq = label_seq
self._index = None

def get_length(self):
return len(self.label_seq)

def index(self, vocab):
if self._index is None:
self._index = [vocab[c] for c in self.label_seq]
return self._index

def to_tensor(self, padding_length):
pads = [0] * (padding_length - self.get_length())
if self._index is None:
if self.get_length() == 0:
return torch.LongTensor(pads)
elif isinstance(self.label_seq[0], int):
return torch.LongTensor(self.label_seq + pads)
elif isinstance(self.label_seq[0], str):
raise RuntimeError("Field {} not indexed. Call index method.".format(self.label))
else:
raise RuntimeError(
"Not support type for SeqLabelField. Expect str or int, got {}.".format(type(self.label)))
else:
return torch.LongTensor(self._index + pads)

def contents(self):
return self.label_seq.copy()

if __name__ == "__main__":
tf = TextField("test the code".split(), is_target=False)

+ 29
- 1
fastNLP/core/instance.py View File

@@ -1,3 +1,5 @@
import torch

class Instance(object):
"""An instance which consists of Fields is an example in the DataSet.

@@ -10,6 +12,28 @@ class Instance(object):

def add_field(self, field_name, field):
self.fields[field_name] = field
return self

def rename_field(self, old_name, new_name):
if old_name in self.fields:
self.fields[new_name] = self.fields.pop(old_name)
if old_name in self.indexes:
self.indexes[new_name] = self.indexes.pop(old_name)
else:
raise KeyError("error, no such field: {}".format(old_name))
return self

def set_target(self, **fields):
for name, val in fields.items():
if name in self.fields:
self.fields[name].is_target = val
return self

def __getitem__(self, name):
if name in self.fields:
return self.fields[name]
else:
raise KeyError("{} not found".format(name))

def get_length(self):
"""Fetch the length of all fields in the instance.
@@ -24,6 +48,7 @@ class Instance(object):
"""use `vocab` to index certain field
"""
self.indexes[field_name] = self.fields[field_name].index(vocab)
return self

def index_all(self, vocab):
"""use `vocab` to index all fields
@@ -35,7 +60,7 @@ class Instance(object):
self.indexes = indexes
return indexes

def to_tensor(self, padding_length: dict):
def to_tensor(self, padding_length: dict, origin_len=None):
"""Convert instance to tensor.

:param padding_length: dict of (str: int), which means (field name: padding_length of this field)
@@ -53,4 +78,7 @@ class Instance(object):
else:
# is_target is None
continue
if origin_len is not None:
name, field_name = origin_len
tensor_x[name] = torch.LongTensor([self.fields[field_name].get_length()])
return tensor_x, tensor_y

+ 14
- 0
fastNLP/core/metrics.py View File

@@ -57,6 +57,20 @@ class SeqLabelEvaluator(Evaluator):
return {"accuracy": float(accuracy)}


class SNLIEvaluator(Evaluator):
def __init__(self):
super(SNLIEvaluator, self).__init__()

def __call__(self, predict, truth):
y_prob = [torch.nn.functional.softmax(y_logit, dim=-1) for y_logit in predict]
y_prob = torch.cat(y_prob, dim=0)
y_pred = torch.argmax(y_prob, dim=-1)
truth = [t['truth'] for t in truth]
y_true = torch.cat(truth, dim=0).view(-1)
acc = float(torch.sum(y_pred == y_true)) / y_true.size(0)
return {"accuracy": acc}


def _conver_numpy(x):
"""convert input data to numpy array



+ 3
- 2
fastNLP/core/predictor.py View File

@@ -2,9 +2,9 @@ import numpy as np
import torch

from fastNLP.core.batch import Batch
from fastNLP.core.dataset import create_dataset_from_lists
from fastNLP.core.preprocess import load_pickle
from fastNLP.core.sampler import SequentialSampler
from fastNLP.loader.dataset_loader import convert_seq2seq_dataset, convert_seq2tag_dataset, convert_seq_dataset


class Predictor(object):
@@ -79,7 +79,8 @@ class Predictor(object):
:return data_set: a DataSet instance.
"""
assert isinstance(data, list)
return create_dataset_from_lists(data, self.word_vocab, has_target=False)
data = convert_seq_dataset(data)
data.index_field("word_seq", self.word_vocab)


class SeqLabelInfer(Predictor):


+ 0
- 262
fastNLP/core/preprocess.py View File

@@ -1,13 +1,6 @@
import _pickle
import os

import numpy as np

from fastNLP.core.dataset import DataSet
from fastNLP.core.field import TextField, LabelField
from fastNLP.core.instance import Instance
from fastNLP.core.vocabulary import Vocabulary


# the first vocab in dict with the index = 5

@@ -53,258 +46,3 @@ def pickle_exist(pickle_path, pickle_name):
return True
else:
return False


class Preprocessor(object):
"""Preprocessors are responsible for converting data of strings into data of indices.
During the pre-processing, the following pickle files will be built:

- "word2id.pkl", a Vocabulary object, mapping words to indices.
- "class2id.pkl", a Vocabulary object, mapping labels to indices.
- "data_train.pkl", a DataSet object for training
- "data_dev.pkl", a DataSet object for validation, if train_dev_split > 0.
- "data_test.pkl", a DataSet object for testing, if test_data is not None.

These four pickle files are expected to be saved in the given pickle directory once they are constructed.
Preprocessors will check if those files are already in the directory and will reuse them in future calls.
"""

def __init__(self, label_is_seq=False, share_vocab=False, add_char_field=False):
"""

:param label_is_seq: bool, whether label is a sequence. If True, label vocabulary will preserve
several special tokens for sequence processing.
:param share_vocab: bool, whether word sequence and label sequence share the same vocabulary. Typically, this
is only available when label_is_seq is True. Default: False.
:param add_char_field: bool, whether to add character representations to all TextFields. Default: False.
"""
print("Preprocessor is about to deprecate. Please use DataSet class.")
self.data_vocab = Vocabulary()
if label_is_seq is True:
if share_vocab is True:
self.label_vocab = self.data_vocab
else:
self.label_vocab = Vocabulary()
else:
self.label_vocab = Vocabulary(need_default=False)

self.character_vocab = Vocabulary(need_default=False)
self.add_char_field = add_char_field

@property
def vocab_size(self):
return len(self.data_vocab)

@property
def num_classes(self):
return len(self.label_vocab)

@property
def char_vocab_size(self):
if self.character_vocab is None:
self.build_char_dict()
return len(self.character_vocab)

def run(self, train_dev_data, test_data=None, pickle_path="./", train_dev_split=0, cross_val=False, n_fold=10):
"""Main pre-processing pipeline.

:param train_dev_data: three-level list, with either single label or multiple labels in a sample.
:param test_data: three-level list, with either single label or multiple labels in a sample. (optional)
:param pickle_path: str, the path to save the pickle files.
:param train_dev_split: float, between [0, 1]. The ratio of training data used as validation set.
:param cross_val: bool, whether to do cross validation.
:param n_fold: int, the number of folds of cross validation. Only useful when cross_val is True.
:return results: multiple datasets after pre-processing. If test_data is provided, return one more dataset.
If train_dev_split > 0, return one more dataset - the dev set. If cross_val is True, each dataset
is a list of DataSet objects; Otherwise, each dataset is a DataSet object.
"""
if pickle_exist(pickle_path, "word2id.pkl") and pickle_exist(pickle_path, "class2id.pkl"):
self.data_vocab = load_pickle(pickle_path, "word2id.pkl")
self.label_vocab = load_pickle(pickle_path, "class2id.pkl")
else:
self.data_vocab, self.label_vocab = self.build_dict(train_dev_data)
save_pickle(self.data_vocab, pickle_path, "word2id.pkl")
save_pickle(self.label_vocab, pickle_path, "class2id.pkl")

self.build_reverse_dict()

train_set = []
dev_set = []
if not cross_val:
if not pickle_exist(pickle_path, "data_train.pkl"):
if train_dev_split > 0 and not pickle_exist(pickle_path, "data_dev.pkl"):
split = int(len(train_dev_data) * train_dev_split)
data_dev = train_dev_data[: split]
data_train = train_dev_data[split:]
train_set = self.convert_to_dataset(data_train, self.data_vocab, self.label_vocab)
dev_set = self.convert_to_dataset(data_dev, self.data_vocab, self.label_vocab)

save_pickle(dev_set, pickle_path, "data_dev.pkl")
print("{} of the training data is split for validation. ".format(train_dev_split))
else:
train_set = self.convert_to_dataset(train_dev_data, self.data_vocab, self.label_vocab)
save_pickle(train_set, pickle_path, "data_train.pkl")
else:
train_set = load_pickle(pickle_path, "data_train.pkl")
if pickle_exist(pickle_path, "data_dev.pkl"):
dev_set = load_pickle(pickle_path, "data_dev.pkl")
else:
# cross_val is True
if not pickle_exist(pickle_path, "data_train_0.pkl"):
# cross validation
data_cv = self.cv_split(train_dev_data, n_fold)
for i, (data_train_cv, data_dev_cv) in enumerate(data_cv):
data_train_cv = self.convert_to_dataset(data_train_cv, self.data_vocab, self.label_vocab)
data_dev_cv = self.convert_to_dataset(data_dev_cv, self.data_vocab, self.label_vocab)
save_pickle(
data_train_cv, pickle_path,
"data_train_{}.pkl".format(i))
save_pickle(
data_dev_cv, pickle_path,
"data_dev_{}.pkl".format(i))
train_set.append(data_train_cv)
dev_set.append(data_dev_cv)
print("{}-fold cross validation.".format(n_fold))
else:
for i in range(n_fold):
data_train_cv = load_pickle(pickle_path, "data_train_{}.pkl".format(i))
data_dev_cv = load_pickle(pickle_path, "data_dev_{}.pkl".format(i))
train_set.append(data_train_cv)
dev_set.append(data_dev_cv)

# prepare test data if provided
test_set = []
if test_data is not None:
if not pickle_exist(pickle_path, "data_test.pkl"):
test_set = self.convert_to_dataset(test_data, self.data_vocab, self.label_vocab)
save_pickle(test_set, pickle_path, "data_test.pkl")

# return preprocessed results
results = [train_set]
if cross_val or train_dev_split > 0:
results.append(dev_set)
if test_data:
results.append(test_set)
if len(results) == 1:
return results[0]
else:
return tuple(results)

def build_dict(self, data):
for example in data:
word, label = example
self.data_vocab.update(word)
self.label_vocab.update(label)
return self.data_vocab, self.label_vocab

def build_char_dict(self):
char_collection = set()
for word in self.data_vocab.word2idx:
if len(word) == 0:
continue
for ch in word:
if ch not in char_collection:
char_collection.add(ch)
self.character_vocab.update(list(char_collection))

def build_reverse_dict(self):
self.data_vocab.build_reverse_vocab()
self.label_vocab.build_reverse_vocab()

def data_split(self, data, train_dev_split):
"""Split data into train and dev set."""
split = int(len(data) * train_dev_split)
data_dev = data[: split]
data_train = data[split:]
return data_train, data_dev

def cv_split(self, data, n_fold):
"""Split data for cross validation.

:param data: list of string
:param n_fold: int
:return data_cv:

::
[
(data_train, data_dev), # 1st fold
(data_train, data_dev), # 2nd fold
...
]

"""
data_copy = data.copy()
np.random.shuffle(data_copy)
fold_size = round(len(data_copy) / n_fold)
data_cv = []
for i in range(n_fold - 1):
start = i * fold_size
end = (i + 1) * fold_size
data_dev = data_copy[start:end]
data_train = data_copy[:start] + data_copy[end:]
data_cv.append((data_train, data_dev))
start = (n_fold - 1) * fold_size
data_dev = data_copy[start:]
data_train = data_copy[:start]
data_cv.append((data_train, data_dev))
return data_cv

def convert_to_dataset(self, data, vocab, label_vocab):
"""Convert list of indices into a DataSet object.

:param data: list. Entries are strings.
:param vocab: a dict, mapping string (token) to index (int).
:param label_vocab: a dict, mapping string (label) to index (int).
:return data_set: a DataSet object
"""
use_word_seq = False
use_label_seq = False
use_label_str = False

# construct a DataSet object and fill it with Instances
data_set = DataSet()
for example in data:
words, label = example[0], example[1]
instance = Instance()

if isinstance(words, list):
x = TextField(words, is_target=False)
instance.add_field("word_seq", x)
use_word_seq = True
else:
raise NotImplementedError("words is a {}".format(type(words)))

if isinstance(label, list):
y = TextField(label, is_target=True)
instance.add_field("label_seq", y)
use_label_seq = True
elif isinstance(label, str):
y = LabelField(label, is_target=True)
instance.add_field("label", y)
use_label_str = True
else:
raise NotImplementedError("label is a {}".format(type(label)))
data_set.append(instance)

# convert strings to indices
if use_word_seq:
data_set.index_field("word_seq", vocab)
if use_label_seq:
data_set.index_field("label_seq", label_vocab)
if use_label_str:
data_set.index_field("label", label_vocab)

return data_set


class SeqLabelPreprocess(Preprocessor):
def __init__(self):
print("[FastNLP warning] SeqLabelPreprocess is about to deprecate. Please use Preprocess directly.")
super(SeqLabelPreprocess, self).__init__()


class ClassPreprocess(Preprocessor):
def __init__(self):
print("[FastNLP warning] ClassPreprocess is about to deprecate. Please use Preprocess directly.")
super(ClassPreprocess, self).__init__()


+ 8
- 0
fastNLP/core/tester.py View File

@@ -83,6 +83,7 @@ class Tester(object):
truth_list.append(batch_y)
eval_results = self.evaluate(output_list, truth_list)
print("[tester] {}".format(self.print_eval_results(eval_results)))
logger.info("[tester] {}".format(self.print_eval_results(eval_results)))

def mode(self, model, is_test=False):
"""Train mode or Test mode. This is for PyTorch currently.
@@ -131,3 +132,10 @@ class ClassificationTester(Tester):
print(
"[FastNLP Warning] ClassificationTester will be deprecated. Please use Tester directly.")
super(ClassificationTester, self).__init__(**test_args)


class SNLITester(Tester):
def __init__(self, **test_args):
print(
"[FastNLP Warning] SNLITester will be deprecated. Please use Tester directly.")
super(SNLITester, self).__init__(**test_args)

+ 14
- 2
fastNLP/core/trainer.py View File

@@ -10,7 +10,7 @@ from fastNLP.core.loss import Loss
from fastNLP.core.metrics import Evaluator
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.sampler import RandomSampler
from fastNLP.core.tester import SeqLabelTester, ClassificationTester
from fastNLP.core.tester import SeqLabelTester, ClassificationTester, SNLITester
from fastNLP.saver.logger import create_logger
from fastNLP.saver.model_saver import ModelSaver

@@ -162,7 +162,7 @@ class Trainer(object):
if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0:
end = time.time()
diff = timedelta(seconds=round(end - kwargs["start"]))
print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.2} time: {}".format(
print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time: {}".format(
kwargs["epoch"], step, loss.data, diff)
print(print_output)
logger.info(print_output)
@@ -292,3 +292,15 @@ class ClassificationTrainer(Trainer):

def _create_validator(self, valid_args):
return ClassificationTester(**valid_args)


class SNLITrainer(Trainer):
"""Trainer for text SNLI."""

def __init__(self, **train_args):
print(
"[FastNLP Warning] SNLITrainer will be deprecated. Please use Trainer directly.")
super(SNLITrainer, self).__init__(**train_args)

def _create_validator(self, valid_args):
return SNLITester(**valid_args)

+ 77
- 27
fastNLP/core/vocabulary.py View File

@@ -19,6 +19,17 @@ def isiterable(p_object):
return True


def check_build_vocab(func):
def _wrapper(self, *args, **kwargs):
if self.word2idx is None:
self.build_vocab()
self.build_reverse_vocab()
elif self.idx2word is None:
self.build_reverse_vocab()
return func(self, *args, **kwargs)
return _wrapper


class Vocabulary(object):
"""Use for word and index one to one mapping

@@ -30,13 +41,41 @@ class Vocabulary(object):
vocab["word"]
vocab.to_word(5)
"""

def __init__(self, need_default=True):
def __init__(self, need_default=True, max_size=None, min_freq=None):
"""
:param bool need_default: set if the Vocabulary has default labels reserved for sequences. Default: True.
:param int max_size: set the max number of words in Vocabulary. Default: None
:param int min_freq: set the min occur frequency of words in Vocabulary. Default: None
"""
self.max_size = max_size
self.min_freq = min_freq
self.word_count = {}
self.has_default = need_default
self.word2idx = None
self.idx2word = None

def update(self, word):
"""add word or list of words into Vocabulary

:param word: a list of string or a single string
"""
if need_default:
if not isinstance(word, str) and isiterable(word):
# it's a nested list
for w in word:
self.update(w)
else:
# it's a word to be added
if word not in self.word_count:
self.word_count[word] = 1
else:
self.word_count[word] += 1
self.word2idx = None
return self

def build_vocab(self):
"""build 'word to index' dict, and filter the word using `max_size` and `min_freq`
"""
if self.has_default:
self.word2idx = deepcopy(DEFAULT_WORD_TO_INDEX)
self.padding_label = DEFAULT_PADDING_LABEL
self.unknown_label = DEFAULT_UNKNOWN_LABEL
@@ -45,28 +84,28 @@ class Vocabulary(object):
self.padding_label = None
self.unknown_label = None

self.has_default = need_default
self.idx2word = None
words = sorted(self.word_count.items(), key=lambda kv: kv[1], reverse=True)
if self.min_freq is not None:
words = list(filter(lambda kv: kv[1] >= self.min_freq, words))
if self.max_size is not None and len(words) > self.max_size:
words = words[:self.max_size]
for w, _ in words:
self.word2idx[w] = len(self.word2idx)

def build_reverse_vocab(self):
"""build 'index to word' dict based on 'word to index' dict
"""
self.idx2word = {self.word2idx[w] : w for w in self.word2idx}

@check_build_vocab
def __len__(self):
return len(self.word2idx)

def update(self, word):
"""add word or list of words into Vocabulary
:param word: a list of string or a single string
"""
if not isinstance(word, str) and isiterable(word):
# it's a nested list
for w in word:
self.update(w)
else:
# it's a word to be added
if word not in self.word2idx:
self.word2idx[word] = len(self)
if self.idx2word is not None:
self.idx2word = None
@check_build_vocab
def has_word(self, w):
return w in self.word2idx

@check_build_vocab
def __getitem__(self, w):
"""To support usage like::

@@ -74,32 +113,35 @@ class Vocabulary(object):
"""
if w in self.word2idx:
return self.word2idx[w]
else:
elif self.has_default:
return self.word2idx[DEFAULT_UNKNOWN_LABEL]
else:
raise ValueError("word {} not in vocabulary".format(w))

@check_build_vocab
def to_index(self, w):
""" like to_index(w) function, turn a word to the index
if w is not in Vocabulary, return the unknown label
:param str w:
"""
return self[w]

@property
@check_build_vocab
def unknown_idx(self):
if self.unknown_label is None:
return None
return self.word2idx[self.unknown_label]

@property
@check_build_vocab
def padding_idx(self):
if self.padding_label is None:
return None
return self.word2idx[self.padding_label]

def build_reverse_vocab(self):
"""build 'index to word' dict based on 'word to index' dict
"""
self.idx2word = {self.word2idx[w]: w for w in self.word2idx}

@check_build_vocab
def to_word(self, idx):
"""given a word's index, return the word itself

@@ -122,3 +164,11 @@ class Vocabulary(object):
"""
self.__dict__.update(state)
self.idx2word = None

def __contains__(self, item):
"""Check if a word in vocabulary.

:param item: the word
:return: True or False
"""
return self.has_word(item)

+ 7
- 8
fastNLP/fastnlp.py View File

@@ -1,6 +1,7 @@
import os

from fastNLP.core.dataset import SeqLabelDataSet, TextClassifyDataSet
from fastNLP.core.dataset import DataSet
from fastNLP.loader.dataset_loader import convert_seq_dataset
from fastNLP.core.predictor import SeqLabelInfer, ClassificationInfer
from fastNLP.core.preprocess import load_pickle
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
@@ -178,13 +179,11 @@ class FastNLP(object):
:param infer_input: 2-D lists of strings
:return data_set: a DataSet object
"""
if self.infer_type == "seq_label":
data_set = SeqLabelDataSet()
data_set.load_raw(infer_input, {"word_vocab": self.word_vocab})
return data_set
elif self.infer_type == "text_class":
data_set = TextClassifyDataSet()
data_set.load_raw(infer_input, {"word_vocab": self.word_vocab})
if self.infer_type in ["seq_label", "text_class"]:
data_set = convert_seq_dataset(infer_input)
data_set.index_field("word_seq", self.word_vocab)
if self.infer_type == "seq_label":
data_set.set_origin_len("word_seq")
return data_set
else:
raise RuntimeError("fail to make outputs with infer type {}".format(self.infer_type))


+ 3
- 2
fastNLP/loader/config_loader.py View File

@@ -8,9 +8,10 @@ from fastNLP.loader.base_loader import BaseLoader
class ConfigLoader(BaseLoader):
"""loader for configuration files"""

def __int__(self, data_path):
def __init__(self, data_path=None):
super(ConfigLoader, self).__init__()
self.config = self.parse(super(ConfigLoader, self).load(data_path))
if data_path is not None:
self.config = self.parse(super(ConfigLoader, self).load(data_path))

@staticmethod
def parse(string):


+ 196
- 9
fastNLP/loader/dataset_loader.py View File

@@ -1,6 +1,74 @@
import os

from fastNLP.loader.base_loader import BaseLoader
from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance
from fastNLP.core.field import *


def convert_seq_dataset(data):
"""Create an DataSet instance that contains no labels.

:param data: list of list of strings, [num_examples, *].
::
[
[word_11, word_12, ...],
...
]

:return: a DataSet.
"""
dataset = DataSet()
for word_seq in data:
x = TextField(word_seq, is_target=False)
dataset.append(Instance(word_seq=x))
return dataset


def convert_seq2tag_dataset(data):
"""Convert list of data into DataSet

:param data: list of list of strings, [num_examples, *].
::
[
[ [word_11, word_12, ...], label_1 ],
[ [word_21, word_22, ...], label_2 ],
...
]

:return: a DataSet.
"""
dataset = DataSet()
for sample in data:
word_seq, label = sample[0], sample[1]
ins = Instance()
ins.add_field("word_seq", TextField(word_seq, is_target=False)) \
.add_field("label", LabelField(label, is_target=True))
dataset.append(ins)
return dataset


def convert_seq2seq_dataset(data):
"""Convert list of data into DataSet

:param data: list of list of strings, [num_examples, *].
::
[
[ [word_11, word_12, ...], [label_1, label_1, ...] ],
[ [word_21, word_22, ...], [label_2, label_1, ...] ],
...
]

:return: a DataSet.
"""
dataset = DataSet()
for sample in data:
word_seq, label_seq = sample[0], sample[1]
ins = Instance()
ins.add_field("word_seq", TextField(word_seq, is_target=False)) \
.add_field("label_seq", TextField(label_seq, is_target=True))
dataset.append(ins)
return dataset


class DataSetLoader(BaseLoader):
@@ -10,9 +78,33 @@ class DataSetLoader(BaseLoader):
super(DataSetLoader, self).__init__()

def load(self, path):
""" load data in `path` into a dataset
"""
raise NotImplementedError

def convert(self, data):
"""convert list of data into dataset
"""
raise NotImplementedError


@DataSet.set_reader('read_raw')
class RawDataSetLoader(DataSetLoader):
def __init__(self):
super(RawDataSetLoader, self).__init__()

def load(self, data_path, split=None):
with open(data_path, "r", encoding="utf-8") as f:
lines = f.readlines()
lines = lines if split is None else [l.split(split) for l in lines]
lines = list(filter(lambda x: len(x) > 0, lines))
return self.convert(lines)

def convert(self, data):
return convert_seq_dataset(data)


@DataSet.set_reader('read_pos')
class POSDataSetLoader(DataSetLoader):
"""Dataset Loader for POS Tag datasets.

@@ -48,7 +140,8 @@ class POSDataSetLoader(DataSetLoader):
"""
with open(data_path, "r", encoding="utf-8") as f:
lines = f.readlines()
return self.parse(lines)
data = self.parse(lines)
return self.convert(data)

@staticmethod
def parse(lines):
@@ -75,7 +168,13 @@ class POSDataSetLoader(DataSetLoader):
data.append([words, labels])
return data

def convert(self, data):
"""Convert lists of strings into Instances with Fields.
"""
return convert_seq2seq_dataset(data)


@DataSet.set_reader('read_tokenize')
class TokenizeDataSetLoader(DataSetLoader):
"""
Data set loader for tokenization data sets
@@ -84,8 +183,7 @@ class TokenizeDataSetLoader(DataSetLoader):
def __init__(self):
super(TokenizeDataSetLoader, self).__init__()

@staticmethod
def load(data_path, max_seq_len=32):
def load(self, data_path, max_seq_len=32):
"""
load pku dataset for Chinese word segmentation
CWS (Chinese Word Segmentation) pku training dataset format:
@@ -130,9 +228,13 @@ class TokenizeDataSetLoader(DataSetLoader):
seq_words = words[start:end]
seq_labels = labels[start:end]
data.append([seq_words, seq_labels])
return data
return self.convert(data)

def convert(self, data):
return convert_seq2seq_dataset(data)


@DataSet.set_reader('read_class')
class ClassDataSetLoader(DataSetLoader):
"""Loader for classification data sets"""

@@ -143,7 +245,8 @@ class ClassDataSetLoader(DataSetLoader):
assert os.path.exists(data_path)
with open(data_path, "r", encoding="utf-8") as f:
lines = f.readlines()
return self.parse(lines)
data = self.parse(lines)
return self.convert(data)

@staticmethod
def parse(lines):
@@ -166,16 +269,19 @@ class ClassDataSetLoader(DataSetLoader):
dataset.append(sentence)
return dataset

def convert(self, data):
return convert_seq2tag_dataset(data)


@DataSet.set_reader('read_conll')
class ConllLoader(DataSetLoader):
"""loader for conll format files"""

def __int__(self, data_path):
def __init__(self):
"""
:param str data_path: the path to the conll data set
"""
super(ConllLoader, self).__init__()
self.data_set = self.parse(self.load(data_path))

def load(self, data_path):
"""
@@ -183,7 +289,8 @@ class ConllLoader(DataSetLoader):
"""
with open(data_path, "r", encoding="utf-8") as f:
lines = f.readlines()
return lines
data = self.parse(lines)
return self.convert(data)

@staticmethod
def parse(lines):
@@ -204,7 +311,11 @@ class ConllLoader(DataSetLoader):
tokens.append(line.split())
return sentences

def convert(self, data):
pass


@DataSet.set_reader('read_lm')
class LMDataSetLoader(DataSetLoader):
"""Language Model Dataset Loader

@@ -222,7 +333,8 @@ class LMDataSetLoader(DataSetLoader):
with open(data_path, "r", encoding="utf=8") as f:
text = " ".join(f.readlines())
tokens = text.strip().split()
return self.sentence_cut(tokens)
data = self.sentence_cut(tokens)
return self.convert(data)

def sentence_cut(self, tokens, sentence_length=15):
start_idx = 0
@@ -236,7 +348,11 @@ class LMDataSetLoader(DataSetLoader):
data_set.append([x, y])
return data_set

def convert(self, data):
pass


@DataSet.set_reader('read_people_daily')
class PeopleDailyCorpusLoader(DataSetLoader):
"""
People Daily Corpus: Chinese word segmentation, POS tag, NER
@@ -286,3 +402,74 @@ class PeopleDailyCorpusLoader(DataSetLoader):
ner_examples.append([sent_words, sent_ner])
return pos_tag_examples, ner_examples

def convert(self, data):
pass


class SNLIDataSetLoader(DataSetLoader):
"""A data set loader for SNLI data set.

"""

def __init__(self):
super(SNLIDataSetLoader, self).__init__()

def load(self, path_list):
"""

:param path_list: A list of file name, in the order of premise file, hypothesis file, and label file.
:return: data_set: A DataSet object.
"""
assert len(path_list) == 3
line_set = []
for file in path_list:
if not os.path.exists(file):
raise FileNotFoundError("file {} NOT found".format(file))

with open(file, 'r', encoding='utf-8') as f:
lines = f.readlines()
line_set.append(lines)

premise_lines, hypothesis_lines, label_lines = line_set
assert len(premise_lines) == len(hypothesis_lines) and len(premise_lines) == len(label_lines)

data_set = []
for premise, hypothesis, label in zip(premise_lines, hypothesis_lines, label_lines):
p = premise.strip().split()
h = hypothesis.strip().split()
l = label.strip()
data_set.append([p, h, l])

return self.convert(data_set)

def convert(self, data):
"""Convert a 3D list to a DataSet object.

:param data: A 3D tensor.
[
[ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ],
[ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ],
...
]
:return: data_set: A DataSet object.
"""

data_set = DataSet()

for example in data:
p, h, l = example
# list, list, str
x1 = TextField(p, is_target=False)
x2 = TextField(h, is_target=False)
x1_len = TextField([1] * len(p), is_target=False)
x2_len = TextField([1] * len(h), is_target=False)
y = LabelField(l, is_target=True)
instance = Instance()
instance.add_field("premise", x1)
instance.add_field("hypothesis", x2)
instance.add_field("premise_len", x1_len)
instance.add_field("hypothesis_len", x2_len)
instance.add_field("truth", y)
data_set.append(instance)

return data_set

+ 61
- 26
fastNLP/loader/embed_loader.py View File

@@ -1,50 +1,85 @@
import _pickle
import os

import numpy as np
import torch

from fastNLP.loader.base_loader import BaseLoader
from fastNLP.core.vocabulary import Vocabulary


class EmbedLoader(BaseLoader):
"""docstring for EmbedLoader"""

def __init__(self, data_path):
super(EmbedLoader, self).__init__(data_path)
def __init__(self):
super(EmbedLoader, self).__init__()

@staticmethod
def load_embedding(emb_dim, emb_file, word_dict, emb_pkl):
def _load_glove(emb_file):
"""Read file as a glove embedding

file format:
embeddings are split by line,
for one embedding, word and numbers split by space
Example::

word_1 float_1 float_2 ... float_emb_dim
word_2 float_1 float_2 ... float_emb_dim
...
"""
emb = {}
with open(emb_file, 'r', encoding='utf-8') as f:
for line in f:
line = list(filter(lambda w: len(w)>0, line.strip().split(' ')))
if len(line) > 0:
emb[line[0]] = torch.Tensor(list(map(float, line[1:])))
return emb
@staticmethod
def _load_pretrain(emb_file, emb_type):
"""Read txt data from embedding file and convert to np.array as pre-trained embedding

:param emb_file: str, the pre-trained embedding file path
:param emb_type: str, the pre-trained embedding data format
:return dict: {str: np.array}
"""
if emb_type == 'glove':
return EmbedLoader._load_glove(emb_file)
else:
raise Exception("embedding type {} not support yet".format(emb_type))

@staticmethod
def load_embedding(emb_dim, emb_file, emb_type, vocab, emb_pkl):
"""Load the pre-trained embedding and combine with the given dictionary.

:param emb_file: str, the pre-trained embedding.
The embedding file should have the following format:
Each line is a word embedding, where a word string is followed by multiple floats.
Floats are separated by space. The word and the first float are separated by space.
:param word_dict: dict, a mapping from word to index.
:param emb_dim: int, the dimension of the embedding. Should be the same as pre-trained embedding.
:param emb_file: str, the pre-trained embedding file path.
:param emb_type: str, the pre-trained embedding format, support glove now
:param vocab: Vocabulary, a mapping from word to index, can be provided by user or built from pre-trained embedding
:param emb_pkl: str, the embedding pickle file.
:return embedding_np: numpy array of shape (len(word_dict), emb_dim)

:return embedding_tensor: Tensor of shape (len(word_dict), emb_dim)
vocab: input vocab or vocab built by pre-train
TODO: fragile code
"""
# If the embedding pickle exists, load it and return.
if os.path.exists(emb_pkl):
with open(emb_pkl, "rb") as f:
embedding_np = _pickle.load(f)
return embedding_np
embedding_tensor, vocab = _pickle.load(f)
return embedding_tensor, vocab
# Otherwise, load the pre-trained embedding.
with open(emb_file, "r", encoding="utf-8") as f:
# begin with a random embedding
embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim))
for line in f:
line = line.strip().split()
if len(line) != emb_dim + 1:
# skip this line if two embedding dimension not match
continue
if line[0] in word_dict:
# find the word and replace its embedding with a pre-trained one
embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]]
pretrain = EmbedLoader._load_pretrain(emb_file, emb_type)
if vocab is None:
# build vocabulary from pre-trained embedding
vocab = Vocabulary()
for w in pretrain.keys():
vocab.update(w)
embedding_tensor = torch.randn(len(vocab), emb_dim)
for w, v in pretrain.items():
if len(v.shape) > 1 or emb_dim != v.shape[0]:
raise ValueError('pretrian embedding dim is {}, dismatching required {}'.format(v.shape, (emb_dim,)))
if vocab.has_word(w):
embedding_tensor[vocab[w]] = v

# save and return the result
with open(emb_pkl, "wb") as f:
_pickle.dump(embedding_np, f)
return embedding_np
_pickle.dump((embedding_tensor, vocab), f)
return embedding_tensor, vocab

+ 5
- 65
fastNLP/models/base_model.py View File

@@ -1,5 +1,7 @@
import torch

from fastNLP.core.trainer import Trainer


class BaseModel(torch.nn.Module):
"""Base PyTorch model for all models.
@@ -8,68 +10,6 @@ class BaseModel(torch.nn.Module):
def __init__(self):
super(BaseModel, self).__init__()


class Vocabulary(object):
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
instance also provides access to the `StringStore`, and owns underlying
data that is shared between `Doc` objects.
"""

def __init__(self):
"""Create the vocabulary.
RETURNS (Vocab): The newly constructed object.
"""
self.data_frame = None


class Document(object):
"""A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary
strings. The `Doc` object holds an array of `Token` objects. The
Python-level `Token` and `Span` objects are views of this array, i.e.
they don't own the data themselves. -- spacy
"""

def __init__(self, vocab, words=None, spaces=None):
"""Create a Doc object.
vocab (Vocab): A vocabulary object, which must match any models you
want to use (e.g. tokenizer, parser, entity recognizer).
words (list or None): A list of unicode strings, to add to the document
as words. If `None`, defaults to empty list.
spaces (list or None): A list of boolean values, of the same length as
words. True means that the word is followed by a space, False means
it is not. If `None`, defaults to `[True]*len(words)`
user_data (dict or None): Optional extra data to attach to the Doc.
RETURNS (Doc): The newly constructed object.
"""
self.vocab = vocab
self.spaces = spaces
self.words = words
if spaces is None:
self.spaces = [True] * len(self.words)
elif len(spaces) != len(self.words):
raise ValueError("dismatch spaces and words")

def get_chunker(self, vocab):
return None

def push_back(self, vocab):
pass


class Token(object):
"""An individual token – i.e. a word, punctuation symbol, whitespace,
etc.
"""

def __init__(self, vocab, doc, offset):
"""Construct a `Token` object.
vocab (Vocabulary): A storage container for lexical types.
doc (Document): The parent document.
offset (int): The index of the token within the document.
"""
self.vocab = vocab
self.doc = doc
self.token = doc[offset]
self.i = offset

def fit(self, train_data, dev_data=None, **train_args):
trainer = Trainer(**train_args)
trainer.train(self, train_data, dev_data)

+ 364
- 0
fastNLP/models/biaffine_parser.py View File

@@ -0,0 +1,364 @@
import sys, os
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
import copy
import numpy as np
import torch
from collections import defaultdict
from torch import nn
from torch.nn import functional as F
from fastNLP.modules.utils import initial_parameter
from fastNLP.modules.encoder.variational_rnn import VarLSTM
from fastNLP.modules.dropout import TimestepDropout

def mst(scores):
"""
with some modification to support parser output for MST decoding
https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/models/nn.py#L692
"""
length = scores.shape[0]
min_score = -np.inf
mask = np.zeros((length, length))
np.fill_diagonal(mask, -np.inf)
scores = scores + mask
heads = np.argmax(scores, axis=1)
heads[0] = 0
tokens = np.arange(1, length)
roots = np.where(heads[tokens] == 0)[0] + 1
if len(roots) < 1:
root_scores = scores[tokens, 0]
head_scores = scores[tokens, heads[tokens]]
new_root = tokens[np.argmax(root_scores / head_scores)]
heads[new_root] = 0
elif len(roots) > 1:
root_scores = scores[roots, 0]
scores[roots, 0] = 0
new_heads = np.argmax(scores[roots][:, tokens], axis=1) + 1
new_root = roots[np.argmin(
scores[roots, new_heads] / root_scores)]
heads[roots] = new_heads
heads[new_root] = 0

edges = defaultdict(set)
vertices = set((0,))
for dep, head in enumerate(heads[tokens]):
vertices.add(dep + 1)
edges[head].add(dep + 1)
for cycle in _find_cycle(vertices, edges):
dependents = set()
to_visit = set(cycle)
while len(to_visit) > 0:
node = to_visit.pop()
if node not in dependents:
dependents.add(node)
to_visit.update(edges[node])
cycle = np.array(list(cycle))
old_heads = heads[cycle]
old_scores = scores[cycle, old_heads]
non_heads = np.array(list(dependents))
scores[np.repeat(cycle, len(non_heads)),
np.repeat([non_heads], len(cycle), axis=0).flatten()] = min_score
new_heads = np.argmax(scores[cycle][:, tokens], axis=1) + 1
new_scores = scores[cycle, new_heads] / old_scores
change = np.argmax(new_scores)
changed_cycle = cycle[change]
old_head = old_heads[change]
new_head = new_heads[change]
heads[changed_cycle] = new_head
edges[new_head].add(changed_cycle)
edges[old_head].remove(changed_cycle)

return heads


def _find_cycle(vertices, edges):
"""
https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm
https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/etc/tarjan.py
"""
_index = 0
_stack = []
_indices = {}
_lowlinks = {}
_onstack = defaultdict(lambda: False)
_SCCs = []

def _strongconnect(v):
nonlocal _index
_indices[v] = _index
_lowlinks[v] = _index
_index += 1
_stack.append(v)
_onstack[v] = True

for w in edges[v]:
if w not in _indices:
_strongconnect(w)
_lowlinks[v] = min(_lowlinks[v], _lowlinks[w])
elif _onstack[w]:
_lowlinks[v] = min(_lowlinks[v], _indices[w])

if _lowlinks[v] == _indices[v]:
SCC = set()
while True:
w = _stack.pop()
_onstack[w] = False
SCC.add(w)
if not(w != v):
break
_SCCs.append(SCC)

for v in vertices:
if v not in _indices:
_strongconnect(v)

return [SCC for SCC in _SCCs if len(SCC) > 1]


class GraphParser(nn.Module):
"""Graph based Parser helper class, support greedy decoding and MST(Maximum Spanning Tree) decoding
"""
def __init__(self):
super(GraphParser, self).__init__()

def forward(self, x):
raise NotImplementedError

def _greedy_decoder(self, arc_matrix, seq_mask=None):
_, seq_len, _ = arc_matrix.shape
matrix = arc_matrix + torch.diag(arc_matrix.new(seq_len).fill_(-np.inf))
_, heads = torch.max(matrix, dim=2)
if seq_mask is not None:
heads *= seq_mask.long()
return heads

def _mst_decoder(self, arc_matrix, seq_mask=None):
batch_size, seq_len, _ = arc_matrix.shape
matrix = torch.zeros_like(arc_matrix).copy_(arc_matrix)
ans = matrix.new_zeros(batch_size, seq_len).long()
for i, graph in enumerate(matrix):
ans[i] = torch.as_tensor(mst(graph.cpu().numpy()), device=ans.device)
if seq_mask is not None:
ans *= seq_mask.long()
return ans


class ArcBiaffine(nn.Module):
"""helper module for Biaffine Dependency Parser predicting arc
"""
def __init__(self, hidden_size, bias=True):
super(ArcBiaffine, self).__init__()
self.U = nn.Parameter(torch.Tensor(hidden_size, hidden_size), requires_grad=True)
self.has_bias = bias
if self.has_bias:
self.bias = nn.Parameter(torch.Tensor(hidden_size), requires_grad=True)
else:
self.register_parameter("bias", None)
initial_parameter(self)

def forward(self, head, dep):
"""
:param head arc-head tensor = [batch, length, emb_dim]
:param dep arc-dependent tensor = [batch, length, emb_dim]

:return output tensor = [bacth, length, length]
"""
output = dep.matmul(self.U)
output = output.bmm(head.transpose(-1, -2))
if self.has_bias:
output += head.matmul(self.bias).unsqueeze(1)
return output


class LabelBilinear(nn.Module):
"""helper module for Biaffine Dependency Parser predicting label
"""
def __init__(self, in1_features, in2_features, num_label, bias=True):
super(LabelBilinear, self).__init__()
self.bilinear = nn.Bilinear(in1_features, in2_features, num_label, bias=bias)
self.lin1 = nn.Linear(in1_features, num_label, bias=False)
self.lin2 = nn.Linear(in2_features, num_label, bias=False)

def forward(self, x1, x2):
output = self.bilinear(x1, x2)
output += self.lin1(x1) + self.lin2(x2)
return output


class BiaffineParser(GraphParser):
"""Biaffine Dependency Parser implemantation.
refer to ` Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016)
<https://arxiv.org/abs/1611.01734>`_ .
"""
def __init__(self,
word_vocab_size,
word_emb_dim,
pos_vocab_size,
pos_emb_dim,
rnn_layers,
rnn_hidden_size,
arc_mlp_size,
label_mlp_size,
num_label,
dropout,
use_var_lstm=False,
use_greedy_infer=False):

super(BiaffineParser, self).__init__()
self.word_embedding = nn.Embedding(num_embeddings=word_vocab_size, embedding_dim=word_emb_dim)
self.pos_embedding = nn.Embedding(num_embeddings=pos_vocab_size, embedding_dim=pos_emb_dim)
if use_var_lstm:
self.lstm = VarLSTM(input_size=word_emb_dim + pos_emb_dim,
hidden_size=rnn_hidden_size,
num_layers=rnn_layers,
bias=True,
batch_first=True,
input_dropout=dropout,
hidden_dropout=dropout,
bidirectional=True)
else:
self.lstm = nn.LSTM(input_size=word_emb_dim + pos_emb_dim,
hidden_size=rnn_hidden_size,
num_layers=rnn_layers,
bias=True,
batch_first=True,
dropout=dropout,
bidirectional=True)

rnn_out_size = 2 * rnn_hidden_size
self.arc_head_mlp = nn.Sequential(nn.Linear(rnn_out_size, arc_mlp_size),
nn.ELU())
self.arc_dep_mlp = copy.deepcopy(self.arc_head_mlp)
self.label_head_mlp = nn.Sequential(nn.Linear(rnn_out_size, label_mlp_size),
nn.ELU())
self.label_dep_mlp = copy.deepcopy(self.label_head_mlp)
self.arc_predictor = ArcBiaffine(arc_mlp_size, bias=True)
self.label_predictor = LabelBilinear(label_mlp_size, label_mlp_size, num_label, bias=True)
self.normal_dropout = nn.Dropout(p=dropout)
self.timestep_dropout = TimestepDropout(p=dropout)
self.use_greedy_infer = use_greedy_infer
initial_parameter(self)

def forward(self, word_seq, pos_seq, seq_mask, gold_heads=None, **_):
"""
:param word_seq: [batch_size, seq_len] sequence of word's indices
:param pos_seq: [batch_size, seq_len] sequence of word's indices
:param seq_mask: [batch_size, seq_len] sequence of length masks
:param gold_heads: [batch_size, seq_len] sequence of golden heads
:return dict: parsing results
arc_pred: [batch_size, seq_len, seq_len]
label_pred: [batch_size, seq_len, seq_len]
seq_mask: [batch_size, seq_len]
head_pred: [batch_size, seq_len] if gold_heads is not provided, predicting the heads
"""
# prepare embeddings
batch_size, seq_len = word_seq.shape
# print('forward {} {}'.format(batch_size, seq_len))
batch_range = torch.arange(start=0, end=batch_size, dtype=torch.long, device=word_seq.device).unsqueeze(1)

# get sequence mask
seq_mask = seq_mask.long()

word = self.normal_dropout(self.word_embedding(word_seq)) # [N,L] -> [N,L,C_0]
pos = self.normal_dropout(self.pos_embedding(pos_seq)) # [N,L] -> [N,L,C_1]
x = torch.cat([word, pos], dim=2) # -> [N,L,C]

# lstm, extract features
feat, _ = self.lstm(x) # -> [N,L,C]

# for arc biaffine
# mlp, reduce dim
arc_dep = self.timestep_dropout(self.arc_dep_mlp(feat))
arc_head = self.timestep_dropout(self.arc_head_mlp(feat))
label_dep = self.timestep_dropout(self.label_dep_mlp(feat))
label_head = self.timestep_dropout(self.label_head_mlp(feat))

# biaffine arc classifier
arc_pred = self.arc_predictor(arc_head, arc_dep) # [N, L, L]
flip_mask = (seq_mask == 0)
arc_pred.masked_fill_(flip_mask.unsqueeze(1), -np.inf)

# use gold or predicted arc to predict label
if gold_heads is None:
# use greedy decoding in training
if self.training or self.use_greedy_infer:
heads = self._greedy_decoder(arc_pred, seq_mask)
else:
heads = self._mst_decoder(arc_pred, seq_mask)
head_pred = heads
else:
head_pred = None
heads = gold_heads

label_head = label_head[batch_range, heads].contiguous()
label_pred = self.label_predictor(label_head, label_dep) # [N, L, num_label]
res_dict = {'arc_pred': arc_pred, 'label_pred': label_pred, 'seq_mask': seq_mask}
if head_pred is not None:
res_dict['head_pred'] = head_pred
return res_dict

def loss(self, arc_pred, label_pred, head_indices, head_labels, seq_mask, **_):
"""
Compute loss.

:param arc_pred: [batch_size, seq_len, seq_len]
:param label_pred: [batch_size, seq_len, seq_len]
:param head_indices: [batch_size, seq_len]
:param head_labels: [batch_size, seq_len]
:param seq_mask: [batch_size, seq_len]
:return: loss value
"""

batch_size, seq_len, _ = arc_pred.shape
arc_logits = F.log_softmax(arc_pred, dim=2)
label_logits = F.log_softmax(label_pred, dim=2)
batch_index = torch.arange(start=0, end=batch_size, device=arc_logits.device).long().unsqueeze(1)
child_index = torch.arange(start=0, end=seq_len, device=arc_logits.device).long().unsqueeze(0)
arc_loss = arc_logits[batch_index, child_index, head_indices]
label_loss = label_logits[batch_index, child_index, head_labels]

arc_loss = arc_loss[:, 1:]
label_loss = label_loss[:, 1:]

float_mask = seq_mask[:, 1:].float()
length = (seq_mask.sum() - batch_size).float()
arc_nll = -(arc_loss*float_mask).sum() / length
label_nll = -(label_loss*float_mask).sum() / length
return arc_nll + label_nll

def evaluate(self, arc_pred, label_pred, head_indices, head_labels, seq_mask, **kwargs):
"""
Evaluate the performance of prediction.

:return dict: performance results.
head_pred_corrct: number of correct predicted heads.
label_pred_correct: number of correct predicted labels.
total_tokens: number of predicted tokens
"""
if 'head_pred' in kwargs:
head_pred = kwargs['head_pred']
elif self.use_greedy_infer:
head_pred = self._greedy_decoder(arc_pred, seq_mask)
else:
head_pred = self._mst_decoder(arc_pred, seq_mask)

head_pred_correct = (head_pred == head_indices).long() * seq_mask
_, label_preds = torch.max(label_pred, dim=2)
label_pred_correct = (label_preds == head_labels).long() * head_pred_correct
return {"head_pred_correct": head_pred_correct.sum(dim=1),
"label_pred_correct": label_pred_correct.sum(dim=1),
"total_tokens": seq_mask.sum(dim=1)}

def metrics(self, head_pred_correct, label_pred_correct, total_tokens, **_):
"""
Compute the metrics of model

:param head_pred_corrct: number of correct predicted heads.
:param label_pred_correct: number of correct predicted labels.
:param total_tokens: number of predicted tokens
:return dict: the metrics results
UAS: the head predicted accuracy
LAS: the label predicted accuracy
"""
return {"UAS": head_pred_correct.sum().float() / total_tokens.sum().float() * 100,
"LAS": label_pred_correct.sum().float() / total_tokens.sum().float() * 100}


+ 1
- 1
fastNLP/models/char_language_model.py View File

@@ -103,7 +103,7 @@ class CharLM(nn.Module):
x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
# [num_seq, seq_len, total_num_filters]

x, hidden = self.lstm(x)
x = self.lstm(x)
# [seq_len, num_seq, hidden_size]

x = self.dropout(x)


+ 161
- 0
fastNLP/models/snli.py View File

@@ -0,0 +1,161 @@
import torch
import torch.nn as nn
import torch.nn.functional as F

from fastNLP.models.base_model import BaseModel
from fastNLP.modules import decoder as Decoder, encoder as Encoder


my_inf = 10e12


class SNLI(BaseModel):
"""
PyTorch Network for SNLI.
"""

def __init__(self, args, init_embedding=None):
super(SNLI, self).__init__()
self.vocab_size = args["vocab_size"]
self.embed_dim = args["embed_dim"]
self.hidden_size = args["hidden_size"]
self.batch_first = args["batch_first"]
self.dropout = args["dropout"]
self.n_labels = args["num_classes"]
self.gpu = args["gpu"] and torch.cuda.is_available()

self.embedding = Encoder.embedding.Embedding(self.vocab_size, self.embed_dim, init_emb=init_embedding,
dropout=self.dropout)

self.embedding_layer = Encoder.Linear(self.embed_dim, self.hidden_size)

self.encoder = Encoder.LSTM(
input_size=self.embed_dim, hidden_size=self.hidden_size, num_layers=1, bias=True,
batch_first=self.batch_first, bidirectional=True
)

self.inference_layer = Encoder.Linear(self.hidden_size * 4, self.hidden_size)

self.decoder = Encoder.LSTM(
input_size=self.hidden_size, hidden_size=self.hidden_size, num_layers=1, bias=True,
batch_first=self.batch_first, bidirectional=True
)

self.output = Decoder.MLP([4 * self.hidden_size, self.hidden_size, self.n_labels], 'tanh')

def forward(self, premise, hypothesis, premise_len, hypothesis_len):
""" Forward function

:param premise: A Tensor represents premise: [batch size(B), premise seq len(PL), hidden size(H)].
:param hypothesis: A Tensor represents hypothesis: [B, hypothesis seq len(HL), H].
:param premise_len: A Tensor record which is a real word and which is a padding word in premise: [B, PL].
:param hypothesis_len: A Tensor record which is a real word and which is a padding word in hypothesis: [B, HL].
:return: prediction: A Tensor of classification result: [B, n_labels(N)].
"""

premise0 = self.embedding_layer(self.embedding(premise))
hypothesis0 = self.embedding_layer(self.embedding(hypothesis))

_BP, _PSL, _HP = premise0.size()
_BH, _HSL, _HH = hypothesis0.size()
_BPL, _PLL = premise_len.size()
_HPL, _HLL = hypothesis_len.size()

assert _BP == _BH and _BPL == _HPL and _BP == _BPL
assert _HP == _HH
assert _PSL == _PLL and _HSL == _HLL

B, PL, H = premise0.size()
B, HL, H = hypothesis0.size()

# a0, (ah0, ac0) = self.encoder(premise) # a0: [B, PL, H * 2], ah0: [2, B, H]
# b0, (bh0, bc0) = self.encoder(hypothesis) # b0: [B, HL, H * 2]

a0 = self.encoder(premise0) # a0: [B, PL, H * 2]
b0 = self.encoder(hypothesis0) # b0: [B, HL, H * 2]

a = torch.mean(a0.view(B, PL, -1, H), dim=2) # a: [B, PL, H]
b = torch.mean(b0.view(B, HL, -1, H), dim=2) # b: [B, HL, H]

ai, bi = self.calc_bi_attention(a, b, premise_len, hypothesis_len)

ma = torch.cat((a, ai, a - ai, a * ai), dim=2) # ma: [B, PL, 4 * H]
mb = torch.cat((b, bi, b - bi, b * bi), dim=2) # mb: [B, HL, 4 * H]

f_ma = self.inference_layer(ma)
f_mb = self.inference_layer(mb)

vat = self.decoder(f_ma)
vbt = self.decoder(f_mb)

va = torch.mean(vat.view(B, PL, -1, H), dim=2) # va: [B, PL, H]
vb = torch.mean(vbt.view(B, HL, -1, H), dim=2) # vb: [B, HL, H]

# va_ave = torch.mean(va, dim=1) # va_ave: [B, H]
# va_max, va_arg_max = torch.max(va, dim=1) # va_max: [B, H]
# vb_ave = torch.mean(vb, dim=1) # vb_ave: [B, H]
# vb_max, vb_arg_max = torch.max(vb, dim=1) # vb_max: [B, H]

va_ave = self.mean_pooling(va, premise_len, dim=1) # va_ave: [B, H]
va_max, va_arg_max = self.max_pooling(va, premise_len, dim=1) # va_max: [B, H]
vb_ave = self.mean_pooling(vb, hypothesis_len, dim=1) # vb_ave: [B, H]
vb_max, vb_arg_max = self.max_pooling(vb, hypothesis_len, dim=1) # vb_max: [B, H]

v = torch.cat((va_ave, va_max, vb_ave, vb_max), dim=1) # v: [B, 4 * H]

# v_mlp = F.tanh(self.mlp_layer1(v)) # v_mlp: [B, H]
# prediction = self.mlp_layer2(v_mlp) # prediction: [B, N]

prediction = F.tanh(self.output(v)) # prediction: [B, N]

return prediction

@staticmethod
def calc_bi_attention(in_x1, in_x2, x1_len, x2_len):

# in_x1: [batch_size, x1_seq_len, hidden_size]
# in_x2: [batch_size, x2_seq_len, hidden_size]
# x1_len: [batch_size, x1_seq_len]
# x2_len: [batch_size, x2_seq_len]

assert in_x1.size()[0] == in_x2.size()[0]
assert in_x1.size()[2] == in_x2.size()[2]
# The batch size and hidden size must be equal.
assert in_x1.size()[1] == x1_len.size()[1] and in_x2.size()[1] == x2_len.size()[1]
# The seq len in in_x and x_len must be equal.
assert in_x1.size()[0] == x1_len.size()[0] and x1_len.size()[0] == x2_len.size()[0]

batch_size = in_x1.size()[0]
x1_max_len = in_x1.size()[1]
x2_max_len = in_x2.size()[1]

in_x2_t = torch.transpose(in_x2, 1, 2) # [batch_size, hidden_size, x2_seq_len]

attention_matrix = torch.bmm(in_x1, in_x2_t) # [batch_size, x1_seq_len, x2_seq_len]

a_mask = x1_len.le(0.5).float() * -my_inf # [batch_size, x1_seq_len]
a_mask = a_mask.view(batch_size, x1_max_len, -1)
a_mask = a_mask.expand(-1, -1, x2_max_len) # [batch_size, x1_seq_len, x2_seq_len]
b_mask = x2_len.le(0.5).float() * -my_inf
b_mask = b_mask.view(batch_size, -1, x2_max_len)
b_mask = b_mask.expand(-1, x1_max_len, -1) # [batch_size, x1_seq_len, x2_seq_len]

attention_a = F.softmax(attention_matrix + a_mask, dim=2) # [batch_size, x1_seq_len, x2_seq_len]
attention_b = F.softmax(attention_matrix + b_mask, dim=1) # [batch_size, x1_seq_len, x2_seq_len]

out_x1 = torch.bmm(attention_a, in_x2) # [batch_size, x1_seq_len, hidden_size]
attention_b_t = torch.transpose(attention_b, 1, 2)
out_x2 = torch.bmm(attention_b_t, in_x1) # [batch_size, x2_seq_len, hidden_size]

return out_x1, out_x2

@staticmethod
def mean_pooling(tensor, mask, dim=0):
masks = mask.view(mask.size(0), mask.size(1), -1).float()
return torch.sum(tensor * masks, dim=dim) / torch.sum(masks, dim=1)

@staticmethod
def max_pooling(tensor, mask, dim=0):
masks = mask.view(mask.size(0), mask.size(1), -1)
masks = masks.expand(-1, -1, tensor.size(2)).float()
return torch.max(tensor + masks.le(0.5).float() * -my_inf, dim=dim)

+ 10
- 9
fastNLP/modules/decoder/MLP.py View File

@@ -1,12 +1,15 @@
import torch
import torch.nn as nn
from fastNLP.modules.utils import initial_parameter


class MLP(nn.Module):
def __init__(self, size_layer, activation='relu' , initial_method = None):
def __init__(self, size_layer, activation='relu', initial_method=None):
"""Multilayer Perceptrons as a decoder

:param size_layer: list of int, define the size of MLP layers
:param activation: str or function, the activation function for hidden layers
:param size_layer: list of int, define the size of MLP layers.
:param activation: str or function, the activation function for hidden layers.
:param initial_method: str, the name of init method.

.. note::
There is no activation function applying on output layer.
@@ -23,7 +26,7 @@ class MLP(nn.Module):

actives = {
'relu': nn.ReLU(),
'tanh': nn.Tanh()
'tanh': nn.Tanh(),
}
if activation in actives:
self.hidden_active = actives[activation]
@@ -31,7 +34,7 @@ class MLP(nn.Module):
self.hidden_active = activation
else:
raise ValueError("should set activation correctly: {}".format(activation))
initial_parameter(self, initial_method )
initial_parameter(self, initial_method)

def forward(self, x):
for layer in self.hiddens:
@@ -40,13 +43,11 @@ class MLP(nn.Module):
return x



if __name__ == '__main__':
net1 = MLP([5,10,5])
net2 = MLP([5,10,5], 'tanh')
net1 = MLP([5, 10, 5])
net2 = MLP([5, 10, 5], 'tanh')
for net in [net1, net2]:
x = torch.randn(5, 5)
y = net(x)
print(x)
print(y)

+ 15
- 0
fastNLP/modules/dropout.py View File

@@ -0,0 +1,15 @@
import torch

class TimestepDropout(torch.nn.Dropout):
"""This module accepts a `[batch_size, num_timesteps, embedding_dim)]` and use a single
dropout mask of shape `(batch_size, embedding_dim)` to apply on every time step.
"""
def forward(self, x):
dropout_mask = x.new_ones(x.shape[0], x.shape[-1])
torch.nn.functional.dropout(dropout_mask, self.p, self.training, inplace=True)
dropout_mask = dropout_mask.unsqueeze(1) # [batch_size, 1, embedding_dim]
if self.inplace:
x *= dropout_mask
return
else:
return x * dropout_mask

+ 7
- 21
fastNLP/modules/encoder/char_embedding.py View File

@@ -1,12 +1,14 @@
import torch
import torch.nn.functional as F
from torch import nn
# from torch.nn.init import xavier_uniform

from fastNLP.modules.utils import initial_parameter


# from torch.nn.init import xavier_uniform
class ConvCharEmbedding(nn.Module):

def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(3, 4, 5),initial_method = None):
def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(3, 4, 5), initial_method=None):
"""
Character Level Word Embedding
:param char_emb_size: the size of character level embedding. Default: 50
@@ -21,7 +23,7 @@ class ConvCharEmbedding(nn.Module):
nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, 4))
for i in range(len(kernels))])

initial_parameter(self,initial_method)
initial_parameter(self, initial_method)

def forward(self, x):
"""
@@ -56,7 +58,7 @@ class LSTMCharEmbedding(nn.Module):
:param hidden_size: int, the number of hidden units. Default: equal to char_emb_size.
"""

def __init__(self, char_emb_size=50, hidden_size=None , initial_method= None):
def __init__(self, char_emb_size=50, hidden_size=None, initial_method=None):
super(LSTMCharEmbedding, self).__init__()
self.hidden_size = char_emb_size if hidden_size is None else hidden_size

@@ -66,6 +68,7 @@ class LSTMCharEmbedding(nn.Module):
bias=True,
batch_first=True)
initial_parameter(self, initial_method)

def forward(self, x):
"""
:param x:[ n_batch*n_word, word_length, char_emb_size]
@@ -79,20 +82,3 @@ class LSTMCharEmbedding(nn.Module):

_, hidden = self.lstm(x, (h0, c0))
return hidden[0].squeeze().unsqueeze(2)


if __name__ == "__main__":
batch_size = 128
char_emb = 100
word_length = 1
x = torch.Tensor(batch_size, char_emb, word_length)
x = x.transpose(1, 2)
cce = ConvCharEmbedding(char_emb)
y = cce(x)
print("CNN Char Emb input: ", x.shape)
print("CNN Char Emb output: ", y.shape) # [128, 100]

lce = LSTMCharEmbedding(char_emb)
o = lce(x)
print("LSTM Char Emb input: ", x.shape)
print("LSTM Char Emb size: ", o.shape)

+ 4
- 1
fastNLP/modules/encoder/linear.py View File

@@ -1,6 +1,8 @@
import torch.nn as nn

from fastNLP.modules.utils import initial_parameter


class Linear(nn.Module):
"""
Linear module
@@ -12,10 +14,11 @@ class Linear(nn.Module):
bidirectional : If True, becomes a bidirectional RNN
"""

def __init__(self, input_size, output_size, bias=True,initial_method = None ):
def __init__(self, input_size, output_size, bias=True, initial_method=None):
super(Linear, self).__init__()
self.linear = nn.Linear(input_size, output_size, bias)
initial_parameter(self, initial_method)

def forward(self, x):
x = self.linear(x)
return x

+ 13
- 6
fastNLP/modules/encoder/lstm.py View File

@@ -14,16 +14,23 @@ class LSTM(nn.Module):
bidirectional : If True, becomes a bidirectional RNN. Default: False.
"""

def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, bidirectional=False,
initial_method=None):
def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True,
bidirectional=False, bias=True, initial_method=None, get_hidden=False):
super(LSTM, self).__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=True,
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=bias, batch_first=batch_first,
dropout=dropout, bidirectional=bidirectional)
self.get_hidden = get_hidden
initial_parameter(self, initial_method)

def forward(self, x):
x, _ = self.lstm(x)
return x
def forward(self, x, h0=None, c0=None):
if h0 is not None and c0 is not None:
x, (ht, ct) = self.lstm(x, (h0, c0))
else:
x, (ht, ct) = self.lstm(x)
if self.get_hidden:
return x, (ht, ct)
else:
return x


if __name__ == "__main__":


+ 123
- 354
fastNLP/modules/encoder/variational_rnn.py View File

@@ -2,384 +2,153 @@ import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
from torch.nn.parameter import Parameter
from torch.nn.utils.rnn import PackedSequence

from fastNLP.modules.utils import initial_parameter

def default_initializer(hidden_size):
stdv = 1.0 / math.sqrt(hidden_size)

def forward(tensor):
nn.init.uniform_(tensor, -stdv, stdv)

return forward


def VarMaskedRecurrent(reverse=False):
def forward(input, hidden, cell, mask):
output = []
steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
for i in steps:
if mask is None or mask[i].data.min() > 0.5:
hidden = cell(input[i], hidden)
elif mask[i].data.max() > 0.5:
hidden_next = cell(input[i], hidden)
# hack to handle LSTM
if isinstance(hidden, tuple):
hx, cx = hidden
hp1, cp1 = hidden_next
hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i])
else:
hidden = hidden + (hidden_next - hidden) * mask[i]
# hack to handle LSTM
output.append(hidden[0] if isinstance(hidden, tuple) else hidden)

if reverse:
output.reverse()
output = torch.cat(output, 0).view(input.size(0), *output[0].size())

return hidden, output

return forward


def StackedRNN(inners, num_layers, lstm=False):
num_directions = len(inners)
total_layers = num_layers * num_directions

def forward(input, hidden, cells, mask):
assert (len(cells) == total_layers)
next_hidden = []

if lstm:
hidden = list(zip(*hidden))

for i in range(num_layers):
all_output = []
for j, inner in enumerate(inners):
l = i * num_directions + j
hy, output = inner(input, hidden[l], cells[l], mask)
next_hidden.append(hy)
all_output.append(output)

input = torch.cat(all_output, input.dim() - 1)

if lstm:
next_h, next_c = zip(*next_hidden)
next_hidden = (
torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
)
else:
next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())

return next_hidden, input

return forward


def AutogradVarMaskedRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False):
rec_factory = VarMaskedRecurrent

if bidirectional:
layer = (rec_factory(), rec_factory(reverse=True))
else:
layer = (rec_factory(),)

func = StackedRNN(layer,
num_layers,
lstm=lstm)

def forward(input, cells, hidden, mask):
if batch_first:
input = input.transpose(0, 1)
if mask is not None:
mask = mask.transpose(0, 1)

nexth, output = func(input, hidden, cells, mask)

if batch_first:
output = output.transpose(0, 1)

return output, nexth

return forward

try:
from torch import flip
except ImportError:
def flip(x, dims):
indices = [slice(None)] * x.dim()
for dim in dims:
indices[dim] = torch.arange(x.size(dim) - 1, -1, -1, dtype=torch.long, device=x.device)
return x[tuple(indices)]

class VarRnnCellWrapper(nn.Module):
"""Wrapper for normal RNN Cells, make it support variational dropout
"""
def __init__(self, cell, hidden_size, input_p, hidden_p):
super(VarRnnCellWrapper, self).__init__()
self.cell = cell
self.hidden_size = hidden_size
self.input_p = input_p
self.hidden_p = hidden_p

def VarMaskedStep():
def forward(input, hidden, cell, mask):
if mask is None or mask.data.min() > 0.5:
hidden = cell(input, hidden)
elif mask.data.max() > 0.5:
hidden_next = cell(input, hidden)
# hack to handle LSTM
if isinstance(hidden, tuple):
def forward(self, input, hidden, mask_x=None, mask_h=None):
"""
:param input: [seq_len, batch_size, input_size]
:param hidden: for LSTM, tuple of (h_0, c_0), [batch_size, hidden_size]
for other RNN, h_0, [batch_size, hidden_size]
:param mask_x: [batch_size, input_size] dropout mask for input
:param mask_h: [batch_size, hidden_size] dropout mask for hidden
:return output: [seq_len, bacth_size, hidden_size]
hidden: for LSTM, tuple of (h_n, c_n), [batch_size, hidden_size]
for other RNN, h_n, [batch_size, hidden_size]
"""
is_lstm = isinstance(hidden, tuple)
input = input * mask_x.unsqueeze(0) if mask_x is not None else input
output_list = []
for x in input:
if is_lstm:
hx, cx = hidden
hp1, cp1 = hidden_next
hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
hidden = (hx * mask_h, cx) if mask_h is not None else (hx, cx)
else:
hidden = hidden + (hidden_next - hidden) * mask
# hack to handle LSTM
output = hidden[0] if isinstance(hidden, tuple) else hidden

return hidden, output

return forward


def StackedStep(layer, num_layers, lstm=False):
def forward(input, hidden, cells, mask):
assert (len(cells) == num_layers)
next_hidden = []

if lstm:
hidden = list(zip(*hidden))

for l in range(num_layers):
hy, output = layer(input, hidden[l], cells[l], mask)
next_hidden.append(hy)
input = output

if lstm:
next_h, next_c = zip(*next_hidden)
next_hidden = (
torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
)
else:
next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())

return next_hidden, input

return forward


def AutogradVarMaskedStep(num_layers=1, lstm=False):
layer = VarMaskedStep()

func = StackedStep(layer,
num_layers,
lstm=lstm)

def forward(input, cells, hidden, mask):
nexth, output = func(input, hidden, cells, mask)
return output, nexth

return forward

hidden *= mask_h if mask_h is not None else hidden
hidden = self.cell(x, hidden)
output_list.append(hidden[0] if is_lstm else hidden)
output = torch.stack(output_list, dim=0)
return output, hidden

class VarMaskedRNNBase(nn.Module):
def __init__(self, Cell, input_size, hidden_size,
num_layers=1, bias=True, batch_first=False,
dropout=(0, 0), bidirectional=False, initializer=None,initial_method = None, **kwargs):

super(VarMaskedRNNBase, self).__init__()
self.Cell = Cell
class VarRNNBase(nn.Module):
"""Implementation of Variational Dropout RNN network.
refer to `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016)
https://arxiv.org/abs/1512.05287`.
"""
def __init__(self, mode, Cell, input_size, hidden_size, num_layers=1,
bias=True, batch_first=False,
input_dropout=0, hidden_dropout=0, bidirectional=False):
super(VarRNNBase, self).__init__()
self.mode = mode
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bias = bias
self.batch_first = batch_first
self.input_dropout = input_dropout
self.hidden_dropout = hidden_dropout
self.bidirectional = bidirectional
self.lstm = False
num_directions = 2 if bidirectional else 1

self.all_cells = []
for layer in range(num_layers):
for direction in range(num_directions):
layer_input_size = input_size if layer == 0 else hidden_size * num_directions

cell = self.Cell(layer_input_size, hidden_size, self.bias, p=dropout, initializer=initializer, **kwargs)
self.all_cells.append(cell)
self.add_module('cell%d' % (layer * num_directions + direction), cell)
initial_parameter(self, initial_method)
def reset_parameters(self):
for cell in self.all_cells:
cell.reset_parameters()

def reset_noise(self, batch_size):
for cell in self.all_cells:
cell.reset_noise(batch_size)
self.num_directions = 2 if bidirectional else 1
self._all_cells = nn.ModuleList()
for layer in range(self.num_layers):
for direction in range(self.num_directions):
input_size = self.input_size if layer == 0 else self.hidden_size * self.num_directions
cell = Cell(input_size, self.hidden_size, bias)
self._all_cells.append(VarRnnCellWrapper(cell, self.hidden_size, input_dropout, hidden_dropout))
initial_parameter(self)

def forward(self, input, hx=None):
is_packed = isinstance(input, PackedSequence)
is_lstm = (self.mode == "LSTM")
if is_packed:
input, batch_sizes = input
max_batch_size = int(batch_sizes[0])
else:
batch_sizes = None
max_batch_size = input.size(0) if self.batch_first else input.size(1)

def forward(self, input, mask=None, hx=None):
batch_size = input.size(0) if self.batch_first else input.size(1)
if hx is None:
num_directions = 2 if self.bidirectional else 1
hx = torch.tensor(input.data.new(self.num_layers * num_directions, batch_size, self.hidden_size).zero_(),
requires_grad=True)
if self.lstm:
hx = input.new_zeros(self.num_layers * self.num_directions,
max_batch_size, self.hidden_size,
requires_grad=False)
if is_lstm:
hx = (hx, hx)

func = AutogradVarMaskedRNN(num_layers=self.num_layers,
batch_first=self.batch_first,
bidirectional=self.bidirectional,
lstm=self.lstm)

self.reset_noise(batch_size)

output, hidden = func(input, self.all_cells, hx, None if mask is None else mask.view(mask.size() + (1,)))
return output, hidden

def step(self, input, hx=None, mask=None):
'''
execute one step forward (only for one-directional RNN).
Args:
input (batch, input_size): input tensor of this step.
hx (num_layers, batch, hidden_size): the hidden state of last step.
mask (batch): the mask tensor of this step.
Returns:
output (batch, hidden_size): tensor containing the output of this step from the last layer of RNN.
hn (num_layers, batch, hidden_size): tensor containing the hidden state of this step
'''
assert not self.bidirectional, "step only cannot be applied to bidirectional RNN."
batch_size = input.size(0)
if hx is None:
hx = torch.tensor(input.data.new(self.num_layers, batch_size, self.hidden_size).zero_(), requires_grad=True)
if self.lstm:
hx = (hx, hx)
if self.batch_first:
input = input.transpose(0, 1)
batch_size = input.shape[1]

mask_x = input.new_ones((batch_size, self.input_size))
mask_out = input.new_ones((batch_size, self.hidden_size * self.num_directions))
mask_h = input.new_ones((batch_size, self.hidden_size))
nn.functional.dropout(mask_x, p=self.input_dropout, training=self.training, inplace=True)
nn.functional.dropout(mask_out, p=self.hidden_dropout, training=self.training, inplace=True)
nn.functional.dropout(mask_h, p=self.hidden_dropout, training=self.training, inplace=True)

hidden_list = []
for layer in range(self.num_layers):
output_list = []
for direction in range(self.num_directions):
input_x = input if direction == 0 else flip(input, [0])
idx = self.num_directions * layer + direction
cell = self._all_cells[idx]
hi = (hx[0][idx], hx[1][idx]) if is_lstm else hx[idx]
mask_xi = mask_x if layer == 0 else mask_out
output_x, hidden_x = cell(input_x, hi, mask_xi, mask_h)
output_list.append(output_x if direction == 0 else flip(output_x, [0]))
hidden_list.append(hidden_x)
input = torch.cat(output_list, dim=-1)

output = input.transpose(0, 1) if self.batch_first else input
if is_lstm:
h_list, c_list = zip(*hidden_list)
hn = torch.stack(h_list, dim=0)
cn = torch.stack(c_list, dim=0)
hidden = (hn, cn)
else:
hidden = torch.stack(hidden_list, dim=0)

func = AutogradVarMaskedStep(num_layers=self.num_layers, lstm=self.lstm)
if is_packed:
output = PackedSequence(output, batch_sizes)

output, hidden = func(input, self.all_cells, hx, mask)
return output, hidden


class VarMaskedFastLSTM(VarMaskedRNNBase):
class VarLSTM(VarRNNBase):
"""Variational Dropout LSTM.
"""
def __init__(self, *args, **kwargs):
super(VarMaskedFastLSTM, self).__init__(VarFastLSTMCell, *args, **kwargs)
self.lstm = True


class VarRNNCellBase(nn.Module):
def __repr__(self):
s = '{name}({input_size}, {hidden_size}'
if 'bias' in self.__dict__ and self.bias is not True:
s += ', bias={bias}'
if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh":
s += ', nonlinearity={nonlinearity}'
s += ')'
return s.format(name=self.__class__.__name__, **self.__dict__)
super(VarLSTM, self).__init__(mode="LSTM", Cell=nn.LSTMCell, *args, **kwargs)

def reset_noise(self, batch_size):
"""
Should be overriden by all subclasses.
Args:
batch_size: (int) batch size of input.
"""
raise NotImplementedError


class VarFastLSTMCell(VarRNNCellBase):
"""
A long short-term memory (LSTM) cell with variational dropout.
.. math::
\begin{array}{ll}
i = \mathrm{sigmoid}(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
f = \mathrm{sigmoid}(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
g = \tanh(W_{ig} x + b_{ig} + W_{hc} h + b_{hg}) \\
o = \mathrm{sigmoid}(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
c' = f * c + i * g \\
h' = o * \tanh(c') \\
\end{array}
class VarRNN(VarRNNBase):
"""Variational Dropout RNN.
"""
def __init__(self, *args, **kwargs):
super(VarRNN, self).__init__(mode="RNN", Cell=nn.RNNCell, *args, **kwargs)

def __init__(self, input_size, hidden_size, bias=True, p=(0.5, 0.5), initializer=None,initial_method =None):
super(VarFastLSTMCell, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.bias = bias
self.weight_ih = Parameter(torch.Tensor(4 * hidden_size, input_size))
self.weight_hh = Parameter(torch.Tensor(4 * hidden_size, hidden_size))
if bias:
self.bias_ih = Parameter(torch.Tensor(4 * hidden_size))
self.bias_hh = Parameter(torch.Tensor(4 * hidden_size))
else:
self.register_parameter('bias_ih', None)
self.register_parameter('bias_hh', None)

self.initializer = default_initializer(self.hidden_size) if initializer is None else initializer
self.reset_parameters()
p_in, p_hidden = p
if p_in < 0 or p_in > 1:
raise ValueError("input dropout probability has to be between 0 and 1, "
"but got {}".format(p_in))
if p_hidden < 0 or p_hidden > 1:
raise ValueError("hidden state dropout probability has to be between 0 and 1, "
"but got {}".format(p_hidden))
self.p_in = p_in
self.p_hidden = p_hidden
self.noise_in = None
self.noise_hidden = None
initial_parameter(self, initial_method)
def reset_parameters(self):
for weight in self.parameters():
if weight.dim() == 1:
weight.data.zero_()
else:
self.initializer(weight.data)

def reset_noise(self, batch_size):
if self.training:
if self.p_in:
noise = self.weight_ih.data.new(batch_size, self.input_size)
self.noise_in = torch.tensor(noise.bernoulli_(1.0 - self.p_in) / (1.0 - self.p_in))
else:
self.noise_in = None

if self.p_hidden:
noise = self.weight_hh.data.new(batch_size, self.hidden_size)
self.noise_hidden = torch.tensor(noise.bernoulli_(1.0 - self.p_hidden) / (1.0 - self.p_hidden))
else:
self.noise_hidden = None
else:
self.noise_in = None
self.noise_hidden = None

def forward(self, input, hx):
return self.__forward(
input, hx,
self.weight_ih, self.weight_hh,
self.bias_ih, self.bias_hh,
self.noise_in, self.noise_hidden,
)

@staticmethod
def __forward(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
if noise_in is not None:
if input.is_cuda:
input = input * noise_in.cuda(input.get_device())
else:
input = input * noise_in

if input.is_cuda:
w_ih = w_ih.cuda(input.get_device())
w_hh = w_hh.cuda(input.get_device())
hidden = [h.cuda(input.get_device()) for h in hidden]
b_ih = b_ih.cuda(input.get_device())
b_hh = b_hh.cuda(input.get_device())
igates = F.linear(input, w_ih.cuda(input.get_device()))
hgates = F.linear(hidden[0], w_hh) if noise_hidden is None \
else F.linear(hidden[0] * noise_hidden.cuda(input.get_device()), w_hh)
state = fusedBackend.LSTMFused.apply
# print("use backend")
# use some magic function
return state(igates, hgates, hidden[1]) if b_ih is None else state(igates, hgates, hidden[1], b_ih, b_hh)

hx, cx = hidden
if noise_hidden is not None:
hx = hx * noise_hidden
gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)

ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)

ingate = F.sigmoid(ingate)
forgetgate = F.sigmoid(forgetgate)
cellgate = F.tanh(cellgate)
outgate = F.sigmoid(outgate)

cy = (forgetgate * cx) + (ingate * cellgate)
hy = outgate * F.tanh(cy)

return hy, cy
class VarGRU(VarRNNBase):
"""Variational Dropout GRU.
"""
def __init__(self, *args, **kwargs):
super(VarGRU, self).__init__(mode="GRU", Cell=nn.GRUCell, *args, **kwargs)

+ 37
- 0
reproduction/Biaffine_parser/cfg.cfg View File

@@ -0,0 +1,37 @@
[train]
epochs = 50
batch_size = 16
pickle_path = "./save/"
validate = true
save_best_dev = false
use_cuda = true
model_saved_path = "./save/"
task = "parse"


[test]
save_output = true
validate_in_training = true
save_dev_input = false
save_loss = true
batch_size = 16
pickle_path = "./save/"
use_cuda = true
task = "parse"

[model]
word_vocab_size = -1
word_emb_dim = 100
pos_vocab_size = -1
pos_emb_dim = 100
rnn_layers = 3
rnn_hidden_size = 400
arc_mlp_size = 500
label_mlp_size = 100
num_label = -1
dropout = 0.33
use_var_lstm=true
use_greedy_infer=false

[optim]
lr = 2e-3

+ 260
- 0
reproduction/Biaffine_parser/run.py View File

@@ -0,0 +1,260 @@
import os
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

from collections import defaultdict
import math
import torch

from fastNLP.core.trainer import Trainer
from fastNLP.core.instance import Instance
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.core.dataset import DataSet
from fastNLP.core.batch import Batch
from fastNLP.core.sampler import SequentialSampler
from fastNLP.core.field import TextField, SeqLabelField
from fastNLP.core.preprocess import SeqLabelPreprocess, load_pickle
from fastNLP.core.tester import Tester
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.loader.embed_loader import EmbedLoader
from fastNLP.models.biaffine_parser import BiaffineParser
from fastNLP.saver.model_saver import ModelSaver

# not in the file's dir
if len(os.path.dirname(__file__)) != 0:
os.chdir(os.path.dirname(__file__))

class MyDataLoader(object):
def __init__(self, pickle_path):
self.pickle_path = pickle_path

def load(self, path, word_v=None, pos_v=None, headtag_v=None):
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)

ds = DataSet(name='conll')
for sample in datalist:
# print(sample)
res = self.get_one(sample)
if word_v is not None:
word_v.update(res[0])
pos_v.update(res[1])
headtag_v.update(res[3])
ds.append(Instance(word_seq=TextField(res[0], is_target=False),
pos_seq=TextField(res[1], is_target=False),
head_indices=SeqLabelField(res[2], is_target=True),
head_labels=TextField(res[3], is_target=True),
seq_mask=SeqLabelField([1 for _ in range(len(res[0]))], is_target=False)))

return ds

def get_one(self, sample):
text = ['<root>']
pos_tags = ['<root>']
heads = [0]
head_tags = ['root']
for w in sample:
t1, t2, t3, t4 = w[1], w[3], w[6], w[7]
if t3 == '_':
continue
text.append(t1)
pos_tags.append(t2)
heads.append(int(t3))
head_tags.append(t4)
return (text, pos_tags, heads, head_tags)

def index_data(self, dataset, word_v, pos_v, tag_v):
dataset.index_field('word_seq', word_v)
dataset.index_field('pos_seq', pos_v)
dataset.index_field('head_labels', tag_v)

# datadir = "/mnt/c/Me/Dev/release-2.2-st-train-dev-data/ud-treebanks-v2.2/UD_English-EWT"
datadir = "/home/yfshao/UD_English-EWT"
cfgfile = './cfg.cfg'
train_data_name = "en_ewt-ud-train.conllu"
dev_data_name = "en_ewt-ud-dev.conllu"
emb_file_name = '/home/yfshao/glove.6B.100d.txt'
processed_datadir = './save'

# Config Loader
train_args = ConfigSection()
test_args = ConfigSection()
model_args = ConfigSection()
optim_args = ConfigSection()
ConfigLoader.load_config(cfgfile, {"train": train_args, "test": test_args, "model": model_args, "optim": optim_args})

# Data Loader
def save_data(dirpath, **kwargs):
import _pickle
if not os.path.exists(dirpath):
os.mkdir(dirpath)
for name, data in kwargs.items():
with open(os.path.join(dirpath, name+'.pkl'), 'wb') as f:
_pickle.dump(data, f)


def load_data(dirpath):
import _pickle
datas = {}
for f_name in os.listdir(dirpath):
if not f_name.endswith('.pkl'):
continue
name = f_name[:-4]
with open(os.path.join(dirpath, f_name), 'rb') as f:
datas[name] = _pickle.load(f)
return datas

class MyTester(object):
def __init__(self, batch_size, use_cuda=False, **kwagrs):
self.batch_size = batch_size
self.use_cuda = use_cuda

def test(self, model, dataset):
self.model = model.cuda() if self.use_cuda else model
self.model.eval()
batchiter = Batch(dataset, self.batch_size, SequentialSampler(), self.use_cuda)
eval_res = defaultdict(list)
i = 0
for batch_x, batch_y in batchiter:
with torch.no_grad():
pred_y = self.model(**batch_x)
eval_one = self.model.evaluate(**pred_y, **batch_y)
i += self.batch_size
for eval_name, tensor in eval_one.items():
eval_res[eval_name].append(tensor)
tmp = {}
for eval_name, tensorlist in eval_res.items():
tmp[eval_name] = torch.cat(tensorlist, dim=0)

self.res = self.model.metrics(**tmp)

def show_metrics(self):
s = ""
for name, val in self.res.items():
s += '{}: {:.2f}\t'.format(name, val)
return s


loader = MyDataLoader('')
try:
data_dict = load_data(processed_datadir)
word_v = data_dict['word_v']
pos_v = data_dict['pos_v']
tag_v = data_dict['tag_v']
train_data = data_dict['train_data']
dev_data = data_dict['dev_data']
print('use saved pickles')

except Exception as _:
print('load raw data and preprocess')
word_v = Vocabulary(need_default=True, min_freq=2)
pos_v = Vocabulary(need_default=True)
tag_v = Vocabulary(need_default=False)
train_data = loader.load(os.path.join(datadir, train_data_name), word_v, pos_v, tag_v)
dev_data = loader.load(os.path.join(datadir, dev_data_name))
save_data(processed_datadir, word_v=word_v, pos_v=pos_v, tag_v=tag_v, train_data=train_data, dev_data=dev_data)

loader.index_data(train_data, word_v, pos_v, tag_v)
loader.index_data(dev_data, word_v, pos_v, tag_v)
print(len(train_data))
print(len(dev_data))
ep = train_args['epochs']
train_args['epochs'] = math.ceil(50000.0 / len(train_data) * train_args['batch_size']) if ep <= 0 else ep
model_args['word_vocab_size'] = len(word_v)
model_args['pos_vocab_size'] = len(pos_v)
model_args['num_label'] = len(tag_v)


def train():
# Trainer
trainer = Trainer(**train_args.data)

def _define_optim(obj):
obj._optimizer = torch.optim.Adam(obj._model.parameters(), **optim_args.data)
obj._scheduler = torch.optim.lr_scheduler.LambdaLR(obj._optimizer, lambda ep: .75 ** (ep / 5e4))

def _update(obj):
obj._scheduler.step()
obj._optimizer.step()

trainer.define_optimizer = lambda: _define_optim(trainer)
trainer.update = lambda: _update(trainer)
trainer.get_loss = lambda predict, truth: trainer._loss_func(**predict, **truth)
trainer._create_validator = lambda x: MyTester(**test_args.data)

# Model
model = BiaffineParser(**model_args.data)

# use pretrain embedding
embed, _ = EmbedLoader.load_embedding(model_args['word_emb_dim'], emb_file_name, 'glove', word_v, os.path.join(processed_datadir, 'word_emb.pkl'))
model.word_embedding = torch.nn.Embedding.from_pretrained(embed, freeze=False)
model.word_embedding.padding_idx = word_v.padding_idx
model.word_embedding.weight.data[word_v.padding_idx].fill_(0)
model.pos_embedding.padding_idx = pos_v.padding_idx
model.pos_embedding.weight.data[pos_v.padding_idx].fill_(0)

try:
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print('model parameter loaded!')
except Exception as _:
print("No saved model. Continue.")
pass

# Start training
trainer.train(model, train_data, dev_data)
print("Training finished!")

# Saver
saver = ModelSaver("./save/saved_model.pkl")
saver.save_pytorch(model)
print("Model saved!")


def test():
# Tester
tester = MyTester(**test_args.data)

# Model
model = BiaffineParser(**model_args.data)

try:
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print('model parameter loaded!')
except Exception as _:
print("No saved model. Abort test.")
raise

# Start training
tester.test(model, dev_data)
print(tester.show_metrics())
print("Testing finished!")



if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Run a chinese word segmentation model')
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer'])
args = parser.parse_args()
if args.mode == 'train':
train()
elif args.mode == 'test':
test()
elif args.mode == 'infer':
infer()
else:
print('no mode specified for model!')
parser.print_help()

+ 1
- 17
reproduction/Char-aware_NLM/main.py View File

@@ -1,24 +1,8 @@
from fastNLP.core.loss import Loss
from fastNLP.core.preprocess import Preprocessor
from fastNLP.core.trainer import Trainer
from fastNLP.loader.dataset_loader import LMDataSetLoader
from fastNLP.models.char_language_model import CharLM

PICKLE = "./save/"


def train():
loader = LMDataSetLoader()
train_data = loader.load()

pre = Preprocessor(label_is_seq=True, share_vocab=True)
train_set = pre.run(train_data, pickle_path=PICKLE)

model = CharLM(50, 50, pre.vocab_size, pre.char_vocab_size)

trainer = Trainer(task="language_model", loss=Loss("cross_entropy"))

trainer.train(model, train_set)
pass


if __name__ == "__main__":


+ 1
- 1
reproduction/chinese_word_segment/run.py View File

@@ -12,7 +12,7 @@ from fastNLP.loader.model_loader import ModelLoader
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.core.predictor import SeqLabelInfer
from fastNLP.core.dataset import SeqLabelDataSet, change_field_is_target
from fastNLP.core.dataset import DataSet
from fastNLP.core.preprocess import save_pickle
from fastNLP.core.metrics import SeqLabelEvaluator



+ 1
- 12
test/core/test_batch.py View File

@@ -3,7 +3,7 @@ import unittest
import torch

from fastNLP.core.batch import Batch
from fastNLP.core.dataset import DataSet, create_dataset_from_lists
from fastNLP.core.dataset import DataSet
from fastNLP.core.field import TextField, LabelField
from fastNLP.core.instance import Instance

@@ -51,14 +51,3 @@ class TestCase1(unittest.TestCase):
self.assertTrue(isinstance(batch_x["text"], torch.LongTensor))
self.assertTrue(isinstance(batch_y, dict))
self.assertTrue(isinstance(batch_y["label"], torch.LongTensor))


class TestCase2(unittest.TestCase):
def test(self):
data = DataSet()
for text in texts:
x = TextField(text, is_target=False)
ins = Instance(text=x)
data.append(ins)
data_set = create_dataset_from_lists(texts, vocab, has_target=False)
self.assertTrue(type(data) == type(data_set))

+ 6
- 195
test/core/test_dataset.py View File

@@ -1,7 +1,6 @@
import unittest

from fastNLP.core.dataset import SeqLabelDataSet, TextClassifyDataSet
from fastNLP.core.dataset import create_dataset_from_lists
from fastNLP.loader.dataset_loader import convert_seq2seq_dataset, convert_seq_dataset


class TestDataSet(unittest.TestCase):
@@ -19,8 +18,9 @@ class TestDataSet(unittest.TestCase):
label_vocab = {"1": 1, "2": 2, "3": 3, "4": 4}

def test_case_1(self):
data_set = create_dataset_from_lists(self.labeled_data_list, self.word_vocab, has_target=True,
label_vocab=self.label_vocab)
data_set = convert_seq2seq_dataset(self.labeled_data_list)
data_set.index_field("word_seq", self.word_vocab)
data_set.index_field("label_seq", self.label_vocab)
self.assertEqual(len(data_set), len(self.labeled_data_list))
self.assertTrue(len(data_set) > 0)
self.assertTrue(hasattr(data_set[0], "fields"))
@@ -39,7 +39,8 @@ class TestDataSet(unittest.TestCase):
[self.label_vocab[c] for c in self.labeled_data_list[0][1]])

def test_case_2(self):
data_set = create_dataset_from_lists(self.unlabeled_data_list, self.word_vocab, has_target=False)
data_set = convert_seq_dataset(self.unlabeled_data_list)
data_set.index_field("word_seq", self.word_vocab)

self.assertEqual(len(data_set), len(self.unlabeled_data_list))
self.assertTrue(len(data_set) > 0)
@@ -51,193 +52,3 @@ class TestDataSet(unittest.TestCase):
self.assertEqual(data_set[0].fields["word_seq"]._index,
[self.word_vocab[c] for c in self.unlabeled_data_list[0]])


class TestDataSetConvertion(unittest.TestCase):
labeled_data_list = [
[["a", "b", "e", "d"], ["1", "2", "3", "4"]],
[["a", "b", "e", "d"], ["1", "2", "3", "4"]],
[["a", "b", "e", "d"], ["1", "2", "3", "4"]],
]
unlabeled_data_list = [
["a", "b", "e", "d"],
["a", "b", "e", "d"],
["a", "b", "e", "d"]
]
word_vocab = {"a": 0, "b": 1, "e": 2, "d": 3}
label_vocab = {"1": 1, "2": 2, "3": 3, "4": 4}

def test_case_1(self):
def loader(path):
labeled_data_list = [
[["a", "b", "e", "d"], ["1", "2", "3", "4"]],
[["a", "b", "e", "d"], ["1", "2", "3", "4"]],
[["a", "b", "e", "d"], ["1", "2", "3", "4"]],
]
return labeled_data_list

data_set = SeqLabelDataSet(load_func=loader)
data_set.load("any_path")

self.assertEqual(len(data_set), len(self.labeled_data_list))
self.assertTrue(len(data_set) > 0)
self.assertTrue(hasattr(data_set[0], "fields"))
self.assertTrue("word_seq" in data_set[0].fields)

self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])

self.assertTrue("truth" in data_set[0].fields)
self.assertTrue(hasattr(data_set[0].fields["truth"], "text"))
self.assertTrue(hasattr(data_set[0].fields["truth"], "_index"))
self.assertEqual(data_set[0].fields["truth"].text, self.labeled_data_list[0][1])

self.assertTrue("word_seq_origin_len" in data_set[0].fields)

def test_case_2(self):
def loader(path):
unlabeled_data_list = [
["a", "b", "e", "d"],
["a", "b", "e", "d"],
["a", "b", "e", "d"]
]
return unlabeled_data_list

data_set = SeqLabelDataSet(load_func=loader)
data_set.load("any_path", vocabs={"word_vocab": self.word_vocab}, infer=True)

self.assertEqual(len(data_set), len(self.labeled_data_list))
self.assertTrue(len(data_set) > 0)
self.assertTrue(hasattr(data_set[0], "fields"))
self.assertTrue("word_seq" in data_set[0].fields)
self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])
self.assertEqual(data_set[0].fields["word_seq"]._index,
[self.word_vocab[c] for c in self.labeled_data_list[0][0]])

self.assertTrue("word_seq_origin_len" in data_set[0].fields)

def test_case_3(self):
def loader(path):
labeled_data_list = [
[["a", "b", "e", "d"], ["1", "2", "3", "4"]],
[["a", "b", "e", "d"], ["1", "2", "3", "4"]],
[["a", "b", "e", "d"], ["1", "2", "3", "4"]],
]
return labeled_data_list

data_set = SeqLabelDataSet(load_func=loader)
data_set.load("any_path", vocabs={"word_vocab": self.word_vocab, "label_vocab": self.label_vocab})

self.assertEqual(len(data_set), len(self.labeled_data_list))
self.assertTrue(len(data_set) > 0)
self.assertTrue(hasattr(data_set[0], "fields"))
self.assertTrue("word_seq" in data_set[0].fields)
self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])
self.assertEqual(data_set[0].fields["word_seq"]._index,
[self.word_vocab[c] for c in self.labeled_data_list[0][0]])

self.assertTrue("truth" in data_set[0].fields)
self.assertTrue(hasattr(data_set[0].fields["truth"], "text"))
self.assertTrue(hasattr(data_set[0].fields["truth"], "_index"))
self.assertEqual(data_set[0].fields["truth"].text, self.labeled_data_list[0][1])
self.assertEqual(data_set[0].fields["truth"]._index,
[self.label_vocab[c] for c in self.labeled_data_list[0][1]])

self.assertTrue("word_seq_origin_len" in data_set[0].fields)


class TestDataSetConvertionHHH(unittest.TestCase):
labeled_data_list = [
[["a", "b", "e", "d"], "A"],
[["a", "b", "e", "d"], "C"],
[["a", "b", "e", "d"], "B"],
]
unlabeled_data_list = [
["a", "b", "e", "d"],
["a", "b", "e", "d"],
["a", "b", "e", "d"]
]
word_vocab = {"a": 0, "b": 1, "e": 2, "d": 3}
label_vocab = {"A": 1, "B": 2, "C": 3}

def test_case_1(self):
def loader(path):
labeled_data_list = [
[["a", "b", "e", "d"], "A"],
[["a", "b", "e", "d"], "C"],
[["a", "b", "e", "d"], "B"],
]
return labeled_data_list

data_set = TextClassifyDataSet(load_func=loader)
data_set.load("xxx")

self.assertEqual(len(data_set), len(self.labeled_data_list))
self.assertTrue(len(data_set) > 0)
self.assertTrue(hasattr(data_set[0], "fields"))
self.assertTrue("word_seq" in data_set[0].fields)

self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])

self.assertTrue("label" in data_set[0].fields)
self.assertTrue(hasattr(data_set[0].fields["label"], "label"))
self.assertTrue(hasattr(data_set[0].fields["label"], "_index"))
self.assertEqual(data_set[0].fields["label"].label, self.labeled_data_list[0][1])

def test_case_2(self):
def loader(path):
labeled_data_list = [
[["a", "b", "e", "d"], "A"],
[["a", "b", "e", "d"], "C"],
[["a", "b", "e", "d"], "B"],
]
return labeled_data_list

data_set = TextClassifyDataSet(load_func=loader)
data_set.load("xxx", vocabs={"word_vocab": self.word_vocab, "label_vocab": self.label_vocab})

self.assertEqual(len(data_set), len(self.labeled_data_list))
self.assertTrue(len(data_set) > 0)
self.assertTrue(hasattr(data_set[0], "fields"))
self.assertTrue("word_seq" in data_set[0].fields)

self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])
self.assertEqual(data_set[0].fields["word_seq"]._index,
[self.word_vocab[c] for c in self.labeled_data_list[0][0]])

self.assertTrue("label" in data_set[0].fields)
self.assertTrue(hasattr(data_set[0].fields["label"], "label"))
self.assertTrue(hasattr(data_set[0].fields["label"], "_index"))
self.assertEqual(data_set[0].fields["label"].label, self.labeled_data_list[0][1])
self.assertEqual(data_set[0].fields["label"]._index, self.label_vocab[self.labeled_data_list[0][1]])

def test_case_3(self):
def loader(path):
unlabeled_data_list = [
["a", "b", "e", "d"],
["a", "b", "e", "d"],
["a", "b", "e", "d"]
]
return unlabeled_data_list

data_set = TextClassifyDataSet(load_func=loader)
data_set.load("xxx", vocabs={"word_vocab": self.word_vocab}, infer=True)

self.assertEqual(len(data_set), len(self.labeled_data_list))
self.assertTrue(len(data_set) > 0)
self.assertTrue(hasattr(data_set[0], "fields"))
self.assertTrue("word_seq" in data_set[0].fields)

self.assertTrue(hasattr(data_set[0].fields["word_seq"], "text"))
self.assertTrue(hasattr(data_set[0].fields["word_seq"], "_index"))
self.assertEqual(data_set[0].fields["word_seq"].text, self.labeled_data_list[0][0])
self.assertEqual(data_set[0].fields["word_seq"]._index,
[self.word_vocab[c] for c in self.labeled_data_list[0][0]])

+ 6
- 7
test/core/test_predictor.py View File

@@ -1,11 +1,12 @@
import os
import unittest

from fastNLP.core.dataset import TextClassifyDataSet, SeqLabelDataSet
from fastNLP.core.dataset import DataSet
from fastNLP.core.predictor import Predictor
from fastNLP.core.preprocess import save_pickle
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.loader.base_loader import BaseLoader
from fastNLP.loader.dataset_loader import convert_seq_dataset
from fastNLP.models.cnn_text_classification import CNNText
from fastNLP.models.sequence_modeling import SeqLabeling

@@ -42,8 +43,8 @@ class TestPredictor(unittest.TestCase):
predictor = Predictor("./save/", pre.text_classify_post_processor)

# Load infer data
infer_data_set = TextClassifyDataSet(load_func=BaseLoader.load)
infer_data_set.convert_for_infer(infer_data, vocabs={"word_vocab": vocab.word2idx})
infer_data_set = convert_seq_dataset(infer_data)
infer_data_set.index_field("word_seq", vocab)

results = predictor.predict(network=model, data=infer_data_set)

@@ -54,14 +55,12 @@ class TestPredictor(unittest.TestCase):
self.assertTrue(isinstance(res, str))
self.assertTrue(res in class_vocab.word2idx)

del model, predictor, infer_data_set
del model, predictor
infer_data_set.set_origin_len("word_seq")

model = SeqLabeling(model_args)
predictor = Predictor("./save/", pre.seq_label_post_processor)

infer_data_set = SeqLabelDataSet(load_func=BaseLoader.load)
infer_data_set.convert_for_infer(infer_data, vocabs={"word_vocab": vocab.word2idx})

results = predictor.predict(network=model, data=infer_data_set)
self.assertTrue(isinstance(results, list))
self.assertEqual(len(results), len(infer_data))


+ 0
- 72
test/core/test_preprocess.py View File

@@ -1,72 +0,0 @@
import os
import unittest

from fastNLP.core.dataset import DataSet
from fastNLP.core.preprocess import SeqLabelPreprocess

data = [
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
[['Hello', 'world', '!'], ['a', 'n', '.']],
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
[['Hello', 'world', '!'], ['a', 'n', '.']],
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
[['Hello', 'world', '!'], ['a', 'n', '.']],
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
[['Hello', 'world', '!'], ['a', 'n', '.']],
[['Tom', 'and', 'Jerry', '.'], ['n', '&', 'n', '.']],
[['Hello', 'world', '!'], ['a', 'n', '.']],
]


class TestCase1(unittest.TestCase):
def test(self):
if os.path.exists("./save"):
for root, dirs, files in os.walk("./save", topdown=False):
for name in files:
os.remove(os.path.join(root, name))
for name in dirs:
os.rmdir(os.path.join(root, name))
result = SeqLabelPreprocess().run(train_dev_data=data, train_dev_split=0.4,
pickle_path="./save")
self.assertEqual(len(result), 2)
self.assertEqual(type(result[0]), DataSet)
self.assertEqual(type(result[1]), DataSet)

os.system("rm -rf save")
print("pickle path deleted")


class TestCase2(unittest.TestCase):
def test(self):
if os.path.exists("./save"):
for root, dirs, files in os.walk("./save", topdown=False):
for name in files:
os.remove(os.path.join(root, name))
for name in dirs:
os.rmdir(os.path.join(root, name))
result = SeqLabelPreprocess().run(test_data=data, train_dev_data=data,
pickle_path="./save", train_dev_split=0.4,
cross_val=False)
self.assertEqual(len(result), 3)
self.assertEqual(type(result[0]), DataSet)
self.assertEqual(type(result[1]), DataSet)
self.assertEqual(type(result[2]), DataSet)

os.system("rm -rf save")
print("pickle path deleted")


class TestCase3(unittest.TestCase):
def test(self):
num_folds = 2
result = SeqLabelPreprocess().run(test_data=None, train_dev_data=data,
pickle_path="./save", train_dev_split=0.4,
cross_val=True, n_fold=num_folds)
self.assertEqual(len(result), 2)
self.assertEqual(len(result[0]), num_folds)
self.assertEqual(len(result[1]), num_folds)
for data_set in result[0] + result[1]:
self.assertEqual(type(data_set), DataSet)

os.system("rm -rf save")
print("pickle path deleted")

+ 2
- 2
test/core/test_tester.py View File

@@ -1,7 +1,7 @@
import os
import unittest

from fastNLP.core.dataset import SeqLabelDataSet
from fastNLP.core.dataset import DataSet
from fastNLP.core.metrics import SeqLabelEvaluator
from fastNLP.core.field import TextField, LabelField
from fastNLP.core.instance import Instance
@@ -35,7 +35,7 @@ class TestTester(unittest.TestCase):
vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9}
label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4}

data_set = SeqLabelDataSet()
data_set = DataSet()
for example in train_data:
text, label = example[0], example[1]
x = TextField(text, False)


+ 2
- 2
test/core/test_trainer.py View File

@@ -1,7 +1,7 @@
import os
import unittest

from fastNLP.core.dataset import SeqLabelDataSet
from fastNLP.core.dataset import DataSet
from fastNLP.core.metrics import SeqLabelEvaluator
from fastNLP.core.field import TextField, LabelField
from fastNLP.core.instance import Instance
@@ -36,7 +36,7 @@ class TestTrainer(unittest.TestCase):
vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9}
label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4}

data_set = SeqLabelDataSet()
data_set = DataSet()
for example in train_data:
text, label = example[0], example[1]
x = TextField(text, False)


+ 25
- 0
test/data_for_tests/config View File

@@ -45,3 +45,28 @@ use_cuda = true
learn_rate = 1e-3
momentum = 0.9
model_name = "class_model.pkl"

[snli_trainer]
epochs = 5
batch_size = 32
validate = true
save_best_dev = true
use_cuda = true
learn_rate = 1e-4
loss = "cross_entropy"
print_every_step = 1000

[snli_tester]
batch_size = 512
use_cuda = true

[snli_model]
model_name = "snli_model.pkl"
embed_dim = 300
hidden_size = 300
batch_first = true
dropout = 0.5
gpu = true
embed_file = "./../data_for_tests/glove.840B.300d.txt"
embed_pkl = "./snli/embed.pkl"
examples = 0

+ 12
- 0
test/data_for_tests/glove.6B.50d_test.txt View File

@@ -0,0 +1,12 @@
the 0.418 0.24968 -0.41242 0.1217 0.34527 -0.044457 -0.49688 -0.17862 -0.00066023 -0.6566 0.27843 -0.14767 -0.55677 0.14658 -0.0095095 0.011658 0.10204 -0.12792 -0.8443 -0.12181 -0.016801 -0.33279 -0.1552 -0.23131 -0.19181 -1.8823 -0.76746 0.099051 -0.42125 -0.19526 4.0071 -0.18594 -0.52287 -0.31681 0.00059213 0.0074449 0.17778 -0.15897 0.012041 -0.054223 -0.29871 -0.15749 -0.34758 -0.045637 -0.44251 0.18785 0.0027849 -0.18411 -0.11514 -0.78581
, 0.013441 0.23682 -0.16899 0.40951 0.63812 0.47709 -0.42852 -0.55641 -0.364 -0.23938 0.13001 -0.063734 -0.39575 -0.48162 0.23291 0.090201 -0.13324 0.078639 -0.41634 -0.15428 0.10068 0.48891 0.31226 -0.1252 -0.037512 -1.5179 0.12612 -0.02442 -0.042961 -0.28351 3.5416 -0.11956 -0.014533 -0.1499 0.21864 -0.33412 -0.13872 0.31806 0.70358 0.44858 -0.080262 0.63003 0.32111 -0.46765 0.22786 0.36034 -0.37818 -0.56657 0.044691 0.30392
. 0.15164 0.30177 -0.16763 0.17684 0.31719 0.33973 -0.43478 -0.31086 -0.44999 -0.29486 0.16608 0.11963 -0.41328 -0.42353 0.59868 0.28825 -0.11547 -0.041848 -0.67989 -0.25063 0.18472 0.086876 0.46582 0.015035 0.043474 -1.4671 -0.30384 -0.023441 0.30589 -0.21785 3.746 0.0042284 -0.18436 -0.46209 0.098329 -0.11907 0.23919 0.1161 0.41705 0.056763 -6.3681e-05 0.068987 0.087939 -0.10285 -0.13931 0.22314 -0.080803 -0.35652 0.016413 0.10216
of 0.70853 0.57088 -0.4716 0.18048 0.54449 0.72603 0.18157 -0.52393 0.10381 -0.17566 0.078852 -0.36216 -0.11829 -0.83336 0.11917 -0.16605 0.061555 -0.012719 -0.56623 0.013616 0.22851 -0.14396 -0.067549 -0.38157 -0.23698 -1.7037 -0.86692 -0.26704 -0.2589 0.1767 3.8676 -0.1613 -0.13273 -0.68881 0.18444 0.0052464 -0.33874 -0.078956 0.24185 0.36576 -0.34727 0.28483 0.075693 -0.062178 -0.38988 0.22902 -0.21617 -0.22562 -0.093918 -0.80375
to 0.68047 -0.039263 0.30186 -0.17792 0.42962 0.032246 -0.41376 0.13228 -0.29847 -0.085253 0.17118 0.22419 -0.10046 -0.43653 0.33418 0.67846 0.057204 -0.34448 -0.42785 -0.43275 0.55963 0.10032 0.18677 -0.26854 0.037334 -2.0932 0.22171 -0.39868 0.20912 -0.55725 3.8826 0.47466 -0.95658 -0.37788 0.20869 -0.32752 0.12751 0.088359 0.16351 -0.21634 -0.094375 0.018324 0.21048 -0.03088 -0.19722 0.082279 -0.09434 -0.073297 -0.064699 -0.26044
and 0.26818 0.14346 -0.27877 0.016257 0.11384 0.69923 -0.51332 -0.47368 -0.33075 -0.13834 0.2702 0.30938 -0.45012 -0.4127 -0.09932 0.038085 0.029749 0.10076 -0.25058 -0.51818 0.34558 0.44922 0.48791 -0.080866 -0.10121 -1.3777 -0.10866 -0.23201 0.012839 -0.46508 3.8463 0.31362 0.13643 -0.52244 0.3302 0.33707 -0.35601 0.32431 0.12041 0.3512 -0.069043 0.36885 0.25168 -0.24517 0.25381 0.1367 -0.31178 -0.6321 -0.25028 -0.38097
in 0.33042 0.24995 -0.60874 0.10923 0.036372 0.151 -0.55083 -0.074239 -0.092307 -0.32821 0.09598 -0.82269 -0.36717 -0.67009 0.42909 0.016496 -0.23573 0.12864 -1.0953 0.43334 0.57067 -0.1036 0.20422 0.078308 -0.42795 -1.7984 -0.27865 0.11954 -0.12689 0.031744 3.8631 -0.17786 -0.082434 -0.62698 0.26497 -0.057185 -0.073521 0.46103 0.30862 0.12498 -0.48609 -0.0080272 0.031184 -0.36576 -0.42699 0.42164 -0.11666 -0.50703 -0.027273 -0.53285
a 0.21705 0.46515 -0.46757 0.10082 1.0135 0.74845 -0.53104 -0.26256 0.16812 0.13182 -0.24909 -0.44185 -0.21739 0.51004 0.13448 -0.43141 -0.03123 0.20674 -0.78138 -0.20148 -0.097401 0.16088 -0.61836 -0.18504 -0.12461 -2.2526 -0.22321 0.5043 0.32257 0.15313 3.9636 -0.71365 -0.67012 0.28388 0.21738 0.14433 0.25926 0.23434 0.4274 -0.44451 0.13813 0.36973 -0.64289 0.024142 -0.039315 -0.26037 0.12017 -0.043782 0.41013 0.1796
" 0.25769 0.45629 -0.76974 -0.37679 0.59272 -0.063527 0.20545 -0.57385 -0.29009 -0.13662 0.32728 1.4719 -0.73681 -0.12036 0.71354 -0.46098 0.65248 0.48887 -0.51558 0.039951 -0.34307 -0.014087 0.86488 0.3546 0.7999 -1.4995 -1.8153 0.41128 0.23921 -0.43139 3.6623 -0.79834 -0.54538 0.16943 -0.82017 -0.3461 0.69495 -1.2256 -0.17992 -0.057474 0.030498 -0.39543 -0.38515 -1.0002 0.087599 -0.31009 -0.34677 -0.31438 0.75004 0.97065
's 0.23727 0.40478 -0.20547 0.58805 0.65533 0.32867 -0.81964 -0.23236 0.27428 0.24265 0.054992 0.16296 -1.2555 -0.086437 0.44536 0.096561 -0.16519 0.058378 -0.38598 0.086977 0.0033869 0.55095 -0.77697 -0.62096 0.092948 -2.5685 -0.67739 0.10151 -0.48643 -0.057805 3.1859 -0.017554 -0.16138 0.055486 -0.25885 -0.33938 -0.19928 0.26049 0.10478 -0.55934 -0.12342 0.65961 -0.51802 -0.82995 -0.082739 0.28155 -0.423 -0.27378 -0.007901 -0.030231



+ 12
- 2
test/loader/test_dataset_loader.py View File

@@ -3,7 +3,7 @@ import unittest

from fastNLP.loader.dataset_loader import POSDataSetLoader, LMDataSetLoader, TokenizeDataSetLoader, \
PeopleDailyCorpusLoader, ConllLoader
from fastNLP.core.dataset import DataSet

class TestDatasetLoader(unittest.TestCase):
def test_case_1(self):
@@ -15,13 +15,23 @@ class TestDatasetLoader(unittest.TestCase):

def test_case_TokenizeDatasetLoader(self):
loader = TokenizeDataSetLoader()
data = loader.load("./test/data_for_tests/cws_pku_utf_8", max_seq_len=32)
filepath = "./test/data_for_tests/cws_pku_utf_8"
data = loader.load(filepath, max_seq_len=32)
assert len(data) > 0

data1 = DataSet()
data1.read_tokenize(filepath, max_seq_len=32)
assert len(data1) > 0
print("pass TokenizeDataSetLoader test!")

def test_case_POSDatasetLoader(self):
loader = POSDataSetLoader()
filepath = "./test/data_for_tests/people.txt"
data = loader.load("./test/data_for_tests/people.txt")
datas = loader.load_lines("./test/data_for_tests/people.txt")

data1 = DataSet().read_pos(filepath)
assert len(data1) > 0
print("pass POSDataSetLoader test!")

def test_case_LMDatasetLoader(self):


+ 33
- 0
test/loader/test_embed_loader.py View File

@@ -0,0 +1,33 @@
import unittest
import os

import torch

from fastNLP.loader.embed_loader import EmbedLoader
from fastNLP.core.vocabulary import Vocabulary


class TestEmbedLoader(unittest.TestCase):
glove_path = './test/data_for_tests/glove.6B.50d_test.txt'
pkl_path = './save'
raw_texts = ["i am a cat",
"this is a test of new batch",
"ha ha",
"I am a good boy .",
"This is the most beautiful girl ."
]
texts = [text.strip().split() for text in raw_texts]
vocab = Vocabulary()
vocab.update(texts)
def test1(self):
emb, _ = EmbedLoader.load_embedding(50, self.glove_path, 'glove', self.vocab, self.pkl_path)
self.assertTrue(emb.shape[0] == (len(self.vocab)))
self.assertTrue(emb.shape[1] == 50)
os.remove(self.pkl_path)
def test2(self):
try:
_ = EmbedLoader.load_embedding(100, self.glove_path, 'glove', self.vocab, self.pkl_path)
self.fail(msg="load dismatch embedding")
except ValueError:
pass

+ 15
- 3
test/model/seq_labeling.py View File

@@ -1,9 +1,9 @@
import os
import sys

sys.path.append("..")
import argparse
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.dataset_loader import BaseLoader
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
@@ -82,6 +82,7 @@ def train_and_test():
save_pickle(data_set.word_vocab, pickle_path, "word2id.pkl")
save_pickle(data_set.label_vocab, pickle_path, "label2id.pkl")

"""
trainer = SeqLabelTrainer(
epochs=trainer_args["epochs"],
batch_size=trainer_args["batch_size"],
@@ -92,12 +93,23 @@ def train_and_test():
model_name=model_name,
optimizer=Optimizer("SGD", lr=0.01, momentum=0.9),
)
"""

# Model
model = SeqLabeling(model_args)

model.fit(train_set, dev_set,
epochs=trainer_args["epochs"],
batch_size=trainer_args["batch_size"],
validate=False,
use_cuda=trainer_args["use_cuda"],
pickle_path=pickle_path,
save_best_dev=trainer_args["save_best_dev"],
model_name=model_name,
optimizer=Optimizer("SGD", lr=0.01, momentum=0.9))

# Start training
trainer.train(model, train_set, dev_set)
# trainer.train(model, train_set, dev_set)
print("Training finished!")

# Saver
@@ -105,7 +117,7 @@ def train_and_test():
saver.save_pytorch(model)
print("Model saved!")

del model, trainer
del model

change_field_is_target(dev_set, "truth", True)



+ 25
- 0
test/model/test_char_language_model.py View File

@@ -0,0 +1,25 @@
import unittest

import numpy as np
import torch

from fastNLP.models.char_language_model import CharLM


class TestCharLM(unittest.TestCase):
def test_case_1(self):
char_emb_dim = 50
word_emb_dim = 50
vocab_size = 1000
num_char = 24
max_word_len = 21
num_seq = 64
seq_len = 32

model = CharLM(char_emb_dim, word_emb_dim, vocab_size, num_char)

x = torch.from_numpy(np.random.randint(0, num_char, size=(num_seq, seq_len, max_word_len + 2)))

self.assertEqual(tuple(x.shape), (num_seq, seq_len, max_word_len + 2))
y = model(x)
self.assertEqual(tuple(y.shape), (num_seq * seq_len, vocab_size))

+ 19
- 13
test/model/test_cws.py View File

@@ -1,13 +1,14 @@
import os

from fastNLP.core.dataset import SeqLabelDataSet, change_field_is_target
from fastNLP.core.dataset import DataSet
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.core.metrics import SeqLabelEvaluator
from fastNLP.core.predictor import SeqLabelInfer
from fastNLP.core.preprocess import save_pickle, load_pickle
from fastNLP.core.tester import SeqLabelTester
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.dataset_loader import TokenizeDataSetLoader, BaseLoader
from fastNLP.loader.dataset_loader import TokenizeDataSetLoader, BaseLoader, RawDataSetLoader
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.models.sequence_modeling import SeqLabeling
from fastNLP.saver.model_saver import ModelSaver
@@ -37,9 +38,9 @@ def infer():
print("model loaded!")

# Load infer data
infer_data = SeqLabelDataSet(load_func=BaseLoader.load)
infer_data.load(data_infer_path, vocabs={"word_vocab": word2index}, infer=True)
infer_data = RawDataSetLoader().load(data_infer_path)
infer_data.index_field("word_seq", word2index)
infer_data.set_origin_len("word_seq")
# inference
infer = SeqLabelInfer(pickle_path)
results = infer.predict(model, infer_data)
@@ -52,13 +53,18 @@ def train_test():
ConfigLoader().load_config(config_path, {"POS_infer": train_args})

# define dataset
data_train = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load)
data_train.load(cws_data_path)
train_args["vocab_size"] = len(data_train.word_vocab)
train_args["num_classes"] = len(data_train.label_vocab)

save_pickle(data_train.word_vocab, pickle_path, "word2id.pkl")
save_pickle(data_train.label_vocab, pickle_path, "label2id.pkl")
data_train = TokenizeDataSetLoader().load(cws_data_path)
word_vocab = Vocabulary()
label_vocab = Vocabulary()
data_train.update_vocab(word_seq=word_vocab, label_seq=label_vocab)
data_train.index_field("word_seq", word_vocab).index_field("label_seq", label_vocab)
data_train.set_origin_len("word_seq")
data_train.rename_field("label_seq", "truth").set_target(truth=False)
train_args["vocab_size"] = len(word_vocab)
train_args["num_classes"] = len(label_vocab)

save_pickle(word_vocab, pickle_path, "word2id.pkl")
save_pickle(label_vocab, pickle_path, "label2id.pkl")

# Trainer
trainer = SeqLabelTrainer(**train_args.data)
@@ -90,7 +96,7 @@ def train_test():
tester = SeqLabelTester(**test_args.data)

# Start testing
change_field_is_target(data_train, "truth", True)
data_train.set_target(truth=True)
tester.test(model, data_train)




+ 14
- 8
test/model/test_seq_label.py View File

@@ -1,6 +1,7 @@
import os

from fastNLP.core.dataset import SeqLabelDataSet, change_field_is_target
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.loader.dataset_loader import TokenizeDataSetLoader
from fastNLP.core.metrics import SeqLabelEvaluator
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.preprocess import save_pickle
@@ -25,14 +26,19 @@ def test_training():
ConfigLoader().load_config(config_dir, {
"test_seq_label_trainer": trainer_args, "test_seq_label_model": model_args})

data_set = SeqLabelDataSet()
data_set.load(data_path)
data_set = TokenizeDataSetLoader().load(data_path)
word_vocab = Vocabulary()
label_vocab = Vocabulary()
data_set.update_vocab(word_seq=word_vocab, label_seq=label_vocab)
data_set.index_field("word_seq", word_vocab).index_field("label_seq", label_vocab)
data_set.set_origin_len("word_seq")
data_set.rename_field("label_seq", "truth").set_target(truth=False)
data_train, data_dev = data_set.split(0.3, shuffle=True)
model_args["vocab_size"] = len(data_set.word_vocab)
model_args["num_classes"] = len(data_set.label_vocab)
model_args["vocab_size"] = len(word_vocab)
model_args["num_classes"] = len(label_vocab)

save_pickle(data_set.word_vocab, pickle_path, "word2id.pkl")
save_pickle(data_set.label_vocab, pickle_path, "label2id.pkl")
save_pickle(word_vocab, pickle_path, "word2id.pkl")
save_pickle(label_vocab, pickle_path, "label2id.pkl")

trainer = SeqLabelTrainer(
epochs=trainer_args["epochs"],
@@ -76,5 +82,5 @@ def test_training():
)

# Start testing with validation data
change_field_is_target(data_dev, "truth", True)
data_dev.set_target(truth=True)
tester.test(model, data_dev)

+ 28
- 0
test/modules/test_char_embedding.py View File

@@ -0,0 +1,28 @@
import unittest

import torch

from fastNLP.modules.encoder.char_embedding import ConvCharEmbedding, LSTMCharEmbedding


class TestCharEmbed(unittest.TestCase):
def test_case_1(self):
batch_size = 128
char_emb = 100
word_length = 1
x = torch.Tensor(batch_size, char_emb, word_length)
x = x.transpose(1, 2)

cce = ConvCharEmbedding(char_emb)
y = cce(x)
self.assertEqual(tuple(x.shape), (batch_size, word_length, char_emb))
print("CNN Char Emb input: ", x.shape)
self.assertEqual(tuple(y.shape), (batch_size, char_emb, 1))
print("CNN Char Emb output: ", y.shape) # [128, 100]

lce = LSTMCharEmbedding(char_emb)
o = lce(x)
self.assertEqual(tuple(x.shape), (batch_size, word_length, char_emb))
print("LSTM Char Emb input: ", x.shape)
self.assertEqual(tuple(o.shape), (batch_size, char_emb, 1))
print("LSTM Char Emb size: ", o.shape)

+ 14
- 17
test/modules/test_variational_rnn.py View File

@@ -1,28 +1,25 @@
import unittest

import numpy as np
import torch
import unittest

from fastNLP.modules.encoder.variational_rnn import VarMaskedFastLSTM
from fastNLP.modules.encoder.variational_rnn import VarLSTM


class TestMaskedRnn(unittest.TestCase):
def test_case_1(self):
masked_rnn = VarMaskedFastLSTM(input_size=1, hidden_size=1, bidirectional=True, batch_first=True)
masked_rnn = VarLSTM(input_size=1, hidden_size=1, bidirectional=True, batch_first=True)
x = torch.tensor([[[1.0], [2.0]]])
print(x.size())
y = masked_rnn(x)
mask = torch.tensor([[[1], [1]]])
y = masked_rnn(x, mask=mask)
mask = torch.tensor([[[1], [0]]])
y = masked_rnn(x, mask=mask)


def test_case_2(self):
masked_rnn = VarMaskedFastLSTM(input_size=1, hidden_size=1, bidirectional=False, batch_first=True)
x = torch.tensor([[[1.0], [2.0]]])
print(x.size())
y = masked_rnn(x)
mask = torch.tensor([[[1], [1]]])
y = masked_rnn(x, mask=mask)
xx = torch.tensor([[[1.0]]])
#y, hidden = masked_rnn.step(xx)
#step() still has a bug
#y, hidden = masked_rnn.step(xx, mask=mask)
input_size = 12
batch = 16
hidden = 10
masked_rnn = VarLSTM(input_size=input_size, hidden_size=hidden, bidirectional=False, batch_first=True)

xx = torch.randn((batch, 32, input_size))
y, _ = masked_rnn(xx)
self.assertEqual(tuple(y.shape), (batch, 32, hidden))

Loading…
Cancel
Save