Browse Source

Merge pull request #2 from fastnlp/master

1
tags/v0.1.0
Yige XU GitHub 6 years ago
parent
commit
7ff29877cd
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
40 changed files with 527 additions and 147 deletions
  1. +124
    -57
      fastNLP/action/tester.py
  2. +109
    -31
      fastNLP/action/trainer.py
  3. +1
    -2
      fastNLP/loader/dataset_loader.py
  4. +39
    -27
      fastNLP/loader/preprocess.py
  5. +2
    -21
      fastNLP/models/base_model.py
  6. +97
    -0
      fastNLP/models/sequence_modeling.py
  7. +2
    -2
      fastNLP/modules/CRF.py
  8. +43
    -5
      fastNLP/modules/prototype/example.py
  9. +6
    -0
      fastNLP/modules/utils.py
  10. +0
    -0
      fastNLP/reproduction/__init__.py
  11. +0
    -0
      reproduction/CNN-sentence_classification/.gitignore
  12. +0
    -0
      reproduction/CNN-sentence_classification/README.md
  13. +0
    -0
      reproduction/CNN-sentence_classification/__init__.py
  14. +0
    -0
      reproduction/CNN-sentence_classification/dataset.py
  15. +0
    -0
      reproduction/CNN-sentence_classification/model.py
  16. +0
    -0
      reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg
  17. +0
    -0
      reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos
  18. +0
    -0
      reproduction/CNN-sentence_classification/train.py
  19. +0
    -0
      reproduction/Char-aware_NLM/LICENSE
  20. +0
    -0
      reproduction/Char-aware_NLM/README.md
  21. +0
    -0
      reproduction/Char-aware_NLM/__init__.py
  22. +0
    -0
      reproduction/Char-aware_NLM/model.py
  23. +0
    -0
      reproduction/Char-aware_NLM/test.py
  24. +0
    -0
      reproduction/Char-aware_NLM/test.txt
  25. +0
    -0
      reproduction/Char-aware_NLM/train.py
  26. +0
    -0
      reproduction/Char-aware_NLM/train.txt
  27. +0
    -0
      reproduction/Char-aware_NLM/utilities.py
  28. +0
    -0
      reproduction/Char-aware_NLM/valid.txt
  29. +0
    -0
      reproduction/HAN-document_classification/README.md
  30. +0
    -0
      reproduction/HAN-document_classification/__init__.py
  31. +0
    -0
      reproduction/HAN-document_classification/data/test_samples.pkl
  32. +0
    -0
      reproduction/HAN-document_classification/data/train_samples.pkl
  33. +0
    -0
      reproduction/HAN-document_classification/data/yelp.word2vec
  34. +0
    -0
      reproduction/HAN-document_classification/evaluate.py
  35. +0
    -0
      reproduction/HAN-document_classification/model.py
  36. +0
    -0
      reproduction/HAN-document_classification/preprocess.py
  37. +0
    -0
      reproduction/HAN-document_classification/train.py
  38. +2
    -2
      requirements.txt
  39. +67
    -0
      test/data_for_tests/people.txt
  40. +35
    -0
      test/test_POS_pipeline.py

+ 124
- 57
fastNLP/action/tester.py View File

@@ -1,87 +1,154 @@
from collections import namedtuple
import _pickle

import numpy as np
import torch

from fastNLP.action.action import Action
from fastNLP.action.action import RandomSampler, Batchifier
from fastNLP.modules.utils import seq_mask


class Tester(Action):
class BaseTester(Action):
"""docstring for Tester"""

TestConfig = namedtuple("config", ["validate_in_training", "save_dev_input", "save_output",
"save_loss", "batch_size"])

def __init__(self, test_args):
"""
:param test_args: named tuple
"""
super(Tester, self).__init__()
self.validate_in_training = test_args.validate_in_training
self.save_dev_input = test_args.save_dev_input
self.valid_x = None
self.valid_y = None
self.save_output = test_args.save_output
super(BaseTester, self).__init__()
self.validate_in_training = test_args["validate_in_training"]
self.save_dev_data = None
self.save_output = test_args["save_output"]
self.output = None
self.save_loss = test_args.save_loss
self.save_loss = test_args["save_loss"]
self.mean_loss = None
self.batch_size = test_args.batch_size

def test(self, network, data):
print("testing")
network.mode(test=True) # turn on the testing mode
if self.save_dev_input:
if self.valid_x is None:
valid_x, valid_y = network.prepare_input(data)
self.valid_x = valid_x
self.valid_y = valid_y
else:
valid_x = self.valid_x
valid_y = self.valid_y
else:
valid_x, valid_y = network.prepare_input(data)
self.batch_size = test_args["batch_size"]
self.pickle_path = test_args["pickle_path"]
self.iterator = None

# split into batches by self.batch_size
iterations, test_batch_generator = self.batchify(self.batch_size, valid_x, valid_y)
self.model = None
self.eval_history = []

batch_output = list()
loss_history = list()
# turn on the testing mode of the network
network.mode(test=True)
def test(self, network):
# print("--------------testing----------------")
self.model = network

# turn on the testing mode; clean up the history
self.mode(network, test=True)

for step in range(iterations):
batch_x, batch_y = test_batch_generator.__next__()
dev_data = self.prepare_input(self.pickle_path)

self.iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True))

batch_output = list()
num_iter = len(dev_data) // self.batch_size

# forward pass from test input to predicted output
prediction = network.data_forward(batch_x)
for step in range(num_iter):
batch_x, batch_y = self.batchify(dev_data)

loss = network.get_loss(prediction, batch_y)
prediction = self.data_forward(network, batch_x)
eval_results = self.evaluate(prediction, batch_y)

if self.save_output:
batch_output.append(prediction.data)
batch_output.append(prediction)
if self.save_loss:
loss_history.append(loss)
self.log(self.make_log(step, loss))
self.eval_history.append(eval_results)

if self.save_loss:
self.mean_loss = np.mean(np.array(loss_history))
if self.save_output:
self.output = self.make_output(batch_output)

@property
def loss(self):
return self.mean_loss
def prepare_input(self, data_path):
"""
Save the dev data once it is loaded. Can return directly next time.
:param data_path: str, the path to the pickle data for dev
:return save_dev_data: list. Each entry is a sample, which is also a list of features and label(s).
"""
if self.save_dev_data is None:
data_dev = _pickle.load(open(data_path + "/data_train.pkl", "rb"))
self.save_dev_data = data_dev
return self.save_dev_data

@property
def result(self):
return self.output
def batchify(self, data):
"""
1. Perform batching from data and produce a batch of training data.
2. Add padding.
:param data: list. Each entry is a sample, which is also a list of features and label(s).
E.g.
[
[[word_11, word_12, word_13], [label_11. label_12]], # sample 1
[[word_21, word_22, word_23], [label_21. label_22]], # sample 2
...
]
:return batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len]
batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels]
"""
indices = next(self.iterator)
batch = [data[idx] for idx in indices]
batch_x = [sample[0] for sample in batch]
batch_y = [sample[1] for sample in batch]
batch_x = self.pad(batch_x)
return batch_x, batch_y

@staticmethod
def make_output(batch_outputs):
# construct full prediction with batch outputs
return np.concatenate(batch_outputs, axis=0)
def pad(batch, fill=0):
"""
Pad a batch of samples to maximum length.
:param batch: list of list
:param fill: word index to pad, default 0.
:return: a padded batch
"""
max_length = max([len(x) for x in batch])
for idx, sample in enumerate(batch):
if len(sample) < max_length:
batch[idx] = sample + [fill * (max_length - len(sample))]
return batch

def load_config(self, args):
def data_forward(self, network, data):
raise NotImplementedError

def load_dataset(self, args):
def evaluate(self, predict, truth):
raise NotImplementedError

@property
def matrices(self):
raise NotImplementedError

def mode(self, model, test=True):
"""To do: combine this function with Trainer ?? """
if test:
model.eval()
else:
model.train()
self.eval_history.clear()


class POSTester(BaseTester):
"""
Tester for sequence labeling.
"""

def __init__(self, test_args):
super(POSTester, self).__init__(test_args)
self.max_len = None
self.mask = None
self.batch_result = None

def data_forward(self, network, x):
"""To Do: combine with Trainer

:param network: the PyTorch model
:param x: list of list, [batch_size, max_len]
:return y: [batch_size, num_classes]
"""
seq_len = [len(seq) for seq in x]
x = torch.Tensor(x).long()
self.batch_size = x.size(0)
self.max_len = x.size(1)
self.mask = seq_mask(seq_len, self.max_len)
y = network(x)
return y

def evaluate(self, predict, truth):
truth = torch.Tensor(truth)
loss, prediction = self.model.loss(predict, truth, self.mask, self.batch_size, self.max_len)
return loss.data

def matrices(self):
return np.mean(self.eval_history)

+ 109
- 31
fastNLP/action/trainer.py View File

@@ -1,12 +1,12 @@
import _pickle
from collections import namedtuple

import numpy as np
import torch

from fastNLP.action.action import Action
from fastNLP.action.action import RandomSampler, Batchifier
from fastNLP.action.tester import Tester
from fastNLP.action.tester import POSTester
from fastNLP.modules.utils import seq_mask


class BaseTrainer(Action):
@@ -21,23 +21,29 @@ class BaseTrainer(Action):
- grad_backward
- get_loss
"""
TrainConfig = namedtuple("config", ["epochs", "validate", "batch_size", "pickle_path"])

def __init__(self, train_args):
"""
training parameters
:param train_args: dict of (key, value)

The base trainer requires the following keys:
- epochs: int, the number of epochs in training
- validate: bool, whether or not to validate on dev set
- batch_size: int
- pickle_path: str, the path to pickle files for pre-processing
"""
super(BaseTrainer, self).__init__()
self.n_epochs = train_args.epochs
self.validate = train_args.validate
self.batch_size = train_args.batch_size
self.pickle_path = train_args.pickle_path
self.n_epochs = train_args["epochs"]
self.validate = train_args["validate"]
self.batch_size = train_args["batch_size"]
self.pickle_path = train_args["pickle_path"]
self.model = None
self.iterator = None
self.loss_func = None
self.optimizer = None

def train(self, network):
"""General training loop.
"""General Training Steps
:param network: a model

The method is framework independent.
@@ -51,22 +57,27 @@ class BaseTrainer(Action):
- update
Subclasses must implement these methods with a specific framework.
"""
# prepare model and data
self.model = network
data_train, data_dev, data_test, embedding = self.prepare_input(self.pickle_path)

test_args = Tester.TestConfig(save_output=True, validate_in_training=True,
save_dev_input=True, save_loss=True, batch_size=self.batch_size)
evaluator = Tester(test_args)
# define tester over dev data
valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True,
"save_loss": True, "batch_size": self.batch_size, "pickle_path": self.pickle_path}
validator = POSTester(valid_args)

best_loss = 1e10
# main training epochs
iterations = len(data_train) // self.batch_size

for epoch in range(self.n_epochs):
self.mode(test=False)

# turn on network training mode; define optimizer; prepare batch iterator
self.mode(test=False)
self.define_optimizer()
self.iterator = iter(Batchifier(RandomSampler(data_train), self.batch_size, drop_last=True))

# training iterations in one epoch
for step in range(iterations):
batch_x, batch_y = self.batchify(self.batch_size, data_train)
batch_x, batch_y = self.batchify(data_train)

prediction = self.data_forward(network, batch_x)

@@ -77,9 +88,8 @@ class BaseTrainer(Action):
if self.validate:
if data_dev is None:
raise RuntimeError("No validation data provided.")
evaluator.test(network, data_dev)
if evaluator.loss < best_loss:
best_loss = evaluator.loss
validator.test(network)
print("[epoch {}] dev loss={:.2f}".format(epoch, validator.matrices()))

# finish training

@@ -155,23 +165,20 @@ class BaseTrainer(Action):
"""
raise NotImplementedError

def batchify(self, batch_size, data):
def batchify(self, data):
"""
1. Perform batching from data and produce a batch of training data.
2. Add padding.
:param batch_size: int, the size of a batch
:param data: list. Each entry is a sample, which is also a list of features and label(s).
E.g.
[
[[feature_1, feature_2, feature_3], [label_1. label_2]], # sample 1
[[feature_1, feature_2, feature_3], [label_1. label_2]], # sample 2
[[word_11, word_12, word_13], [label_11. label_12]], # sample 1
[[word_21, word_22, word_23], [label_21. label_22]], # sample 2
...
]
:return batch_x: list. Each entry is a list of features of a sample.
batch_y: list. Each entry is a list of labels of a sample.
:return batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len]
batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels]
"""
if self.iterator is None:
self.iterator = iter(Batchifier(RandomSampler(data), batch_size, drop_last=True))
indices = next(self.iterator)
batch = [data[idx] for idx in indices]
batch_x = [sample[0] for sample in batch]
@@ -195,7 +202,9 @@ class BaseTrainer(Action):


class ToyTrainer(BaseTrainer):
"""A simple trainer for a PyTorch model."""
"""
deprecated
"""

def __init__(self, train_args):
super(ToyTrainer, self).__init__(train_args)
@@ -230,7 +239,7 @@ class ToyTrainer(BaseTrainer):

class WordSegTrainer(BaseTrainer):
"""
reserve for changes
deprecated
"""

def __init__(self, train_args):
@@ -301,6 +310,7 @@ class WordSegTrainer(BaseTrainer):
self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85)

def get_loss(self, predict, truth):
truth = torch.Tensor(truth)
self._loss = torch.nn.CrossEntropyLoss(predict, truth)
return self._loss

@@ -313,8 +323,76 @@ class WordSegTrainer(BaseTrainer):
self.optimizer.step()


class POSTrainer(BaseTrainer):
"""
Trainer for Sequence Modeling

"""
def __init__(self, train_args):
super(POSTrainer, self).__init__(train_args)
self.vocab_size = train_args["vocab_size"]
self.num_classes = train_args["num_classes"]
self.max_len = None
self.mask = None

def prepare_input(self, data_path):
"""
To do: Load pkl files of train/dev/test and embedding
"""
data_train = _pickle.load(open(data_path + "/data_train.pkl", "rb"))
data_dev = _pickle.load(open(data_path + "/data_train.pkl", "rb"))
return data_train, data_dev, 0, 1

def data_forward(self, network, x):
"""
:param network: the PyTorch model
:param x: list of list, [batch_size, max_len]
:return y: [batch_size, num_classes]
"""
seq_len = [len(seq) for seq in x]
x = torch.Tensor(x).long()
self.batch_size = x.size(0)
self.max_len = x.size(1)
self.mask = seq_mask(seq_len, self.max_len)
y = network(x)
return y

def mode(self, test=False):
if test:
self.model.eval()
else:
self.model.train()

def define_optimizer(self):
self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01, momentum=0.9)

def grad_backward(self, loss):
self.model.zero_grad()
loss.backward()

def update(self):
self.optimizer.step()

def get_loss(self, predict, truth):
"""
Compute loss given prediction and ground truth.
:param predict: prediction label vector, [batch_size, num_classes]
:param truth: ground truth label vector, [batch_size, max_len]
:return: a scalar
"""
truth = torch.Tensor(truth)
if self.loss_func is None:
if hasattr(self.model, "loss"):
self.loss_func = self.model.loss
else:
self.define_loss()
loss, prediction = self.loss_func(predict, truth, self.mask, self.batch_size, self.max_len)
# print("loss={:.2f}".format(loss.data))
return loss


if __name__ == "__name__":
train_args = BaseTrainer.TrainConfig(epochs=1, validate=False, batch_size=3, pickle_path="./")
train_args = {"epochs": 1, "validate": False, "batch_size": 3, "pickle_path": "./"}
trainer = BaseTrainer(train_args)
data_train = [[[1, 2, 3, 4], [0]] * 10] + [[[1, 3, 5, 2], [1]] * 10]
trainer.batchify(batch_size=3, data=data_train)
trainer.batchify(data=data_train)

+ 1
- 2
fastNLP/loader/dataset_loader.py View File

@@ -15,7 +15,6 @@ class POSDatasetLoader(DatasetLoader):

def __init__(self, data_name, data_path):
super(POSDatasetLoader, self).__init__(data_name, data_path)
#self.data_set = self.load()

def load(self):
assert os.path.exists(self.data_path)
@@ -24,7 +23,7 @@ class POSDatasetLoader(DatasetLoader):
return line

def load_lines(self):
assert os.path.exists(self.data_path)
assert (os.path.exists(self.data_path))
with open(self.data_path, "r", encoding="utf-8") as f:
lines = f.readlines()
return lines


+ 39
- 27
fastNLP/loader/preprocess.py View File

@@ -46,19 +46,17 @@ class BasePreprocess(object):
class POSPreprocess(BasePreprocess):
"""
This class are used to preprocess the pos datasets.
In these datasets, each line is divided by '\t'
The first Col is the vocabulary.
The second Col is the labels.
In these datasets, each line are divided by '\t'
while the first Col is the vocabulary and the second
Col is the label.
Different sentence are divided by an empty line.
e.g:
Tom label1
and label2
Jerry label1
. label3
Hello label4
world label5
! label3
@@ -71,11 +69,13 @@ class POSPreprocess(BasePreprocess):
super(POSPreprocess, self).__init__(data, pickle_path)
self.word_dict = None
self.label_dict = None
self.data = data
self.pickle_path = pickle_path
self.build_dict()
self.word2id()
self.id2word()
self.vocab_size = self.id2word()
self.class2id()
self.id2class()
self.num_classes = self.id2class()
self.embedding()
self.data_train()
self.data_dev()
@@ -87,7 +87,8 @@ class POSPreprocess(BasePreprocess):
DEFAULT_RESERVED_LABEL[2]: 4}
self.label_dict = {}
for w in self.data:
if len(w) == 0:
w = w.strip()
if len(w) <= 1:
continue
word = w.split('\t')
@@ -95,10 +96,11 @@ class POSPreprocess(BasePreprocess):
index = len(self.word_dict)
self.word_dict[word[0]] = index
for label in word[1: ]:
if label not in self.label_dict:
index = len(self.label_dict)
self.label_dict[label] = index
# for label in word[1: ]:
label = word[1]
if label not in self.label_dict:
index = len(self.label_dict)
self.label_dict[label] = index
def pickle_exist(self, pickle_name):
"""
@@ -107,7 +109,7 @@ class POSPreprocess(BasePreprocess):
"""
if not os.path.exists(self.pickle_path):
os.makedirs(self.pickle_path)
file_name = self.pickle_path + pickle_name
file_name = os.path.join(self.pickle_path, pickle_name)
if os.path.exists(file_name):
return True
else:
@@ -118,42 +120,48 @@ class POSPreprocess(BasePreprocess):
return
# nothing will be done if word2id.pkl exists
file_name = self.pickle_path + "word2id.pkl"
with open(file_name, "wb", encoding='utf-8') as f:
file_name = os.path.join(self.pickle_path, "word2id.pkl")
with open(file_name, "wb") as f:
_pickle.dump(self.word_dict, f)
def id2word(self):
if self.pickle_exist("id2word.pkl"):
return
file_name = os.path.join(self.pickle_path, "id2word.pkl")
id2word_dict = _pickle.load(open(file_name, "rb"))
return len(id2word_dict)
# nothing will be done if id2word.pkl exists
id2word_dict = {}
for word in self.word_dict:
id2word_dict[self.word_dict[word]] = word
file_name = self.pickle_path + "id2word.pkl"
with open(file_name, "wb", encoding='utf-8') as f:
file_name = os.path.join(self.pickle_path, "id2word.pkl")
with open(file_name, "wb") as f:
_pickle.dump(id2word_dict, f)
return len(id2word_dict)
def class2id(self):
if self.pickle_exist("class2id.pkl"):
return
# nothing will be done if class2id.pkl exists
file_name = self.pickle_path + "class2id.pkl"
with open(file_name, "wb", encoding='utf-8') as f:
file_name = os.path.join(self.pickle_path, "class2id.pkl")
with open(file_name, "wb") as f:
_pickle.dump(self.label_dict, f)
def id2class(self):
if self.pickle_exist("id2class.pkl"):
return
file_name = os.path.join(self.pickle_path, "id2class.pkl")
id2class_dict = _pickle.load(open(file_name, "rb"))
return len(id2class_dict)
# nothing will be done if id2class.pkl exists
id2class_dict = {}
for label in self.label_dict:
id2class_dict[self.label_dict[label]] = label
file_name = self.pickle_path + "id2class.pkl"
with open(file_name, "wb", encoding='utf-8') as f:
file_name = os.path.join(self.pickle_path, "id2class.pkl")
with open(file_name, "wb") as f:
_pickle.dump(id2class_dict, f)
return len(id2class_dict)
def embedding(self):
if self.pickle_exist("embedding.pkl"):
@@ -168,22 +176,26 @@ class POSPreprocess(BasePreprocess):
data_train = []
sentence = []
for w in self.data:
if len(w) == 0:
w = w.strip()
if len(w) <= 1:
wid = []
lid = []
for i in range(len(sentence)):
# if sentence[i][0]=="":
# print("")
wid.append(self.word_dict[sentence[i][0]])
lid.append(self.label_dict[sentence[i][1]])
data_train.append((wid, lid))
sentence = []
continue
sentence.append(w.split('\t'))
file_name = self.pickle_path + "data_train.pkl"
with open(file_name, "wb", encoding='utf-8') as f:
file_name = os.path.join(self.pickle_path, "data_train.pkl")
with open(file_name, "wb") as f:
_pickle.dump(data_train, f)
def data_dev(self):
pass
def data_test(self):
pass
pass

+ 2
- 21
fastNLP/models/base_model.py View File

@@ -3,32 +3,12 @@ import torch

class BaseModel(torch.nn.Module):
"""Base PyTorch model for all models.
Three network modules presented:
- embedding module
- aggregation module
- output module
Subclasses must implement these three modules with "components".
To do: add some useful common features
"""

def __init__(self):
super(BaseModel, self).__init__()

def forward(self, *inputs):
x = self.encode(*inputs)
x = self.aggregation(x)
x = self.output(x)
return x

def encode(self, x):
raise NotImplementedError

def aggregation(self, x):
raise NotImplementedError

def output(self, x):
raise NotImplementedError



class Vocabulary(object):
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
@@ -93,3 +73,4 @@ class Token(object):
self.doc = doc
self.token = doc[offset]
self.i = offset


+ 97
- 0
fastNLP/models/sequence_modeling.py View File

@@ -0,0 +1,97 @@
import torch
import torch.nn as nn
from torch.nn import functional as F

from fastNLP.models.base_model import BaseModel
from fastNLP.modules.CRF import ContionalRandomField


class SeqLabeling(BaseModel):
"""
PyTorch Network for sequence labeling
"""

def __init__(self, hidden_dim,
rnn_num_layer,
num_classes,
vocab_size,
word_emb_dim=100,
init_emb=None,
rnn_mode="gru",
bi_direction=False,
dropout=0.5,
use_crf=True):
super(SeqLabeling, self).__init__()

self.Emb = nn.Embedding(vocab_size, word_emb_dim)
if init_emb:
self.Emb.weight = nn.Parameter(init_emb)

self.num_classes = num_classes
self.input_dim = word_emb_dim
self.layers = rnn_num_layer
self.hidden_dim = hidden_dim
self.bi_direction = bi_direction
self.dropout = dropout
self.mode = rnn_mode

if self.mode == "lstm":
self.rnn = nn.LSTM(self.input_dim, self.hidden_dim, self.layers, batch_first=True,
bidirectional=self.bi_direction, dropout=self.dropout)
elif self.mode == "gru":
self.rnn = nn.GRU(self.input_dim, self.hidden_dim, self.layers, batch_first=True,
bidirectional=self.bi_direction, dropout=self.dropout)
elif self.mode == "rnn":
self.rnn = nn.RNN(self.input_dim, self.hidden_dim, self.layers, batch_first=True,
bidirectional=self.bi_direction, dropout=self.dropout)
else:
raise Exception
if bi_direction:
self.linear = nn.Linear(self.hidden_dim * 2, self.num_classes)
else:
self.linear = nn.Linear(self.hidden_dim, self.num_classes)
self.use_crf = use_crf
if self.use_crf:
self.crf = ContionalRandomField(num_classes)

def forward(self, x):
"""
:param x: LongTensor, [batch_size, mex_len]
:return y: [batch_size, tag_size, tag_size]
"""
x = self.Emb(x)
# [batch_size, max_len, word_emb_dim]
x, hidden = self.rnn(x)
# [batch_size, max_len, hidden_size * direction]
y = self.linear(x)
# [batch_size, max_len, num_classes]
return y

def loss(self, x, y, mask, batch_size, max_len):
"""
Negative log likelihood loss.
:param x: FloatTensor, [batch_size, tag_size, tag_size]
:param y: LongTensor, [batch_size, max_len]
:param mask: ByteTensor, [batch_size, max_len]
:param batch_size: int
:param max_len: int
:return loss:
prediction:
"""
x = x.float()
y = y.long()
mask = mask.byte()
# print(x.shape, y.shape, mask.shape)

if self.use_crf:
total_loss = self.crf(x, y, mask)
tag_seq = self.crf.viterbi_decode(x, mask)
else:
# error
loss_function = nn.NLLLoss(ignore_index=0, size_average=False)
x = x.view(batch_size * max_len, -1)
score = F.log_softmax(x)
total_loss = loss_function(score, y.view(batch_size * max_len))
_, tag_seq = torch.max(score)
tag_seq = tag_seq.view(batch_size, max_len)
return torch.mean(total_loss), tag_seq

+ 2
- 2
fastNLP/modules/CRF.py View File

@@ -82,7 +82,7 @@ class ContionalRandomField(nn.Module):
def _glod_score(self, feats, tags, masks):
"""
Compute the score for the gold path.
:param feats: FloatTensor, batch_size x tag_size x tag_size
:param feats: FloatTensor, batch_size x max_len x tag_size
:param tags: LongTensor, batch_size x max_len
:param masks: ByteTensor, batch_size x max_len
:return:FloatTensor, batch_size
@@ -118,7 +118,7 @@ class ContionalRandomField(nn.Module):
def forward(self, feats, tags, masks):
"""
Calculate the neg log likelihood
:param feats:FloatTensor, batch_size x tag_size x tag_size
:param feats:FloatTensor, batch_size x max_len x tag_size
:param tags:LongTensor, batch_size x max_len
:param masks:ByteTensor batch_size x max_len
:return:FloatTensor, batch_size


+ 43
- 5
fastNLP/modules/prototype/example.py View File

@@ -1,12 +1,13 @@
import torch
import torch.nn as nn
import encoder
import time

import aggregation
import dataloader
import embedding
import encoder
import predict
import torch
import torch.nn as nn
import torch.optim as optim
import time
import dataloader

WORD_NUM = 357361
WORD_SIZE = 100
@@ -16,6 +17,30 @@ R = 10
MLP_HIDDEN = 2000
CLASSES_NUM = 5

from fastNLP.models.base_model import BaseModel
from fastNLP.action.trainer import BaseTrainer


class MyNet(BaseModel):
def __init__(self):
super(MyNet, self).__init__()
self.embedding = embedding.Lookuptable(WORD_NUM, WORD_SIZE)
self.encoder = encoder.Lstm(WORD_SIZE, HIDDEN_SIZE, 1, 0.5, True)
self.aggregation = aggregation.Selfattention(2 * HIDDEN_SIZE, D_A, R)
self.predict = predict.MLP(R * HIDDEN_SIZE * 2, MLP_HIDDEN, CLASSES_NUM)
self.penalty = None

def encode(self, x):
return self.encode(self.embedding(x))

def aggregate(self, x):
x, self.penalty = self.aggregate(x)
return x

def decode(self, x):
return [self.predict(x), self.penalty]


class Net(nn.Module):
"""
A model for sentiment analysis using lstm and self-attention
@@ -34,6 +59,19 @@ class Net(nn.Module):
x = self.predict(x)
return x, penalty


class MyTrainer(BaseTrainer):
def __init__(self, args):
super(MyTrainer, self).__init__(args)
self.optimizer = None

def define_optimizer(self):
self.optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.9)

def define_loss(self):
self.loss_func = nn.CrossEntropyLoss()


def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
momentum=0.3, batch_size=32, epochs=5, coef=1.0, interval=10):
"""


+ 6
- 0
fastNLP/modules/utils.py View File

@@ -7,3 +7,9 @@ def mask_softmax(matrix, mask):
else:
raise NotImplementedError
return result


def seq_mask(seq_len, max_len):
mask = [torch.ge(torch.LongTensor(seq_len), i + 1) for i in range(max_len)]
mask = torch.stack(mask, 1)
return mask

+ 0
- 0
fastNLP/reproduction/__init__.py View File


fastNLP/reproduction/CNN-sentence_classification/.gitignore → reproduction/CNN-sentence_classification/.gitignore View File


fastNLP/reproduction/CNN-sentence_classification/README.md → reproduction/CNN-sentence_classification/README.md View File


fastNLP/reproduction/CNN-sentence_classification/__init__.py → reproduction/CNN-sentence_classification/__init__.py View File


fastNLP/reproduction/CNN-sentence_classification/dataset.py → reproduction/CNN-sentence_classification/dataset.py View File


fastNLP/reproduction/CNN-sentence_classification/model.py → reproduction/CNN-sentence_classification/model.py View File


fastNLP/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg → reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg View File


fastNLP/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos → reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos View File


fastNLP/reproduction/CNN-sentence_classification/train.py → reproduction/CNN-sentence_classification/train.py View File


fastNLP/reproduction/Char-aware_NLM/LICENSE → reproduction/Char-aware_NLM/LICENSE View File


fastNLP/reproduction/Char-aware_NLM/README.md → reproduction/Char-aware_NLM/README.md View File


fastNLP/reproduction/Char-aware_NLM/__init__.py → reproduction/Char-aware_NLM/__init__.py View File


fastNLP/reproduction/Char-aware_NLM/model.py → reproduction/Char-aware_NLM/model.py View File


fastNLP/reproduction/Char-aware_NLM/test.py → reproduction/Char-aware_NLM/test.py View File


fastNLP/reproduction/Char-aware_NLM/test.txt → reproduction/Char-aware_NLM/test.txt View File


fastNLP/reproduction/Char-aware_NLM/train.py → reproduction/Char-aware_NLM/train.py View File


fastNLP/reproduction/Char-aware_NLM/train.txt → reproduction/Char-aware_NLM/train.txt View File


fastNLP/reproduction/Char-aware_NLM/utilities.py → reproduction/Char-aware_NLM/utilities.py View File


fastNLP/reproduction/Char-aware_NLM/valid.txt → reproduction/Char-aware_NLM/valid.txt View File


fastNLP/reproduction/HAN-document_classification/README.md → reproduction/HAN-document_classification/README.md View File


fastNLP/reproduction/HAN-document_classification/__init__.py → reproduction/HAN-document_classification/__init__.py View File


fastNLP/reproduction/HAN-document_classification/data/test_samples.pkl → reproduction/HAN-document_classification/data/test_samples.pkl View File


fastNLP/reproduction/HAN-document_classification/data/train_samples.pkl → reproduction/HAN-document_classification/data/train_samples.pkl View File


fastNLP/reproduction/HAN-document_classification/data/yelp.word2vec → reproduction/HAN-document_classification/data/yelp.word2vec View File


fastNLP/reproduction/HAN-document_classification/evaluate.py → reproduction/HAN-document_classification/evaluate.py View File


fastNLP/reproduction/HAN-document_classification/model.py → reproduction/HAN-document_classification/model.py View File


fastNLP/reproduction/HAN-document_classification/preprocess.py → reproduction/HAN-document_classification/preprocess.py View File


fastNLP/reproduction/HAN-document_classification/train.py → reproduction/HAN-document_classification/train.py View File


+ 2
- 2
requirements.txt View File

@@ -1,3 +1,3 @@
numpy==1.14.2
numpy>=1.14.2
torch==0.4.0
torchvision==0.1.8
torchvision>=0.1.8

+ 67
- 0
test/data_for_tests/people.txt View File

@@ -0,0 +1,67 @@
迈 B-v
向 E-v
充 B-v
满 E-v
希 B-n
望 E-n
的 S-u
新 S-a
世 B-n
纪 E-n
— B-w
— E-w
一 B-t
九 M-t
九 M-t
八 M-t
年 E-t
新 B-t
年 E-t
讲 B-n
话 E-n
( S-w
附 S-v
图 B-n
片 E-n
1 S-m
张 S-q
) S-w

中 B-nt
共 M-nt
中 M-nt
央 E-nt
总 B-n
书 M-n
记 E-n
、 S-w
国 B-n
家 E-n
主 B-n
席 E-n
江 B-nr
泽 M-nr
民 E-nr

( S-w
一 B-t
九 M-t
九 M-t
七 M-t
年 E-t
十 B-t
二 M-t
月 E-t
三 B-t
十 M-t
一 M-t
日 E-t
) S-w

1 B-t
2 M-t
月 E-t
3 B-t
1 M-t
日 E-t
, S-w

+ 35
- 0
test/test_POS_pipeline.py View File

@@ -0,0 +1,35 @@
import sys

sys.path.append("..")

from fastNLP.action.trainer import POSTrainer
from fastNLP.loader.dataset_loader import POSDatasetLoader
from fastNLP.loader.preprocess import POSPreprocess
from fastNLP.models.sequence_modeling import SeqLabeling

data_name = "people.txt"
data_path = "data_for_tests/people.txt"
pickle_path = "data_for_tests"

if __name__ == "__main__":
# Data Loader
pos = POSDatasetLoader(data_name, data_path)
train_data = pos.load_lines()

# Preprocessor
p = POSPreprocess(train_data, pickle_path)
vocab_size = p.vocab_size
num_classes = p.num_classes

# Trainer
train_args = {"epochs": 20, "batch_size": 1, "num_classes": num_classes,
"vocab_size": vocab_size, "pickle_path": pickle_path, "validate": True}
trainer = POSTrainer(train_args)

# Model
model = SeqLabeling(100, 1, num_classes, vocab_size, bi_direction=True)

# Start training
trainer.train(model)

print("Training finished!")

Loading…
Cancel
Save