Browse Source

Merge pull request #1 from fastnlp/master

1
tags/v0.1.0
Yige XU GitHub 6 years ago
parent
commit
b998593bc5
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
27 changed files with 1019 additions and 353 deletions
  1. +58
    -33
      fastNLP/action/action.py
  2. +1
    -1
      fastNLP/action/tester.py
  3. +279
    -52
      fastNLP/action/trainer.py
  4. +0
    -35
      fastNLP/loader/base_preprocess.py
  5. +5
    -6
      fastNLP/loader/config_loader.py
  6. +5
    -4
      fastNLP/loader/dataset_loader.py
  7. +49
    -24
      fastNLP/loader/preprocess.py
  8. +0
    -95
      fastNLP/models/base_model.py
  9. +6
    -1
      fastNLP/models/char_language_model.py
  10. +1
    -90
      fastNLP/models/word_seg_model.py
  11. +24
    -0
      fastNLP/modules/convolution/avg_pool.py
  12. +28
    -0
      fastNLP/modules/convolution/conv.py
  13. +20
    -0
      fastNLP/modules/convolution/kmax_pool.py
  14. +26
    -0
      fastNLP/modules/convolution/max_pool.py
  15. +41
    -0
      fastNLP/modules/prototype/README.md
  16. +63
    -0
      fastNLP/modules/prototype/Word2Idx.py
  17. +40
    -0
      fastNLP/modules/prototype/aggregation.py
  18. +81
    -0
      fastNLP/modules/prototype/dataloader.py
  19. +23
    -0
      fastNLP/modules/prototype/embedding.py
  20. +22
    -0
      fastNLP/modules/prototype/encoder.py
  21. +129
    -0
      fastNLP/modules/prototype/example.py
  22. +25
    -0
      fastNLP/modules/prototype/predict.py
  23. +50
    -0
      fastNLP/modules/prototype/prepare.py
  24. +14
    -0
      fastNLP/saver/base_saver.py
  25. +12
    -0
      fastNLP/saver/logger.py
  26. +8
    -0
      fastNLP/saver/model_saver.py
  27. +9
    -12
      test/test_word_seg.py

+ 58
- 33
fastNLP/action/action.py View File

@@ -1,4 +1,4 @@
from saver.logger import Logger
import numpy as np


class Action(object):
@@ -8,39 +8,64 @@ class Action(object):

def __init__(self):
super(Action, self).__init__()
self.logger = Logger("logger_output.txt")

def load_config(self, args):
raise NotImplementedError

def load_dataset(self, args):
class BaseSampler(object):
"""
Base class for all samplers.
"""

def __init__(self, data_set):
self.data_set_length = len(data_set)

def __len__(self):
return self.data_set_length

def __iter__(self):
raise NotImplementedError

def log(self, string):
self.logger.log(string)

def batchify(self, batch_size, X, Y=None):
"""
:param batch_size: int
:param X: feature matrix of size [n_sample, m_feature]
:param Y: label vector of size [n_sample, 1] (optional)
:return iteration:int, the number of step in each epoch
generator:generator, to generate batch inputs
"""
n_samples = X.shape[0]
num_iter = n_samples // batch_size
if Y is None:
generator = self._batch_generate(batch_size, num_iter, X)
else:
generator = self._batch_generate(batch_size, num_iter, X, Y)
return num_iter, generator

@staticmethod
def _batch_generate(batch_size, num_iter, *data):
for step in range(num_iter):
start = batch_size * step
end = batch_size * (step + 1)
yield tuple([x[start:end] for x in data])

def make_log(self, *args):
return "log"

class SequentialSampler(BaseSampler):
"""
Sample data in the original order.
"""

def __init__(self, data_set):
super(SequentialSampler, self).__init__(data_set)

def __iter__(self):
return iter(range(self.data_set_length))


class RandomSampler(BaseSampler):
"""
Sample data in random permutation order.
"""

def __init__(self, data_set):
super(RandomSampler, self).__init__(data_set)

def __iter__(self):
return iter(np.random.permutation(self.data_set_length))


class Batchifier(object):
"""
Wrap random or sequential sampler to generate a mini-batch.
"""

def __init__(self, sampler, batch_size, drop_last=True):
super(Batchifier, self).__init__()
self.sampler = sampler
self.batch_size = batch_size
self.drop_last = drop_last

def __iter__(self):
batch = []
for idx in self.sampler:
batch.append(idx)
if len(batch) == self.batch_size:
yield batch
batch = []
if len(batch) < self.batch_size and self.drop_last is False:
yield batch

+ 1
- 1
fastNLP/action/tester.py View File

@@ -2,7 +2,7 @@ from collections import namedtuple

import numpy as np

from fastNLP.action import Action
from fastNLP.action.action import Action


class Tester(Action):


+ 279
- 52
fastNLP/action/trainer.py View File

@@ -1,93 +1,320 @@
import _pickle
from collections import namedtuple

from .action import Action
from .tester import Tester
import numpy as np
import torch

from fastNLP.action.action import Action
from fastNLP.action.action import RandomSampler, Batchifier
from fastNLP.action.tester import Tester

class Trainer(Action):
"""
Trainer is a common training pipeline shared among all models.

class BaseTrainer(Action):
"""Base trainer for all trainers.
Trainer receives a model and data, and then performs training.

Subclasses must implement the following abstract methods:
- prepare_input
- mode
- define_optimizer
- data_forward
- grad_backward
- get_loss
"""
TrainConfig = namedtuple("config", ["epochs", "validate", "save_when_better",
"log_per_step", "log_validation", "batch_size"])
TrainConfig = namedtuple("config", ["epochs", "validate", "batch_size", "pickle_path"])

def __init__(self, train_args):
"""
:param train_args: namedtuple
training parameters
"""
super(Trainer, self).__init__()
super(BaseTrainer, self).__init__()
self.n_epochs = train_args.epochs
self.validate = train_args.validate
self.save_when_better = train_args.save_when_better
self.log_per_step = train_args.log_per_step
self.log_validation = train_args.log_validation
self.batch_size = train_args.batch_size
self.pickle_path = train_args.pickle_path
self.model = None
self.iterator = None
self.loss_func = None

def train(self, network, train_data, dev_data=None):
"""
:param network: the models controller
:param train_data: raw data for training
:param dev_data: raw data for validation
This method will call all the base methods of network (implemented in models.base_model).
"""
train_x, train_y = network.prepare_input(train_data)
def train(self, network):
"""General training loop.
:param network: a model

iterations, train_batch_generator = self.batchify(self.batch_size, train_x, train_y)
The method is framework independent.
Work by calling the following methods:
- prepare_input
- mode
- define_optimizer
- data_forward
- get_loss
- grad_backward
- update
Subclasses must implement these methods with a specific framework.
"""
self.model = network
data_train, data_dev, data_test, embedding = self.prepare_input(self.pickle_path)

test_args = Tester.TestConfig(save_output=True, validate_in_training=True,
save_dev_input=True, save_loss=True, batch_size=self.batch_size)
evaluator = Tester(test_args)

best_loss = 1e10
loss_history = list()
iterations = len(data_train) // self.batch_size

for epoch in range(self.n_epochs):
network.mode(test=False) # turn on the train mode
self.mode(test=False)

network.define_optimizer()
self.define_optimizer()
for step in range(iterations):
batch_x, batch_y = train_batch_generator.__next__()

prediction = network.data_forward(batch_x)
batch_x, batch_y = self.batchify(self.batch_size, data_train)

loss = network.get_loss(prediction, batch_y)
network.grad_backward()
prediction = self.data_forward(network, batch_x)

if step % self.log_per_step == 0:
print("step ", step)
loss_history.append(loss)
self.log(self.make_log(epoch, step, loss))
loss = self.get_loss(prediction, batch_y)
self.grad_backward(loss)
self.update()

#################### evaluate over dev set ###################
if self.validate:
if dev_data is None:
if data_dev is None:
raise RuntimeError("No validation data provided.")
# give all controls to tester
evaluator.test(network, dev_data)

if self.log_validation:
self.log(self.make_valid_log(epoch, evaluator.loss))
evaluator.test(network, data_dev)
if evaluator.loss < best_loss:
best_loss = evaluator.loss
if self.save_when_better:
self.save_model(network)

# finish training

def make_log(self, *args):
return "make a log"
def prepare_input(self, data_path):
"""
To do: Load pkl files of train/dev/test and embedding
"""
data_train = _pickle.load(open(data_path + "data_train.pkl", "rb"))
data_dev = _pickle.load(open(data_path + "data_dev.pkl", "rb"))
data_test = _pickle.load(open(data_path + "data_test.pkl", "rb"))
embedding = _pickle.load(open(data_path + "embedding.pkl", "rb"))
return data_train, data_dev, data_test, embedding

def make_valid_log(self, *args):
return "make a valid log"
def mode(self, test=False):
"""
Tell the network to be trained or not.
:param test: bool
"""
raise NotImplementedError

def save_model(self, model):
model.save()
def define_optimizer(self):
"""
Define framework-specific optimizer specified by the models.
"""
raise NotImplementedError

def load_data(self, data_name):
print("load data")
def update(self):
"""
Perform weight update on a model.

def load_config(self, args):
For PyTorch, just call optimizer to update.
"""
raise NotImplementedError

def load_dataset(self, args):
def data_forward(self, network, x):
"""
Forward pass of the data.
:param network: a model
:param x: input feature matrix and label vector
:return: output by the models

For PyTorch, just do "network(*x)"
"""
raise NotImplementedError

def grad_backward(self, loss):
"""
Compute gradient with link rules.
:param loss: a scalar where back-prop starts

For PyTorch, just do "loss.backward()"
"""
raise NotImplementedError

def get_loss(self, predict, truth):
"""
Compute loss given prediction and ground truth.
:param predict: prediction label vector
:param truth: ground truth label vector
:return: a scalar
"""
if self.loss_func is None:
if hasattr(self.model, "loss"):
self.loss_func = self.model.loss
else:
self.define_loss()
return self.loss_func(predict, truth)

def define_loss(self):
"""
Assign an instance of loss function to self.loss_func
E.g. self.loss_func = nn.CrossEntropyLoss()
"""
raise NotImplementedError

def batchify(self, batch_size, data):
"""
1. Perform batching from data and produce a batch of training data.
2. Add padding.
:param batch_size: int, the size of a batch
:param data: list. Each entry is a sample, which is also a list of features and label(s).
E.g.
[
[[feature_1, feature_2, feature_3], [label_1. label_2]], # sample 1
[[feature_1, feature_2, feature_3], [label_1. label_2]], # sample 2
...
]
:return batch_x: list. Each entry is a list of features of a sample.
batch_y: list. Each entry is a list of labels of a sample.
"""
if self.iterator is None:
self.iterator = iter(Batchifier(RandomSampler(data), batch_size, drop_last=True))
indices = next(self.iterator)
batch = [data[idx] for idx in indices]
batch_x = [sample[0] for sample in batch]
batch_y = [sample[1] for sample in batch]
batch_x = self.pad(batch_x)
return batch_x, batch_y

@staticmethod
def pad(batch, fill=0):
"""
Pad a batch of samples to maximum length.
:param batch: list of list
:param fill: word index to pad, default 0.
:return: a padded batch
"""
max_length = max([len(x) for x in batch])
for idx, sample in enumerate(batch):
if len(sample) < max_length:
batch[idx] = sample + [fill * (max_length - len(sample))]
return batch


class ToyTrainer(BaseTrainer):
"""A simple trainer for a PyTorch model."""

def __init__(self, train_args):
super(ToyTrainer, self).__init__(train_args)
self.test_mode = False
self.weight = np.random.rand(5, 1)
self.bias = np.random.rand()
self._loss = 0
self._optimizer = None

def prepare_input(self, data):
return data[:, :-1], data[:, -1]

def mode(self, test=False):
self.model.mode(test)

def data_forward(self, network, x):
return np.matmul(x, self.weight) + self.bias

def grad_backward(self, loss):
loss.backward()

def get_loss(self, pred, truth):
self._loss = np.mean(np.square(pred - truth))
return self._loss

def define_optimizer(self):
self._optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01)

def update(self):
self._optimizer.step()


class WordSegTrainer(BaseTrainer):
"""
reserve for changes
"""

def __init__(self, train_args):
super(WordSegTrainer, self).__init__(train_args)
self.id2word = None
self.word2id = None
self.id2tag = None
self.tag2id = None

self.lstm_batch_size = 8
self.lstm_seq_len = 32 # Trainer batch_size == lstm_batch_size * lstm_seq_len
self.hidden_dim = 100
self.lstm_num_layers = 2
self.vocab_size = 100
self.word_emb_dim = 100

self.hidden = (self.to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)),
self.to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)))

self.optimizer = None
self._loss = None

self.USE_GPU = False

def to_var(self, x):
if torch.cuda.is_available() and self.USE_GPU:
x = x.cuda()
return torch.autograd.Variable(x)

def prepare_input(self, data):
"""
perform word indices lookup to convert strings into indices
:param data: list of string, each string contains word + space + [B, M, E, S]
:return
"""
word_list = []
tag_list = []
for line in data:
if len(line) > 2:
tokens = line.split("#")
word_list.append(tokens[0])
tag_list.append(tokens[2][0])
self.id2word = list(set(word_list))
self.word2id = {word: idx for idx, word in enumerate(self.id2word)}
self.id2tag = list(set(tag_list))
self.tag2id = {tag: idx for idx, tag in enumerate(self.id2tag)}
words = np.array([self.word2id[w] for w in word_list]).reshape(-1, 1)
tags = np.array([self.tag2id[t] for t in tag_list]).reshape(-1, 1)
return words, tags

def mode(self, test=False):
if test:
self.model.eval()
else:
self.model.train()

def data_forward(self, network, x):
"""
:param network: a PyTorch model
:param x: sequence of length [batch_size], word indices
:return:
"""
x = x.reshape(self.lstm_batch_size, self.lstm_seq_len)
output, self.hidden = network(x, self.hidden)
return output

def define_optimizer(self):
self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85)

def get_loss(self, predict, truth):
self._loss = torch.nn.CrossEntropyLoss(predict, truth)
return self._loss

def grad_backward(self, network):
self.model.zero_grad()
self._loss.backward()
torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)

def update(self):
self.optimizer.step()


if __name__ == "__name__":
train_args = BaseTrainer.TrainConfig(epochs=1, validate=False, batch_size=3, pickle_path="./")
trainer = BaseTrainer(train_args)
data_train = [[[1, 2, 3, 4], [0]] * 10] + [[[1, 3, 5, 2], [1]] * 10]
trainer.batchify(batch_size=3, data=data_train)

+ 0
- 35
fastNLP/loader/base_preprocess.py View File

@@ -1,35 +0,0 @@
class BasePreprocess(object):
def __init__(self, data, pickle_path):
super(BasePreprocess, self).__init__()
self.data = data
self.pickle_path = pickle_path
if not self.pickle_path.endswith('/'):
self.pickle_path = self.pickle_path + '/'
def word2id(self):
raise NotImplementedError
def id2word(self):
raise NotImplementedError
def class2id(self):
raise NotImplementedError
def id2class(self):
raise NotImplementedError
def embedding(self):
raise NotImplementedError
def data_train(self):
raise NotImplementedError
def data_dev(self):
raise NotImplementedError
def data_test(self):
raise NotImplementedError

+ 5
- 6
fastNLP/loader/config_loader.py View File

@@ -1,9 +1,8 @@
from fastNLP.loader.base_loader import BaseLoader

import configparser
import traceback
import json

from fastNLP.loader.base_loader import BaseLoader


class ConfigLoader(BaseLoader):
"""loader for configuration files"""
@@ -17,14 +16,14 @@ class ConfigLoader(BaseLoader):
raise NotImplementedError

@staticmethod
def loadConfig(filePath, sections):
def load_config(file_path, sections):
"""
:param filePath: the path of config file
:param file_path: the path of config file
:param sections: the dict of sections
:return:
"""
cfg = configparser.ConfigParser()
cfg.read(filePath)
cfg.read(file_path)
for s in sections:
attr_list = [i for i in type(sections[s]).__dict__.keys() if
not callable(getattr(sections[s], i)) and not i.startswith("__")]


+ 5
- 4
fastNLP/loader/dataset_loader.py View File

@@ -1,6 +1,7 @@
from fastNLP.loader.base_loader import BaseLoader
import os

from fastNLP.loader.base_loader import BaseLoader


class DatasetLoader(BaseLoader):
""""loader for data sets"""
@@ -16,7 +17,6 @@ class POSDatasetLoader(DatasetLoader):
super(POSDatasetLoader, self).__init__(data_name, data_path)
#self.data_set = self.load()


def load(self):
assert os.path.exists(self.data_path)
with open(self.data_path, "r", encoding="utf-8") as f:
@@ -30,11 +30,11 @@ class POSDatasetLoader(DatasetLoader):
return lines


class ClassficationDatasetLoader(DatasetLoader):
class ClassificationDatasetLoader(DatasetLoader):
"""loader for classfication data sets"""

def __init__(self, data_name, data_path):
super(ClassficationDatasetLoader, data_name)
super(ClassificationDatasetLoader, data_name).__init__()

def load(self):
assert os.path.exists(self.data_path)
@@ -58,6 +58,7 @@ class ClassficationDatasetLoader(DatasetLoader):
dataset.append(sentence)
return dataset


class ConllLoader(DatasetLoader):
"""loader for conll format files"""



+ 49
- 24
fastNLP/loader/preprocess.py View File

@@ -1,25 +1,57 @@
import pickle
import _pickle
import os
from fastNLP.loader.base_preprocess import BasePreprocess
DEFAULT_PADDING_LABEL = '<pad>' #dict index = 0
DEFAULT_UNKNOWN_LABEL = '<unk>' #dict index = 1
DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0
DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1
DEFAULT_RESERVED_LABEL = ['<reserved-2>',
'<reserved-3>',
'<reserved-4>'] #dict index = 2~4
#the first vocab in dict with the index = 5
'<reserved-4>'] # dict index = 2~4
# the first vocab in dict with the index = 5
class BasePreprocess(object):
def __init__(self, data, pickle_path):
super(BasePreprocess, self).__init__()
self.data = data
self.pickle_path = pickle_path
if not self.pickle_path.endswith('/'):
self.pickle_path = self.pickle_path + '/'
def word2id(self):
raise NotImplementedError
def id2word(self):
raise NotImplementedError
def class2id(self):
raise NotImplementedError
def id2class(self):
raise NotImplementedError
def embedding(self):
raise NotImplementedError
def data_train(self):
raise NotImplementedError
def data_dev(self):
raise NotImplementedError
def data_test(self):
raise NotImplementedError
class POSPreprocess(BasePreprocess):
"""
This class are used to preprocess the pos datasets.
In these datasets, each line are divided by '\t'
while the first Col is the vocabulary and the second
Col is the label.
In these datasets, each line is divided by '\t'
The first Col is the vocabulary.
The second Col is the labels.
Different sentence are divided by an empty line.
e.g:
Tom label1
@@ -36,7 +68,9 @@ class POSPreprocess(BasePreprocess):
"""
def __init__(self, data, pickle_path):
super(POSPreprocess, self).__init(data, pickle_path)
super(POSPreprocess, self).__init__(data, pickle_path)
self.word_dict = None
self.label_dict = None
self.build_dict()
self.word2id()
self.id2word()
@@ -46,8 +80,6 @@ class POSPreprocess(BasePreprocess):
self.data_train()
self.data_dev()
self.data_test()
#...
def build_dict(self):
self.word_dict = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
@@ -68,7 +100,6 @@ class POSPreprocess(BasePreprocess):
index = len(self.label_dict)
self.label_dict[label] = index
def pickle_exist(self, pickle_name):
"""
:param pickle_name: the filename of target pickle file
@@ -82,7 +113,6 @@ class POSPreprocess(BasePreprocess):
else:
return False
def word2id(self):
if self.pickle_exist("word2id.pkl"):
return
@@ -92,11 +122,10 @@ class POSPreprocess(BasePreprocess):
with open(file_name, "wb", encoding='utf-8') as f:
_pickle.dump(self.word_dict, f)
def id2word(self):
if self.pickle_exist("id2word.pkl"):
return
#nothing will be done if id2word.pkl exists
# nothing will be done if id2word.pkl exists
id2word_dict = {}
for word in self.word_dict:
@@ -105,7 +134,6 @@ class POSPreprocess(BasePreprocess):
with open(file_name, "wb", encoding='utf-8') as f:
_pickle.dump(id2word_dict, f)
def class2id(self):
if self.pickle_exist("class2id.pkl"):
return
@@ -115,11 +143,10 @@ class POSPreprocess(BasePreprocess):
with open(file_name, "wb", encoding='utf-8') as f:
_pickle.dump(self.label_dict, f)
def id2class(self):
if self.pickle_exist("id2class.pkl"):
return
#nothing will be done if id2class.pkl exists
# nothing will be done if id2class.pkl exists
id2class_dict = {}
for label in self.label_dict:
@@ -128,17 +155,15 @@ class POSPreprocess(BasePreprocess):
with open(file_name, "wb", encoding='utf-8') as f:
_pickle.dump(id2class_dict, f)
def embedding(self):
if self.pickle_exist("embedding.pkl"):
return
#nothing will be done if embedding.pkl exists
# nothing will be done if embedding.pkl exists
def data_train(self):
if self.pickle_exist("data_train.pkl"):
return
#nothing will be done if data_train.pkl exists
# nothing will be done if data_train.pkl exists
data_train = []
sentence = []


+ 0
- 95
fastNLP/models/base_model.py View File

@@ -1,4 +1,3 @@
import numpy as np
import torch


@@ -30,100 +29,6 @@ class BaseModel(torch.nn.Module):
raise NotImplementedError


class BaseController(object):
"""Base Controller for all controllers.
This class and its subclasses are actually "controllers" of the PyTorch models.
They act as an interface between Trainer and the PyTorch models.
This controller provides the following methods to be called by Trainer.
- prepare_input
- mode
- define_optimizer
- data_forward
- grad_backward
- get_loss
"""

def __init__(self):
"""
Define PyTorch model parameters here.
"""
pass

def prepare_input(self, data):
"""
Perform data transformation from raw input to vector/matrix inputs.
:param data: raw inputs
:return (X, Y): tuple, input features and labels
"""
raise NotImplementedError

def mode(self, test=False):
"""
Tell the network to be trained or not, required by PyTorch.
:param test: bool
"""
raise NotImplementedError

def define_optimizer(self):
"""
Define PyTorch optimizer specified by the models.
"""
raise NotImplementedError

def data_forward(self, *x):
"""
Forward pass of the data.
:param x: input feature matrix and label vector
:return: output by the models
"""
# required by PyTorch nn
raise NotImplementedError

def grad_backward(self):
"""
Perform gradient descent to update the models parameters.
"""
raise NotImplementedError

def get_loss(self, pred, truth):
"""
Compute loss given models prediction and ground truth. Loss function specified by the models.
:param pred: prediction label vector
:param truth: ground truth label vector
:return: a scalar
"""
raise NotImplementedError


class ToyController(BaseController):
"""This is for code testing."""

def __init__(self):
super(ToyController, self).__init__()
self.test_mode = False
self.weight = np.random.rand(5, 1)
self.bias = np.random.rand()
self._loss = 0

def prepare_input(self, data):
return data[:, :-1], data[:, -1]

def mode(self, test=False):
self.test_mode = test

def data_forward(self, x):
return np.matmul(x, self.weight) + self.bias

def grad_backward(self):
print("loss gradient backward")

def get_loss(self, pred, truth):
self._loss = np.mean(np.square(pred - truth))
return self._loss

def define_optimizer(self):
pass


class Vocabulary(object):
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`


+ 6
- 1
fastNLP/models/char_language_model.py View File

@@ -6,11 +6,16 @@ import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from model.base_model import BaseModel
from torch.autograd import Variable

from fastNLP.models.base_model import BaseModel

USE_GPU = True

"""
To be deprecated.
"""


class CharLM(BaseModel):
"""


+ 1
- 90
fastNLP/models/word_seg_model.py View File

@@ -1,95 +1,6 @@
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

from fastNLP.models.base_model import BaseModel, BaseController

USE_GPU = True


def to_var(x):
if torch.cuda.is_available() and USE_GPU:
x = x.cuda()
return Variable(x)


class WordSegModel(BaseController):
"""
Model controller for WordSeg
"""

def __init__(self):
super(WordSegModel, self).__init__()
self.id2word = None
self.word2id = None
self.id2tag = None
self.tag2id = None

self.lstm_batch_size = 8
self.lstm_seq_len = 32 # Trainer batch_size == lstm_batch_size * lstm_seq_len
self.hidden_dim = 100
self.lstm_num_layers = 2
self.vocab_size = 100
self.word_emb_dim = 100

self.model = WordSeg(self.hidden_dim, self.lstm_num_layers, self.vocab_size, self.word_emb_dim)
self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)),
to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)))

self.optimizer = None
self._loss = None

def prepare_input(self, data):
"""
perform word indices lookup to convert strings into indices
:param data: list of string, each string contains word + space + [B, M, E, S]
:return
"""
word_list = []
tag_list = []
for line in data:
if len(line) > 2:
tokens = line.split("#")
word_list.append(tokens[0])
tag_list.append(tokens[2][0])
self.id2word = list(set(word_list))
self.word2id = {word: idx for idx, word in enumerate(self.id2word)}
self.id2tag = list(set(tag_list))
self.tag2id = {tag: idx for idx, tag in enumerate(self.id2tag)}
words = np.array([self.word2id[w] for w in word_list]).reshape(-1, 1)
tags = np.array([self.tag2id[t] for t in tag_list]).reshape(-1, 1)
return words, tags

def mode(self, test=False):
if test:
self.model.eval()
else:
self.model.train()

def data_forward(self, x):
"""
:param x: sequence of length [batch_size], word indices
:return:
"""
x = x.reshape(self.lstm_batch_size, self.lstm_seq_len)
output, self.hidden = self.model(x, self.hidden)
return output

def define_optimizer(self):
self.optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85)

def get_loss(self, pred, truth):

self._loss = nn.CrossEntropyLoss(pred, truth)
return self._loss

def grad_backward(self):
self.model.zero_grad()
self._loss.backward()
torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
self.optimizer.step()
from fastNLP.models.base_model import BaseModel


class WordSeg(BaseModel):


+ 24
- 0
fastNLP/modules/convolution/avg_pool.py View File

@@ -0,0 +1,24 @@
# python: 3.6
# encoding: utf-8

import torch.nn as nn
import torch.nn.functional as F


class AvgPool(nn.Module):
"""1-d average pooling module."""

def __init__(self, stride=None, padding=0):
super(AvgPool, self).__init__()
self.stride = stride
self.padding = padding

def forward(self, x):
# [N,C,L] -> [N,C]
kernel_size = x.size(2)
x = F.max_pool1d(
input=x,
kernel_size=kernel_size,
stride=self.stride,
padding=self.padding)
return x.squeeze(dim=-1)

+ 28
- 0
fastNLP/modules/convolution/conv.py View File

@@ -0,0 +1,28 @@
# python: 3.6
# encoding: utf-8

import torch.nn as nn
# import torch.nn.functional as F


class Conv(nn.Module):
"""
Basic 1-d convolution module.
"""

def __init__(self, in_channels, out_channels, kernel_size,
stride=1, padding=0, dilation=1,
groups=1, bias=True):
super(Conv, self).__init__()
self.conv = nn.Conv1d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
groups=groups,
bias=bias)

def forward(self, x):
return self.conv(x) # [N,C,L]

+ 20
- 0
fastNLP/modules/convolution/kmax_pool.py View File

@@ -0,0 +1,20 @@
# python: 3.6
# encoding: utf-8

import torch
import torch.nn as nn
# import torch.nn.functional as F


class KMaxPool(nn.Module):
"""K max-pooling module."""

def __init__(self, k):
super(KMaxPool, self).__init__()
self.k = k

def forward(self, x):
# [N,C,L] -> [N,C*k]
x, index = torch.topk(x, self.k, dim=-1, sorted=False)
x = torch.reshape(x, (x.size(0), -1))
return x

+ 26
- 0
fastNLP/modules/convolution/max_pool.py View File

@@ -0,0 +1,26 @@
# python: 3.6
# encoding: utf-8

import torch.nn as nn
import torch.nn.functional as F


class MaxPool(nn.Module):
"""1-d max-pooling module."""

def __init__(self, stride=None, padding=0, dilation=1):
super(MaxPool, self).__init__()
self.stride = stride
self.padding = padding
self.dilation = dilation

def forward(self, x):
# [N,C,L] -> [N,C]
kernel_size = x.size(2)
x = F.max_pool1d(
input=x,
kernel_size=kernel_size,
stride=self.stride,
padding=self.padding,
dilation=self.dilation)
return x.squeeze(dim=-1)

+ 41
- 0
fastNLP/modules/prototype/README.md View File

@@ -0,0 +1,41 @@
# Prototype

## Word2Idx.py
A mapping model between words and indexes

## embedding.py
embedding modules

Contains a simple encapsulation for torch.nn.Embedding

## encoder.py
encoder modules

Contains a simple encapsulation for torch.nn.LSTM

## aggregation.py
aggregation modules

Contains a self-attention model, according to paper "A Structured Self-attentive Sentence Embedding", https://arxiv.org/abs/1703.03130

## predict.py
predict modules

Contains a two layers perceptron for classification

## example.py
An example showing how to use above modules to build a model

Contains a model for sentiment analysis on Yelp dataset, and its training and testing procedures. See https://arxiv.org/abs/1703.03130 for more details.

## prepare.py
A case of using Word2Idx to build Yelp datasets

## dataloader.py
A dataloader for Yelp dataset

It is an iterable object, returning a zero-padded batch every iteration.





+ 63
- 0
fastNLP/modules/prototype/Word2Idx.py View File

@@ -0,0 +1,63 @@
import collections
import pickle

class Word2Idx():
"""
Build a word index according to word frequency.

If "min_freq" is given, then only words with a frequncy not lesser than min_freq will be kept.
If "max_num" is given, then at most the most frequent $max_num words will be kept.
"words" should be a list [ w_1,w_2,...,w_i,...,w_n ] where each w_i is a string representing a word.
num is the size of the lookup table.
w2i is a lookup table assigning each word an index.
i2w is a vector which serves as an invert mapping of w2i.
Note that index 0 is token "<PAD>" for padding
index 1 is token "<UNK>" for unregistered words
e.g. i2w[w2i["word"]] == "word"
"""
def __init__(self):
self.__w2i = dict()
self.__i2w = []
self.num = 0

def build(self, words, min_freq=0, max_num=None):
"""build a model from words"""
counter = collections.Counter(words)
word_set = set(words)
if max_num is not None:
most_common = counter.most_common(min(len(word_set), max_num - 1))
else:
most_common = counter.most_common()
self.__w2i = dict((w[0],i + 1) for i,w in enumerate(most_common) if w[1] >= min_freq)
self.__w2i["<PAD>"] = 0
self.__w2i["<UNK>"] = 1
self.__i2w = ["<PAD>", "<UNK>"] + [ w[0] for w in most_common if w[1] >= min_freq ]
self.num = len(self.__i2w)

def w2i(self, word):
"""word to index"""
if word in self.__w2i:
return self.__w2i[word]
return 0

def i2w(self, idx):
"""index to word"""
if idx >= self.num:
raise Exception("out of range\n")
return self.__i2w[idx]

def save(self, addr):
"""save the model to a file with address "addr" """
f = open(addr,"wb")
pickle.dump([self.__i2w, self.__w2i, self.num], f)
f.close()

def load(self, addr):
"""load a model from a file with address "addr" """
f = open(addr,"rb")
paras = pickle.load(f)
self.__i2w, self.__w2i, self.num = paras[0], paras[1], paras[2]
f.close()



+ 40
- 0
fastNLP/modules/prototype/aggregation.py View File

@@ -0,0 +1,40 @@
import torch
import torch.nn as nn
from torch.autograd import Variable

class Selfattention(nn.Module):
"""
Self Attention Module.

Args:
input_size : the size for the input vector
d_a : the width of weight matrix
r : the number of encoded vectors
"""
def __init__(self, input_size, d_a, r):
super(Selfattention, self).__init__()
self.W_s1 = nn.Parameter(torch.randn(d_a, input_size), requires_grad=True)
self.W_s2 = nn.Parameter(torch.randn(r, d_a), requires_grad=True)
self.softmax = nn.Softmax(dim=2)
self.tanh = nn.Tanh()

def penalization(self, A):
"""
compute the penalization term for attention module
"""
if self.W_s1.is_cuda:
I = Variable(torch.eye(A.size(1)).cuda(), requires_grad=False)
else:
I = Variable(torch.eye(A.size(1)), requires_grad=False)
M = torch.matmul(A, torch.transpose(A, 1, 2)) - I
M = M.view(M.size(0), -1)
return torch.sum(M ** 2, dim=1)
def forward(self, x):
inter = self.tanh(torch.matmul(self.W_s1, torch.transpose(x, 1, 2)))
A = self.softmax(torch.matmul(self.W_s2, inter))
out = torch.matmul(A, x)
out = out.view(out.size(0), -1)
penalty = self.penalization(A)
return out, penalty


+ 81
- 0
fastNLP/modules/prototype/dataloader.py View File

@@ -0,0 +1,81 @@
import random
import pickle
import torch
import numpy as np
from torch.autograd import Variable

def float_wrapper(x, requires_grad=True, using_cuda=True):
"""
transform float type list to pytorch variable
"""
if using_cuda==True:
return Variable(torch.FloatTensor(x).cuda(), requires_grad=requires_grad)
else:
return Variable(torch.FloatTensor(x), requires_grad=requires_grad)

def long_wrapper(x, requires_grad=True, using_cuda=True):
"""
transform long type list to pytorch variable
"""
if using_cuda==True:
return Variable(torch.LongTensor(x).cuda(), requires_grad=requires_grad)
else:
return Variable(torch.LongTensor(x), requires_grad=requires_grad)
def pad(X, using_cuda):
"""
zero-pad sequnces to same length then pack them together
"""
maxlen = max([x.size(0) for x in X])
Y = []
for x in X:
padlen = maxlen - x.size(0)
if padlen > 0:
if using_cuda:
paddings = Variable(torch.zeros(padlen).long()).cuda()
else:
paddings = Variable(torch.zeros(padlen).long())
x_ = torch.cat((x, paddings), 0)
Y.append(x_)
else:
Y.append(x)
return torch.stack(Y)

class DataLoader(object):
"""
load data with form {"feature", "class"}

Args:
fdir : data file address
batch_size : batch_size
shuffle : if True, shuffle dataset every epoch
using_cuda : if True, return tensors on GPU
"""
def __init__(self, fdir, batch_size, shuffle=True, using_cuda=True):
with open(fdir, "rb") as f:
self.data = pickle.load(f)
self.batch_size = batch_size
self.num = len(self.data)
self.count = 0
self.iters = int(self.num / batch_size)
self.shuffle = shuffle
self.using_cuda = using_cuda
def __iter__(self):
return self

def __next__(self):
if self.count == self.iters:
self.count = 0
if self.shuffle:
random.shuffle(self.data)
raise StopIteration()
else:
batch = self.data[self.count * self.batch_size : (self.count + 1) * self.batch_size]
self.count += 1
X = [long_wrapper(x["sent"], using_cuda=self.using_cuda, requires_grad=False) for x in batch]
X = pad(X, self.using_cuda)
y = long_wrapper([x["class"] for x in batch], using_cuda=self.using_cuda, requires_grad=False)
return {"feature" : X, "class" : y}


+ 23
- 0
fastNLP/modules/prototype/embedding.py View File

@@ -0,0 +1,23 @@
import torch
import torch.nn as nn

class Lookuptable(nn.Module):
"""
A simple lookup table

Args:
nums : the size of the lookup table
dims : the size of each vector
padding_idx : pads the tensor with zeros whenever it encounters this index
sparse : If True, gradient matrix will be a sparse tensor. In this case,
only optim.SGD(cuda and cpu) and optim.Adagrad(cpu) can be used
"""
def __init__(self, nums, dims, padding_idx=0, sparse=False):
super(Lookuptable, self).__init__()
self.embed = nn.Embedding(nums, dims, padding_idx, sparse=sparse)
def forward(self, x):
return self.embed(x)

if __name__ == "__main__":
model = Lookuptable(10, 20)

+ 22
- 0
fastNLP/modules/prototype/encoder.py View File

@@ -0,0 +1,22 @@
import torch
import torch.nn as nn

class Lstm(nn.Module):
"""
LSTM module

Args:
input_size : input size
hidden_size : hidden size
num_layers : number of hidden layers
dropout : dropout rate
bidirectional : If True, becomes a bidirectional RNN
"""
def __init__(self, input_size, hidden_size, num_layers, dropout, bidirectional):
super(Lstm, self).__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=True,\
dropout=dropout, bidirectional=bidirectional)
def forward(self, x):
x, _ = self.lstm(x)
return x

+ 129
- 0
fastNLP/modules/prototype/example.py View File

@@ -0,0 +1,129 @@
import torch
import torch.nn as nn
import encoder
import aggregation
import embedding
import predict
import torch.optim as optim
import time
import dataloader

WORD_NUM = 357361
WORD_SIZE = 100
HIDDEN_SIZE = 300
D_A = 350
R = 10
MLP_HIDDEN = 2000
CLASSES_NUM = 5

class Net(nn.Module):
"""
A model for sentiment analysis using lstm and self-attention
"""
def __init__(self):
super(Net, self).__init__()
self.embedding = embedding.Lookuptable(WORD_NUM, WORD_SIZE)
self.encoder = encoder.Lstm(WORD_SIZE, HIDDEN_SIZE, 1, 0.5, True)
self.aggregation = aggregation.Selfattention(2 * HIDDEN_SIZE, D_A, R)
self.predict = predict.MLP(R * HIDDEN_SIZE * 2, MLP_HIDDEN, CLASSES_NUM)

def forward(self, x):
x = self.embedding(x)
x = self.encoder(x)
x, penalty = self.aggregation(x)
x = self.predict(x)
return x, penalty

def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
momentum=0.3, batch_size=32, epochs=5, coef=1.0, interval=10):
"""
training procedure

Args:
If model_dict is given (a file address), it will continue training on the given model.
Otherwise, it would train a new model from scratch.
If using_cuda is true, the training would be conducted on GPU.
Learning_rate and momentum is for SGD optimizer.
coef is the coefficent between the cross-entropy loss and the penalization term.
interval is the frequncy of reporting.

the result will be saved with a form "model_dict_+current time", which could be used for further training
"""
if using_cuda:
net = Net().cuda()
else:
net = Net()
if model_dict != None:
net.load_state_dict(torch.load(model_dict))

optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)
criterion = nn.CrossEntropyLoss()
dataset = dataloader.DataLoader("train_set.pkl", batch_size, using_cuda=using_cuda)

#statistics
loss_count = 0
prepare_time = 0
run_time = 0
count = 0

for epoch in range(epochs):
print("epoch: %d"%(epoch))
for i, batch in enumerate(dataset):
t1 = time.time()
X = batch["feature"]
y = batch["class"]
t2 = time.time()
y_pred, y_penl = net(X)
loss = criterion(y_pred, y) + torch.sum(y_penl) / batch_size * coef
optimizer.zero_grad()
loss.backward()
nn.utils.clip_grad_norm(net.parameters(), 0.5)
optimizer.step()
t3 = time.time()

loss_count += torch.sum(y_penl).data[0]
prepare_time += (t2 - t1)
run_time += (t3 - t2)
p, idx = torch.max(y_pred.data, dim=1)
count += torch.sum(torch.eq(idx.cpu(), y.data.cpu()))

if (i + 1) % interval == 0:
print("epoch : %d, iters: %d"%(epoch, i + 1))
print("loss count:" + str(loss_count / (interval * batch_size)))
print("acuracy:" + str(count / (interval * batch_size)))
print("penalty:" + str(torch.sum(y_penl).data[0] / batch_size))
print("prepare time:" + str(prepare_time))
print("run time:" + str(run_time))
prepare_time = 0
run_time = 0
loss_count = 0
count = 0
string = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
torch.save(net.state_dict(), "model_dict_%s.dict"%(string))

def test(model_dict, using_cuda=True):
if using_cuda:
net = Net().cuda()
else:
net = Net()
net.load_state_dict(torch.load(model_dict))
dataset = dataloader.DataLoader("test_set.pkl", batch_size=1, using_cuda=using_cuda)
count = 0
for i, batch in enumerate(dataset):
X = batch["feature"]
y = batch["class"]
y_pred, _ = net(X)
p, idx = torch.max(y_pred.data, dim=1)
count += torch.sum(torch.eq(idx.cpu(), y.data.cpu()))
print("accuracy: %f"%(count / dataset.num))

if __name__ == "__main__":
train(using_cuda=torch.cuda.is_available())


+ 25
- 0
fastNLP/modules/prototype/predict.py View File

@@ -0,0 +1,25 @@
import torch
import torch.nn as nn
import torch.nn.functional as F

class MLP(nn.Module):
"""
A two layers perceptron for classification.

Output : Unnormalized possibility distribution
Args:
input_size : the size of input
hidden_size : the size of hidden layer
output_size : the size of output
"""
def __init__(self, input_size, hidden_size, output_size):
super(MLP,self).__init__()
self.L1 = nn.Linear(input_size, hidden_size)
self.L2 = nn.Linear(hidden_size, output_size)

def forward(self, x):
out = self.L2(F.relu(self.L1(x)))
return out

if __name__ == "__main__":
MLP(20, 30, 20)

+ 50
- 0
fastNLP/modules/prototype/prepare.py View File

@@ -0,0 +1,50 @@
import pickle
import Word2Idx

def get_sets(m, n):
"""
get a train set containing m samples and a test set containing n samples
"""
samples = pickle.load(open("tuples.pkl","rb"))
if m+n > len(samples):
print("asking for too many tuples\n")
return
train_samples = samples[ : m]
test_samples = samples[m: m+n]
return train_samples, test_samples

def build_wordidx():
"""
build wordidx using word2idx
"""
train, test = get_sets(500000, 2000)
words = []
for x in train:
words += x[0]
wordidx = Word2Idx.Word2Idx()
wordidx.build(words)
print(wordidx.num)
print(wordidx.i2w(0))
wordidx.save("wordidx.pkl")

def build_sets():
"""
build train set and test set, transform word to index
"""
train, test = get_sets(500000, 2000)
wordidx = Word2Idx.Word2Idx()
wordidx.load("wordidx.pkl")
train_set = []
for x in train:
sent = [wordidx.w2i(w) for w in x[0]]
train_set.append({"sent" : sent, "class" : x[1]})
test_set = []
for x in test:
sent = [wordidx.w2i(w) for w in x[0]]
test_set.append({"sent" : sent, "class" : x[1]})
pickle.dump(train_set, open("train_set.pkl", "wb"))
pickle.dump(test_set, open("test_set.pkl", "wb"))

if __name__ == "__main__":
build_wordidx()
build_sets()

+ 14
- 0
fastNLP/saver/base_saver.py View File

@@ -0,0 +1,14 @@
class BaseSaver(object):
"""base class for all savers"""

def __init__(self, save_path):
self.save_path = save_path

def save_bytes(self):
raise NotImplementedError

def save_str(self):
raise NotImplementedError

def compress(self):
raise NotImplementedError

+ 12
- 0
fastNLP/saver/logger.py View File

@@ -0,0 +1,12 @@
from saver.base_saver import BaseSaver


class Logger(BaseSaver):
"""Logging"""

def __init__(self, save_path):
super(Logger, self).__init__(save_path)

def log(self, string):
with open(self.save_path, "a") as f:
f.write(string)

+ 8
- 0
fastNLP/saver/model_saver.py View File

@@ -0,0 +1,8 @@
from saver.base_saver import BaseSaver


class ModelSaver(BaseSaver):
"""Save a models"""

def __init__(self, save_path):
super(ModelSaver, self).__init__(save_path)

+ 9
- 12
test/test_word_seg.py View File

@@ -1,23 +1,20 @@
from loader.base_loader import BaseLoader
from model.word_seg_model import WordSegModel
from fastNLP.action.tester import Tester
from fastNLP.action.trainer import WordSegTrainer
from fastNLP.loader.base_loader import BaseLoader
from fastNLP.models.word_seg_model import WordSeg

from fastNLP.action import Tester
from fastNLP.action.trainer import Trainer


def test_charlm():
train_config = Trainer.TrainConfig(epochs=5, validate=False, save_when_better=False,
def test_wordseg():
train_config = WordSegTrainer.TrainConfig(epochs=5, validate=False, save_when_better=False,
log_per_step=10, log_validation=False, batch_size=254)
trainer = Trainer(train_config)
trainer = WordSegTrainer(train_config)

model = WordSegModel()
model = WordSeg(100, 2, 1000)

train_data = BaseLoader("load_train", "./data_for_tests/cws_train").load_lines()

trainer.train(model, train_data)

trainer.save_model(model)

test_config = Tester.TestConfig(save_output=False, validate_in_training=False,
save_dev_input=False, save_loss=False, batch_size=254)
tester = Tester(test_config)
@@ -28,4 +25,4 @@ def test_charlm():


if __name__ == "__main__":
test_charlm()
test_wordseg()

Loading…
Cancel
Save