Browse Source

Merge pull request #3 from fastnlp/dev

primary framework built
tags/v0.1.0
Coet GitHub 6 years ago
parent
commit
ea87f250d4
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 4158 additions and 55 deletions
  1. +1
    -1
      README.md
  2. +8
    -0
      action/README.md
  3. +34
    -4
      action/action.py
  4. +80
    -2
      action/tester.py
  5. +82
    -5
      action/trainer.py
  6. +16
    -0
      loader/base_loader.py
  7. +1
    -2
      loader/config_loader.py
  8. +0
    -0
      model/__init__.py
  9. +82
    -0
      model/base_model.py
  10. +356
    -0
      model/char_language_model.py
  11. +6
    -12
      reproduction/CNN-sentence_classification/train.py
  12. +9
    -9
      reproduction/Char-aware_NLM/test.py
  13. +27
    -20
      reproduction/Char-aware_NLM/train.py
  14. +14
    -0
      saver/base_saver.py
  15. +0
    -0
      saver/empty.txt
  16. +12
    -0
      saver/logger.py
  17. +8
    -0
      saver/model_saver.py
  18. +3370
    -0
      tests/data_for_tests/charlm.txt
  19. +31
    -0
      tests/test_charlm.py
  20. +0
    -0
      tests/test_loader.py
  21. +21
    -0
      tests/test_trainer.py

+ 1
- 1
README.md View File

@@ -1,2 +1,2 @@
# FastNLP # FastNLP
FastNLP
FastNLP

+ 8
- 0
action/README.md View File

@@ -0,0 +1,8 @@
SpaCy "Doc"
https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/tokens/doc.pyx#L80

SpaCy "Vocab"
https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/vocab.pyx#L25

SpaCy "Token"
https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/tokens/token.pyx#L27

+ 34
- 4
action/action.py View File

@@ -1,3 +1,6 @@
from saver.logger import Logger


class Action(object): class Action(object):
""" """
base class for Trainer and Tester base class for Trainer and Tester
@@ -5,12 +8,39 @@ class Action(object):


def __init__(self): def __init__(self):
super(Action, self).__init__() super(Action, self).__init__()
self.logger = Logger("logger_output.txt")


def load_config(self, args): def load_config(self, args):
pass
raise NotImplementedError


def load_dataset(self, args): def load_dataset(self, args):
pass
raise NotImplementedError

def log(self, string):
self.logger.log(string)

def batchify(self, batch_size, X, Y=None):
"""
:param batch_size: int
:param X: feature matrix of size [n_sample, m_feature]
:param Y: label vector of size [n_sample, 1] (optional)
:return iteration:int, the number of step in each epoch
generator:generator, to generate batch inputs
"""
n_samples = X.size()[0]
num_iter = n_samples // batch_size
if Y is None:
generator = self._batch_generate(batch_size, num_iter, X)
else:
generator = self._batch_generate(batch_size, num_iter, X, Y)
return num_iter, generator

@staticmethod
def _batch_generate(batch_size, num_iter, *data):
for step in range(num_iter):
start = batch_size * step
end = batch_size * (step + 1)
yield tuple([x[start:end] for x in data])


def log(self, args):
pass
def make_log(self, *args):
return "log"

+ 80
- 2
action/tester.py View File

@@ -1,9 +1,87 @@
from collections import namedtuple

import numpy as np

from action.action import Action from action.action import Action




class Tester(Action): class Tester(Action):
"""docstring for Tester""" """docstring for Tester"""


def __init__(self, arg):
TestConfig = namedtuple("config", ["validate_in_training", "save_dev_input", "save_output",
"save_loss", "batch_size"])

def __init__(self, test_args):
"""
:param test_args: named tuple
"""
super(Tester, self).__init__() super(Tester, self).__init__()
self.arg = arg
self.validate_in_training = test_args.validate_in_training
self.save_dev_input = test_args.save_dev_input
self.valid_x = None
self.valid_y = None
self.save_output = test_args.save_output
self.output = None
self.save_loss = test_args.save_loss
self.mean_loss = None
self.batch_size = test_args.batch_size

def test(self, network, data):
print("testing")
network.mode(test=True) # turn on the testing mode
if self.save_dev_input:
if self.valid_x is None:
valid_x, valid_y = network.prepare_input(data)
self.valid_x = valid_x
self.valid_y = valid_y
else:
valid_x = self.valid_x
valid_y = self.valid_y
else:
valid_x, valid_y = network.prepare_input(data)

# split into batches by self.batch_size
iterations, test_batch_generator = self.batchify(self.batch_size, valid_x, valid_y)

batch_output = list()
loss_history = list()
# turn on the testing mode of the network
network.mode(test=True)

for step in range(iterations):
batch_x, batch_y = test_batch_generator.__next__()

# forward pass from tests input to predicted output
prediction = network.data_forward(batch_x)

loss = network.get_loss(prediction, batch_y)

if self.save_output:
batch_output.append(prediction.data)
if self.save_loss:
loss_history.append(loss)
self.log(self.make_log(step, loss))

if self.save_loss:
self.mean_loss = np.mean(np.array(loss_history))
if self.save_output:
self.output = self.make_output(batch_output)

@property
def loss(self):
return self.mean_loss

@property
def result(self):
return self.output

@staticmethod
def make_output(batch_outputs):
# construct full prediction with batch outputs
return np.concatenate(batch_outputs, axis=0)

def load_config(self, args):
raise NotImplementedError

def load_dataset(self, args):
raise NotImplementedError

+ 82
- 5
action/trainer.py View File

@@ -1,14 +1,91 @@
from action.action import Action
from collections import namedtuple

from .action import Action
from .tester import Tester




class Trainer(Action): class Trainer(Action):
""" """
Trainer for common training logic of all models Trainer for common training logic of all models
""" """
TrainConfig = namedtuple("config", ["epochs", "validate", "save_when_better",
"log_per_step", "log_validation", "batch_size"])


def __init__(self, arg):
def __init__(self, train_args):
"""
:param train_args: namedtuple
"""
super(Trainer, self).__init__() super(Trainer, self).__init__()
self.arg = arg
self.n_epochs = train_args.epochs
self.validate = train_args.validate
self.save_when_better = train_args.save_when_better
self.log_per_step = train_args.log_per_step
self.log_validation = train_args.log_validation
self.batch_size = train_args.batch_size

def train(self, network, train_data, dev_data):
"""
:param network: the model controller
:param train_data: raw data for training
:param dev_data: raw data for validation
:return:
"""
train_x, train_y = network.prepare_input(train_data)

iterations, train_batch_generator = self.batchify(self.batch_size, train_x, train_y)

test_args = Tester.TestConfig(save_output=True, validate_in_training=True,
save_dev_input=True, save_loss=True, batch_size=self.batch_size)
evaluator = Tester(test_args)

best_loss = 1e10
loss_history = list()

for epoch in range(self.n_epochs):
network.mode(test=False) # turn on the train mode

network.define_optimizer()
for step in range(iterations):
batch_x, batch_y = train_batch_generator.__next__()

prediction = network.data_forward(batch_x)

loss = network.get_loss(prediction, batch_y)
network.grad_backward()

if step % self.log_per_step == 0:
print("step ", step)
loss_history.append(loss)
self.log(self.make_log(epoch, step, loss))

#################### evaluate over dev set ###################
if self.validate:
# give all controls to tester
evaluator.test(network, dev_data)

if self.log_validation:
self.log(self.make_valid_log(epoch, evaluator.loss))
if evaluator.loss < best_loss:
best_loss = evaluator.loss
if self.save_when_better:
self.save_model(network)

# finish training

def make_log(self, *args):
return "make a log"

def make_valid_log(self, *args):
return "make a valid log"

def save_model(self, model):
model.save()

def load_data(self, data_name):
print("load data")

def load_config(self, args):
raise NotImplementedError


def train(self, args):
pass
def load_dataset(self, args):
raise NotImplementedError

+ 16
- 0
loader/base_loader.py View File

@@ -13,3 +13,19 @@ class BaseLoader(object):
with open(self.data_path, "r", encoding="utf-8") as f: with open(self.data_path, "r", encoding="utf-8") as f:
text = f.read() text = f.read()
return text return text


class ToyLoader0(BaseLoader):
"""
For charLM
"""

def __init__(self, name, path):
super(ToyLoader0, self).__init__(name, path)

def load(self):
with open(self.data_path, 'r') as f:
corpus = f.read().lower()
import re
corpus = re.sub(r"<unk>", "unk", corpus)
return corpus.split()

+ 1
- 2
loader/config_loader.py View File

@@ -10,5 +10,4 @@ class ConfigLoader(BaseLoader):


@staticmethod @staticmethod
def parse(string): def parse(string):
# To do
return string
raise NotImplementedError

model/empty.txt → model/__init__.py View File


+ 82
- 0
model/base_model.py View File

@@ -0,0 +1,82 @@
import numpy as np


class BaseModel(object):
"""PyTorch base model for all models"""

def __init__(self):
pass

def prepare_input(self, data):
"""
:param data: str, raw input vector(?)
:return (X, Y): tuple, input features and labels
"""
raise NotImplementedError

def mode(self, test=False):
raise NotImplementedError

def data_forward(self, *x):
# required by PyTorch nn
raise NotImplementedError

def grad_backward(self):
raise NotImplementedError

def get_loss(self, pred, truth):
raise NotImplementedError


class ToyModel(BaseModel):
"""This is for code testing."""

def __init__(self):
super(ToyModel, self).__init__()
self.test_mode = False
self.weight = np.random.rand(5, 1)
self.bias = np.random.rand()
self._loss = 0

def prepare_input(self, data):
return data[:, :-1], data[:, -1]

def mode(self, test=False):
self.test_mode = test

def data_forward(self, x):
return np.matmul(x, self.weight) + self.bias

def grad_backward(self):
print("loss gradient backward")

def get_loss(self, pred, truth):
self._loss = np.mean(np.square(pred - truth))
return self._loss


class Vocabulary(object):
"""
A collection of lookup tables.
"""

def __init__(self):
self.word_set = None
self.word2idx = None
self.emb_matrix = None

def lookup(self, word):
if word in self.word_set:
return self.emb_matrix[self.word2idx[word]]
return LookupError("The key " + word + " does not exist.")


class Document(object):
"""
contains a sequence of tokens
each token is a character with linguistic attributes
"""

def __init__(self):
# wrap pandas.dataframe
self.dataframe = None

+ 356
- 0
model/char_language_model.py View File

@@ -0,0 +1,356 @@
import os
from collections import namedtuple

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

from model.base_model import BaseModel

USE_GPU = True


class CharLM(BaseModel):

"""
Controller of the Character-level Neural Language Model
To do:
- where the data goes, call data savers.
"""
DataTuple = namedtuple("DataTuple", ["feature", "label"])

def __init__(self, lstm_batch_size, lstm_seq_len):
super(CharLM, self).__init__()
"""
Settings: should come from config loader or pre-processing
"""
self.word_embed_dim = 300
self.char_embedding_dim = 15
self.cnn_batch_size = lstm_batch_size * lstm_seq_len
self.lstm_seq_len = lstm_seq_len
self.lstm_batch_size = lstm_batch_size
self.num_epoch = 10
self.old_PPL = 100000
self.best_PPL = 100000

"""
These parameters are set by pre-processing.
"""
self.max_word_len = None
self.num_char = None
self.vocab_size = None
self.preprocess("./data_for_tests/charlm.txt")

self.data = None # named tuple to store all data set
self.data_ready = False
self.criterion = nn.CrossEntropyLoss()
self._loss = None
self.use_gpu = USE_GPU

# word_emb_dim == hidden_size / num of hidden units
self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)),
to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)))

self.model = charLM(self.char_embedding_dim,
self.word_embed_dim,
self.vocab_size,
self.num_char,
use_gpu=self.use_gpu)
for param in self.model.parameters():
nn.init.uniform(param.data, -0.05, 0.05)

self.learning_rate = 0.1
self.optimizer = None

def prepare_input(self, raw_text):
"""
:param raw_text: raw input text consisting of words
:return: torch.Tensor, torch.Tensor
feature matrix, label vector
This function is only called once in Trainer.train, but may called multiple times in Tester.test
So Tester will save test input for frequent calls.
"""
if os.path.exists("cache/prep.pt") is False:
self.preprocess("./data_for_tests/charlm.txt") # To do: This is not good. Need to fix..
objects = torch.load("cache/prep.pt")
word_dict = objects["word_dict"]
char_dict = objects["char_dict"]
max_word_len = self.max_word_len
print("word/char dictionary built. Start making inputs.")

words = raw_text
input_vec = np.array(text2vec(words, char_dict, max_word_len))
# Labels are next-word index in word_dict with the same length as inputs
input_label = np.array([word_dict[w] for w in words[1:]] + [word_dict[words[-1]]])
feature_input = torch.from_numpy(input_vec)
label_input = torch.from_numpy(input_label)
return feature_input, label_input

def mode(self, test=False):
if test:
self.model.eval()
else:
self.model.train()

def data_forward(self, x):
"""
:param x: Tensor of size [lstm_batch_size, lstm_seq_len, max_word_len+2]
:return: Tensor of size [num_words, ?]
"""
# additional processing of inputs after batching
num_seq = x.size()[0] // self.lstm_seq_len
x = x[:num_seq * self.lstm_seq_len, :]
x = x.view(-1, self.lstm_seq_len, self.max_word_len + 2)

# detach hidden state of LSTM from last batch
hidden = [state.detach() for state in self.hidden]
output, self.hidden = self.model(to_var(x), hidden)
return output

def grad_backward(self):
self.model.zero_grad()
self._loss.backward()
torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
self.optimizer.step()

def get_loss(self, predict, truth):
self._loss = self.criterion(predict, to_var(truth))
return self._loss.data # No pytorch data structure exposed outsides

def define_optimizer(self):
# redefine optimizer for every new epoch
self.optimizer = optim.SGD(self.model.parameters(), lr=self.learning_rate, momentum=0.85)

def save(self):
print("network saved")
# torch.save(self.model, "cache/model.pkl")

def preprocess(self, all_text_files):
word_dict, char_dict = create_word_char_dict(all_text_files)
num_char = len(char_dict)
self.vocab_size = len(word_dict)
char_dict["BOW"] = num_char + 1
char_dict["EOW"] = num_char + 2
char_dict["PAD"] = 0
self.num_char = num_char + 3
# char_dict is a dict of (int, string), int counting from 0 to 47
reverse_word_dict = {value: key for key, value in word_dict.items()}
self.max_word_len = max([len(word) for word in word_dict])
objects = {
"word_dict": word_dict,
"char_dict": char_dict,
"reverse_word_dict": reverse_word_dict,
}
torch.save(objects, "cache/prep.pt")
print("Preprocess done.")


"""
Global Functions
"""


def batch_generator(x, batch_size):
# x: [num_words, in_channel, height, width]
# partitions x into batches
num_step = x.size()[0] // batch_size
for t in range(num_step):
yield x[t * batch_size:(t + 1) * batch_size]


def text2vec(words, char_dict, max_word_len):
""" Return list of list of int """
word_vec = []
for word in words:
vec = [char_dict[ch] for ch in word]
if len(vec) < max_word_len:
vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))]
vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]]
word_vec.append(vec)
return word_vec


def read_data(file_name):
with open(file_name, 'r') as f:
corpus = f.read().lower()
import re
corpus = re.sub(r"<unk>", "unk", corpus)
return corpus.split()


def get_char_dict(vocabulary):
char_dict = dict()
count = 1
for word in vocabulary:
for ch in word:
if ch not in char_dict:
char_dict[ch] = count
count += 1
return char_dict


def create_word_char_dict(*file_name):
text = []
for file in file_name:
text += read_data(file)
word_dict = {word: ix for ix, word in enumerate(set(text))}
char_dict = get_char_dict(word_dict)
return word_dict, char_dict


def to_var(x):
if torch.cuda.is_available() and USE_GPU:
x = x.cuda()
return Variable(x)


"""
Neural Network
"""


class Highway(nn.Module):
"""Highway network"""

def __init__(self, input_size):
super(Highway, self).__init__()
self.fc1 = nn.Linear(input_size, input_size, bias=True)
self.fc2 = nn.Linear(input_size, input_size, bias=True)

def forward(self, x):
t = F.sigmoid(self.fc1(x))
return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x)


class charLM(nn.Module):
"""Character-level Neural Language Model
CNN + highway network + LSTM
# Input:
4D tensor with shape [batch_size, in_channel, height, width]
# Output:
2D Tensor with shape [batch_size, vocab_size]
# Arguments:
char_emb_dim: the size of each character's embedding
word_emb_dim: the size of each word's embedding
vocab_size: num of unique words
num_char: num of characters
use_gpu: True or False
"""

def __init__(self, char_emb_dim, word_emb_dim,
vocab_size, num_char, use_gpu):
super(charLM, self).__init__()
self.char_emb_dim = char_emb_dim
self.word_emb_dim = word_emb_dim
self.vocab_size = vocab_size

# char embedding layer
self.char_embed = nn.Embedding(num_char, char_emb_dim)

# convolutions of filters with different sizes
self.convolutions = []

# list of tuples: (the number of filter, width)
# self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)]
self.filter_num_width = [(25, 1), (50, 2), (75, 3)]

for out_channel, filter_width in self.filter_num_width:
self.convolutions.append(
nn.Conv2d(
1, # in_channel
out_channel, # out_channel
kernel_size=(char_emb_dim, filter_width), # (height, width)
bias=True
)
)

self.highway_input_dim = sum([x for x, y in self.filter_num_width])

self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False)

# highway net
self.highway1 = Highway(self.highway_input_dim)
self.highway2 = Highway(self.highway_input_dim)

# LSTM
self.lstm_num_layers = 2

self.lstm = nn.LSTM(input_size=self.highway_input_dim,
hidden_size=self.word_emb_dim,
num_layers=self.lstm_num_layers,
bias=True,
dropout=0.5,
batch_first=True)

# output layer
self.dropout = nn.Dropout(p=0.5)
self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)

if use_gpu is True:
for x in range(len(self.convolutions)):
self.convolutions[x] = self.convolutions[x].cuda()
self.highway1 = self.highway1.cuda()
self.highway2 = self.highway2.cuda()
self.lstm = self.lstm.cuda()
self.dropout = self.dropout.cuda()
self.char_embed = self.char_embed.cuda()
self.linear = self.linear.cuda()
self.batch_norm = self.batch_norm.cuda()

def forward(self, x, hidden):
# Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2]
# Return: Variable of Tensor with shape [num_words, len(word_dict)]
lstm_batch_size = x.size()[0]
lstm_seq_len = x.size()[1]

x = x.contiguous().view(-1, x.size()[2])
# [num_seq*seq_len, max_word_len+2]

x = self.char_embed(x)
# [num_seq*seq_len, max_word_len+2, char_emb_dim]

x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
# [num_seq*seq_len, 1, char_emb_dim, max_word_len+2]

x = self.conv_layers(x)
# [num_seq*seq_len, total_num_filters]

x = self.batch_norm(x)
# [num_seq*seq_len, total_num_filters]

x = self.highway1(x)
x = self.highway2(x)
# [num_seq*seq_len, total_num_filters]

x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
# [num_seq, seq_len, total_num_filters]

x, hidden = self.lstm(x, hidden)
# [seq_len, num_seq, hidden_size]

x = self.dropout(x)
# [seq_len, num_seq, hidden_size]

x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1)
# [num_seq*seq_len, hidden_size]

x = self.linear(x)
# [num_seq*seq_len, vocab_size]
return x, hidden

def conv_layers(self, x):
chosen_list = list()
for conv in self.convolutions:
feature_map = F.tanh(conv(x))
# (batch_size, out_channel, 1, max_word_len-width+1)
chosen = torch.max(feature_map, 3)[0]
# (batch_size, out_channel, 1)
chosen = chosen.squeeze()
# (batch_size, out_channel)
chosen_list.append(chosen)

# (batch_size, total_num_filers)
return torch.cat(chosen_list, 1)

+ 6
- 12
reproduction/CNN-sentence_classification/train.py View File

@@ -1,17 +1,12 @@
import os import os
import torch

import
import torch
import torch.nn as nn import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
import dataset as dst
from model import CNN_text
.dataset as dst
from .model import CNN_text
from torch.autograd import Variable from torch.autograd import Variable


from sklearn import cross_validation
from sklearn import datasets



# Hyper Parameters # Hyper Parameters
batch_size = 50 batch_size = 50
learning_rate = 0.0001 learning_rate = 0.0001
@@ -51,8 +46,7 @@ if cuda:
criterion = nn.CrossEntropyLoss() criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate) optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate)



#train and test
# train and tests
best_acc = None best_acc = None


for epoch in range(num_epochs): for epoch in range(num_epochs):


+ 9
- 9
reproduction/Char-aware_NLM/test.py View File

@@ -1,12 +1,12 @@
import os import os
from collections import namedtuple

import numpy as np
import torch import torch
from torch.autograd import Variable
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from model import charLM
from torch.autograd import Variable
from utilities import * from utilities import *
from collections import namedtuple



def to_var(x): def to_var(x):
if torch.cuda.is_available(): if torch.cuda.is_available():
@@ -76,18 +76,18 @@ if __name__ == "__main__":




if os.path.exists("cache/data_sets.pt") is False: if os.path.exists("cache/data_sets.pt") is False:
test_text = read_data("./test.txt")
test_text = read_data("./tests.txt")
test_set = np.array(text2vec(test_text, char_dict, max_word_len)) test_set = np.array(text2vec(test_text, char_dict, max_word_len))


# Labels are next-word index in word_dict with the same length as inputs # Labels are next-word index in word_dict with the same length as inputs
test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]]) test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]])


category = {"test": test_set, "tlabel":test_label}
category = {"tests": test_set, "tlabel": test_label}
torch.save(category, "cache/data_sets.pt") torch.save(category, "cache/data_sets.pt")
else: else:
data_sets = torch.load("cache/data_sets.pt") data_sets = torch.load("cache/data_sets.pt")
test_set = data_sets["test"]
test_set = data_sets["tests"]
test_label = data_sets["tlabel"] test_label = data_sets["tlabel"]
train_set = data_sets["tdata"] train_set = data_sets["tdata"]
train_label = data_sets["trlabel"] train_label = data_sets["trlabel"]


+ 27
- 20
reproduction/Char-aware_NLM/train.py View File

@@ -1,20 +1,16 @@

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import os import os
from model import charLM
from utilities import *
from collections import namedtuple from collections import namedtuple
from test import test

import numpy as np
import torch.optim as optim

from .model import charLM
from .test import test
from .utilities import *




def preprocess(): def preprocess():
word_dict, char_dict = create_word_char_dict("valid.txt", "train.txt", "test.txt")
word_dict, char_dict = create_word_char_dict("charlm.txt", "train.txt", "tests.txt")
num_words = len(word_dict) num_words = len(word_dict)
num_char = len(char_dict) num_char = len(char_dict)
char_dict["BOW"] = num_char+1 char_dict["BOW"] = num_char+1
@@ -43,7 +39,18 @@ def to_var(x):




def train(net, data, opt): def train(net, data, opt):
"""
:param net: the pytorch model
:param data: numpy array
:param opt: named tuple
1. random seed
2. define local input
3. training settting: learning rate, loss, etc
4. main loop epoch
5. batchify
6. validation
7. save model
"""
torch.manual_seed(1024) torch.manual_seed(1024)


train_input = torch.from_numpy(data.train_input) train_input = torch.from_numpy(data.train_input)
@@ -125,9 +132,9 @@ def train(net, data, opt):
################################################## ##################################################
#################### Training #################### #################### Training ####################
net.train() net.train()
optimizer = optim.SGD(net.parameters(),
lr = learning_rate,
momentum=0.85)
optimizer = optim.SGD(net.parameters(),
lr = learning_rate,
momentum=0.85)


# split the first dim # split the first dim
input_generator = batch_generator(train_input, opt.lstm_batch_size) input_generator = batch_generator(train_input, opt.lstm_batch_size)
@@ -183,8 +190,8 @@ if __name__=="__main__":


if os.path.exists("cache/data_sets.pt") is False: if os.path.exists("cache/data_sets.pt") is False:
train_text = read_data("./train.txt") train_text = read_data("./train.txt")
valid_text = read_data("./valid.txt")
test_text = read_data("./test.txt")
valid_text = read_data("./charlm.txt")
test_text = read_data("./tests.txt")


train_set = np.array(text2vec(train_text, char_dict, max_word_len)) train_set = np.array(text2vec(train_text, char_dict, max_word_len))
valid_set = np.array(text2vec(valid_text, char_dict, max_word_len)) valid_set = np.array(text2vec(valid_text, char_dict, max_word_len))
@@ -195,14 +202,14 @@ if __name__=="__main__":
valid_label = np.array([word_dict[w] for w in valid_text[1:]] + [word_dict[valid_text[-1]]]) valid_label = np.array([word_dict[w] for w in valid_text[1:]] + [word_dict[valid_text[-1]]])
test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]]) test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]])


category = {"tdata":train_set, "vdata":valid_set, "test": test_set,
category = {"tdata": train_set, "vdata": valid_set, "tests": test_set,
"trlabel":train_label, "vlabel":valid_label, "tlabel":test_label} "trlabel":train_label, "vlabel":valid_label, "tlabel":test_label}
torch.save(category, "cache/data_sets.pt") torch.save(category, "cache/data_sets.pt")
else: else:
data_sets = torch.load("cache/data_sets.pt") data_sets = torch.load("cache/data_sets.pt")
train_set = data_sets["tdata"] train_set = data_sets["tdata"]
valid_set = data_sets["vdata"] valid_set = data_sets["vdata"]
test_set = data_sets["test"]
test_set = data_sets["tests"]
train_label = data_sets["trlabel"] train_label = data_sets["trlabel"]
valid_label = data_sets["vlabel"] valid_label = data_sets["vlabel"]
test_label = data_sets["tlabel"] test_label = data_sets["tlabel"]


+ 14
- 0
saver/base_saver.py View File

@@ -0,0 +1,14 @@
class BaseSaver(object):
"""base class for all savers"""

def __init__(self, save_path):
self.save_path = save_path

def save_bytes(self):
raise NotImplementedError

def save_str(self):
raise NotImplementedError

def compress(self):
raise NotImplementedError

+ 0
- 0
saver/empty.txt View File


+ 12
- 0
saver/logger.py View File

@@ -0,0 +1,12 @@
from saver.base_saver import BaseSaver


class Logger(BaseSaver):
"""Logging"""

def __init__(self, save_path):
super(Logger, self).__init__(save_path)

def log(self, string):
with open(self.save_path, "a") as f:
f.write(string)

+ 8
- 0
saver/model_saver.py View File

@@ -0,0 +1,8 @@
from saver.base_saver import BaseSaver


class ModelSaver(BaseSaver):
"""Save a model"""

def __init__(self, save_path):
super(ModelSaver, self).__init__(save_path)

+ 3370
- 0
tests/data_for_tests/charlm.txt
File diff suppressed because it is too large
View File


+ 31
- 0
tests/test_charlm.py View File

@@ -0,0 +1,31 @@
from action.tester import Tester
from action.trainer import Trainer
from loader.base_loader import ToyLoader0
from model.char_language_model import CharLM


def test_charlm():
train_config = Trainer.TrainConfig(epochs=1, validate=True, save_when_better=True,
log_per_step=10, log_validation=True, batch_size=160)
trainer = Trainer(train_config)

model = CharLM(lstm_batch_size=16, lstm_seq_len=10)

train_data = ToyLoader0("load_train", "./data_for_tests/charlm.txt").load()
valid_data = ToyLoader0("load_valid", "./data_for_tests/charlm.txt").load()

trainer.train(model, train_data, valid_data)

trainer.save_model(model)

test_config = Tester.TestConfig(save_output=True, validate_in_training=True,
save_dev_input=True, save_loss=True, batch_size=160)
tester = Tester(test_config)

test_data = ToyLoader0("load_test", "./data_for_tests/charlm.txt").load()

tester.test(model, test_data)


if __name__ == "__main__":
test_charlm()

test/test_loader.py → tests/test_loader.py View File


+ 21
- 0
tests/test_trainer.py View File

@@ -0,0 +1,21 @@
from collections import namedtuple

import numpy as np

from action.trainer import Trainer
from model.base_model import ToyModel


def test_trainer():
Config = namedtuple("config", ["epochs", "validate", "save_when_better"])
train_config = Config(epochs=5, validate=True, save_when_better=True)
trainer = Trainer(train_config)

net = ToyModel()
data = np.random.rand(20, 6)
dev_data = np.random.rand(20, 6)
trainer.train(net, data, dev_data)


if __name__ == "__main__":
test_trainer()

Loading…
Cancel
Save