Browse Source

design intermediate controller between trainer and pytorch model

tags/v0.1.0
FengZiYjun 6 years ago
parent
commit
6b357bec40
7 changed files with 362 additions and 11 deletions
  1. +1
    -1
      .idea/fastNLP.iml
  2. +1
    -1
      .idea/misc.xml
  3. +12
    -4
      action/trainer.py
  4. +0
    -0
      model/__init__.py
  5. +3
    -2
      model/base_model.py
  6. +342
    -0
      model/char_language_model.py
  7. +3
    -3
      reproduction/Char-aware_NLM/train.py

+ 1
- 1
.idea/fastNLP.iml View File

@@ -2,7 +2,7 @@
<module type="PYTHON_MODULE" version="4"> <module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager"> <component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" /> <content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="jdk" jdkName="Remote Python 3.6.5 (ssh://zyfeng@10.141.208.102:22/home/zyfeng/anaconda2/envs/conda_env3/bin/python)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>
<component name="TestRunnerService"> <component name="TestRunnerService">


+ 1
- 1
.idea/misc.xml View File

@@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project version="4"> <project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5 (PCA_emb)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.6.5 (ssh://zyfeng@10.141.208.102:22/home/zyfeng/anaconda2/envs/conda_env3/bin/python)" project-jdk-type="Python SDK" />
</project> </project>

+ 12
- 4
action/trainer.py View File

@@ -19,9 +19,10 @@ class Trainer(Action):
self.save_when_better = self.train_args.save_when_better self.save_when_better = self.train_args.save_when_better


def train(self, network, data, dev_data): def train(self, network, data, dev_data):
X, Y = network.prepare_input(data)
train_x, train_y = network.prepare_input(data.train_set, data.train_label)
valid_x, valid_y = network.prepare_input(dev_data.valid_set, dev_data.valid_label)


iterations, train_batch_generator = self.batchify(X, Y)
iterations, train_batch_generator = self.batchify(train_x, train_y)
loss_history = list() loss_history = list()
network.mode(test=False) network.mode(test=False)


@@ -33,15 +34,18 @@ class Trainer(Action):


for step in range(iterations): for step in range(iterations):
batch_x, batch_y = train_batch_generator.__next__() batch_x, batch_y = train_batch_generator.__next__()

prediction = network.data_forward(batch_x) prediction = network.data_forward(batch_x)

loss = network.loss(batch_y, prediction) loss = network.loss(batch_y, prediction)
network.grad_backward() network.grad_backward()
loss_history.append(loss) loss_history.append(loss)
self.log(self.make_log(epoch, step, loss)) self.log(self.make_log(epoch, step, loss))


# evaluate over dev set
#################### evaluate over dev set ###################
if self.validate: if self.validate:
evaluator.test(network, dev_data)
evaluator.test(network, [valid_x, valid_y])

self.log(self.make_valid_log(epoch, evaluator.loss)) self.log(self.make_valid_log(epoch, evaluator.loss))
if evaluator.loss < best_loss: if evaluator.loss < best_loss:
best_loss = evaluator.loss best_loss = evaluator.loss
@@ -50,6 +54,10 @@ class Trainer(Action):


# finish training # finish training


@staticmethod
def prepare_training(network, data):
return network.prepare_training(data)

def make_log(self, *args): def make_log(self, *args):
print("logged") print("logged")




+ 0
- 0
model/__init__.py View File


+ 3
- 2
model/base_model.py View File

@@ -2,7 +2,7 @@ import numpy as np




class BaseModel(object): class BaseModel(object):
"""base model for all models"""
"""PyTorch base model for all models"""


def __init__(self): def __init__(self):
pass pass
@@ -17,7 +17,8 @@ class BaseModel(object):
def mode(self, test=False): def mode(self, test=False):
raise NotImplementedError raise NotImplementedError


def data_forward(self, x):
def data_forward(self, *x):
# required by PyTorch nn
raise NotImplementedError raise NotImplementedError


def grad_backward(self): def grad_backward(self):


+ 342
- 0
model/char_language_model.py View File

@@ -0,0 +1,342 @@
import os
from collections import namedtuple

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

from model.base_model import BaseModel


class CharLM(BaseModel):
"""
Controller of the Character-level Neural Language Model
"""

def __init__(self):
super(CharLM, self).__init__()
"""
Settings
"""
self.word_embed_dim = 300
self.char_embedding_dim = 15
self.cnn_batch_size = 700
self.lstm_seq_len = 35
self.lstm_batch_size = 20
self.vocab_size = 100
self.num_char = 150

self.data = None # named tuple to store all data set
self.data_ready = False
self.criterion = nn.CrossEntropyLoss()
self.loss = None
self.optimizer = optim.SGD(self.parameters(), lr=learning_rate, momentum=0.85)
self.use_gpu = False
# word_emb_dim == hidden_size / num of hidden units
self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)),
to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)))

self.model = charLM(self.char_embedding_dim,
self.word_embed_dim,
self.vocab_size,
self.num_char,
use_gpu=self.use_gpu)

def prepare_input(self, raw_text):
"""
Do some preparation jobs. Transform raw data into input vectors.
"""
if not self.data_ready:
# To do: These need to be dropped out from here. (below)
if os.path.exists("cache/prep.pt") is False:
self.preprocess()
objects = torch.load("cache/prep.pt")
word_dict = objects["word_dict"]
char_dict = objects["char_dict"]
max_word_len = objects["max_word_len"]
self.data_ready = True
print("word/char dictionary built. Start making inputs.")

if os.path.exists("cache/data_sets.pt") is False:
train_text = read_data("./train.txt")
valid_text = read_data("./valid.txt")
test_text = read_data("./tests.txt")

# To do: These need to be dropped out from here. (above)

input_vec = np.array(text2vec(raw_text, char_dict, max_word_len))

# Labels are next-word index in word_dict with the same length as inputs
input_label = np.array([word_dict[w] for w in raw_text[1:]] + [word_dict[raw_text[-1]]])

category = {"features": input_vec, "label": input_label}
torch.save(category, "cache/data_sets.pt")
else:
data_sets = torch.load("cache/data_sets.pt")
input_vec = data_sets["features"]
input_label = data_sets["label"]

DataTuple = namedtuple("DataTuple", ["feature", "label"])
self.data = DataTuple(feature=input_vec, label=input_label)

return self.data.feature, self.data.label

def mode(self, test=False):
raise NotImplementedError

def data_forward(self, x):
# detach hidden state of LSTM from last batch
hidden = [state.detach() for state in self.hidden]
output, self.hidden = self.model(to_var(x), hidden)
return output

def grad_backward(self):
self.model.zero_grad()
self.loss.backward()
torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
self.optimizer.step()

def loss(self, predict, truth):
self.loss = self.criterion(predict, to_var(truth))
return self.loss

@staticmethod
def preprocess():
word_dict, char_dict = create_word_char_dict("valid.txt", "train.txt", "tests.txt")
num_char = len(char_dict)
char_dict["BOW"] = num_char + 1
char_dict["EOW"] = num_char + 2
char_dict["PAD"] = 0
# dict of (int, string)
reverse_word_dict = {value: key for key, value in word_dict.items()}
max_word_len = max([len(word) for word in word_dict])
objects = {
"word_dict": word_dict,
"char_dict": char_dict,
"reverse_word_dict": reverse_word_dict,
"max_word_len": max_word_len
}
torch.save(objects, "cache/prep.pt")
print("Preprocess done.")

def forward(self, x, hidden):
lstm_batch_size = x.size()[0]
lstm_seq_len = x.size()[1]
x = x.contiguous().view(-1, x.size()[2])
x = self.char_embed(x)
x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
x = self.conv_layers(x)
x = self.batch_norm(x)
x = self.highway1(x)
x = self.highway2(x)
x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
x, hidden = self.lstm(x, hidden)
x = self.dropout(x)
x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1)
x = self.linear(x)
return x, hidden


"""
Global Functions
"""


def batch_generator(x, batch_size):
# x: [num_words, in_channel, height, width]
# partitions x into batches
num_step = x.size()[0] // batch_size
for t in range(num_step):
yield x[t * batch_size:(t + 1) * batch_size]


def text2vec(words, char_dict, max_word_len):
""" Return list of list of int """
word_vec = []
for word in words:
vec = [char_dict[ch] for ch in word]
if len(vec) < max_word_len:
vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))]
vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]]
word_vec.append(vec)
return word_vec


def read_data(file_name):
with open(file_name, 'r') as f:
corpus = f.read().lower()
import re
corpus = re.sub(r"<unk>", "unk", corpus)
return corpus.split()


def get_char_dict(vocabulary):
char_dict = dict()
count = 1
for word in vocabulary:
for ch in word:
if ch not in char_dict:
char_dict[ch] = count
count += 1
return char_dict


def create_word_char_dict(*file_name):
text = []
for file in file_name:
text += read_data(file)
word_dict = {word: ix for ix, word in enumerate(set(text))}
char_dict = get_char_dict(word_dict)
return word_dict, char_dict


def to_var(x):
if torch.cuda.is_available():
x = x.cuda()
return Variable(x)


class Highway(nn.Module):
"""Highway network"""

def __init__(self, input_size):
super(Highway, self).__init__()
self.fc1 = nn.Linear(input_size, input_size, bias=True)
self.fc2 = nn.Linear(input_size, input_size, bias=True)

def forward(self, x):
t = F.sigmoid(self.fc1(x))
return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x)


class charLM(nn.Module):
"""Character-level Neural Language Model
CNN + highway network + LSTM
# Input:
4D tensor with shape [batch_size, in_channel, height, width]
# Output:
2D Tensor with shape [batch_size, vocab_size]
# Arguments:
char_emb_dim: the size of each character's embedding
word_emb_dim: the size of each word's embedding
vocab_size: num of unique words
num_char: num of characters
use_gpu: True or False
"""

def __init__(self, char_emb_dim, word_emb_dim,
vocab_size, num_char, use_gpu):
super(charLM, self).__init__()
self.char_emb_dim = char_emb_dim
self.word_emb_dim = word_emb_dim
self.vocab_size = vocab_size

# char embedding layer
self.char_embed = nn.Embedding(num_char, char_emb_dim)

# convolutions of filters with different sizes
self.convolutions = []

# list of tuples: (the number of filter, width)
self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)]

for out_channel, filter_width in self.filter_num_width:
self.convolutions.append(
nn.Conv2d(
1, # in_channel
out_channel, # out_channel
kernel_size=(char_emb_dim, filter_width), # (height, width)
bias=True
)
)

self.highway_input_dim = sum([x for x, y in self.filter_num_width])

self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False)

# highway net
self.highway1 = Highway(self.highway_input_dim)
self.highway2 = Highway(self.highway_input_dim)

# LSTM
self.lstm_num_layers = 2

self.lstm = nn.LSTM(input_size=self.highway_input_dim,
hidden_size=self.word_emb_dim,
num_layers=self.lstm_num_layers,
bias=True,
dropout=0.5,
batch_first=True)

# output layer
self.dropout = nn.Dropout(p=0.5)
self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)

if use_gpu is True:
for x in range(len(self.convolutions)):
self.convolutions[x] = self.convolutions[x].cuda()
self.highway1 = self.highway1.cuda()
self.highway2 = self.highway2.cuda()
self.lstm = self.lstm.cuda()
self.dropout = self.dropout.cuda()
self.char_embed = self.char_embed.cuda()
self.linear = self.linear.cuda()
self.batch_norm = self.batch_norm.cuda()

def forward(self, x, hidden):
# Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2]
# Return: Variable of Tensor with shape [num_words, len(word_dict)]
lstm_batch_size = x.size()[0]
lstm_seq_len = x.size()[1]

x = x.contiguous().view(-1, x.size()[2])
# [num_seq*seq_len, max_word_len+2]

x = self.char_embed(x)
# [num_seq*seq_len, max_word_len+2, char_emb_dim]

x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
# [num_seq*seq_len, 1, max_word_len+2, char_emb_dim]

x = self.conv_layers(x)
# [num_seq*seq_len, total_num_filters]

x = self.batch_norm(x)
# [num_seq*seq_len, total_num_filters]

x = self.highway1(x)
x = self.highway2(x)
# [num_seq*seq_len, total_num_filters]

x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
# [num_seq, seq_len, total_num_filters]

x, hidden = self.lstm(x, hidden)
# [seq_len, num_seq, hidden_size]

x = self.dropout(x)
# [seq_len, num_seq, hidden_size]

x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1)
# [num_seq*seq_len, hidden_size]

x = self.linear(x)
# [num_seq*seq_len, vocab_size]
return x, hidden

def conv_layers(self, x):
chosen_list = list()
for conv in self.convolutions:
feature_map = F.tanh(conv(x))
# (batch_size, out_channel, 1, max_word_len-width+1)
chosen = torch.max(feature_map, 3)[0]
# (batch_size, out_channel, 1)
chosen = chosen.squeeze()
# (batch_size, out_channel)
chosen_list.append(chosen)

# (batch_size, total_num_filers)
return torch.cat(chosen_list, 1)

+ 3
- 3
reproduction/Char-aware_NLM/train.py View File

@@ -135,9 +135,9 @@ def train(net, data, opt):
################################################## ##################################################
#################### Training #################### #################### Training ####################
net.train() net.train()
optimizer = optim.SGD(net.parameters(),
lr = learning_rate,
momentum=0.85)
optimizer = optim.SGD(net.parameters(),
lr = learning_rate,
momentum=0.85)


# split the first dim # split the first dim
input_generator = batch_generator(train_input, opt.lstm_batch_size) input_generator = batch_generator(train_input, opt.lstm_batch_size)


Loading…
Cancel
Save