|
|
@@ -0,0 +1,342 @@ |
|
|
|
import os |
|
|
|
from collections import namedtuple |
|
|
|
|
|
|
|
import numpy as np |
|
|
|
import torch |
|
|
|
import torch.nn as nn |
|
|
|
import torch.nn.functional as F |
|
|
|
import torch.optim as optim |
|
|
|
from torch.autograd import Variable |
|
|
|
|
|
|
|
from model.base_model import BaseModel |
|
|
|
|
|
|
|
|
|
|
|
class CharLM(BaseModel): |
|
|
|
""" |
|
|
|
Controller of the Character-level Neural Language Model |
|
|
|
""" |
|
|
|
|
|
|
|
def __init__(self): |
|
|
|
super(CharLM, self).__init__() |
|
|
|
""" |
|
|
|
Settings |
|
|
|
""" |
|
|
|
self.word_embed_dim = 300 |
|
|
|
self.char_embedding_dim = 15 |
|
|
|
self.cnn_batch_size = 700 |
|
|
|
self.lstm_seq_len = 35 |
|
|
|
self.lstm_batch_size = 20 |
|
|
|
self.vocab_size = 100 |
|
|
|
self.num_char = 150 |
|
|
|
|
|
|
|
self.data = None # named tuple to store all data set |
|
|
|
self.data_ready = False |
|
|
|
self.criterion = nn.CrossEntropyLoss() |
|
|
|
self.loss = None |
|
|
|
self.optimizer = optim.SGD(self.parameters(), lr=learning_rate, momentum=0.85) |
|
|
|
self.use_gpu = False |
|
|
|
# word_emb_dim == hidden_size / num of hidden units |
|
|
|
self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)), |
|
|
|
to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim))) |
|
|
|
|
|
|
|
self.model = charLM(self.char_embedding_dim, |
|
|
|
self.word_embed_dim, |
|
|
|
self.vocab_size, |
|
|
|
self.num_char, |
|
|
|
use_gpu=self.use_gpu) |
|
|
|
|
|
|
|
def prepare_input(self, raw_text): |
|
|
|
""" |
|
|
|
Do some preparation jobs. Transform raw data into input vectors. |
|
|
|
""" |
|
|
|
if not self.data_ready: |
|
|
|
# To do: These need to be dropped out from here. (below) |
|
|
|
if os.path.exists("cache/prep.pt") is False: |
|
|
|
self.preprocess() |
|
|
|
objects = torch.load("cache/prep.pt") |
|
|
|
word_dict = objects["word_dict"] |
|
|
|
char_dict = objects["char_dict"] |
|
|
|
max_word_len = objects["max_word_len"] |
|
|
|
self.data_ready = True |
|
|
|
print("word/char dictionary built. Start making inputs.") |
|
|
|
|
|
|
|
if os.path.exists("cache/data_sets.pt") is False: |
|
|
|
train_text = read_data("./train.txt") |
|
|
|
valid_text = read_data("./valid.txt") |
|
|
|
test_text = read_data("./tests.txt") |
|
|
|
|
|
|
|
# To do: These need to be dropped out from here. (above) |
|
|
|
|
|
|
|
input_vec = np.array(text2vec(raw_text, char_dict, max_word_len)) |
|
|
|
|
|
|
|
# Labels are next-word index in word_dict with the same length as inputs |
|
|
|
input_label = np.array([word_dict[w] for w in raw_text[1:]] + [word_dict[raw_text[-1]]]) |
|
|
|
|
|
|
|
category = {"features": input_vec, "label": input_label} |
|
|
|
torch.save(category, "cache/data_sets.pt") |
|
|
|
else: |
|
|
|
data_sets = torch.load("cache/data_sets.pt") |
|
|
|
input_vec = data_sets["features"] |
|
|
|
input_label = data_sets["label"] |
|
|
|
|
|
|
|
DataTuple = namedtuple("DataTuple", ["feature", "label"]) |
|
|
|
self.data = DataTuple(feature=input_vec, label=input_label) |
|
|
|
|
|
|
|
return self.data.feature, self.data.label |
|
|
|
|
|
|
|
def mode(self, test=False): |
|
|
|
raise NotImplementedError |
|
|
|
|
|
|
|
def data_forward(self, x): |
|
|
|
# detach hidden state of LSTM from last batch |
|
|
|
hidden = [state.detach() for state in self.hidden] |
|
|
|
output, self.hidden = self.model(to_var(x), hidden) |
|
|
|
return output |
|
|
|
|
|
|
|
def grad_backward(self): |
|
|
|
self.model.zero_grad() |
|
|
|
self.loss.backward() |
|
|
|
torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2) |
|
|
|
self.optimizer.step() |
|
|
|
|
|
|
|
def loss(self, predict, truth): |
|
|
|
self.loss = self.criterion(predict, to_var(truth)) |
|
|
|
return self.loss |
|
|
|
|
|
|
|
@staticmethod |
|
|
|
def preprocess(): |
|
|
|
word_dict, char_dict = create_word_char_dict("valid.txt", "train.txt", "tests.txt") |
|
|
|
num_char = len(char_dict) |
|
|
|
char_dict["BOW"] = num_char + 1 |
|
|
|
char_dict["EOW"] = num_char + 2 |
|
|
|
char_dict["PAD"] = 0 |
|
|
|
# dict of (int, string) |
|
|
|
reverse_word_dict = {value: key for key, value in word_dict.items()} |
|
|
|
max_word_len = max([len(word) for word in word_dict]) |
|
|
|
objects = { |
|
|
|
"word_dict": word_dict, |
|
|
|
"char_dict": char_dict, |
|
|
|
"reverse_word_dict": reverse_word_dict, |
|
|
|
"max_word_len": max_word_len |
|
|
|
} |
|
|
|
torch.save(objects, "cache/prep.pt") |
|
|
|
print("Preprocess done.") |
|
|
|
|
|
|
|
def forward(self, x, hidden): |
|
|
|
lstm_batch_size = x.size()[0] |
|
|
|
lstm_seq_len = x.size()[1] |
|
|
|
x = x.contiguous().view(-1, x.size()[2]) |
|
|
|
x = self.char_embed(x) |
|
|
|
x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3) |
|
|
|
x = self.conv_layers(x) |
|
|
|
x = self.batch_norm(x) |
|
|
|
x = self.highway1(x) |
|
|
|
x = self.highway2(x) |
|
|
|
x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1) |
|
|
|
x, hidden = self.lstm(x, hidden) |
|
|
|
x = self.dropout(x) |
|
|
|
x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1) |
|
|
|
x = self.linear(x) |
|
|
|
return x, hidden |
|
|
|
|
|
|
|
|
|
|
|
""" |
|
|
|
Global Functions |
|
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
def batch_generator(x, batch_size): |
|
|
|
# x: [num_words, in_channel, height, width] |
|
|
|
# partitions x into batches |
|
|
|
num_step = x.size()[0] // batch_size |
|
|
|
for t in range(num_step): |
|
|
|
yield x[t * batch_size:(t + 1) * batch_size] |
|
|
|
|
|
|
|
|
|
|
|
def text2vec(words, char_dict, max_word_len): |
|
|
|
""" Return list of list of int """ |
|
|
|
word_vec = [] |
|
|
|
for word in words: |
|
|
|
vec = [char_dict[ch] for ch in word] |
|
|
|
if len(vec) < max_word_len: |
|
|
|
vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))] |
|
|
|
vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]] |
|
|
|
word_vec.append(vec) |
|
|
|
return word_vec |
|
|
|
|
|
|
|
|
|
|
|
def read_data(file_name): |
|
|
|
with open(file_name, 'r') as f: |
|
|
|
corpus = f.read().lower() |
|
|
|
import re |
|
|
|
corpus = re.sub(r"<unk>", "unk", corpus) |
|
|
|
return corpus.split() |
|
|
|
|
|
|
|
|
|
|
|
def get_char_dict(vocabulary): |
|
|
|
char_dict = dict() |
|
|
|
count = 1 |
|
|
|
for word in vocabulary: |
|
|
|
for ch in word: |
|
|
|
if ch not in char_dict: |
|
|
|
char_dict[ch] = count |
|
|
|
count += 1 |
|
|
|
return char_dict |
|
|
|
|
|
|
|
|
|
|
|
def create_word_char_dict(*file_name): |
|
|
|
text = [] |
|
|
|
for file in file_name: |
|
|
|
text += read_data(file) |
|
|
|
word_dict = {word: ix for ix, word in enumerate(set(text))} |
|
|
|
char_dict = get_char_dict(word_dict) |
|
|
|
return word_dict, char_dict |
|
|
|
|
|
|
|
|
|
|
|
def to_var(x): |
|
|
|
if torch.cuda.is_available(): |
|
|
|
x = x.cuda() |
|
|
|
return Variable(x) |
|
|
|
|
|
|
|
|
|
|
|
class Highway(nn.Module): |
|
|
|
"""Highway network""" |
|
|
|
|
|
|
|
def __init__(self, input_size): |
|
|
|
super(Highway, self).__init__() |
|
|
|
self.fc1 = nn.Linear(input_size, input_size, bias=True) |
|
|
|
self.fc2 = nn.Linear(input_size, input_size, bias=True) |
|
|
|
|
|
|
|
def forward(self, x): |
|
|
|
t = F.sigmoid(self.fc1(x)) |
|
|
|
return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x) |
|
|
|
|
|
|
|
|
|
|
|
class charLM(nn.Module): |
|
|
|
"""Character-level Neural Language Model |
|
|
|
CNN + highway network + LSTM |
|
|
|
# Input: |
|
|
|
4D tensor with shape [batch_size, in_channel, height, width] |
|
|
|
# Output: |
|
|
|
2D Tensor with shape [batch_size, vocab_size] |
|
|
|
# Arguments: |
|
|
|
char_emb_dim: the size of each character's embedding |
|
|
|
word_emb_dim: the size of each word's embedding |
|
|
|
vocab_size: num of unique words |
|
|
|
num_char: num of characters |
|
|
|
use_gpu: True or False |
|
|
|
""" |
|
|
|
|
|
|
|
def __init__(self, char_emb_dim, word_emb_dim, |
|
|
|
vocab_size, num_char, use_gpu): |
|
|
|
super(charLM, self).__init__() |
|
|
|
self.char_emb_dim = char_emb_dim |
|
|
|
self.word_emb_dim = word_emb_dim |
|
|
|
self.vocab_size = vocab_size |
|
|
|
|
|
|
|
# char embedding layer |
|
|
|
self.char_embed = nn.Embedding(num_char, char_emb_dim) |
|
|
|
|
|
|
|
# convolutions of filters with different sizes |
|
|
|
self.convolutions = [] |
|
|
|
|
|
|
|
# list of tuples: (the number of filter, width) |
|
|
|
self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)] |
|
|
|
|
|
|
|
for out_channel, filter_width in self.filter_num_width: |
|
|
|
self.convolutions.append( |
|
|
|
nn.Conv2d( |
|
|
|
1, # in_channel |
|
|
|
out_channel, # out_channel |
|
|
|
kernel_size=(char_emb_dim, filter_width), # (height, width) |
|
|
|
bias=True |
|
|
|
) |
|
|
|
) |
|
|
|
|
|
|
|
self.highway_input_dim = sum([x for x, y in self.filter_num_width]) |
|
|
|
|
|
|
|
self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False) |
|
|
|
|
|
|
|
# highway net |
|
|
|
self.highway1 = Highway(self.highway_input_dim) |
|
|
|
self.highway2 = Highway(self.highway_input_dim) |
|
|
|
|
|
|
|
# LSTM |
|
|
|
self.lstm_num_layers = 2 |
|
|
|
|
|
|
|
self.lstm = nn.LSTM(input_size=self.highway_input_dim, |
|
|
|
hidden_size=self.word_emb_dim, |
|
|
|
num_layers=self.lstm_num_layers, |
|
|
|
bias=True, |
|
|
|
dropout=0.5, |
|
|
|
batch_first=True) |
|
|
|
|
|
|
|
# output layer |
|
|
|
self.dropout = nn.Dropout(p=0.5) |
|
|
|
self.linear = nn.Linear(self.word_emb_dim, self.vocab_size) |
|
|
|
|
|
|
|
if use_gpu is True: |
|
|
|
for x in range(len(self.convolutions)): |
|
|
|
self.convolutions[x] = self.convolutions[x].cuda() |
|
|
|
self.highway1 = self.highway1.cuda() |
|
|
|
self.highway2 = self.highway2.cuda() |
|
|
|
self.lstm = self.lstm.cuda() |
|
|
|
self.dropout = self.dropout.cuda() |
|
|
|
self.char_embed = self.char_embed.cuda() |
|
|
|
self.linear = self.linear.cuda() |
|
|
|
self.batch_norm = self.batch_norm.cuda() |
|
|
|
|
|
|
|
def forward(self, x, hidden): |
|
|
|
# Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2] |
|
|
|
# Return: Variable of Tensor with shape [num_words, len(word_dict)] |
|
|
|
lstm_batch_size = x.size()[0] |
|
|
|
lstm_seq_len = x.size()[1] |
|
|
|
|
|
|
|
x = x.contiguous().view(-1, x.size()[2]) |
|
|
|
# [num_seq*seq_len, max_word_len+2] |
|
|
|
|
|
|
|
x = self.char_embed(x) |
|
|
|
# [num_seq*seq_len, max_word_len+2, char_emb_dim] |
|
|
|
|
|
|
|
x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3) |
|
|
|
# [num_seq*seq_len, 1, max_word_len+2, char_emb_dim] |
|
|
|
|
|
|
|
x = self.conv_layers(x) |
|
|
|
# [num_seq*seq_len, total_num_filters] |
|
|
|
|
|
|
|
x = self.batch_norm(x) |
|
|
|
# [num_seq*seq_len, total_num_filters] |
|
|
|
|
|
|
|
x = self.highway1(x) |
|
|
|
x = self.highway2(x) |
|
|
|
# [num_seq*seq_len, total_num_filters] |
|
|
|
|
|
|
|
x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1) |
|
|
|
# [num_seq, seq_len, total_num_filters] |
|
|
|
|
|
|
|
x, hidden = self.lstm(x, hidden) |
|
|
|
# [seq_len, num_seq, hidden_size] |
|
|
|
|
|
|
|
x = self.dropout(x) |
|
|
|
# [seq_len, num_seq, hidden_size] |
|
|
|
|
|
|
|
x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1) |
|
|
|
# [num_seq*seq_len, hidden_size] |
|
|
|
|
|
|
|
x = self.linear(x) |
|
|
|
# [num_seq*seq_len, vocab_size] |
|
|
|
return x, hidden |
|
|
|
|
|
|
|
def conv_layers(self, x): |
|
|
|
chosen_list = list() |
|
|
|
for conv in self.convolutions: |
|
|
|
feature_map = F.tanh(conv(x)) |
|
|
|
# (batch_size, out_channel, 1, max_word_len-width+1) |
|
|
|
chosen = torch.max(feature_map, 3)[0] |
|
|
|
# (batch_size, out_channel, 1) |
|
|
|
chosen = chosen.squeeze() |
|
|
|
# (batch_size, out_channel) |
|
|
|
chosen_list.append(chosen) |
|
|
|
|
|
|
|
# (batch_size, total_num_filers) |
|
|
|
return torch.cat(chosen_list, 1) |