Browse Source

Merge pull request #2 from FengZiYjun/master

character-aware neural language model
tags/v0.1.0
Coet GitHub 6 years ago
parent
commit
cfc47392e8
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 49885 additions and 0 deletions
  1. +21
    -0
      Char-aware_NLM/LICENSE
  2. +40
    -0
      Char-aware_NLM/README.md
  3. +148
    -0
      Char-aware_NLM/model.py
  4. +123
    -0
      Char-aware_NLM/test.py
  5. +3761
    -0
      Char-aware_NLM/test.txt
  6. +268
    -0
      Char-aware_NLM/train.py
  7. +42068
    -0
      Char-aware_NLM/train.txt
  8. +86
    -0
      Char-aware_NLM/utilities.py
  9. +3370
    -0
      Char-aware_NLM/valid.txt

+ 21
- 0
Char-aware_NLM/LICENSE View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2017
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

+ 40
- 0
Char-aware_NLM/README.md View File

@@ -0,0 +1,40 @@
# PyTorch-Character-Aware-Neural-Language-Model
This is the PyTorch implementation of character-aware neural language model proposed in this [paper](https://arxiv.org/abs/1508.06615) by Yoon Kim.
## Requiredments
The code is run and tested with **Python 3.5.2** and **PyTorch 0.3.1**.
## HyperParameters
| HyperParam | value |
| ------ | :-------|
| LSTM batch size | 20 |
| LSTM sequence length | 35 |
| LSTM hidden units | 300 |
| epochs | 35 |
| initial learning rate | 1.0 |
| character embedding dimension | 15 |
## Demo
Train the model with split train/valid/test data.
`python train.py`
The trained model will saved in `cache/net.pkl`.
Test the model.
`python test.py`
Best result on test set:
PPl=127.2163
cross entropy loss=4.8459
## Acknowledgement
This implementation borrowed ideas from
https://github.com/jarfo/kchar
https://github.com/cronos123/Character-Aware-Neural-Language-Models

+ 148
- 0
Char-aware_NLM/model.py View File

@@ -0,0 +1,148 @@
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
class Highway(nn.Module):
"""Highway network"""
def __init__(self, input_size):
super(Highway, self).__init__()
self.fc1 = nn.Linear(input_size, input_size, bias=True)
self.fc2 = nn.Linear(input_size, input_size, bias=True)
def forward(self, x):
t = F.sigmoid(self.fc1(x))
return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1-t, x)
class charLM(nn.Module):
"""CNN + highway network + LSTM
# Input:
4D tensor with shape [batch_size, in_channel, height, width]
# Output:
2D Tensor with shape [batch_size, vocab_size]
# Arguments:
char_emb_dim: the size of each character's embedding
word_emb_dim: the size of each word's embedding
vocab_size: num of unique words
num_char: num of characters
use_gpu: True or False
"""
def __init__(self, char_emb_dim, word_emb_dim,
vocab_size, num_char, use_gpu):
super(charLM, self).__init__()
self.char_emb_dim = char_emb_dim
self.word_emb_dim = word_emb_dim
self.vocab_size = vocab_size
# char embedding layer
self.char_embed = nn.Embedding(num_char, char_emb_dim)
# convolutions of filters with different sizes
self.convolutions = []
# list of tuples: (the number of filter, width)
self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)]
for out_channel, filter_width in self.filter_num_width:
self.convolutions.append(
nn.Conv2d(
1, # in_channel
out_channel, # out_channel
kernel_size=(char_emb_dim, filter_width), # (height, width)
bias=True
)
)
self.highway_input_dim = sum([x for x, y in self.filter_num_width])
self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False)
# highway net
self.highway1 = Highway(self.highway_input_dim)
self.highway2 = Highway(self.highway_input_dim)
# LSTM
self.lstm_num_layers = 2
self.lstm = nn.LSTM(input_size=self.highway_input_dim,
hidden_size=self.word_emb_dim,
num_layers=self.lstm_num_layers,
bias=True,
dropout=0.5,
batch_first=True)
# output layer
self.dropout = nn.Dropout(p=0.5)
self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)
if use_gpu is True:
for x in range(len(self.convolutions)):
self.convolutions[x] = self.convolutions[x].cuda()
self.highway1 = self.highway1.cuda()
self.highway2 = self.highway2.cuda()
self.lstm = self.lstm.cuda()
self.dropout = self.dropout.cuda()
self.char_embed = self.char_embed.cuda()
self.linear = self.linear.cuda()
self.batch_norm = self.batch_norm.cuda()
def forward(self, x, hidden):
# Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2]
# Return: Variable of Tensor with shape [num_words, len(word_dict)]
lstm_batch_size = x.size()[0]
lstm_seq_len = x.size()[1]
x = x.contiguous().view(-1, x.size()[2])
# [num_seq*seq_len, max_word_len+2]
x = self.char_embed(x)
# [num_seq*seq_len, max_word_len+2, char_emb_dim]
x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
# [num_seq*seq_len, 1, max_word_len+2, char_emb_dim]
x = self.conv_layers(x)
# [num_seq*seq_len, total_num_filters]
x = self.batch_norm(x)
# [num_seq*seq_len, total_num_filters]
x = self.highway1(x)
x = self.highway2(x)
# [num_seq*seq_len, total_num_filters]
x = x.contiguous().view(lstm_batch_size,lstm_seq_len, -1)
# [num_seq, seq_len, total_num_filters]
x, hidden = self.lstm(x, hidden)
# [seq_len, num_seq, hidden_size]
x = self.dropout(x)
# [seq_len, num_seq, hidden_size]
x = x.contiguous().view(lstm_batch_size*lstm_seq_len, -1)
# [num_seq*seq_len, hidden_size]
x = self.linear(x)
# [num_seq*seq_len, vocab_size]
return x, hidden
def conv_layers(self, x):
chosen_list = list()
for conv in self.convolutions:
feature_map = F.tanh(conv(x))
# (batch_size, out_channel, 1, max_word_len-width+1)
chosen = torch.max(feature_map, 3)[0]
# (batch_size, out_channel, 1)
chosen = chosen.squeeze()
# (batch_size, out_channel)
chosen_list.append(chosen)
# (batch_size, total_num_filers)
return torch.cat(chosen_list, 1)

+ 123
- 0
Char-aware_NLM/test.py View File

@@ -0,0 +1,123 @@
import os
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from model import charLM
from utilities import *
from collections import namedtuple
def to_var(x):
if torch.cuda.is_available():
x = x.cuda()
return Variable(x)
def test(net, data, opt):
net.eval()
test_input = torch.from_numpy(data.test_input)
test_label = torch.from_numpy(data.test_label)
num_seq = test_input.size()[0] // opt.lstm_seq_len
test_input = test_input[:num_seq*opt.lstm_seq_len, :]
# [num_seq, seq_len, max_word_len+2]
test_input = test_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2)
criterion = nn.CrossEntropyLoss()
loss_list = []
num_hits = 0
total = 0
iterations = test_input.size()[0] // opt.lstm_batch_size
test_generator = batch_generator(test_input, opt.lstm_batch_size)
label_generator = batch_generator(test_label, opt.lstm_batch_size*opt.lstm_seq_len)
hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)),
to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)))
add_loss = 0.0
for t in range(iterations):
batch_input = test_generator.__next__ ()
batch_label = label_generator.__next__()
net.zero_grad()
hidden = [state.detach() for state in hidden]
test_output, hidden = net(to_var(batch_input), hidden)
test_loss = criterion(test_output, to_var(batch_label)).data
loss_list.append(test_loss)
add_loss += test_loss
print("Test Loss={0:.4f}".format(float(add_loss) / iterations))
print("Test PPL={0:.4f}".format(float(np.exp(add_loss / iterations))))
#############################################################
if __name__ == "__main__":
word_embed_dim = 300
char_embedding_dim = 15
if os.path.exists("cache/prep.pt") is False:
print("Cannot find prep.pt")
objetcs = torch.load("cache/prep.pt")
word_dict = objetcs["word_dict"]
char_dict = objetcs["char_dict"]
reverse_word_dict = objetcs["reverse_word_dict"]
max_word_len = objetcs["max_word_len"]
num_words = len(word_dict)
print("word/char dictionary built. Start making inputs.")
if os.path.exists("cache/data_sets.pt") is False:
test_text = read_data("./test.txt")
test_set = np.array(text2vec(test_text, char_dict, max_word_len))
# Labels are next-word index in word_dict with the same length as inputs
test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]])
category = {"test": test_set, "tlabel":test_label}
torch.save(category, "cache/data_sets.pt")
else:
data_sets = torch.load("cache/data_sets.pt")
test_set = data_sets["test"]
test_label = data_sets["tlabel"]
train_set = data_sets["tdata"]
train_label = data_sets["trlabel"]
DataTuple = namedtuple("DataTuple", "test_input test_label train_input train_label ")
data = DataTuple( test_input=test_set,
test_label=test_label, train_label=train_label, train_input=train_set)
print("Loaded data sets. Start building network.")
USE_GPU = True
cnn_batch_size = 700
lstm_seq_len = 35
lstm_batch_size = 20
net = torch.load("cache/net.pkl")
Options = namedtuple("Options", [ "cnn_batch_size", "lstm_seq_len",
"max_word_len", "lstm_batch_size", "word_embed_dim"])
opt = Options(cnn_batch_size=lstm_seq_len*lstm_batch_size,
lstm_seq_len=lstm_seq_len,
max_word_len=max_word_len,
lstm_batch_size=lstm_batch_size,
word_embed_dim=word_embed_dim)
print("Network built. Start testing.")
test(net, data, opt)

+ 3761
- 0
Char-aware_NLM/test.txt
File diff suppressed because it is too large
View File


+ 268
- 0
Char-aware_NLM/train.py View File

@@ -0,0 +1,268 @@
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import os
from model import charLM
from utilities import *
from collections import namedtuple
from test import test
def preprocess():
word_dict, char_dict = create_word_char_dict("valid.txt", "train.txt", "test.txt")
num_words = len(word_dict)
num_char = len(char_dict)
char_dict["BOW"] = num_char+1
char_dict["EOW"] = num_char+2
char_dict["PAD"] = 0
# dict of (int, string)
reverse_word_dict = {value:key for key, value in word_dict.items()}
max_word_len = max([len(word) for word in word_dict])
objects = {
"word_dict": word_dict,
"char_dict": char_dict,
"reverse_word_dict": reverse_word_dict,
"max_word_len": max_word_len
}
torch.save(objects, "cache/prep.pt")
print("Preprocess done.")
def to_var(x):
if torch.cuda.is_available():
x = x.cuda()
return Variable(x)
def train(net, data, opt):
torch.manual_seed(1024)
train_input = torch.from_numpy(data.train_input)
train_label = torch.from_numpy(data.train_label)
valid_input = torch.from_numpy(data.valid_input)
valid_label = torch.from_numpy(data.valid_label)
# [num_seq, seq_len, max_word_len+2]
num_seq = train_input.size()[0] // opt.lstm_seq_len
train_input = train_input[:num_seq*opt.lstm_seq_len, :]
train_input = train_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2)
num_seq = valid_input.size()[0] // opt.lstm_seq_len
valid_input = valid_input[:num_seq*opt.lstm_seq_len, :]
valid_input = valid_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2)
num_epoch = opt.epochs
num_iter_per_epoch = train_input.size()[0] // opt.lstm_batch_size
learning_rate = opt.init_lr
old_PPL = 100000
best_PPL = 100000
# Log-SoftMax
criterion = nn.CrossEntropyLoss()
# word_emb_dim == hidden_size / num of hidden units
hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)),
to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)))
for epoch in range(num_epoch):
################ Validation ####################
net.eval()
loss_batch = []
PPL_batch = []
iterations = valid_input.size()[0] // opt.lstm_batch_size
valid_generator = batch_generator(valid_input, opt.lstm_batch_size)
vlabel_generator = batch_generator(valid_label, opt.lstm_batch_size*opt.lstm_seq_len)
for t in range(iterations):
batch_input = valid_generator.__next__()
batch_label = vlabel_generator.__next__()
hidden = [state.detach() for state in hidden]
valid_output, hidden = net(to_var(batch_input), hidden)
length = valid_output.size()[0]
# [num_sample-1, len(word_dict)] vs [num_sample-1]
valid_loss = criterion(valid_output, to_var(batch_label))
PPL = torch.exp(valid_loss.data)
loss_batch.append(float(valid_loss))
PPL_batch.append(float(PPL))
PPL = np.mean(PPL_batch)
print("[epoch {}] valid PPL={}".format(epoch, PPL))
print("valid loss={}".format(np.mean(loss_batch)))
print("PPL decrease={}".format(float(old_PPL - PPL)))
# Preserve the best model
if best_PPL > PPL:
best_PPL = PPL
torch.save(net.state_dict(), "cache/model.pt")
torch.save(net, "cache/net.pkl")
# Adjust the learning rate
if float(old_PPL - PPL) <= 1.0:
learning_rate /= 2
print("halved lr:{}".format(learning_rate))
old_PPL = PPL
##################################################
#################### Training ####################
net.train()
optimizer = optim.SGD(net.parameters(),
lr = learning_rate,
momentum=0.85)
# split the first dim
input_generator = batch_generator(train_input, opt.lstm_batch_size)
label_generator = batch_generator(train_label, opt.lstm_batch_size*opt.lstm_seq_len)
for t in range(num_iter_per_epoch):
batch_input = input_generator.__next__()
batch_label = label_generator.__next__()
# detach hidden state of LSTM from last batch
hidden = [state.detach() for state in hidden]
output, hidden = net(to_var(batch_input), hidden)
# [num_word, vocab_size]
loss = criterion(output, to_var(batch_label))
net.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm(net.parameters(), 5, norm_type=2)
optimizer.step()
if (t+1) % 100 == 0:
print("[epoch {} step {}] train loss={}, Perplexity={}".format(epoch+1,
t+1, float(loss.data), float(np.exp(loss.data))))
torch.save(net.state_dict(), "cache/model.pt")
print("Training finished.")
################################################################
if __name__=="__main__":
word_embed_dim = 300
char_embedding_dim = 15
if os.path.exists("cache/prep.pt") is False:
preprocess()
objetcs = torch.load("cache/prep.pt")
word_dict = objetcs["word_dict"]
char_dict = objetcs["char_dict"]
reverse_word_dict = objetcs["reverse_word_dict"]
max_word_len = objetcs["max_word_len"]
num_words = len(word_dict)
print("word/char dictionary built. Start making inputs.")
if os.path.exists("cache/data_sets.pt") is False:
train_text = read_data("./train.txt")
valid_text = read_data("./valid.txt")
test_text = read_data("./test.txt")
train_set = np.array(text2vec(train_text, char_dict, max_word_len))
valid_set = np.array(text2vec(valid_text, char_dict, max_word_len))
test_set = np.array(text2vec(test_text, char_dict, max_word_len))
# Labels are next-word index in word_dict with the same length as inputs
train_label = np.array([word_dict[w] for w in train_text[1:]] + [word_dict[train_text[-1]]])
valid_label = np.array([word_dict[w] for w in valid_text[1:]] + [word_dict[valid_text[-1]]])
test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]])
category = {"tdata":train_set, "vdata":valid_set, "test": test_set,
"trlabel":train_label, "vlabel":valid_label, "tlabel":test_label}
torch.save(category, "cache/data_sets.pt")
else:
data_sets = torch.load("cache/data_sets.pt")
train_set = data_sets["tdata"]
valid_set = data_sets["vdata"]
test_set = data_sets["test"]
train_label = data_sets["trlabel"]
valid_label = data_sets["vlabel"]
test_label = data_sets["tlabel"]
DataTuple = namedtuple("DataTuple",
"train_input train_label valid_input valid_label test_input test_label")
data = DataTuple(train_input=train_set,
train_label=train_label,
valid_input=valid_set,
valid_label=valid_label,
test_input=test_set,
test_label=test_label)
print("Loaded data sets. Start building network.")
USE_GPU = True
cnn_batch_size = 700
lstm_seq_len = 35
lstm_batch_size = 20
# cnn_batch_size == lstm_seq_len * lstm_batch_size
net = charLM(char_embedding_dim,
word_embed_dim,
num_words,
len(char_dict),
use_gpu=USE_GPU)
for param in net.parameters():
nn.init.uniform(param.data, -0.05, 0.05)
Options = namedtuple("Options", [
"cnn_batch_size", "init_lr", "lstm_seq_len",
"max_word_len", "lstm_batch_size", "epochs",
"word_embed_dim"])
opt = Options(cnn_batch_size=lstm_seq_len*lstm_batch_size,
init_lr=1.0,
lstm_seq_len=lstm_seq_len,
max_word_len=max_word_len,
lstm_batch_size=lstm_batch_size,
epochs=35,
word_embed_dim=word_embed_dim)
print("Network built. Start training.")
# You can stop training anytime by "ctrl+C"
try:
train(net, data, opt)
except KeyboardInterrupt:
print('-' * 89)
print('Exiting from training early')
torch.save(net, "cache/net.pkl")
print("save net")
test(net, data, opt)

+ 42068
- 0
Char-aware_NLM/train.txt
File diff suppressed because it is too large
View File


+ 86
- 0
Char-aware_NLM/utilities.py View File

@@ -0,0 +1,86 @@
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
def batch_generator(x, batch_size):
# x: [num_words, in_channel, height, width]
# partitions x into batches
num_step = x.size()[0] // batch_size
for t in range(num_step):
yield x[t*batch_size:(t+1)*batch_size]
def text2vec(words, char_dict, max_word_len):
""" Return list of list of int """
word_vec = []
for word in words:
vec = [char_dict[ch] for ch in word]
if len(vec) < max_word_len:
vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))]
vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]]
word_vec.append(vec)
return word_vec
def seq2vec(input_words, char_embedding, char_embedding_dim, char_table):
""" convert the input strings into character embeddings """
# input_words == list of string
# char_embedding == torch.nn.Embedding
# char_embedding_dim == int
# char_table == list of unique chars
# Returns: tensor of shape [len(input_words), char_embedding_dim, max_word_len+2]
max_word_len = max([len(word) for word in input_words])
print("max_word_len={}".format(max_word_len))
tensor_list = []
start_column = torch.ones(char_embedding_dim, 1)
end_column = torch.ones(char_embedding_dim, 1)
for word in input_words:
# convert string to word embedding
word_encoding = char_embedding_lookup(word, char_embedding, char_table)
# add start and end columns
word_encoding = torch.cat([start_column, word_encoding, end_column], 1)
# zero-pad right columns
word_encoding = F.pad(word_encoding, (0, max_word_len-word_encoding.size()[1]+2)).data
# create dimension
word_encoding = word_encoding.unsqueeze(0)
tensor_list.append(word_encoding)
return torch.cat(tensor_list, 0)
def read_data(file_name):
# Return: list of strings
with open(file_name, 'r') as f:
corpus = f.read().lower()
import re
corpus = re.sub(r"<unk>", "unk", corpus)
return corpus.split()
def get_char_dict(vocabulary):
# vocabulary == dict of (word, int)
# Return: dict of (char, int), starting from 1
char_dict = dict()
count = 1
for word in vocabulary:
for ch in word:
if ch not in char_dict:
char_dict[ch] = count
count += 1
return char_dict
def create_word_char_dict(*file_name):
text = []
for file in file_name:
text += read_data(file)
word_dict = {word:ix for ix, word in enumerate(set(text))}
char_dict = get_char_dict(word_dict)
return word_dict, char_dict

+ 3370
- 0
Char-aware_NLM/valid.txt
File diff suppressed because it is too large
View File


Loading…
Cancel
Save