@@ -0,0 +1,21 @@ | |||||
MIT License | |||||
Copyright (c) 2017 | |||||
Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
of this software and associated documentation files (the "Software"), to deal | |||||
in the Software without restriction, including without limitation the rights | |||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
copies of the Software, and to permit persons to whom the Software is | |||||
furnished to do so, subject to the following conditions: | |||||
The above copyright notice and this permission notice shall be included in all | |||||
copies or substantial portions of the Software. | |||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
SOFTWARE. |
@@ -0,0 +1,40 @@ | |||||
# PyTorch-Character-Aware-Neural-Language-Model | |||||
This is the PyTorch implementation of character-aware neural language model proposed in this [paper](https://arxiv.org/abs/1508.06615) by Yoon Kim. | |||||
## Requiredments | |||||
The code is run and tested with **Python 3.5.2** and **PyTorch 0.3.1**. | |||||
## HyperParameters | |||||
| HyperParam | value | | |||||
| ------ | :-------| | |||||
| LSTM batch size | 20 | | |||||
| LSTM sequence length | 35 | | |||||
| LSTM hidden units | 300 | | |||||
| epochs | 35 | | |||||
| initial learning rate | 1.0 | | |||||
| character embedding dimension | 15 | | |||||
## Demo | |||||
Train the model with split train/valid/test data. | |||||
`python train.py` | |||||
The trained model will saved in `cache/net.pkl`. | |||||
Test the model. | |||||
`python test.py` | |||||
Best result on test set: | |||||
PPl=127.2163 | |||||
cross entropy loss=4.8459 | |||||
## Acknowledgement | |||||
This implementation borrowed ideas from | |||||
https://github.com/jarfo/kchar | |||||
https://github.com/cronos123/Character-Aware-Neural-Language-Models | |||||
@@ -0,0 +1,148 @@ | |||||
import torch | |||||
from torch.autograd import Variable | |||||
import torch.nn as nn | |||||
import torch.nn.functional as F | |||||
class Highway(nn.Module): | |||||
"""Highway network""" | |||||
def __init__(self, input_size): | |||||
super(Highway, self).__init__() | |||||
self.fc1 = nn.Linear(input_size, input_size, bias=True) | |||||
self.fc2 = nn.Linear(input_size, input_size, bias=True) | |||||
def forward(self, x): | |||||
t = F.sigmoid(self.fc1(x)) | |||||
return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1-t, x) | |||||
class charLM(nn.Module): | |||||
"""CNN + highway network + LSTM | |||||
# Input: | |||||
4D tensor with shape [batch_size, in_channel, height, width] | |||||
# Output: | |||||
2D Tensor with shape [batch_size, vocab_size] | |||||
# Arguments: | |||||
char_emb_dim: the size of each character's embedding | |||||
word_emb_dim: the size of each word's embedding | |||||
vocab_size: num of unique words | |||||
num_char: num of characters | |||||
use_gpu: True or False | |||||
""" | |||||
def __init__(self, char_emb_dim, word_emb_dim, | |||||
vocab_size, num_char, use_gpu): | |||||
super(charLM, self).__init__() | |||||
self.char_emb_dim = char_emb_dim | |||||
self.word_emb_dim = word_emb_dim | |||||
self.vocab_size = vocab_size | |||||
# char embedding layer | |||||
self.char_embed = nn.Embedding(num_char, char_emb_dim) | |||||
# convolutions of filters with different sizes | |||||
self.convolutions = [] | |||||
# list of tuples: (the number of filter, width) | |||||
self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)] | |||||
for out_channel, filter_width in self.filter_num_width: | |||||
self.convolutions.append( | |||||
nn.Conv2d( | |||||
1, # in_channel | |||||
out_channel, # out_channel | |||||
kernel_size=(char_emb_dim, filter_width), # (height, width) | |||||
bias=True | |||||
) | |||||
) | |||||
self.highway_input_dim = sum([x for x, y in self.filter_num_width]) | |||||
self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False) | |||||
# highway net | |||||
self.highway1 = Highway(self.highway_input_dim) | |||||
self.highway2 = Highway(self.highway_input_dim) | |||||
# LSTM | |||||
self.lstm_num_layers = 2 | |||||
self.lstm = nn.LSTM(input_size=self.highway_input_dim, | |||||
hidden_size=self.word_emb_dim, | |||||
num_layers=self.lstm_num_layers, | |||||
bias=True, | |||||
dropout=0.5, | |||||
batch_first=True) | |||||
# output layer | |||||
self.dropout = nn.Dropout(p=0.5) | |||||
self.linear = nn.Linear(self.word_emb_dim, self.vocab_size) | |||||
if use_gpu is True: | |||||
for x in range(len(self.convolutions)): | |||||
self.convolutions[x] = self.convolutions[x].cuda() | |||||
self.highway1 = self.highway1.cuda() | |||||
self.highway2 = self.highway2.cuda() | |||||
self.lstm = self.lstm.cuda() | |||||
self.dropout = self.dropout.cuda() | |||||
self.char_embed = self.char_embed.cuda() | |||||
self.linear = self.linear.cuda() | |||||
self.batch_norm = self.batch_norm.cuda() | |||||
def forward(self, x, hidden): | |||||
# Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2] | |||||
# Return: Variable of Tensor with shape [num_words, len(word_dict)] | |||||
lstm_batch_size = x.size()[0] | |||||
lstm_seq_len = x.size()[1] | |||||
x = x.contiguous().view(-1, x.size()[2]) | |||||
# [num_seq*seq_len, max_word_len+2] | |||||
x = self.char_embed(x) | |||||
# [num_seq*seq_len, max_word_len+2, char_emb_dim] | |||||
x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3) | |||||
# [num_seq*seq_len, 1, max_word_len+2, char_emb_dim] | |||||
x = self.conv_layers(x) | |||||
# [num_seq*seq_len, total_num_filters] | |||||
x = self.batch_norm(x) | |||||
# [num_seq*seq_len, total_num_filters] | |||||
x = self.highway1(x) | |||||
x = self.highway2(x) | |||||
# [num_seq*seq_len, total_num_filters] | |||||
x = x.contiguous().view(lstm_batch_size,lstm_seq_len, -1) | |||||
# [num_seq, seq_len, total_num_filters] | |||||
x, hidden = self.lstm(x, hidden) | |||||
# [seq_len, num_seq, hidden_size] | |||||
x = self.dropout(x) | |||||
# [seq_len, num_seq, hidden_size] | |||||
x = x.contiguous().view(lstm_batch_size*lstm_seq_len, -1) | |||||
# [num_seq*seq_len, hidden_size] | |||||
x = self.linear(x) | |||||
# [num_seq*seq_len, vocab_size] | |||||
return x, hidden | |||||
def conv_layers(self, x): | |||||
chosen_list = list() | |||||
for conv in self.convolutions: | |||||
feature_map = F.tanh(conv(x)) | |||||
# (batch_size, out_channel, 1, max_word_len-width+1) | |||||
chosen = torch.max(feature_map, 3)[0] | |||||
# (batch_size, out_channel, 1) | |||||
chosen = chosen.squeeze() | |||||
# (batch_size, out_channel) | |||||
chosen_list.append(chosen) | |||||
# (batch_size, total_num_filers) | |||||
return torch.cat(chosen_list, 1) |
@@ -0,0 +1,123 @@ | |||||
import os | |||||
import torch | |||||
from torch.autograd import Variable | |||||
import torch.nn as nn | |||||
import torch.nn.functional as F | |||||
import numpy as np | |||||
from model import charLM | |||||
from utilities import * | |||||
from collections import namedtuple | |||||
def to_var(x): | |||||
if torch.cuda.is_available(): | |||||
x = x.cuda() | |||||
return Variable(x) | |||||
def test(net, data, opt): | |||||
net.eval() | |||||
test_input = torch.from_numpy(data.test_input) | |||||
test_label = torch.from_numpy(data.test_label) | |||||
num_seq = test_input.size()[0] // opt.lstm_seq_len | |||||
test_input = test_input[:num_seq*opt.lstm_seq_len, :] | |||||
# [num_seq, seq_len, max_word_len+2] | |||||
test_input = test_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2) | |||||
criterion = nn.CrossEntropyLoss() | |||||
loss_list = [] | |||||
num_hits = 0 | |||||
total = 0 | |||||
iterations = test_input.size()[0] // opt.lstm_batch_size | |||||
test_generator = batch_generator(test_input, opt.lstm_batch_size) | |||||
label_generator = batch_generator(test_label, opt.lstm_batch_size*opt.lstm_seq_len) | |||||
hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)), | |||||
to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim))) | |||||
add_loss = 0.0 | |||||
for t in range(iterations): | |||||
batch_input = test_generator.__next__ () | |||||
batch_label = label_generator.__next__() | |||||
net.zero_grad() | |||||
hidden = [state.detach() for state in hidden] | |||||
test_output, hidden = net(to_var(batch_input), hidden) | |||||
test_loss = criterion(test_output, to_var(batch_label)).data | |||||
loss_list.append(test_loss) | |||||
add_loss += test_loss | |||||
print("Test Loss={0:.4f}".format(float(add_loss) / iterations)) | |||||
print("Test PPL={0:.4f}".format(float(np.exp(add_loss / iterations)))) | |||||
############################################################# | |||||
if __name__ == "__main__": | |||||
word_embed_dim = 300 | |||||
char_embedding_dim = 15 | |||||
if os.path.exists("cache/prep.pt") is False: | |||||
print("Cannot find prep.pt") | |||||
objetcs = torch.load("cache/prep.pt") | |||||
word_dict = objetcs["word_dict"] | |||||
char_dict = objetcs["char_dict"] | |||||
reverse_word_dict = objetcs["reverse_word_dict"] | |||||
max_word_len = objetcs["max_word_len"] | |||||
num_words = len(word_dict) | |||||
print("word/char dictionary built. Start making inputs.") | |||||
if os.path.exists("cache/data_sets.pt") is False: | |||||
test_text = read_data("./test.txt") | |||||
test_set = np.array(text2vec(test_text, char_dict, max_word_len)) | |||||
# Labels are next-word index in word_dict with the same length as inputs | |||||
test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]]) | |||||
category = {"test": test_set, "tlabel":test_label} | |||||
torch.save(category, "cache/data_sets.pt") | |||||
else: | |||||
data_sets = torch.load("cache/data_sets.pt") | |||||
test_set = data_sets["test"] | |||||
test_label = data_sets["tlabel"] | |||||
train_set = data_sets["tdata"] | |||||
train_label = data_sets["trlabel"] | |||||
DataTuple = namedtuple("DataTuple", "test_input test_label train_input train_label ") | |||||
data = DataTuple( test_input=test_set, | |||||
test_label=test_label, train_label=train_label, train_input=train_set) | |||||
print("Loaded data sets. Start building network.") | |||||
USE_GPU = True | |||||
cnn_batch_size = 700 | |||||
lstm_seq_len = 35 | |||||
lstm_batch_size = 20 | |||||
net = torch.load("cache/net.pkl") | |||||
Options = namedtuple("Options", [ "cnn_batch_size", "lstm_seq_len", | |||||
"max_word_len", "lstm_batch_size", "word_embed_dim"]) | |||||
opt = Options(cnn_batch_size=lstm_seq_len*lstm_batch_size, | |||||
lstm_seq_len=lstm_seq_len, | |||||
max_word_len=max_word_len, | |||||
lstm_batch_size=lstm_batch_size, | |||||
word_embed_dim=word_embed_dim) | |||||
print("Network built. Start testing.") | |||||
test(net, data, opt) |
@@ -0,0 +1,268 @@ | |||||
import torch | |||||
from torch.autograd import Variable | |||||
import torch.nn as nn | |||||
import torch.nn.functional as F | |||||
import torch.optim as optim | |||||
import numpy as np | |||||
import os | |||||
from model import charLM | |||||
from utilities import * | |||||
from collections import namedtuple | |||||
from test import test | |||||
def preprocess(): | |||||
word_dict, char_dict = create_word_char_dict("valid.txt", "train.txt", "test.txt") | |||||
num_words = len(word_dict) | |||||
num_char = len(char_dict) | |||||
char_dict["BOW"] = num_char+1 | |||||
char_dict["EOW"] = num_char+2 | |||||
char_dict["PAD"] = 0 | |||||
# dict of (int, string) | |||||
reverse_word_dict = {value:key for key, value in word_dict.items()} | |||||
max_word_len = max([len(word) for word in word_dict]) | |||||
objects = { | |||||
"word_dict": word_dict, | |||||
"char_dict": char_dict, | |||||
"reverse_word_dict": reverse_word_dict, | |||||
"max_word_len": max_word_len | |||||
} | |||||
torch.save(objects, "cache/prep.pt") | |||||
print("Preprocess done.") | |||||
def to_var(x): | |||||
if torch.cuda.is_available(): | |||||
x = x.cuda() | |||||
return Variable(x) | |||||
def train(net, data, opt): | |||||
torch.manual_seed(1024) | |||||
train_input = torch.from_numpy(data.train_input) | |||||
train_label = torch.from_numpy(data.train_label) | |||||
valid_input = torch.from_numpy(data.valid_input) | |||||
valid_label = torch.from_numpy(data.valid_label) | |||||
# [num_seq, seq_len, max_word_len+2] | |||||
num_seq = train_input.size()[0] // opt.lstm_seq_len | |||||
train_input = train_input[:num_seq*opt.lstm_seq_len, :] | |||||
train_input = train_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2) | |||||
num_seq = valid_input.size()[0] // opt.lstm_seq_len | |||||
valid_input = valid_input[:num_seq*opt.lstm_seq_len, :] | |||||
valid_input = valid_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2) | |||||
num_epoch = opt.epochs | |||||
num_iter_per_epoch = train_input.size()[0] // opt.lstm_batch_size | |||||
learning_rate = opt.init_lr | |||||
old_PPL = 100000 | |||||
best_PPL = 100000 | |||||
# Log-SoftMax | |||||
criterion = nn.CrossEntropyLoss() | |||||
# word_emb_dim == hidden_size / num of hidden units | |||||
hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)), | |||||
to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim))) | |||||
for epoch in range(num_epoch): | |||||
################ Validation #################### | |||||
net.eval() | |||||
loss_batch = [] | |||||
PPL_batch = [] | |||||
iterations = valid_input.size()[0] // opt.lstm_batch_size | |||||
valid_generator = batch_generator(valid_input, opt.lstm_batch_size) | |||||
vlabel_generator = batch_generator(valid_label, opt.lstm_batch_size*opt.lstm_seq_len) | |||||
for t in range(iterations): | |||||
batch_input = valid_generator.__next__() | |||||
batch_label = vlabel_generator.__next__() | |||||
hidden = [state.detach() for state in hidden] | |||||
valid_output, hidden = net(to_var(batch_input), hidden) | |||||
length = valid_output.size()[0] | |||||
# [num_sample-1, len(word_dict)] vs [num_sample-1] | |||||
valid_loss = criterion(valid_output, to_var(batch_label)) | |||||
PPL = torch.exp(valid_loss.data) | |||||
loss_batch.append(float(valid_loss)) | |||||
PPL_batch.append(float(PPL)) | |||||
PPL = np.mean(PPL_batch) | |||||
print("[epoch {}] valid PPL={}".format(epoch, PPL)) | |||||
print("valid loss={}".format(np.mean(loss_batch))) | |||||
print("PPL decrease={}".format(float(old_PPL - PPL))) | |||||
# Preserve the best model | |||||
if best_PPL > PPL: | |||||
best_PPL = PPL | |||||
torch.save(net.state_dict(), "cache/model.pt") | |||||
torch.save(net, "cache/net.pkl") | |||||
# Adjust the learning rate | |||||
if float(old_PPL - PPL) <= 1.0: | |||||
learning_rate /= 2 | |||||
print("halved lr:{}".format(learning_rate)) | |||||
old_PPL = PPL | |||||
################################################## | |||||
#################### Training #################### | |||||
net.train() | |||||
optimizer = optim.SGD(net.parameters(), | |||||
lr = learning_rate, | |||||
momentum=0.85) | |||||
# split the first dim | |||||
input_generator = batch_generator(train_input, opt.lstm_batch_size) | |||||
label_generator = batch_generator(train_label, opt.lstm_batch_size*opt.lstm_seq_len) | |||||
for t in range(num_iter_per_epoch): | |||||
batch_input = input_generator.__next__() | |||||
batch_label = label_generator.__next__() | |||||
# detach hidden state of LSTM from last batch | |||||
hidden = [state.detach() for state in hidden] | |||||
output, hidden = net(to_var(batch_input), hidden) | |||||
# [num_word, vocab_size] | |||||
loss = criterion(output, to_var(batch_label)) | |||||
net.zero_grad() | |||||
loss.backward() | |||||
torch.nn.utils.clip_grad_norm(net.parameters(), 5, norm_type=2) | |||||
optimizer.step() | |||||
if (t+1) % 100 == 0: | |||||
print("[epoch {} step {}] train loss={}, Perplexity={}".format(epoch+1, | |||||
t+1, float(loss.data), float(np.exp(loss.data)))) | |||||
torch.save(net.state_dict(), "cache/model.pt") | |||||
print("Training finished.") | |||||
################################################################ | |||||
if __name__=="__main__": | |||||
word_embed_dim = 300 | |||||
char_embedding_dim = 15 | |||||
if os.path.exists("cache/prep.pt") is False: | |||||
preprocess() | |||||
objetcs = torch.load("cache/prep.pt") | |||||
word_dict = objetcs["word_dict"] | |||||
char_dict = objetcs["char_dict"] | |||||
reverse_word_dict = objetcs["reverse_word_dict"] | |||||
max_word_len = objetcs["max_word_len"] | |||||
num_words = len(word_dict) | |||||
print("word/char dictionary built. Start making inputs.") | |||||
if os.path.exists("cache/data_sets.pt") is False: | |||||
train_text = read_data("./train.txt") | |||||
valid_text = read_data("./valid.txt") | |||||
test_text = read_data("./test.txt") | |||||
train_set = np.array(text2vec(train_text, char_dict, max_word_len)) | |||||
valid_set = np.array(text2vec(valid_text, char_dict, max_word_len)) | |||||
test_set = np.array(text2vec(test_text, char_dict, max_word_len)) | |||||
# Labels are next-word index in word_dict with the same length as inputs | |||||
train_label = np.array([word_dict[w] for w in train_text[1:]] + [word_dict[train_text[-1]]]) | |||||
valid_label = np.array([word_dict[w] for w in valid_text[1:]] + [word_dict[valid_text[-1]]]) | |||||
test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]]) | |||||
category = {"tdata":train_set, "vdata":valid_set, "test": test_set, | |||||
"trlabel":train_label, "vlabel":valid_label, "tlabel":test_label} | |||||
torch.save(category, "cache/data_sets.pt") | |||||
else: | |||||
data_sets = torch.load("cache/data_sets.pt") | |||||
train_set = data_sets["tdata"] | |||||
valid_set = data_sets["vdata"] | |||||
test_set = data_sets["test"] | |||||
train_label = data_sets["trlabel"] | |||||
valid_label = data_sets["vlabel"] | |||||
test_label = data_sets["tlabel"] | |||||
DataTuple = namedtuple("DataTuple", | |||||
"train_input train_label valid_input valid_label test_input test_label") | |||||
data = DataTuple(train_input=train_set, | |||||
train_label=train_label, | |||||
valid_input=valid_set, | |||||
valid_label=valid_label, | |||||
test_input=test_set, | |||||
test_label=test_label) | |||||
print("Loaded data sets. Start building network.") | |||||
USE_GPU = True | |||||
cnn_batch_size = 700 | |||||
lstm_seq_len = 35 | |||||
lstm_batch_size = 20 | |||||
# cnn_batch_size == lstm_seq_len * lstm_batch_size | |||||
net = charLM(char_embedding_dim, | |||||
word_embed_dim, | |||||
num_words, | |||||
len(char_dict), | |||||
use_gpu=USE_GPU) | |||||
for param in net.parameters(): | |||||
nn.init.uniform(param.data, -0.05, 0.05) | |||||
Options = namedtuple("Options", [ | |||||
"cnn_batch_size", "init_lr", "lstm_seq_len", | |||||
"max_word_len", "lstm_batch_size", "epochs", | |||||
"word_embed_dim"]) | |||||
opt = Options(cnn_batch_size=lstm_seq_len*lstm_batch_size, | |||||
init_lr=1.0, | |||||
lstm_seq_len=lstm_seq_len, | |||||
max_word_len=max_word_len, | |||||
lstm_batch_size=lstm_batch_size, | |||||
epochs=35, | |||||
word_embed_dim=word_embed_dim) | |||||
print("Network built. Start training.") | |||||
# You can stop training anytime by "ctrl+C" | |||||
try: | |||||
train(net, data, opt) | |||||
except KeyboardInterrupt: | |||||
print('-' * 89) | |||||
print('Exiting from training early') | |||||
torch.save(net, "cache/net.pkl") | |||||
print("save net") | |||||
test(net, data, opt) |
@@ -0,0 +1,86 @@ | |||||
import torch | |||||
from torch.autograd import Variable | |||||
import torch.nn as nn | |||||
import torch.nn.functional as F | |||||
def batch_generator(x, batch_size): | |||||
# x: [num_words, in_channel, height, width] | |||||
# partitions x into batches | |||||
num_step = x.size()[0] // batch_size | |||||
for t in range(num_step): | |||||
yield x[t*batch_size:(t+1)*batch_size] | |||||
def text2vec(words, char_dict, max_word_len): | |||||
""" Return list of list of int """ | |||||
word_vec = [] | |||||
for word in words: | |||||
vec = [char_dict[ch] for ch in word] | |||||
if len(vec) < max_word_len: | |||||
vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))] | |||||
vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]] | |||||
word_vec.append(vec) | |||||
return word_vec | |||||
def seq2vec(input_words, char_embedding, char_embedding_dim, char_table): | |||||
""" convert the input strings into character embeddings """ | |||||
# input_words == list of string | |||||
# char_embedding == torch.nn.Embedding | |||||
# char_embedding_dim == int | |||||
# char_table == list of unique chars | |||||
# Returns: tensor of shape [len(input_words), char_embedding_dim, max_word_len+2] | |||||
max_word_len = max([len(word) for word in input_words]) | |||||
print("max_word_len={}".format(max_word_len)) | |||||
tensor_list = [] | |||||
start_column = torch.ones(char_embedding_dim, 1) | |||||
end_column = torch.ones(char_embedding_dim, 1) | |||||
for word in input_words: | |||||
# convert string to word embedding | |||||
word_encoding = char_embedding_lookup(word, char_embedding, char_table) | |||||
# add start and end columns | |||||
word_encoding = torch.cat([start_column, word_encoding, end_column], 1) | |||||
# zero-pad right columns | |||||
word_encoding = F.pad(word_encoding, (0, max_word_len-word_encoding.size()[1]+2)).data | |||||
# create dimension | |||||
word_encoding = word_encoding.unsqueeze(0) | |||||
tensor_list.append(word_encoding) | |||||
return torch.cat(tensor_list, 0) | |||||
def read_data(file_name): | |||||
# Return: list of strings | |||||
with open(file_name, 'r') as f: | |||||
corpus = f.read().lower() | |||||
import re | |||||
corpus = re.sub(r"<unk>", "unk", corpus) | |||||
return corpus.split() | |||||
def get_char_dict(vocabulary): | |||||
# vocabulary == dict of (word, int) | |||||
# Return: dict of (char, int), starting from 1 | |||||
char_dict = dict() | |||||
count = 1 | |||||
for word in vocabulary: | |||||
for ch in word: | |||||
if ch not in char_dict: | |||||
char_dict[ch] = count | |||||
count += 1 | |||||
return char_dict | |||||
def create_word_char_dict(*file_name): | |||||
text = [] | |||||
for file in file_name: | |||||
text += read_data(file) | |||||
word_dict = {word:ix for ix, word in enumerate(set(text))} | |||||
char_dict = get_char_dict(word_dict) | |||||
return word_dict, char_dict | |||||