prototype and self-attention modeltags/v0.1.0
@@ -0,0 +1,41 @@ | |||||
# Prototype | |||||
## Word2Idx.py | |||||
A mapping model between words and indexes | |||||
## embedding.py | |||||
embedding modules | |||||
Contains a simple encapsulation for torch.nn.Embedding | |||||
## encoder.py | |||||
encoder modules | |||||
Contains a simple encapsulation for torch.nn.LSTM | |||||
## aggregation.py | |||||
aggregation modules | |||||
Contains a self-attention model, according to paper "A Structured Self-attentive Sentence Embedding", https://arxiv.org/abs/1703.03130 | |||||
## predict.py | |||||
predict modules | |||||
Contains a two layers perceptron for classification | |||||
## example.py | |||||
An example showing how to use above modules to build a model | |||||
Contains a model for sentiment analysis on Yelp dataset, and its training and testing procedures. See https://arxiv.org/abs/1703.03130 for more details. | |||||
## prepare.py | |||||
A case of using Word2Idx to build Yelp datasets | |||||
## dataloader.py | |||||
A dataloader for Yelp dataset | |||||
It is an iterable object, returning a zero-padded batch every iteration. | |||||
@@ -0,0 +1,63 @@ | |||||
import collections | |||||
import pickle | |||||
class Word2Idx(): | |||||
""" | |||||
Build a word index according to word frequency. | |||||
If "min_freq" is given, then only words with a frequncy not lesser than min_freq will be kept. | |||||
If "max_num" is given, then at most the most frequent $max_num words will be kept. | |||||
"words" should be a list [ w_1,w_2,...,w_i,...,w_n ] where each w_i is a string representing a word. | |||||
num is the size of the lookup table. | |||||
w2i is a lookup table assigning each word an index. | |||||
i2w is a vector which serves as an invert mapping of w2i. | |||||
Note that index 0 is token "<PAD>" for padding | |||||
index 1 is token "<UNK>" for unregistered words | |||||
e.g. i2w[w2i["word"]] == "word" | |||||
""" | |||||
def __init__(self): | |||||
self.__w2i = dict() | |||||
self.__i2w = [] | |||||
self.num = 0 | |||||
def build(self, words, min_freq=0, max_num=None): | |||||
"""build a model from words""" | |||||
counter = collections.Counter(words) | |||||
word_set = set(words) | |||||
if max_num is not None: | |||||
most_common = counter.most_common(min(len(word_set), max_num - 1)) | |||||
else: | |||||
most_common = counter.most_common() | |||||
self.__w2i = dict((w[0],i + 1) for i,w in enumerate(most_common) if w[1] >= min_freq) | |||||
self.__w2i["<PAD>"] = 0 | |||||
self.__w2i["<UNK>"] = 1 | |||||
self.__i2w = ["<PAD>", "<UNK>"] + [ w[0] for w in most_common if w[1] >= min_freq ] | |||||
self.num = len(self.__i2w) | |||||
def w2i(self, word): | |||||
"""word to index""" | |||||
if word in self.__w2i: | |||||
return self.__w2i[word] | |||||
return 0 | |||||
def i2w(self, idx): | |||||
"""index to word""" | |||||
if idx >= self.num: | |||||
raise Exception("out of range\n") | |||||
return self.__i2w[idx] | |||||
def save(self, addr): | |||||
"""save the model to a file with address "addr" """ | |||||
f = open(addr,"wb") | |||||
pickle.dump([self.__i2w, self.__w2i, self.num], f) | |||||
f.close() | |||||
def load(self, addr): | |||||
"""load a model from a file with address "addr" """ | |||||
f = open(addr,"rb") | |||||
paras = pickle.load(f) | |||||
self.__i2w, self.__w2i, self.num = paras[0], paras[1], paras[2] | |||||
f.close() | |||||
@@ -0,0 +1,40 @@ | |||||
import torch | |||||
import torch.nn as nn | |||||
from torch.autograd import Variable | |||||
class Selfattention(nn.Module): | |||||
""" | |||||
Self Attention Module. | |||||
Args: | |||||
input_size : the size for the input vector | |||||
d_a : the width of weight matrix | |||||
r : the number of encoded vectors | |||||
""" | |||||
def __init__(self, input_size, d_a, r): | |||||
super(Selfattention, self).__init__() | |||||
self.W_s1 = nn.Parameter(torch.randn(d_a, input_size), requires_grad=True) | |||||
self.W_s2 = nn.Parameter(torch.randn(r, d_a), requires_grad=True) | |||||
self.softmax = nn.Softmax(dim=2) | |||||
self.tanh = nn.Tanh() | |||||
def penalization(self, A): | |||||
""" | |||||
compute the penalization term for attention module | |||||
""" | |||||
if self.W_s1.is_cuda: | |||||
I = Variable(torch.eye(A.size(1)).cuda(), requires_grad=False) | |||||
else: | |||||
I = Variable(torch.eye(A.size(1)), requires_grad=False) | |||||
M = torch.matmul(A, torch.transpose(A, 1, 2)) - I | |||||
M = M.view(M.size(0), -1) | |||||
return torch.sum(M ** 2, dim=1) | |||||
def forward(self, x): | |||||
inter = self.tanh(torch.matmul(self.W_s1, torch.transpose(x, 1, 2))) | |||||
A = self.softmax(torch.matmul(self.W_s2, inter)) | |||||
out = torch.matmul(A, x) | |||||
out = out.view(out.size(0), -1) | |||||
penalty = self.penalization(A) | |||||
return out, penalty | |||||
@@ -0,0 +1,81 @@ | |||||
import random | |||||
import pickle | |||||
import torch | |||||
import numpy as np | |||||
from torch.autograd import Variable | |||||
def float_wrapper(x, requires_grad=True, using_cuda=True): | |||||
""" | |||||
transform float type list to pytorch variable | |||||
""" | |||||
if using_cuda==True: | |||||
return Variable(torch.FloatTensor(x).cuda(), requires_grad=requires_grad) | |||||
else: | |||||
return Variable(torch.FloatTensor(x), requires_grad=requires_grad) | |||||
def long_wrapper(x, requires_grad=True, using_cuda=True): | |||||
""" | |||||
transform long type list to pytorch variable | |||||
""" | |||||
if using_cuda==True: | |||||
return Variable(torch.LongTensor(x).cuda(), requires_grad=requires_grad) | |||||
else: | |||||
return Variable(torch.LongTensor(x), requires_grad=requires_grad) | |||||
def pad(X, using_cuda): | |||||
""" | |||||
zero-pad sequnces to same length then pack them together | |||||
""" | |||||
maxlen = max([x.size(0) for x in X]) | |||||
Y = [] | |||||
for x in X: | |||||
padlen = maxlen - x.size(0) | |||||
if padlen > 0: | |||||
if using_cuda: | |||||
paddings = Variable(torch.zeros(padlen).long()).cuda() | |||||
else: | |||||
paddings = Variable(torch.zeros(padlen).long()) | |||||
x_ = torch.cat((x, paddings), 0) | |||||
Y.append(x_) | |||||
else: | |||||
Y.append(x) | |||||
return torch.stack(Y) | |||||
class DataLoader(object): | |||||
""" | |||||
load data with form {"feature", "class"} | |||||
Args: | |||||
fdir : data file address | |||||
batch_size : batch_size | |||||
shuffle : if True, shuffle dataset every epoch | |||||
using_cuda : if True, return tensors on GPU | |||||
""" | |||||
def __init__(self, fdir, batch_size, shuffle=True, using_cuda=True): | |||||
with open(fdir, "rb") as f: | |||||
self.data = pickle.load(f) | |||||
self.batch_size = batch_size | |||||
self.num = len(self.data) | |||||
self.count = 0 | |||||
self.iters = int(self.num / batch_size) | |||||
self.shuffle = shuffle | |||||
self.using_cuda = using_cuda | |||||
def __iter__(self): | |||||
return self | |||||
def __next__(self): | |||||
if self.count == self.iters: | |||||
self.count = 0 | |||||
if self.shuffle: | |||||
random.shuffle(self.data) | |||||
raise StopIteration() | |||||
else: | |||||
batch = self.data[self.count * self.batch_size : (self.count + 1) * self.batch_size] | |||||
self.count += 1 | |||||
X = [long_wrapper(x["sent"], using_cuda=self.using_cuda, requires_grad=False) for x in batch] | |||||
X = pad(X, self.using_cuda) | |||||
y = long_wrapper([x["class"] for x in batch], using_cuda=self.using_cuda, requires_grad=False) | |||||
return {"feature" : X, "class" : y} | |||||
@@ -0,0 +1,23 @@ | |||||
import torch | |||||
import torch.nn as nn | |||||
class Lookuptable(nn.Module): | |||||
""" | |||||
A simple lookup table | |||||
Args: | |||||
nums : the size of the lookup table | |||||
dims : the size of each vector | |||||
padding_idx : pads the tensor with zeros whenever it encounters this index | |||||
sparse : If True, gradient matrix will be a sparse tensor. In this case, | |||||
only optim.SGD(cuda and cpu) and optim.Adagrad(cpu) can be used | |||||
""" | |||||
def __init__(self, nums, dims, padding_idx=0, sparse=False): | |||||
super(Lookuptable, self).__init__() | |||||
self.embed = nn.Embedding(nums, dims, padding_idx, sparse=sparse) | |||||
def forward(self, x): | |||||
return self.embed(x) | |||||
if __name__ == "__main__": | |||||
model = Lookuptable(10, 20) |
@@ -0,0 +1,22 @@ | |||||
import torch | |||||
import torch.nn as nn | |||||
class Lstm(nn.Module): | |||||
""" | |||||
LSTM module | |||||
Args: | |||||
input_size : input size | |||||
hidden_size : hidden size | |||||
num_layers : number of hidden layers | |||||
dropout : dropout rate | |||||
bidirectional : If True, becomes a bidirectional RNN | |||||
""" | |||||
def __init__(self, input_size, hidden_size, num_layers, dropout, bidirectional): | |||||
super(Lstm, self).__init__() | |||||
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=True,\ | |||||
dropout=dropout, bidirectional=bidirectional) | |||||
def forward(self, x): | |||||
x, _ = self.lstm(x) | |||||
return x |
@@ -0,0 +1,129 @@ | |||||
import torch | |||||
import torch.nn as nn | |||||
import encoder | |||||
import aggregation | |||||
import embedding | |||||
import predict | |||||
import torch.optim as optim | |||||
import time | |||||
import dataloader | |||||
WORD_NUM = 357361 | |||||
WORD_SIZE = 100 | |||||
HIDDEN_SIZE = 300 | |||||
D_A = 350 | |||||
R = 10 | |||||
MLP_HIDDEN = 2000 | |||||
CLASSES_NUM = 5 | |||||
class Net(nn.Module): | |||||
""" | |||||
A model for sentiment analysis using lstm and self-attention | |||||
""" | |||||
def __init__(self): | |||||
super(Net, self).__init__() | |||||
self.embedding = embedding.Lookuptable(WORD_NUM, WORD_SIZE) | |||||
self.encoder = encoder.Lstm(WORD_SIZE, HIDDEN_SIZE, 1, 0.5, True) | |||||
self.aggregation = aggregation.Selfattention(2 * HIDDEN_SIZE, D_A, R) | |||||
self.predict = predict.MLP(R * HIDDEN_SIZE * 2, MLP_HIDDEN, CLASSES_NUM) | |||||
def forward(self, x): | |||||
x = self.embedding(x) | |||||
x = self.encoder(x) | |||||
x, penalty = self.aggregation(x) | |||||
x = self.predict(x) | |||||
return x, penalty | |||||
def train(model_dict=None, using_cuda=True, learning_rate=0.06,\ | |||||
momentum=0.3, batch_size=32, epochs=5, coef=1.0, interval=10): | |||||
""" | |||||
training procedure | |||||
Args: | |||||
If model_dict is given (a file address), it will continue training on the given model. | |||||
Otherwise, it would train a new model from scratch. | |||||
If using_cuda is true, the training would be conducted on GPU. | |||||
Learning_rate and momentum is for SGD optimizer. | |||||
coef is the coefficent between the cross-entropy loss and the penalization term. | |||||
interval is the frequncy of reporting. | |||||
the result will be saved with a form "model_dict_+current time", which could be used for further training | |||||
""" | |||||
if using_cuda: | |||||
net = Net().cuda() | |||||
else: | |||||
net = Net() | |||||
if model_dict != None: | |||||
net.load_state_dict(torch.load(model_dict)) | |||||
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum) | |||||
criterion = nn.CrossEntropyLoss() | |||||
dataset = dataloader.DataLoader("train_set.pkl", batch_size, using_cuda=using_cuda) | |||||
#statistics | |||||
loss_count = 0 | |||||
prepare_time = 0 | |||||
run_time = 0 | |||||
count = 0 | |||||
for epoch in range(epochs): | |||||
print("epoch: %d"%(epoch)) | |||||
for i, batch in enumerate(dataset): | |||||
t1 = time.time() | |||||
X = batch["feature"] | |||||
y = batch["class"] | |||||
t2 = time.time() | |||||
y_pred, y_penl = net(X) | |||||
loss = criterion(y_pred, y) + torch.sum(y_penl) / batch_size * coef | |||||
optimizer.zero_grad() | |||||
loss.backward() | |||||
nn.utils.clip_grad_norm(net.parameters(), 0.5) | |||||
optimizer.step() | |||||
t3 = time.time() | |||||
loss_count += torch.sum(y_penl).data[0] | |||||
prepare_time += (t2 - t1) | |||||
run_time += (t3 - t2) | |||||
p, idx = torch.max(y_pred.data, dim=1) | |||||
count += torch.sum(torch.eq(idx.cpu(), y.data.cpu())) | |||||
if (i + 1) % interval == 0: | |||||
print("epoch : %d, iters: %d"%(epoch, i + 1)) | |||||
print("loss count:" + str(loss_count / (interval * batch_size))) | |||||
print("acuracy:" + str(count / (interval * batch_size))) | |||||
print("penalty:" + str(torch.sum(y_penl).data[0] / batch_size)) | |||||
print("prepare time:" + str(prepare_time)) | |||||
print("run time:" + str(run_time)) | |||||
prepare_time = 0 | |||||
run_time = 0 | |||||
loss_count = 0 | |||||
count = 0 | |||||
string = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()) | |||||
torch.save(net.state_dict(), "model_dict_%s.dict"%(string)) | |||||
def test(model_dict, using_cuda=True): | |||||
if using_cuda: | |||||
net = Net().cuda() | |||||
else: | |||||
net = Net() | |||||
net.load_state_dict(torch.load(model_dict)) | |||||
dataset = dataloader.DataLoader("test_set.pkl", batch_size=1, using_cuda=using_cuda) | |||||
count = 0 | |||||
for i, batch in enumerate(dataset): | |||||
X = batch["feature"] | |||||
y = batch["class"] | |||||
y_pred, _ = net(X) | |||||
p, idx = torch.max(y_pred.data, dim=1) | |||||
count += torch.sum(torch.eq(idx.cpu(), y.data.cpu())) | |||||
print("accuracy: %f"%(count / dataset.num)) | |||||
if __name__ == "__main__": | |||||
train(using_cuda=torch.cuda.is_available()) | |||||
@@ -0,0 +1,25 @@ | |||||
import torch | |||||
import torch.nn as nn | |||||
import torch.nn.functional as F | |||||
class MLP(nn.Module): | |||||
""" | |||||
A two layers perceptron for classification. | |||||
Output : Unnormalized possibility distribution | |||||
Args: | |||||
input_size : the size of input | |||||
hidden_size : the size of hidden layer | |||||
output_size : the size of output | |||||
""" | |||||
def __init__(self, input_size, hidden_size, output_size): | |||||
super(MLP,self).__init__() | |||||
self.L1 = nn.Linear(input_size, hidden_size) | |||||
self.L2 = nn.Linear(hidden_size, output_size) | |||||
def forward(self, x): | |||||
out = self.L2(F.relu(self.L1(x))) | |||||
return out | |||||
if __name__ == "__main__": | |||||
MLP(20, 30, 20) |
@@ -0,0 +1,50 @@ | |||||
import pickle | |||||
import Word2Idx | |||||
def get_sets(m, n): | |||||
""" | |||||
get a train set containing m samples and a test set containing n samples | |||||
""" | |||||
samples = pickle.load(open("tuples.pkl","rb")) | |||||
if m+n > len(samples): | |||||
print("asking for too many tuples\n") | |||||
return | |||||
train_samples = samples[ : m] | |||||
test_samples = samples[m: m+n] | |||||
return train_samples, test_samples | |||||
def build_wordidx(): | |||||
""" | |||||
build wordidx using word2idx | |||||
""" | |||||
train, test = get_sets(500000, 2000) | |||||
words = [] | |||||
for x in train: | |||||
words += x[0] | |||||
wordidx = Word2Idx.Word2Idx() | |||||
wordidx.build(words) | |||||
print(wordidx.num) | |||||
print(wordidx.i2w(0)) | |||||
wordidx.save("wordidx.pkl") | |||||
def build_sets(): | |||||
""" | |||||
build train set and test set, transform word to index | |||||
""" | |||||
train, test = get_sets(500000, 2000) | |||||
wordidx = Word2Idx.Word2Idx() | |||||
wordidx.load("wordidx.pkl") | |||||
train_set = [] | |||||
for x in train: | |||||
sent = [wordidx.w2i(w) for w in x[0]] | |||||
train_set.append({"sent" : sent, "class" : x[1]}) | |||||
test_set = [] | |||||
for x in test: | |||||
sent = [wordidx.w2i(w) for w in x[0]] | |||||
test_set.append({"sent" : sent, "class" : x[1]}) | |||||
pickle.dump(train_set, open("train_set.pkl", "wb")) | |||||
pickle.dump(test_set, open("test_set.pkl", "wb")) | |||||
if __name__ == "__main__": | |||||
build_wordidx() | |||||
build_sets() |