@@ -0,0 +1,62 @@ | |||
import collections | |||
import pickle | |||
class Word2Idx(): | |||
""" | |||
Build a word index according to word frequency. | |||
If "min_freq" is given, then only words with a frequncy not lesser than min_freq will be kept. | |||
If "max_num" is given, then at most the most frequent $max_num words will be kept. | |||
"words" should be a list [ w_1,w_2,...,w_i,...,w_n ] where each w_i is a string representing a word. | |||
num is the size of the lookup table. | |||
w2i is a lookup table assigning each word an index. | |||
Note that index 0 will be returned for any unregistered words. | |||
i2w is a vector which serves as an invert mapping of w2i. | |||
Token "<UNK>" will be returned for index 0 | |||
e.g. i2w[w2i["word"]] == "word" | |||
""" | |||
def __init__(self): | |||
self.__w2i = dict() | |||
self.__i2w = [] | |||
self.num = 0 | |||
def build(self, words, min_freq=0, max_num=None): | |||
"""build a model from words""" | |||
counter = collections.Counter(words) | |||
word_set = set(words) | |||
if max_num is not None: | |||
most_common = counter.most_common(min(len(word_set), max_num - 1)) | |||
else: | |||
most_common = counter.most_common() | |||
self.__w2i = dict((w[0],i + 1) for i,w in enumerate(most_common) if w[1] >= min_freq) | |||
self.__w2i["<UNK>"] = 0 | |||
self.__i2w = ["<UNK>"] + [ w[0] for w in most_common if w[1] >= min_freq ] | |||
self.num = len(self.__i2w) | |||
def w2i(self,word): | |||
"""word to index""" | |||
if word in self.__w2i: | |||
return self.__w2i[word] | |||
return 0 | |||
def i2w(self,idx): | |||
"""index to word""" | |||
if idx >= self.num: | |||
raise Exception("out of range\n") | |||
return self.__i2w[idx] | |||
def save(self,addr): | |||
"""save the model to a file with address "addr" """ | |||
f = open(addr,"wb") | |||
pickle.dump([self.__i2w, self.__w2i, self.num], f) | |||
f.close() | |||
def load(self,addr): | |||
"""load a model from a file with address "addr" """ | |||
f = open(addr,"rb") | |||
paras = pickle.load(f) | |||
self.__i2w, self.__w2i, self.num = paras[0], paras[1], paras[2] | |||
f.close() | |||
@@ -0,0 +1,41 @@ | |||
import torch | |||
import torch.nn as nn | |||
class Selfattention(nn.Module): | |||
""" | |||
Self Attention Module. | |||
Args: | |||
input_size : the size for the input vector | |||
d_a : the width of weight matrix | |||
r : the number of encoded vectors | |||
""" | |||
def __init__(self, input_size, d_a, r): | |||
super(Selfattention, self).__init__() | |||
self.W_s1 = nn.Parameter(torch.randn(d_a, input_size), requires_grad=True) | |||
self.W_s2 = nn.Parameter(torch.randn(r, d_a), requires_grad=True) | |||
self.softmax = nn.Softmax(dim=2) | |||
self.tanh = nn.Tanh() | |||
def penalization(self, A): | |||
""" | |||
compute the penalization term for attention module | |||
""" | |||
if self.W_s1.is_cuda: | |||
I = Variable(torch.eye(A.size(1)).cuda(), requires_grad=False) | |||
else: | |||
I = Variable(torch.eye(A.size(1)), requires_grad=False) | |||
M = torch.matmul(A, torch.transpose(A, 1, 2)) - I | |||
M = M.view(M.size(0), -1) | |||
return torch.sum(M ** 2, dim=1) | |||
def forward(self, x): | |||
inter = self.tanh(torch.matmul(self.W_s1, torch.transpose(x, 1, 2))) | |||
A = self.softmax(torch.matmul(self.W_s2, inter)) | |||
out = torch.matmul(A, H) | |||
out = out.view(out.size(0), -1) | |||
penalty = self.penalization(A) | |||
return out, penalty | |||
if __name__ == "__main__": | |||
model = Selfattention(100, 10, 20) |
@@ -0,0 +1,82 @@ | |||
import random | |||
import pickle | |||
import torch | |||
import numpy as np | |||
from torch.autograd import Variable | |||
def float_wrapper(x, requires_grad=True, using_cuda=True): | |||
""" | |||
transform float type list to pytorch variable | |||
""" | |||
if using_cuda==True: | |||
return Variable(torch.FloatTensor(x).cuda(), requires_grad=requires_grad) | |||
else: | |||
return Variable(torch.FloatTensor(x), requires_grad=requires_grad) | |||
def long_wrapper(x, requires_grad=True, using_cuda=True): | |||
""" | |||
transform long type list to pytorch variable | |||
""" | |||
if using_cuda==True: | |||
return Variable(torch.LongTensor(x).cuda(), requires_grad=requires_grad) | |||
else: | |||
return Variable(torch.LongTensor(x), requires_grad=requires_grad) | |||
def pad(X, using_cuda): | |||
""" | |||
zero-pad sequnces to same length then pack them together | |||
""" | |||
maxlen = max([x.size(0) for x in X]) | |||
Y = [] | |||
for x in X: | |||
padlen = maxlen - x.size(0) | |||
if padlen > 0: | |||
if using_cuda: | |||
paddings = torch.zeros(padlen).cuda() | |||
else: | |||
paddings = torch.zeros(padlen) | |||
x_ = torch.cat(x, paddings) | |||
Y.append(x_) | |||
else: | |||
Y.append(x) | |||
return torch.stack(Y) | |||
class DataLoader(object): | |||
""" | |||
load data with form {"feature", "class"} | |||
Args: | |||
fdir : data file address | |||
batch_size : batch_size | |||
shuffle : if True, shuffle dataset every epoch | |||
using_cuda : if True, return tensors on GPU | |||
""" | |||
def __init__(self, fdir, batch_size, shuffle=True, using_cuda=True): | |||
with open(fdir, "rb") as f: | |||
self.data = pickle.load(f) | |||
self.batch_size = batch_size | |||
self.num = len(self.data) | |||
self.count = 0 | |||
self.iters = int(self.num / batch_size) | |||
self.shuffle = shuffle | |||
self.using_cuda = using_cuda | |||
def __iter__(self): | |||
return self | |||
def __next__(self): | |||
if self.count == self.iters: | |||
self.count = 0 | |||
if self.shuffle: | |||
random.shuffle(self.data) | |||
raise StopIteration() | |||
else: | |||
X = self.data[self.count * self.batch_size : (self.count + 1) * self.batch_size] | |||
self.count += 1 | |||
X = [long_wrapper(x["sent"], using_cuda=self.using_cuda) for x in X] | |||
X = pad(X, self.using_cuda) | |||
y = [long_wrapper(x["class"], using_cuda=self.using_cuda) for x in X] | |||
y = torch.stack(y) | |||
return {"feature" : X, "class" : y} | |||
@@ -0,0 +1,23 @@ | |||
import torch | |||
import torch.nn as nn | |||
class Lookuptable(nn.Module): | |||
""" | |||
A simple lookup table | |||
Args: | |||
nums : the size of the lookup table | |||
dims : the size of each vector | |||
padding_idx : pads the tensor with zeros whenever it encounters this index | |||
sparse : If True, gradient matrix will be a sparse tensor. In this case, | |||
only optim.SGD(cuda and cpu) and optim.Adagrad(cpu) can be used | |||
""" | |||
def __init__(self, nums, dims, padding_idx=0, sparse=False): | |||
super(Lookuptable, self).__init__() | |||
self.embed = nn.Embedding(nums, dims, padding_idx, sparse=sparse) | |||
def forward(self, x): | |||
return self.embed(x) | |||
if __name__ == "__main__": | |||
model = Lookuptable(10, 20) |
@@ -0,0 +1,25 @@ | |||
import torch | |||
import torch.nn as nn | |||
class Lstm(nn.Module): | |||
""" | |||
LSTM module | |||
Args: | |||
input_size : input size | |||
hidden_size : hidden size | |||
num_layers : number of hidden layers | |||
dropout : dropout rate | |||
bidirectional : If True, becomes a bidirectional RNN | |||
""" | |||
def __init__(self, input_size, hidden_size, num_layers, dropout, bidirectional): | |||
super(Lstm, self).__init__() | |||
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=True,\ | |||
dropout=dropout, bidirectional=bidirectional) | |||
def forward(self, x): | |||
x, _ = self.lstm(x) | |||
return x | |||
if __name__ == "__main__": | |||
model = Lstm(20, 30, 1, 0.5, False) |
@@ -0,0 +1,108 @@ | |||
import torch | |||
import torch.nn as nn | |||
import encoder | |||
import aggregation | |||
import embedding | |||
import predict | |||
import torch.optim as optim | |||
import time | |||
import dataloader | |||
WORD_SIZE = 100 | |||
HIDDEN_SIZE = 300 | |||
D_A = 350 | |||
R = 20 | |||
MLP_HIDDEN = 2000 | |||
CLASSES_NUM = 5 | |||
WORD_NUM = 357361 | |||
class Net(nn.Module): | |||
""" | |||
A model for sentiment analysis using lstm and self-attention | |||
""" | |||
def __init__(self): | |||
super(Net, self).__init__() | |||
self.embedding = embedding.Lookuptable(WORD_NUM, WORD_SIZE) | |||
self.encoder = encoder.Lstm(WORD_SIZE, HIDDEN_SIZE, 1, 0.5, True) | |||
self.aggregation = aggregation.Selfattention(2 * HIDDEN_SIZE, D_A, R) | |||
self.predict = predict.MLP(R * HIDDEN_SIZE * 2, MLP_HIDDEN, CLASSES_NUM) | |||
def forward(self, x): | |||
x = self.embedding(x) | |||
x = self.encoder(x) | |||
x, penalty = self.aggregation(x) | |||
x = self.predict(x) | |||
return r, x | |||
def train(model_dict=None, using_cuda=True, learning_rate=0.06,\ | |||
momentum=0.3, batch_size=32, epochs=5, coef=1.0, interval=10): | |||
""" | |||
training procedure | |||
Args: | |||
If model_dict is given (a file address), it will continue training on the given model. | |||
Otherwise, it would train a new model from scratch. | |||
If using_cuda is true, the training would be conducted on GPU. | |||
Learning_rate and momentum is for SGD optimizer. | |||
coef is the coefficent between the cross-entropy loss and the penalization term. | |||
interval is the frequncy of reporting. | |||
the result will be saved with a form "model_dict_+current time", which could be used for further training | |||
""" | |||
if using_cuda == True: | |||
net = Net().cuda() | |||
else: | |||
net = Net() | |||
if model_dict != None: | |||
net.load_state_dict(torch.load(model_dict)) | |||
optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum) | |||
criterion = nn.CrossEntropyLoss() | |||
dataset = dataloader.DataLoader("trainset.pkl", using_cuda=using_cuda) | |||
#statistics | |||
loss_count = 0 | |||
prepare_time = 0 | |||
run_time = 0 | |||
count = 0 | |||
for epoch in range(epochs): | |||
for i, batch in enumerate(dataset): | |||
t1 = time.time() | |||
X = batch["feature"] | |||
y = batch["class"] | |||
t2 = time.time() | |||
y_pred, y_penl = net(X) | |||
loss = criterion(y_pred, y) + torch.sum(y_penl) / batch_size * coef | |||
optimizer.zero_grad() | |||
loss.backward() | |||
nn.utils.clip_grad_norm(net.parameters(), 0.5) | |||
optimizer.step() | |||
t3 = time.time() | |||
loss_count += torch.sum(y_penl).data[0] | |||
prepare_time += (t2 - t1) | |||
run_time += (t3 - t2) | |||
p, idx = torch.max(y_pred, dim=1) | |||
idx = idx.data | |||
count += torch.sum(torch.eq(idx.cpu(), y)) | |||
if i % interval == 0: | |||
print(i) | |||
print("loss count:" + str(loss_count / batch_size)) | |||
print("acuracy:" + str(count / batch_size)) | |||
print("penalty:" + str(torch.sum(y_penl).data[0] / batch_size)) | |||
print("prepare time:" + str(prepare_time / batch_size)) | |||
print("run time:" + str(run_time / batch_size)) | |||
prepare_time = 0 | |||
run_time = 0 | |||
loss_count = 0 | |||
count = 0 | |||
torch.save(net.state_dict(), "model_dict_%s.pkl"%(str(time.time()))) | |||
if __name__ == "__main__": | |||
train(using_cuda=torch.cuda.is_available()) | |||
@@ -0,0 +1,25 @@ | |||
import torch | |||
import torch.nn as nn | |||
class MLP(nn.Module): | |||
""" | |||
A two layers perceptron for classification. | |||
Output : Unnormalized possibility distribution | |||
Args: | |||
input_size : the size of input | |||
hidden_size : the size of hidden layer | |||
output_size : the size of output | |||
""" | |||
def __init__(self, input_size, hidden_size, output_size): | |||
super(MLP,self).__init__() | |||
self.L1 = nn.Linear(input_size, hidden_size) | |||
self.L2 = nn.Linear(hidden_size, output_size) | |||
self.softmax = nn.Softmax(dim=1) | |||
def forward(self, x): | |||
out = self.L2(F.relu(self.L1(x))) | |||
return out | |||
if __name__ == "__main__": | |||
MLP(20, 30, 20) |