@@ -0,0 +1 @@ | |||||
*.pyc |
@@ -20,16 +20,20 @@ class HAN(nn.Module): | |||||
sent_num_layers, | sent_num_layers, | ||||
sent_context_size) | sent_context_size) | ||||
self.output_layer = nn.Linear(2* sent_hidden_size, output_size) | self.output_layer = nn.Linear(2* sent_hidden_size, output_size) | ||||
self.softmax = nn.Softmax() | |||||
self.softmax = nn.LogSoftmax(dim=1) | |||||
def forward(self, doc): | |||||
def forward(self, batch_doc): | |||||
# input is a sequence of vector | # input is a sequence of vector | ||||
# if level == w, a seq of words (a sent); level == s, a seq of sents (a doc) | # if level == w, a seq of words (a sent); level == s, a seq of sents (a doc) | ||||
s_list = [] | |||||
for sent in doc: | |||||
s_list.append(self.word_layer(sent)) | |||||
s_vec = torch.cat(s_list, dim=1).t() | |||||
doc_vec = self.sent_layer(s_vec) | |||||
doc_vec_list = [] | |||||
for doc in batch_doc: | |||||
s_list = [] | |||||
for sent in doc: | |||||
s_list.append(self.word_layer(sent)) | |||||
s_vec = torch.cat(s_list, dim=0) | |||||
vec = self.sent_layer(s_vec) | |||||
doc_vec_list.append(vec) | |||||
doc_vec = torch.cat(doc_vec_list, dim=0) | |||||
output = self.softmax(self.output_layer(doc_vec)) | output = self.softmax(self.output_layer(doc_vec)) | ||||
return output | return output | ||||
@@ -51,7 +55,7 @@ class AttentionNet(nn.Module): | |||||
# Attention | # Attention | ||||
self.fc = nn.Linear(2* gru_hidden_size, context_vec_size) | self.fc = nn.Linear(2* gru_hidden_size, context_vec_size) | ||||
self.tanh = nn.Tanh() | self.tanh = nn.Tanh() | ||||
self.softmax = nn.Softmax() | |||||
self.softmax = nn.Softmax(dim=0) | |||||
# context vector | # context vector | ||||
self.context_vec = nn.Parameter(torch.Tensor(context_vec_size, 1)) | self.context_vec = nn.Parameter(torch.Tensor(context_vec_size, 1)) | ||||
self.context_vec.data.uniform_(-0.1, 0.1) | self.context_vec.data.uniform_(-0.1, 0.1) | ||||
@@ -63,47 +67,8 @@ class AttentionNet(nn.Module): | |||||
h_t = torch.squeeze(h_t, 1) | h_t = torch.squeeze(h_t, 1) | ||||
u = self.tanh(self.fc(h_t)) | u = self.tanh(self.fc(h_t)) | ||||
alpha = self.softmax(torch.mm(u, self.context_vec)) | alpha = self.softmax(torch.mm(u, self.context_vec)) | ||||
output = torch.mm(h_t.t(), alpha) | |||||
# output's dim (2*hidden_size, 1) | |||||
output = torch.mm(h_t.t(), alpha).t() | |||||
# output's dim (1, 2*hidden_size) | |||||
return output | return output | ||||
''' | |||||
Train process | |||||
''' | |||||
import math | |||||
import os | |||||
import copy | |||||
import pickle | |||||
import matplotlib.pyplot as plt | |||||
import matplotlib.ticker as ticker | |||||
import numpy as np | |||||
import json | |||||
import nltk | |||||
optimizer = torch.optim.SGD(lr=0.01) | |||||
criterion = nn.NLLLoss() | |||||
epoch = 1 | |||||
batch_size = 10 | |||||
net = HAN(input_size=100, output_size=5, | |||||
word_hidden_size=50, word_num_layers=1, word_context_size=100, | |||||
sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) | |||||
def dataloader(filename): | |||||
samples = pickle.load(open(filename, 'rb')) | |||||
return samples | |||||
def gen_doc(text): | |||||
pass | |||||
class SampleDoc: | |||||
def __init__(self, doc, label): | |||||
self.doc = doc | |||||
self.label = label | |||||
def __iter__(self): | |||||
for sent in self.doc: | |||||
for word in sent: | |||||
@@ -0,0 +1,138 @@ | |||||
import gensim | |||||
from gensim import models | |||||
import os | |||||
import pickle | |||||
class SampleIter: | |||||
def __init__(self, dirname): | |||||
self.dirname = dirname | |||||
def __iter__(self): | |||||
for f in os.listdir(self.dirname): | |||||
for y, x in pickle.load(open(os.path.join(self.dirname, f), 'rb')): | |||||
yield x, y | |||||
class SentIter: | |||||
def __init__(self, dirname, count): | |||||
self.dirname = dirname | |||||
self.count = int(count) | |||||
def __iter__(self): | |||||
for f in os.listdir(self.dirname)[:self.count]: | |||||
for y, x in pickle.load(open(os.path.join(self.dirname, f), 'rb')): | |||||
for sent in x: | |||||
yield sent | |||||
def train_word_vec(): | |||||
# load data | |||||
dirname = 'reviews' | |||||
sents = SentIter(dirname, 238) | |||||
# define model and train | |||||
model = models.Word2Vec(sentences=sents, size=200, sg=0, workers=4, min_count=5) | |||||
model.save('yelp.word2vec') | |||||
''' | |||||
Train process | |||||
''' | |||||
import math | |||||
import os | |||||
import copy | |||||
import pickle | |||||
import matplotlib.pyplot as plt | |||||
import matplotlib.ticker as ticker | |||||
import numpy as np | |||||
import json | |||||
import nltk | |||||
from gensim.models import Word2Vec | |||||
import torch | |||||
from torch.utils.data import DataLoader, Dataset | |||||
from model import * | |||||
net = HAN(input_size=200, output_size=5, | |||||
word_hidden_size=50, word_num_layers=1, word_context_size=100, | |||||
sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) | |||||
optimizer = torch.optim.SGD(net.parameters(), lr=0.01) | |||||
criterion = nn.NLLLoss() | |||||
num_epoch = 1 | |||||
batch_size = 64 | |||||
class Embedding_layer: | |||||
def __init__(self, wv, vector_size): | |||||
self.wv = wv | |||||
self.vector_size = vector_size | |||||
def get_vec(self, w): | |||||
try: | |||||
v = self.wv[w] | |||||
except KeyError as e: | |||||
v = np.zeros(self.vector_size) | |||||
return v | |||||
embed_model = Word2Vec.load('yelp.word2vec') | |||||
embedding = Embedding_layer(embed_model.wv, embed_model.wv.vector_size) | |||||
del embed_model | |||||
class YelpDocSet(Dataset): | |||||
def __init__(self, dirname, num_files, embedding): | |||||
self.dirname = dirname | |||||
self.num_files = num_files | |||||
self._len = num_files*5000 | |||||
self._files = os.listdir(dirname)[:num_files] | |||||
self.embedding = embedding | |||||
def __len__(self): | |||||
return self._len | |||||
def __getitem__(self, n): | |||||
file_id = n // 5000 | |||||
sample_list = pickle.load(open( | |||||
os.path.join(self.dirname, self._files[file_id]), 'rb')) | |||||
y, x = sample_list[n % 5000] | |||||
return x, y-1 | |||||
def collate(iterable): | |||||
y_list = [] | |||||
x_list = [] | |||||
for x, y in iterable: | |||||
y_list.append(y) | |||||
x_list.append(x) | |||||
return x_list, torch.LongTensor(y_list) | |||||
if __name__ == '__main__': | |||||
dirname = 'reviews' | |||||
dataloader = DataLoader(YelpDocSet(dirname, 238, embedding), batch_size=batch_size, collate_fn=collate) | |||||
running_loss = 0.0 | |||||
print_size = 10 | |||||
for epoch in range(num_epoch): | |||||
for i, batch_samples in enumerate(dataloader): | |||||
x, y = batch_samples | |||||
doc_list = [] | |||||
for sample in x: | |||||
doc = [] | |||||
for sent in sample: | |||||
sent_vec = [] | |||||
for word in sent: | |||||
vec = embedding.get_vec(word) | |||||
sent_vec.append(torch.Tensor(vec.reshape((1, -1)))) | |||||
sent_vec = torch.cat(sent_vec, dim=0) | |||||
# print(sent_vec.size()) | |||||
doc.append(Variable(sent_vec)) | |||||
doc_list.append(doc) | |||||
y = Variable(y) | |||||
predict = net(doc_list) | |||||
loss = criterion(predict, y) | |||||
optimizer.zero_grad() | |||||
loss.backward() | |||||
optimizer.step() | |||||
running_loss += loss.data[0] | |||||
print(loss.data[0]) | |||||
if i % print_size == print_size-1: | |||||
print(running_loss/print_size) | |||||
running_loss = 0.0 | |||||