@@ -1,42 +0,0 @@ | |||||
import pickle | |||||
import json | |||||
import nltk | |||||
from nltk.tokenize import stanford | |||||
# f = open('dataset/review.json', encoding='utf-8') | |||||
# samples = [] | |||||
# j = 0 | |||||
# for i, line in enumerate(f.readlines()): | |||||
# review = json.loads(line) | |||||
# samples.append((review['stars'], review['text'])) | |||||
# if (i+1) % 5000 == 0: | |||||
# print(i) | |||||
# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb')) | |||||
# j += 1 | |||||
# samples = [] | |||||
# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb')) | |||||
samples = pickle.load(open('review/samples0.pkl', 'rb')) | |||||
# print(samples[0]) | |||||
import os | |||||
os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe' | |||||
path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar' | |||||
tokenizer = stanford.CoreNLPTokenizer() | |||||
dirname = 'review' | |||||
dirname1 = 'reviews' | |||||
for fn in os.listdir(dirname): | |||||
print(fn) | |||||
precessed = [] | |||||
for stars, text in pickle.load(open(os.path.join(dirname, fn), 'rb')): | |||||
tokens = [] | |||||
sents = nltk.tokenize.sent_tokenize(text) | |||||
for s in sents: | |||||
tokens.append(tokenizer.tokenize(s)) | |||||
precessed.append((stars, tokens)) | |||||
# print(tokens) | |||||
if len(precessed) % 100 == 0: | |||||
print(len(precessed)) | |||||
pickle.dump(precessed, open(os.path.join(dirname1, fn), 'wb')) | |||||
@@ -1,2 +0,0 @@ | |||||
# Implementation of the model in | |||||
Hierarchical Attention Networks for Document Classification |
@@ -0,0 +1,36 @@ | |||||
## Introduction | |||||
This is the implementation of [Hierarchical Attention Networks for Document Classification](https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf) paper in PyTorch. | |||||
* Dataset is 600k documents extracted from [Yelp 2018](https://www.yelp.com/dataset) customer reviews | |||||
* Use [NLTK](http://www.nltk.org/) and [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/) to tokenize documents and sentences | |||||
* Both CPU & GPU support | |||||
* The best accuracy is 71%, reaching the same performance in the paper | |||||
## Requirement | |||||
* python 3.6 | |||||
* pytorch >= 0.3.0 | |||||
* numpy | |||||
* gensim | |||||
* nltk | |||||
* coreNLP | |||||
## Parameters | |||||
According to the paper and experiment, I set model parameters: | |||||
|word embedding dimension|GRU hidden size|GRU layer|word/sentence context vector dimension| | |||||
|---|---|---|---| | |||||
|200|50|1|100| | |||||
And the training parameters: | |||||
|Epoch|learning rate|momentum|batch size| | |||||
|---|---|---|---| | |||||
|3|0.01|0.9|64| | |||||
## Run | |||||
1. Prepare dataset. Download the [data set](https://www.yelp.com/dataset), and unzip the custom reviews as a file. Use preprocess.py to transform file into data set foe model input. | |||||
2. Train the model. The model will trained and autosaved in 'model.dict' | |||||
``` | |||||
python train | |||||
``` | |||||
3. Test the model. | |||||
``` | |||||
python evaluate | |||||
``` |
@@ -12,7 +12,6 @@ def evaluate(net, dataset, bactch_size=64, use_cuda=False): | |||||
for sample in x: | for sample in x: | ||||
doc = [] | doc = [] | ||||
for sent_vec in sample: | for sent_vec in sample: | ||||
# print(sent_vec.size()) | |||||
if use_cuda: | if use_cuda: | ||||
sent_vec = sent_vec.cuda() | sent_vec = sent_vec.cuda() | ||||
doc.append(Variable(sent_vec, volatile=True)) | doc.append(Variable(sent_vec, volatile=True)) | ||||
@@ -20,10 +19,6 @@ def evaluate(net, dataset, bactch_size=64, use_cuda=False): | |||||
if use_cuda: | if use_cuda: | ||||
y = y.cuda() | y = y.cuda() | ||||
predicts = net(doc_list) | predicts = net(doc_list) | ||||
# idx = [] | |||||
# for p in predicts.data: | |||||
# idx.append(np.random.choice(5, p=torch.exp(p).numpy())) | |||||
# idx = torch.LongTensor(idx) | |||||
p, idx = torch.max(predicts, dim=1) | p, idx = torch.max(predicts, dim=1) | ||||
idx = idx.data | idx = idx.data | ||||
count += torch.sum(torch.eq(idx, y)) | count += torch.sum(torch.eq(idx, y)) |
@@ -38,11 +38,9 @@ class HAN(nn.Module): | |||||
def forward(self, batch_doc): | def forward(self, batch_doc): | ||||
# input is a sequence of matrix | # input is a sequence of matrix | ||||
doc_vec_list = [] | doc_vec_list = [] | ||||
for doc in batch_doc: | |||||
# doc's dim (num_sent, seq_len, word_dim) | |||||
sent_mat = self.word_layer(doc) | |||||
# sent_mat's dim (num_sent, vec_dim) | |||||
doc_vec_list.append(sent_mat) | |||||
for doc in batch_doc: | |||||
sent_mat = self.word_layer(doc) # doc's dim (num_sent, seq_len, word_dim) | |||||
doc_vec_list.append(sent_mat) # sent_mat's dim (num_sent, vec_dim) | |||||
doc_vec = self.sent_layer(pack_sequence(doc_vec_list)) | doc_vec = self.sent_layer(pack_sequence(doc_vec_list)) | ||||
output = self.softmax(self.output_layer(doc_vec)) | output = self.softmax(self.output_layer(doc_vec)) | ||||
return output | return output | ||||
@@ -55,7 +53,6 @@ class AttentionNet(nn.Module): | |||||
self.gru_hidden_size = gru_hidden_size | self.gru_hidden_size = gru_hidden_size | ||||
self.gru_num_layers = gru_num_layers | self.gru_num_layers = gru_num_layers | ||||
self.context_vec_size = context_vec_size | self.context_vec_size = context_vec_size | ||||
self.last_alpha = None | |||||
# Encoder | # Encoder | ||||
self.gru = nn.GRU(input_size=input_size, | self.gru = nn.GRU(input_size=input_size, | ||||
@@ -72,18 +69,13 @@ class AttentionNet(nn.Module): | |||||
self.context_vec.data.uniform_(-0.1, 0.1) | self.context_vec.data.uniform_(-0.1, 0.1) | ||||
def forward(self, inputs): | def forward(self, inputs): | ||||
# inputs's dim (batch_size, seq_len, word_dim) | |||||
# GRU part | # GRU part | ||||
h_t, hidden = self.gru(inputs) | |||||
h_t, hidden = self.gru(inputs) # inputs's dim (batch_size, seq_len, word_dim) | |||||
u = self.tanh(self.fc(h_t)) | u = self.tanh(self.fc(h_t)) | ||||
# Attention part | # Attention part | ||||
# u's dim (batch_size, seq_len, context_vec_size) | |||||
alpha = self.softmax(torch.matmul(u, self.context_vec)) | |||||
self.last_alpha = alpha.data | |||||
# alpha's dim (batch_size, seq_len, 1) | |||||
output = torch.bmm(torch.transpose(h_t, 1, 2), alpha) | |||||
# output's dim (batch_size, 2*hidden_size, 1) | |||||
return torch.squeeze(output, dim=2) | |||||
alpha = self.softmax(torch.matmul(u, self.context_vec)) # u's dim (batch_size, seq_len, context_vec_size) | |||||
output = torch.bmm(torch.transpose(h_t, 1, 2), alpha) # alpha's dim (batch_size, seq_len, 1) | |||||
return torch.squeeze(output, dim=2) # output's dim (batch_size, 2*hidden_size, 1) | |||||
if __name__ == '__main__': | if __name__ == '__main__': |
@@ -0,0 +1,51 @@ | |||||
'''' | |||||
Tokenize yelp dataset's documents using stanford core nlp | |||||
''' | |||||
import pickle | |||||
import json | |||||
import nltk | |||||
from nltk.tokenize import stanford | |||||
import os | |||||
input_filename = 'review.json' | |||||
# config for stanford core nlp | |||||
os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe' | |||||
path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar' | |||||
tokenizer = stanford.CoreNLPTokenizer() | |||||
in_dirname = 'review' | |||||
out_dirname = 'reviews' | |||||
f = open(input_filename, encoding='utf-8') | |||||
samples = [] | |||||
j = 0 | |||||
for i, line in enumerate(f.readlines()): | |||||
review = json.loads(line) | |||||
samples.append((review['stars'], review['text'])) | |||||
if (i+1) % 5000 == 0: | |||||
print(i) | |||||
pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb')) | |||||
j += 1 | |||||
samples = [] | |||||
pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb')) | |||||
# samples = pickle.load(open(out_dirname + '/samples0.pkl', 'rb')) | |||||
# print(samples[0]) | |||||
for fn in os.listdir(in_dirname): | |||||
print(fn) | |||||
precessed = [] | |||||
for stars, text in pickle.load(open(os.path.join(in_dirname, fn), 'rb')): | |||||
tokens = [] | |||||
sents = nltk.tokenize.sent_tokenize(text) | |||||
for s in sents: | |||||
tokens.append(tokenizer.tokenize(s)) | |||||
precessed.append((stars, tokens)) | |||||
# print(tokens) | |||||
if len(precessed) % 100 == 0: | |||||
print(len(precessed)) | |||||
pickle.dump(precessed, open(os.path.join(out_dirname, fn), 'wb')) | |||||
@@ -1,9 +1,6 @@ | |||||
import os | import os | ||||
import pickle | import pickle | ||||
import matplotlib.pyplot as plt | |||||
import matplotlib.ticker as ticker | |||||
import nltk | import nltk | ||||
import numpy as np | import numpy as np | ||||
import torch | import torch | ||||
@@ -60,7 +57,6 @@ class YelpDocSet(Dataset): | |||||
file_id = n // 5000 | file_id = n // 5000 | ||||
idx = file_id % 5 | idx = file_id % 5 | ||||
if self._cache[idx][0] != file_id: | if self._cache[idx][0] != file_id: | ||||
# print('load {} to {}'.format(file_id, idx)) | |||||
with open(os.path.join(self.dirname, self._files[file_id]), 'rb') as f: | with open(os.path.join(self.dirname, self._files[file_id]), 'rb') as f: | ||||
self._cache[idx] = (file_id, pickle.load(f)) | self._cache[idx] = (file_id, pickle.load(f)) | ||||
y, x = self._cache[idx][1][n % 5000] | y, x = self._cache[idx][1][n % 5000] | ||||
@@ -90,7 +86,6 @@ class YelpDocSet(Dataset): | |||||
vec = self.embedding.get_vec(word) | vec = self.embedding.get_vec(word) | ||||
sent_vec.append(vec.tolist()) | sent_vec.append(vec.tolist()) | ||||
sent_vec = torch.Tensor(sent_vec) | sent_vec = torch.Tensor(sent_vec) | ||||
# print(sent_vec.size()) | |||||
doc.append(sent_vec) | doc.append(sent_vec) | ||||
if len(doc) == 0: | if len(doc) == 0: | ||||
doc = [torch.zeros(1,200)] | doc = [torch.zeros(1,200)] | ||||
@@ -124,7 +119,6 @@ def train(net, dataset, num_epoch, batch_size, print_size=10, use_cuda=False): | |||||
for sample in x: | for sample in x: | ||||
doc = [] | doc = [] | ||||
for sent_vec in sample: | for sent_vec in sample: | ||||
# print(sent_vec.size()) | |||||
if use_cuda: | if use_cuda: | ||||
sent_vec = sent_vec.cuda() | sent_vec = sent_vec.cuda() | ||||
doc.append(Variable(sent_vec)) | doc.append(Variable(sent_vec)) |