diff --git a/model_inplement/preprocess.py b/model_inplement/preprocess.py deleted file mode 100644 index 37f6eb25..00000000 --- a/model_inplement/preprocess.py +++ /dev/null @@ -1,42 +0,0 @@ -import pickle -import json -import nltk -from nltk.tokenize import stanford - -# f = open('dataset/review.json', encoding='utf-8') -# samples = [] -# j = 0 -# for i, line in enumerate(f.readlines()): -# review = json.loads(line) -# samples.append((review['stars'], review['text'])) -# if (i+1) % 5000 == 0: -# print(i) -# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb')) -# j += 1 -# samples = [] -# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb')) -samples = pickle.load(open('review/samples0.pkl', 'rb')) -# print(samples[0]) - -import os -os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe' -path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar' -tokenizer = stanford.CoreNLPTokenizer() - -dirname = 'review' -dirname1 = 'reviews' - -for fn in os.listdir(dirname): - print(fn) - precessed = [] - for stars, text in pickle.load(open(os.path.join(dirname, fn), 'rb')): - tokens = [] - sents = nltk.tokenize.sent_tokenize(text) - for s in sents: - tokens.append(tokenizer.tokenize(s)) - precessed.append((stars, tokens)) - # print(tokens) - if len(precessed) % 100 == 0: - print(len(precessed)) - pickle.dump(precessed, open(os.path.join(dirname1, fn), 'wb')) - diff --git a/model_inplement/readme.md b/model_inplement/readme.md deleted file mode 100644 index 9da1bde6..00000000 --- a/model_inplement/readme.md +++ /dev/null @@ -1,2 +0,0 @@ -# Implementation of the model in -Hierarchical Attention Networks for Document Classification diff --git a/model_inplement/.gitignore b/reproduction/.gitignore similarity index 100% rename from model_inplement/.gitignore rename to reproduction/.gitignore diff --git a/reproduction/README.md b/reproduction/README.md new file mode 100644 index 00000000..cf90cb7d --- /dev/null +++ b/reproduction/README.md @@ -0,0 +1,36 @@ +## Introduction +This is the implementation of [Hierarchical Attention Networks for Document Classification](https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf) paper in PyTorch. +* Dataset is 600k documents extracted from [Yelp 2018](https://www.yelp.com/dataset) customer reviews +* Use [NLTK](http://www.nltk.org/) and [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/) to tokenize documents and sentences +* Both CPU & GPU support +* The best accuracy is 71%, reaching the same performance in the paper + +## Requirement +* python 3.6 +* pytorch >= 0.3.0 +* numpy +* gensim +* nltk +* coreNLP + +## Parameters +According to the paper and experiment, I set model parameters: +|word embedding dimension|GRU hidden size|GRU layer|word/sentence context vector dimension| +|---|---|---|---| +|200|50|1|100| + +And the training parameters: +|Epoch|learning rate|momentum|batch size| +|---|---|---|---| +|3|0.01|0.9|64| + +## Run +1. Prepare dataset. Download the [data set](https://www.yelp.com/dataset), and unzip the custom reviews as a file. Use preprocess.py to transform file into data set foe model input. +2. Train the model. The model will trained and autosaved in 'model.dict' +``` +python train +``` +3. Test the model. +``` +python evaluate +``` \ No newline at end of file diff --git a/model_inplement/evaluate.py b/reproduction/evaluate.py similarity index 88% rename from model_inplement/evaluate.py rename to reproduction/evaluate.py index 7384428a..1f10a9a2 100644 --- a/model_inplement/evaluate.py +++ b/reproduction/evaluate.py @@ -12,7 +12,6 @@ def evaluate(net, dataset, bactch_size=64, use_cuda=False): for sample in x: doc = [] for sent_vec in sample: - # print(sent_vec.size()) if use_cuda: sent_vec = sent_vec.cuda() doc.append(Variable(sent_vec, volatile=True)) @@ -20,10 +19,6 @@ def evaluate(net, dataset, bactch_size=64, use_cuda=False): if use_cuda: y = y.cuda() predicts = net(doc_list) - # idx = [] - # for p in predicts.data: - # idx.append(np.random.choice(5, p=torch.exp(p).numpy())) - # idx = torch.LongTensor(idx) p, idx = torch.max(predicts, dim=1) idx = idx.data count += torch.sum(torch.eq(idx, y)) diff --git a/model_inplement/model.py b/reproduction/model.py similarity index 83% rename from model_inplement/model.py rename to reproduction/model.py index e1e52f97..ce047f8e 100644 --- a/model_inplement/model.py +++ b/reproduction/model.py @@ -38,11 +38,9 @@ class HAN(nn.Module): def forward(self, batch_doc): # input is a sequence of matrix doc_vec_list = [] - for doc in batch_doc: - # doc's dim (num_sent, seq_len, word_dim) - sent_mat = self.word_layer(doc) - # sent_mat's dim (num_sent, vec_dim) - doc_vec_list.append(sent_mat) + for doc in batch_doc: + sent_mat = self.word_layer(doc) # doc's dim (num_sent, seq_len, word_dim) + doc_vec_list.append(sent_mat) # sent_mat's dim (num_sent, vec_dim) doc_vec = self.sent_layer(pack_sequence(doc_vec_list)) output = self.softmax(self.output_layer(doc_vec)) return output @@ -55,7 +53,6 @@ class AttentionNet(nn.Module): self.gru_hidden_size = gru_hidden_size self.gru_num_layers = gru_num_layers self.context_vec_size = context_vec_size - self.last_alpha = None # Encoder self.gru = nn.GRU(input_size=input_size, @@ -72,18 +69,13 @@ class AttentionNet(nn.Module): self.context_vec.data.uniform_(-0.1, 0.1) def forward(self, inputs): - # inputs's dim (batch_size, seq_len, word_dim) # GRU part - h_t, hidden = self.gru(inputs) + h_t, hidden = self.gru(inputs) # inputs's dim (batch_size, seq_len, word_dim) u = self.tanh(self.fc(h_t)) # Attention part - # u's dim (batch_size, seq_len, context_vec_size) - alpha = self.softmax(torch.matmul(u, self.context_vec)) - self.last_alpha = alpha.data - # alpha's dim (batch_size, seq_len, 1) - output = torch.bmm(torch.transpose(h_t, 1, 2), alpha) - # output's dim (batch_size, 2*hidden_size, 1) - return torch.squeeze(output, dim=2) + alpha = self.softmax(torch.matmul(u, self.context_vec)) # u's dim (batch_size, seq_len, context_vec_size) + output = torch.bmm(torch.transpose(h_t, 1, 2), alpha) # alpha's dim (batch_size, seq_len, 1) + return torch.squeeze(output, dim=2) # output's dim (batch_size, 2*hidden_size, 1) if __name__ == '__main__': diff --git a/reproduction/preprocess.py b/reproduction/preprocess.py new file mode 100644 index 00000000..0de0d74e --- /dev/null +++ b/reproduction/preprocess.py @@ -0,0 +1,51 @@ +'''' + Tokenize yelp dataset's documents using stanford core nlp +''' + +import pickle +import json +import nltk +from nltk.tokenize import stanford +import os + +input_filename = 'review.json' + +# config for stanford core nlp +os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe' +path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar' +tokenizer = stanford.CoreNLPTokenizer() + +in_dirname = 'review' +out_dirname = 'reviews' + + +f = open(input_filename, encoding='utf-8') +samples = [] +j = 0 +for i, line in enumerate(f.readlines()): + review = json.loads(line) + samples.append((review['stars'], review['text'])) + if (i+1) % 5000 == 0: + print(i) + pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb')) + j += 1 + samples = [] +pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb')) +# samples = pickle.load(open(out_dirname + '/samples0.pkl', 'rb')) +# print(samples[0]) + + +for fn in os.listdir(in_dirname): + print(fn) + precessed = [] + for stars, text in pickle.load(open(os.path.join(in_dirname, fn), 'rb')): + tokens = [] + sents = nltk.tokenize.sent_tokenize(text) + for s in sents: + tokens.append(tokenizer.tokenize(s)) + precessed.append((stars, tokens)) + # print(tokens) + if len(precessed) % 100 == 0: + print(len(precessed)) + pickle.dump(precessed, open(os.path.join(out_dirname, fn), 'wb')) + diff --git a/model_inplement/train.py b/reproduction/train.py similarity index 96% rename from model_inplement/train.py rename to reproduction/train.py index a4a64b32..add570c1 100644 --- a/model_inplement/train.py +++ b/reproduction/train.py @@ -1,9 +1,6 @@ import os import pickle -import matplotlib.pyplot as plt -import matplotlib.ticker as ticker - import nltk import numpy as np import torch @@ -60,7 +57,6 @@ class YelpDocSet(Dataset): file_id = n // 5000 idx = file_id % 5 if self._cache[idx][0] != file_id: - # print('load {} to {}'.format(file_id, idx)) with open(os.path.join(self.dirname, self._files[file_id]), 'rb') as f: self._cache[idx] = (file_id, pickle.load(f)) y, x = self._cache[idx][1][n % 5000] @@ -90,7 +86,6 @@ class YelpDocSet(Dataset): vec = self.embedding.get_vec(word) sent_vec.append(vec.tolist()) sent_vec = torch.Tensor(sent_vec) - # print(sent_vec.size()) doc.append(sent_vec) if len(doc) == 0: doc = [torch.zeros(1,200)] @@ -124,7 +119,6 @@ def train(net, dataset, num_epoch, batch_size, print_size=10, use_cuda=False): for sample in x: doc = [] for sent_vec in sample: - # print(sent_vec.size()) if use_cuda: sent_vec = sent_vec.cuda() doc.append(Variable(sent_vec))