Browse Source

add readme

tags/v0.1.0
choosewhatulike 6 years ago
parent
commit
428ca505f2
8 changed files with 94 additions and 70 deletions
  1. +0
    -42
      model_inplement/preprocess.py
  2. +0
    -2
      model_inplement/readme.md
  3. +0
    -0
      reproduction/.gitignore
  4. +36
    -0
      reproduction/README.md
  5. +0
    -5
      reproduction/evaluate.py
  6. +7
    -15
      reproduction/model.py
  7. +51
    -0
      reproduction/preprocess.py
  8. +0
    -6
      reproduction/train.py

+ 0
- 42
model_inplement/preprocess.py View File

@@ -1,42 +0,0 @@
import pickle
import json
import nltk
from nltk.tokenize import stanford

# f = open('dataset/review.json', encoding='utf-8')
# samples = []
# j = 0
# for i, line in enumerate(f.readlines()):
# review = json.loads(line)
# samples.append((review['stars'], review['text']))
# if (i+1) % 5000 == 0:
# print(i)
# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb'))
# j += 1
# samples = []
# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb'))
samples = pickle.load(open('review/samples0.pkl', 'rb'))
# print(samples[0])

import os
os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
tokenizer = stanford.CoreNLPTokenizer()

dirname = 'review'
dirname1 = 'reviews'

for fn in os.listdir(dirname):
print(fn)
precessed = []
for stars, text in pickle.load(open(os.path.join(dirname, fn), 'rb')):
tokens = []
sents = nltk.tokenize.sent_tokenize(text)
for s in sents:
tokens.append(tokenizer.tokenize(s))
precessed.append((stars, tokens))
# print(tokens)
if len(precessed) % 100 == 0:
print(len(precessed))
pickle.dump(precessed, open(os.path.join(dirname1, fn), 'wb'))

+ 0
- 2
model_inplement/readme.md View File

@@ -1,2 +0,0 @@
# Implementation of the model in
Hierarchical Attention Networks for Document Classification

model_inplement/.gitignore → reproduction/.gitignore View File


+ 36
- 0
reproduction/README.md View File

@@ -0,0 +1,36 @@
## Introduction
This is the implementation of [Hierarchical Attention Networks for Document Classification](https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf) paper in PyTorch.
* Dataset is 600k documents extracted from [Yelp 2018](https://www.yelp.com/dataset) customer reviews
* Use [NLTK](http://www.nltk.org/) and [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/) to tokenize documents and sentences
* Both CPU & GPU support
* The best accuracy is 71%, reaching the same performance in the paper

## Requirement
* python 3.6
* pytorch >= 0.3.0
* numpy
* gensim
* nltk
* coreNLP

## Parameters
According to the paper and experiment, I set model parameters:
|word embedding dimension|GRU hidden size|GRU layer|word/sentence context vector dimension|
|---|---|---|---|
|200|50|1|100|

And the training parameters:
|Epoch|learning rate|momentum|batch size|
|---|---|---|---|
|3|0.01|0.9|64|

## Run
1. Prepare dataset. Download the [data set](https://www.yelp.com/dataset), and unzip the custom reviews as a file. Use preprocess.py to transform file into data set foe model input.
2. Train the model. The model will trained and autosaved in 'model.dict'
```
python train
```
3. Test the model.
```
python evaluate
```

model_inplement/evaluate.py → reproduction/evaluate.py View File

@@ -12,7 +12,6 @@ def evaluate(net, dataset, bactch_size=64, use_cuda=False):
for sample in x: for sample in x:
doc = [] doc = []
for sent_vec in sample: for sent_vec in sample:
# print(sent_vec.size())
if use_cuda: if use_cuda:
sent_vec = sent_vec.cuda() sent_vec = sent_vec.cuda()
doc.append(Variable(sent_vec, volatile=True)) doc.append(Variable(sent_vec, volatile=True))
@@ -20,10 +19,6 @@ def evaluate(net, dataset, bactch_size=64, use_cuda=False):
if use_cuda: if use_cuda:
y = y.cuda() y = y.cuda()
predicts = net(doc_list) predicts = net(doc_list)
# idx = []
# for p in predicts.data:
# idx.append(np.random.choice(5, p=torch.exp(p).numpy()))
# idx = torch.LongTensor(idx)
p, idx = torch.max(predicts, dim=1) p, idx = torch.max(predicts, dim=1)
idx = idx.data idx = idx.data
count += torch.sum(torch.eq(idx, y)) count += torch.sum(torch.eq(idx, y))

model_inplement/model.py → reproduction/model.py View File

@@ -38,11 +38,9 @@ class HAN(nn.Module):
def forward(self, batch_doc): def forward(self, batch_doc):
# input is a sequence of matrix # input is a sequence of matrix
doc_vec_list = [] doc_vec_list = []
for doc in batch_doc:
# doc's dim (num_sent, seq_len, word_dim)
sent_mat = self.word_layer(doc)
# sent_mat's dim (num_sent, vec_dim)
doc_vec_list.append(sent_mat)
for doc in batch_doc:
sent_mat = self.word_layer(doc) # doc's dim (num_sent, seq_len, word_dim)
doc_vec_list.append(sent_mat) # sent_mat's dim (num_sent, vec_dim)
doc_vec = self.sent_layer(pack_sequence(doc_vec_list)) doc_vec = self.sent_layer(pack_sequence(doc_vec_list))
output = self.softmax(self.output_layer(doc_vec)) output = self.softmax(self.output_layer(doc_vec))
return output return output
@@ -55,7 +53,6 @@ class AttentionNet(nn.Module):
self.gru_hidden_size = gru_hidden_size self.gru_hidden_size = gru_hidden_size
self.gru_num_layers = gru_num_layers self.gru_num_layers = gru_num_layers
self.context_vec_size = context_vec_size self.context_vec_size = context_vec_size
self.last_alpha = None


# Encoder # Encoder
self.gru = nn.GRU(input_size=input_size, self.gru = nn.GRU(input_size=input_size,
@@ -72,18 +69,13 @@ class AttentionNet(nn.Module):
self.context_vec.data.uniform_(-0.1, 0.1) self.context_vec.data.uniform_(-0.1, 0.1)


def forward(self, inputs): def forward(self, inputs):
# inputs's dim (batch_size, seq_len, word_dim)
# GRU part # GRU part
h_t, hidden = self.gru(inputs)
h_t, hidden = self.gru(inputs) # inputs's dim (batch_size, seq_len, word_dim)
u = self.tanh(self.fc(h_t)) u = self.tanh(self.fc(h_t))
# Attention part # Attention part
# u's dim (batch_size, seq_len, context_vec_size)
alpha = self.softmax(torch.matmul(u, self.context_vec))
self.last_alpha = alpha.data
# alpha's dim (batch_size, seq_len, 1)
output = torch.bmm(torch.transpose(h_t, 1, 2), alpha)
# output's dim (batch_size, 2*hidden_size, 1)
return torch.squeeze(output, dim=2)
alpha = self.softmax(torch.matmul(u, self.context_vec)) # u's dim (batch_size, seq_len, context_vec_size)
output = torch.bmm(torch.transpose(h_t, 1, 2), alpha) # alpha's dim (batch_size, seq_len, 1)
return torch.squeeze(output, dim=2) # output's dim (batch_size, 2*hidden_size, 1)




if __name__ == '__main__': if __name__ == '__main__':

+ 51
- 0
reproduction/preprocess.py View File

@@ -0,0 +1,51 @@
''''
Tokenize yelp dataset's documents using stanford core nlp
'''

import pickle
import json
import nltk
from nltk.tokenize import stanford
import os

input_filename = 'review.json'

# config for stanford core nlp
os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
tokenizer = stanford.CoreNLPTokenizer()

in_dirname = 'review'
out_dirname = 'reviews'


f = open(input_filename, encoding='utf-8')
samples = []
j = 0
for i, line in enumerate(f.readlines()):
review = json.loads(line)
samples.append((review['stars'], review['text']))
if (i+1) % 5000 == 0:
print(i)
pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb'))
j += 1
samples = []
pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb'))
# samples = pickle.load(open(out_dirname + '/samples0.pkl', 'rb'))
# print(samples[0])


for fn in os.listdir(in_dirname):
print(fn)
precessed = []
for stars, text in pickle.load(open(os.path.join(in_dirname, fn), 'rb')):
tokens = []
sents = nltk.tokenize.sent_tokenize(text)
for s in sents:
tokens.append(tokenizer.tokenize(s))
precessed.append((stars, tokens))
# print(tokens)
if len(precessed) % 100 == 0:
print(len(precessed))
pickle.dump(precessed, open(os.path.join(out_dirname, fn), 'wb'))

model_inplement/train.py → reproduction/train.py View File

@@ -1,9 +1,6 @@
import os import os
import pickle import pickle


import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import nltk import nltk
import numpy as np import numpy as np
import torch import torch
@@ -60,7 +57,6 @@ class YelpDocSet(Dataset):
file_id = n // 5000 file_id = n // 5000
idx = file_id % 5 idx = file_id % 5
if self._cache[idx][0] != file_id: if self._cache[idx][0] != file_id:
# print('load {} to {}'.format(file_id, idx))
with open(os.path.join(self.dirname, self._files[file_id]), 'rb') as f: with open(os.path.join(self.dirname, self._files[file_id]), 'rb') as f:
self._cache[idx] = (file_id, pickle.load(f)) self._cache[idx] = (file_id, pickle.load(f))
y, x = self._cache[idx][1][n % 5000] y, x = self._cache[idx][1][n % 5000]
@@ -90,7 +86,6 @@ class YelpDocSet(Dataset):
vec = self.embedding.get_vec(word) vec = self.embedding.get_vec(word)
sent_vec.append(vec.tolist()) sent_vec.append(vec.tolist())
sent_vec = torch.Tensor(sent_vec) sent_vec = torch.Tensor(sent_vec)
# print(sent_vec.size())
doc.append(sent_vec) doc.append(sent_vec)
if len(doc) == 0: if len(doc) == 0:
doc = [torch.zeros(1,200)] doc = [torch.zeros(1,200)]
@@ -124,7 +119,6 @@ def train(net, dataset, num_epoch, batch_size, print_size=10, use_cuda=False):
for sample in x: for sample in x:
doc = [] doc = []
for sent_vec in sample: for sent_vec in sample:
# print(sent_vec.size())
if use_cuda: if use_cuda:
sent_vec = sent_vec.cuda() sent_vec = sent_vec.cuda()
doc.append(Variable(sent_vec)) doc.append(Variable(sent_vec))

Loading…
Cancel
Save