Browse Source

add readme

tags/v0.1.0
choosewhatulike 6 years ago
parent
commit
428ca505f2
8 changed files with 94 additions and 70 deletions
  1. +0
    -42
      model_inplement/preprocess.py
  2. +0
    -2
      model_inplement/readme.md
  3. +0
    -0
      reproduction/.gitignore
  4. +36
    -0
      reproduction/README.md
  5. +0
    -5
      reproduction/evaluate.py
  6. +7
    -15
      reproduction/model.py
  7. +51
    -0
      reproduction/preprocess.py
  8. +0
    -6
      reproduction/train.py

+ 0
- 42
model_inplement/preprocess.py View File

@@ -1,42 +0,0 @@
import pickle
import json
import nltk
from nltk.tokenize import stanford

# f = open('dataset/review.json', encoding='utf-8')
# samples = []
# j = 0
# for i, line in enumerate(f.readlines()):
# review = json.loads(line)
# samples.append((review['stars'], review['text']))
# if (i+1) % 5000 == 0:
# print(i)
# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb'))
# j += 1
# samples = []
# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb'))
samples = pickle.load(open('review/samples0.pkl', 'rb'))
# print(samples[0])

import os
os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
tokenizer = stanford.CoreNLPTokenizer()

dirname = 'review'
dirname1 = 'reviews'

for fn in os.listdir(dirname):
print(fn)
precessed = []
for stars, text in pickle.load(open(os.path.join(dirname, fn), 'rb')):
tokens = []
sents = nltk.tokenize.sent_tokenize(text)
for s in sents:
tokens.append(tokenizer.tokenize(s))
precessed.append((stars, tokens))
# print(tokens)
if len(precessed) % 100 == 0:
print(len(precessed))
pickle.dump(precessed, open(os.path.join(dirname1, fn), 'wb'))

+ 0
- 2
model_inplement/readme.md View File

@@ -1,2 +0,0 @@
# Implementation of the model in
Hierarchical Attention Networks for Document Classification

model_inplement/.gitignore → reproduction/.gitignore View File


+ 36
- 0
reproduction/README.md View File

@@ -0,0 +1,36 @@
## Introduction
This is the implementation of [Hierarchical Attention Networks for Document Classification](https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf) paper in PyTorch.
* Dataset is 600k documents extracted from [Yelp 2018](https://www.yelp.com/dataset) customer reviews
* Use [NLTK](http://www.nltk.org/) and [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/) to tokenize documents and sentences
* Both CPU & GPU support
* The best accuracy is 71%, reaching the same performance in the paper

## Requirement
* python 3.6
* pytorch >= 0.3.0
* numpy
* gensim
* nltk
* coreNLP

## Parameters
According to the paper and experiment, I set model parameters:
|word embedding dimension|GRU hidden size|GRU layer|word/sentence context vector dimension|
|---|---|---|---|
|200|50|1|100|

And the training parameters:
|Epoch|learning rate|momentum|batch size|
|---|---|---|---|
|3|0.01|0.9|64|

## Run
1. Prepare dataset. Download the [data set](https://www.yelp.com/dataset), and unzip the custom reviews as a file. Use preprocess.py to transform file into data set foe model input.
2. Train the model. The model will trained and autosaved in 'model.dict'
```
python train
```
3. Test the model.
```
python evaluate
```

model_inplement/evaluate.py → reproduction/evaluate.py View File

@@ -12,7 +12,6 @@ def evaluate(net, dataset, bactch_size=64, use_cuda=False):
for sample in x:
doc = []
for sent_vec in sample:
# print(sent_vec.size())
if use_cuda:
sent_vec = sent_vec.cuda()
doc.append(Variable(sent_vec, volatile=True))
@@ -20,10 +19,6 @@ def evaluate(net, dataset, bactch_size=64, use_cuda=False):
if use_cuda:
y = y.cuda()
predicts = net(doc_list)
# idx = []
# for p in predicts.data:
# idx.append(np.random.choice(5, p=torch.exp(p).numpy()))
# idx = torch.LongTensor(idx)
p, idx = torch.max(predicts, dim=1)
idx = idx.data
count += torch.sum(torch.eq(idx, y))

model_inplement/model.py → reproduction/model.py View File

@@ -38,11 +38,9 @@ class HAN(nn.Module):
def forward(self, batch_doc):
# input is a sequence of matrix
doc_vec_list = []
for doc in batch_doc:
# doc's dim (num_sent, seq_len, word_dim)
sent_mat = self.word_layer(doc)
# sent_mat's dim (num_sent, vec_dim)
doc_vec_list.append(sent_mat)
for doc in batch_doc:
sent_mat = self.word_layer(doc) # doc's dim (num_sent, seq_len, word_dim)
doc_vec_list.append(sent_mat) # sent_mat's dim (num_sent, vec_dim)
doc_vec = self.sent_layer(pack_sequence(doc_vec_list))
output = self.softmax(self.output_layer(doc_vec))
return output
@@ -55,7 +53,6 @@ class AttentionNet(nn.Module):
self.gru_hidden_size = gru_hidden_size
self.gru_num_layers = gru_num_layers
self.context_vec_size = context_vec_size
self.last_alpha = None

# Encoder
self.gru = nn.GRU(input_size=input_size,
@@ -72,18 +69,13 @@ class AttentionNet(nn.Module):
self.context_vec.data.uniform_(-0.1, 0.1)

def forward(self, inputs):
# inputs's dim (batch_size, seq_len, word_dim)
# GRU part
h_t, hidden = self.gru(inputs)
h_t, hidden = self.gru(inputs) # inputs's dim (batch_size, seq_len, word_dim)
u = self.tanh(self.fc(h_t))
# Attention part
# u's dim (batch_size, seq_len, context_vec_size)
alpha = self.softmax(torch.matmul(u, self.context_vec))
self.last_alpha = alpha.data
# alpha's dim (batch_size, seq_len, 1)
output = torch.bmm(torch.transpose(h_t, 1, 2), alpha)
# output's dim (batch_size, 2*hidden_size, 1)
return torch.squeeze(output, dim=2)
alpha = self.softmax(torch.matmul(u, self.context_vec)) # u's dim (batch_size, seq_len, context_vec_size)
output = torch.bmm(torch.transpose(h_t, 1, 2), alpha) # alpha's dim (batch_size, seq_len, 1)
return torch.squeeze(output, dim=2) # output's dim (batch_size, 2*hidden_size, 1)


if __name__ == '__main__':

+ 51
- 0
reproduction/preprocess.py View File

@@ -0,0 +1,51 @@
''''
Tokenize yelp dataset's documents using stanford core nlp
'''

import pickle
import json
import nltk
from nltk.tokenize import stanford
import os

input_filename = 'review.json'

# config for stanford core nlp
os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
tokenizer = stanford.CoreNLPTokenizer()

in_dirname = 'review'
out_dirname = 'reviews'


f = open(input_filename, encoding='utf-8')
samples = []
j = 0
for i, line in enumerate(f.readlines()):
review = json.loads(line)
samples.append((review['stars'], review['text']))
if (i+1) % 5000 == 0:
print(i)
pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb'))
j += 1
samples = []
pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb'))
# samples = pickle.load(open(out_dirname + '/samples0.pkl', 'rb'))
# print(samples[0])


for fn in os.listdir(in_dirname):
print(fn)
precessed = []
for stars, text in pickle.load(open(os.path.join(in_dirname, fn), 'rb')):
tokens = []
sents = nltk.tokenize.sent_tokenize(text)
for s in sents:
tokens.append(tokenizer.tokenize(s))
precessed.append((stars, tokens))
# print(tokens)
if len(precessed) % 100 == 0:
print(len(precessed))
pickle.dump(precessed, open(os.path.join(out_dirname, fn), 'wb'))

model_inplement/train.py → reproduction/train.py View File

@@ -1,9 +1,6 @@
import os
import pickle

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import nltk
import numpy as np
import torch
@@ -60,7 +57,6 @@ class YelpDocSet(Dataset):
file_id = n // 5000
idx = file_id % 5
if self._cache[idx][0] != file_id:
# print('load {} to {}'.format(file_id, idx))
with open(os.path.join(self.dirname, self._files[file_id]), 'rb') as f:
self._cache[idx] = (file_id, pickle.load(f))
y, x = self._cache[idx][1][n % 5000]
@@ -90,7 +86,6 @@ class YelpDocSet(Dataset):
vec = self.embedding.get_vec(word)
sent_vec.append(vec.tolist())
sent_vec = torch.Tensor(sent_vec)
# print(sent_vec.size())
doc.append(sent_vec)
if len(doc) == 0:
doc = [torch.zeros(1,200)]
@@ -124,7 +119,6 @@ def train(net, dataset, num_epoch, batch_size, print_size=10, use_cuda=False):
for sample in x:
doc = []
for sent_vec in sample:
# print(sent_vec.size())
if use_cuda:
sent_vec = sent_vec.cuda()
doc.append(Variable(sent_vec))

Loading…
Cancel
Save