add readme

7 years ago · 428ca505f2
--- a/model_inplement/preprocess.py
+++ b/model_inplement/preprocess.py
@@ -1,42 +0,0 @@
 import pickle
 import json
 import nltk
 from nltk.tokenize import stanford
 # f = open('dataset/review.json', encoding='utf-8')
 # samples = []
 # j = 0
 # for i, line in enumerate(f.readlines()):
 #     review = json.loads(line)
 #     samples.append((review['stars'], review['text']))
 #     if (i+1) % 5000 == 0:
 #         print(i)
 #         pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb'))
 #         j += 1
 #         samples = []
 # pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb'))
 samples = pickle.load(open('review/samples0.pkl', 'rb'))
 # print(samples[0])
 import os
 os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
 path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
 tokenizer = stanford.CoreNLPTokenizer()
 dirname = 'review'
 dirname1 = 'reviews'
 for fn in os.listdir(dirname):
    print(fn)
    precessed = []
    for stars, text in pickle.load(open(os.path.join(dirname, fn), 'rb')):
        tokens = []
        sents = nltk.tokenize.sent_tokenize(text)
        for s in sents:
            tokens.append(tokenizer.tokenize(s))
        precessed.append((stars, tokens))
        # print(tokens)
        if len(precessed) % 100 == 0:
            print(len(precessed))
    pickle.dump(precessed, open(os.path.join(dirname1, fn), 'wb'))
--- a/model_inplement/readme.md
+++ b/model_inplement/readme.md
@@ -1,2 +0,0 @@
 # Implementation of the model in 
 Hierarchical Attention Networks for Document Classification
--- a/model_inplement/.gitignore
+++ b/model_inplement/.gitignore
--- a/reproduction/README.md
+++ b/reproduction/README.md
@@ -0,0 +1,36 @@
 ## Introduction
 This is the implementation of [Hierarchical Attention Networks for Document Classification](https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf) paper in PyTorch.
 * Dataset is 600k documents extracted from [Yelp 2018](https://www.yelp.com/dataset) customer reviews
 * Use [NLTK](http://www.nltk.org/) and [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/) to tokenize documents and sentences
 * Both CPU & GPU support
 * The best accuracy is 71%, reaching the same performance in the paper
 ## Requirement
 * python 3.6
 * pytorch >= 0.3.0
 * numpy
 * gensim
 * nltk
 * coreNLP
 ## Parameters
 According to the paper and experiment, I set model parameters:
 |word embedding dimension|GRU hidden size|GRU layer|word/sentence context vector dimension|
 |---|---|---|---|
 |200|50|1|100|
 And the training parameters:
 |Epoch|learning rate|momentum|batch size|
 |---|---|---|---|
 |3|0.01|0.9|64|
 ## Run
 1. Prepare dataset. Download the [data set](https://www.yelp.com/dataset), and unzip the custom reviews as a file. Use preprocess.py to transform file into data set foe model input.
 2. Train the model. The model will trained and autosaved in 'model.dict'
 ```
 python train
 ```
 3. Test the model.
 ```
 python evaluate
 ```
--- a/model_inplement/evaluate.py
+++ b/model_inplement/evaluate.py
@@ -12,7 +12,6 @@ def evaluate(net, dataset, bactch_size=64, use_cuda=False):
        for sample in x:
            doc = []
            for sent_vec in sample:
                # print(sent_vec.size())
                if use_cuda:
                    sent_vec = sent_vec.cuda()
                doc.append(Variable(sent_vec, volatile=True))
@@ -20,10 +19,6 @@ def evaluate(net, dataset, bactch_size=64, use_cuda=False):
        if use_cuda:
            y = y.cuda()
        predicts = net(doc_list)
        # idx = []
        # for p in predicts.data:
        #     idx.append(np.random.choice(5, p=torch.exp(p).numpy()))
        # idx = torch.LongTensor(idx)
        p, idx = torch.max(predicts, dim=1)
        idx = idx.data
        count += torch.sum(torch.eq(idx, y))
--- a/model_inplement/model.py
+++ b/model_inplement/model.py
@@ -38,11 +38,9 @@ class HAN(nn.Module):
    def forward(self, batch_doc):
        # input is a sequence of matrix
        doc_vec_list = []
        for doc in batch_doc:
            # doc's dim (num_sent, seq_len, word_dim)
            sent_mat = self.word_layer(doc)
            # sent_mat's dim (num_sent, vec_dim)
            doc_vec_list.append(sent_mat)
        for doc in batch_doc: 
            sent_mat = self.word_layer(doc) # doc's dim (num_sent, seq_len, word_dim)
            doc_vec_list.append(sent_mat) # sent_mat's dim (num_sent, vec_dim)
        doc_vec = self.sent_layer(pack_sequence(doc_vec_list))
        output = self.softmax(self.output_layer(doc_vec))
        return output
@@ -55,7 +53,6 @@ class AttentionNet(nn.Module):
        self.gru_hidden_size = gru_hidden_size
        self.gru_num_layers = gru_num_layers
        self.context_vec_size = context_vec_size
        self.last_alpha = None
        # Encoder
        self.gru = nn.GRU(input_size=input_size, 
@@ -72,18 +69,13 @@ class AttentionNet(nn.Module):
        self.context_vec.data.uniform_(-0.1, 0.1)
    def forward(self, inputs):
        # inputs's dim (batch_size, seq_len,  word_dim)
        # GRU part
        h_t, hidden = self.gru(inputs)
        h_t, hidden = self.gru(inputs) # inputs's dim (batch_size, seq_len,  word_dim)
        u = self.tanh(self.fc(h_t))
        # Attention part
        # u's dim (batch_size, seq_len, context_vec_size)
        alpha = self.softmax(torch.matmul(u, self.context_vec))
        self.last_alpha = alpha.data
        # alpha's dim (batch_size, seq_len, 1)
        output = torch.bmm(torch.transpose(h_t, 1, 2), alpha)
        # output's dim (batch_size, 2*hidden_size, 1)
        return torch.squeeze(output, dim=2)
        alpha = self.softmax(torch.matmul(u, self.context_vec)) # u's dim (batch_size, seq_len, context_vec_size)
        output = torch.bmm(torch.transpose(h_t, 1, 2), alpha) # alpha's dim (batch_size, seq_len, 1)
        return torch.squeeze(output, dim=2) # output's dim (batch_size, 2*hidden_size, 1)
 if __name__ == '__main__':
--- a/reproduction/preprocess.py
+++ b/reproduction/preprocess.py
@@ -0,0 +1,51 @@
 ''''
    Tokenize yelp dataset's documents using stanford core nlp
 '''
 import pickle
 import json
 import nltk
 from nltk.tokenize import stanford
 import os
 input_filename = 'review.json'
 # config for stanford core nlp
 os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
 path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
 tokenizer = stanford.CoreNLPTokenizer()
 in_dirname = 'review'
 out_dirname = 'reviews'
 f = open(input_filename, encoding='utf-8')
 samples = []
 j = 0
 for i, line in enumerate(f.readlines()):
    review = json.loads(line)
    samples.append((review['stars'], review['text']))
    if (i+1) % 5000 == 0:
        print(i)
        pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb'))
        j += 1
        samples = []
 pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb'))
 # samples = pickle.load(open(out_dirname + '/samples0.pkl', 'rb'))
 # print(samples[0])
 for fn in os.listdir(in_dirname):
    print(fn)
    precessed = []
    for stars, text in pickle.load(open(os.path.join(in_dirname, fn), 'rb')):
        tokens = []
        sents = nltk.tokenize.sent_tokenize(text)
        for s in sents:
            tokens.append(tokenizer.tokenize(s))
        precessed.append((stars, tokens))
        # print(tokens)
        if len(precessed) % 100 == 0:
            print(len(precessed))
    pickle.dump(precessed, open(os.path.join(out_dirname, fn), 'wb'))
--- a/model_inplement/train.py
+++ b/model_inplement/train.py
@@ -1,9 +1,6 @@
 import os
 import pickle
 import matplotlib.pyplot as plt
 import matplotlib.ticker as ticker
 import nltk
 import numpy as np
 import torch
@@ -60,7 +57,6 @@ class YelpDocSet(Dataset):
        file_id = n // 5000
        idx = file_id % 5
        if self._cache[idx][0] != file_id:
            # print('load {} to {}'.format(file_id, idx))
            with open(os.path.join(self.dirname, self._files[file_id]), 'rb') as f:
                self._cache[idx] = (file_id, pickle.load(f))
        y, x = self._cache[idx][1][n % 5000]
@@ -90,7 +86,6 @@ class YelpDocSet(Dataset):
                vec = self.embedding.get_vec(word)
                sent_vec.append(vec.tolist())
            sent_vec = torch.Tensor(sent_vec)
            # print(sent_vec.size())
            doc.append(sent_vec)
        if len(doc) == 0:
            doc = [torch.zeros(1,200)]
@@ -124,7 +119,6 @@ def train(net, dataset, num_epoch, batch_size, print_size=10, use_cuda=False):
            for sample in x:
                doc = []
                for sent_vec in sample:
                    # print(sent_vec.size())
                    if use_cuda:
                        sent_vec = sent_vec.cuda()
                    doc.append(Variable(sent_vec))