| @@ -147,7 +147,7 @@ if __name__ == '__main__': | |||||
| import gensim | import gensim | ||||
| from gensim import models | from gensim import models | ||||
| # train_word_vec() | |||||
| train_word_vec() | |||||
| embed_model = Word2Vec.load('yelp.word2vec') | embed_model = Word2Vec.load('yelp.word2vec') | ||||
| embedding = Embedding_layer(embed_model.wv, embed_model.wv.vector_size) | embedding = Embedding_layer(embed_model.wv, embed_model.wv.vector_size) | ||||
| @@ -158,6 +158,10 @@ if __name__ == '__main__': | |||||
| net = HAN(input_size=200, output_size=5, | net = HAN(input_size=200, output_size=5, | ||||
| word_hidden_size=50, word_num_layers=1, word_context_size=100, | word_hidden_size=50, word_num_layers=1, word_context_size=100, | ||||
| sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) | sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) | ||||
| net.load_state_dict(torch.load('model.dict')) | |||||
| try: | |||||
| net.load_state_dict(torch.load('model.dict')) | |||||
| print("last time trained model has loaded") | |||||
| except Exception: | |||||
| print("cannot load model, train the inital model") | |||||
| train(net, dataset, num_epoch=5, batch_size=64, use_cuda=True) | train(net, dataset, num_epoch=5, batch_size=64, use_cuda=True) | ||||
| @@ -1 +0,0 @@ | |||||
| *.pyc | |||||
| @@ -1,36 +0,0 @@ | |||||
| ## Introduction | |||||
| This is the implementation of [Hierarchical Attention Networks for Document Classification](https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf) paper in PyTorch. | |||||
| * Dataset is 600k documents extracted from [Yelp 2018](https://www.yelp.com/dataset) customer reviews | |||||
| * Use [NLTK](http://www.nltk.org/) and [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/) to tokenize documents and sentences | |||||
| * Both CPU & GPU support | |||||
| * The best accuracy is 71%, reaching the same performance in the paper | |||||
| ## Requirement | |||||
| * python 3.6 | |||||
| * pytorch = 0.3.0 | |||||
| * numpy | |||||
| * gensim | |||||
| * nltk | |||||
| * coreNLP | |||||
| ## Parameters | |||||
| According to the paper and experiment, I set model parameters: | |||||
| |word embedding dimension|GRU hidden size|GRU layer|word/sentence context vector dimension| | |||||
| |---|---|---|---| | |||||
| |200|50|1|100| | |||||
| And the training parameters: | |||||
| |Epoch|learning rate|momentum|batch size| | |||||
| |---|---|---|---| | |||||
| |3|0.01|0.9|64| | |||||
| ## Run | |||||
| 1. Prepare dataset. Download the [data set](https://www.yelp.com/dataset), and unzip the custom reviews as a file. Use preprocess.py to transform file into data set foe model input. | |||||
| 2. Train the model. The model will trained and autosaved in 'model.dict' | |||||
| ``` | |||||
| python train | |||||
| ``` | |||||
| 3. Test the model. | |||||
| ``` | |||||
| python evaluate | |||||
| ``` | |||||