From 73e87b5db1bbd583ade83f3ae50b10de859ad9c1 Mon Sep 17 00:00:00 2001 From: choosewhatulike <1901722105@qq.com> Date: Sat, 23 Jun 2018 17:36:53 +0800 Subject: [PATCH] add some sample data --- .../evaluate.py | 0 .../model.py | 0 .../preprocess.py | 0 .../train.py | 10 ++++-- reproduction/.gitignore | 1 - reproduction/README.md | 36 ------------------- 6 files changed, 7 insertions(+), 40 deletions(-) rename {reproduction => HAN-document_classification}/evaluate.py (100%) rename {reproduction => HAN-document_classification}/model.py (100%) rename {reproduction => HAN-document_classification}/preprocess.py (100%) rename {reproduction => HAN-document_classification}/train.py (95%) delete mode 100644 reproduction/.gitignore delete mode 100644 reproduction/README.md diff --git a/reproduction/evaluate.py b/HAN-document_classification/evaluate.py similarity index 100% rename from reproduction/evaluate.py rename to HAN-document_classification/evaluate.py diff --git a/reproduction/model.py b/HAN-document_classification/model.py similarity index 100% rename from reproduction/model.py rename to HAN-document_classification/model.py diff --git a/reproduction/preprocess.py b/HAN-document_classification/preprocess.py similarity index 100% rename from reproduction/preprocess.py rename to HAN-document_classification/preprocess.py diff --git a/reproduction/train.py b/HAN-document_classification/train.py similarity index 95% rename from reproduction/train.py rename to HAN-document_classification/train.py index add570c1..72e2a696 100644 --- a/reproduction/train.py +++ b/HAN-document_classification/train.py @@ -147,7 +147,7 @@ if __name__ == '__main__': import gensim from gensim import models - # train_word_vec() + train_word_vec() embed_model = Word2Vec.load('yelp.word2vec') embedding = Embedding_layer(embed_model.wv, embed_model.wv.vector_size) @@ -158,6 +158,10 @@ if __name__ == '__main__': net = HAN(input_size=200, output_size=5, word_hidden_size=50, word_num_layers=1, word_context_size=100, sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) - - net.load_state_dict(torch.load('model.dict')) + try: + net.load_state_dict(torch.load('model.dict')) + print("last time trained model has loaded") + except Exception: + print("cannot load model, train the inital model") + train(net, dataset, num_epoch=5, batch_size=64, use_cuda=True) \ No newline at end of file diff --git a/reproduction/.gitignore b/reproduction/.gitignore deleted file mode 100644 index 7e99e367..00000000 --- a/reproduction/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.pyc \ No newline at end of file diff --git a/reproduction/README.md b/reproduction/README.md deleted file mode 100644 index c5c197bd..00000000 --- a/reproduction/README.md +++ /dev/null @@ -1,36 +0,0 @@ -## Introduction -This is the implementation of [Hierarchical Attention Networks for Document Classification](https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf) paper in PyTorch. -* Dataset is 600k documents extracted from [Yelp 2018](https://www.yelp.com/dataset) customer reviews -* Use [NLTK](http://www.nltk.org/) and [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/) to tokenize documents and sentences -* Both CPU & GPU support -* The best accuracy is 71%, reaching the same performance in the paper - -## Requirement -* python 3.6 -* pytorch = 0.3.0 -* numpy -* gensim -* nltk -* coreNLP - -## Parameters -According to the paper and experiment, I set model parameters: -|word embedding dimension|GRU hidden size|GRU layer|word/sentence context vector dimension| -|---|---|---|---| -|200|50|1|100| - -And the training parameters: -|Epoch|learning rate|momentum|batch size| -|---|---|---|---| -|3|0.01|0.9|64| - -## Run -1. Prepare dataset. Download the [data set](https://www.yelp.com/dataset), and unzip the custom reviews as a file. Use preprocess.py to transform file into data set foe model input. -2. Train the model. The model will trained and autosaved in 'model.dict' -``` -python train -``` -3. Test the model. -``` -python evaluate -``` \ No newline at end of file