diff --git a/model_inplement/preprocess.py b/model_inplement/preprocess.py
deleted file mode 100644
index 37f6eb25..00000000
--- a/model_inplement/preprocess.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import pickle
-import json
-import nltk
-from nltk.tokenize import stanford
-
-# f = open('dataset/review.json', encoding='utf-8')
-# samples = []
-# j = 0
-# for i, line in enumerate(f.readlines()):
-#     review = json.loads(line)
-#     samples.append((review['stars'], review['text']))
-#     if (i+1) % 5000 == 0:
-#         print(i)
-#         pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb'))
-#         j += 1
-#         samples = []
-# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb'))
-samples = pickle.load(open('review/samples0.pkl', 'rb'))
-# print(samples[0])
-
-import os
-os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
-path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
-tokenizer = stanford.CoreNLPTokenizer()
-
-dirname = 'review'
-dirname1 = 'reviews'
-
-for fn in os.listdir(dirname):
-    print(fn)
-    precessed = []
-    for stars, text in pickle.load(open(os.path.join(dirname, fn), 'rb')):
-        tokens = []
-        sents = nltk.tokenize.sent_tokenize(text)
-        for s in sents:
-            tokens.append(tokenizer.tokenize(s))
-        precessed.append((stars, tokens))
-        # print(tokens)
-        if len(precessed) % 100 == 0:
-            print(len(precessed))
-    pickle.dump(precessed, open(os.path.join(dirname1, fn), 'wb'))
-    
diff --git a/model_inplement/readme.md b/model_inplement/readme.md
deleted file mode 100644
index 9da1bde6..00000000
--- a/model_inplement/readme.md
+++ /dev/null
@@ -1,2 +0,0 @@
-# Implementation of the model in 
-Hierarchical Attention Networks for Document Classification
diff --git a/model_inplement/.gitignore b/reproduction/.gitignore
similarity index 100%
rename from model_inplement/.gitignore
rename to reproduction/.gitignore
diff --git a/reproduction/README.md b/reproduction/README.md
new file mode 100644
index 00000000..cf90cb7d
--- /dev/null
+++ b/reproduction/README.md
@@ -0,0 +1,36 @@
+## Introduction
+This is the implementation of [Hierarchical Attention Networks for Document Classification](https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf) paper in PyTorch.
+* Dataset is 600k documents extracted from [Yelp 2018](https://www.yelp.com/dataset) customer reviews
+* Use [NLTK](http://www.nltk.org/) and [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/) to tokenize documents and sentences
+* Both CPU & GPU support
+* The best accuracy is 71%, reaching the same performance in the paper
+
+## Requirement
+* python 3.6
+* pytorch >= 0.3.0
+* numpy
+* gensim
+* nltk
+* coreNLP
+
+## Parameters
+According to the paper and experiment, I set model parameters:
+|word embedding dimension|GRU hidden size|GRU layer|word/sentence context vector dimension|
+|---|---|---|---|
+|200|50|1|100|
+
+And the training parameters:
+|Epoch|learning rate|momentum|batch size|
+|---|---|---|---|
+|3|0.01|0.9|64|
+
+## Run
+1. Prepare dataset. Download the [data set](https://www.yelp.com/dataset), and unzip the custom reviews as a file. Use preprocess.py to transform file into data set foe model input.
+2. Train the model. The model will trained and autosaved in 'model.dict'
+```
+python train
+```
+3. Test the model.
+```
+python evaluate
+```
\ No newline at end of file
diff --git a/model_inplement/evaluate.py b/reproduction/evaluate.py
similarity index 88%
rename from model_inplement/evaluate.py
rename to reproduction/evaluate.py
index 7384428a..1f10a9a2 100644
--- a/model_inplement/evaluate.py
+++ b/reproduction/evaluate.py
@@ -12,7 +12,6 @@ def evaluate(net, dataset, bactch_size=64, use_cuda=False):
         for sample in x:
             doc = []
             for sent_vec in sample:
-                # print(sent_vec.size())
                 if use_cuda:
                     sent_vec = sent_vec.cuda()
                 doc.append(Variable(sent_vec, volatile=True))
@@ -20,10 +19,6 @@ def evaluate(net, dataset, bactch_size=64, use_cuda=False):
         if use_cuda:
             y = y.cuda()
         predicts = net(doc_list)
-        # idx = []
-        # for p in predicts.data:
-        #     idx.append(np.random.choice(5, p=torch.exp(p).numpy()))
-        # idx = torch.LongTensor(idx)
         p, idx = torch.max(predicts, dim=1)
         idx = idx.data
         count += torch.sum(torch.eq(idx, y))
diff --git a/model_inplement/model.py b/reproduction/model.py
similarity index 83%
rename from model_inplement/model.py
rename to reproduction/model.py
index e1e52f97..ce047f8e 100644
--- a/model_inplement/model.py
+++ b/reproduction/model.py
@@ -38,11 +38,9 @@ class HAN(nn.Module):
     def forward(self, batch_doc):
         # input is a sequence of matrix
         doc_vec_list = []
-        for doc in batch_doc:
-            # doc's dim (num_sent, seq_len, word_dim)
-            sent_mat = self.word_layer(doc)
-            # sent_mat's dim (num_sent, vec_dim)
-            doc_vec_list.append(sent_mat)
+        for doc in batch_doc: 
+            sent_mat = self.word_layer(doc) # doc's dim (num_sent, seq_len, word_dim)
+            doc_vec_list.append(sent_mat) # sent_mat's dim (num_sent, vec_dim)
         doc_vec = self.sent_layer(pack_sequence(doc_vec_list))
         output = self.softmax(self.output_layer(doc_vec))
         return output
@@ -55,7 +53,6 @@ class AttentionNet(nn.Module):
         self.gru_hidden_size = gru_hidden_size
         self.gru_num_layers = gru_num_layers
         self.context_vec_size = context_vec_size
-        self.last_alpha = None
 
         # Encoder
         self.gru = nn.GRU(input_size=input_size, 
@@ -72,18 +69,13 @@ class AttentionNet(nn.Module):
         self.context_vec.data.uniform_(-0.1, 0.1)
 
     def forward(self, inputs):
-        # inputs's dim (batch_size, seq_len,  word_dim)
         # GRU part
-        h_t, hidden = self.gru(inputs)
+        h_t, hidden = self.gru(inputs) # inputs's dim (batch_size, seq_len,  word_dim)
         u = self.tanh(self.fc(h_t))
         # Attention part
-        # u's dim (batch_size, seq_len, context_vec_size)
-        alpha = self.softmax(torch.matmul(u, self.context_vec))
-        self.last_alpha = alpha.data
-        # alpha's dim (batch_size, seq_len, 1)
-        output = torch.bmm(torch.transpose(h_t, 1, 2), alpha)
-        # output's dim (batch_size, 2*hidden_size, 1)
-        return torch.squeeze(output, dim=2)
+        alpha = self.softmax(torch.matmul(u, self.context_vec)) # u's dim (batch_size, seq_len, context_vec_size)
+        output = torch.bmm(torch.transpose(h_t, 1, 2), alpha) # alpha's dim (batch_size, seq_len, 1)
+        return torch.squeeze(output, dim=2) # output's dim (batch_size, 2*hidden_size, 1)
 
 
 if __name__ == '__main__':
diff --git a/reproduction/preprocess.py b/reproduction/preprocess.py
new file mode 100644
index 00000000..0de0d74e
--- /dev/null
+++ b/reproduction/preprocess.py
@@ -0,0 +1,51 @@
+''''
+    Tokenize yelp dataset's documents using stanford core nlp
+'''
+
+import pickle
+import json
+import nltk
+from nltk.tokenize import stanford
+import os
+
+input_filename = 'review.json'
+
+# config for stanford core nlp
+os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
+path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
+tokenizer = stanford.CoreNLPTokenizer()
+
+in_dirname = 'review'
+out_dirname = 'reviews'
+
+
+f = open(input_filename, encoding='utf-8')
+samples = []
+j = 0
+for i, line in enumerate(f.readlines()):
+    review = json.loads(line)
+    samples.append((review['stars'], review['text']))
+    if (i+1) % 5000 == 0:
+        print(i)
+        pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb'))
+        j += 1
+        samples = []
+pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb'))
+# samples = pickle.load(open(out_dirname + '/samples0.pkl', 'rb'))
+# print(samples[0])
+
+
+for fn in os.listdir(in_dirname):
+    print(fn)
+    precessed = []
+    for stars, text in pickle.load(open(os.path.join(in_dirname, fn), 'rb')):
+        tokens = []
+        sents = nltk.tokenize.sent_tokenize(text)
+        for s in sents:
+            tokens.append(tokenizer.tokenize(s))
+        precessed.append((stars, tokens))
+        # print(tokens)
+        if len(precessed) % 100 == 0:
+            print(len(precessed))
+    pickle.dump(precessed, open(os.path.join(out_dirname, fn), 'wb'))
+    
diff --git a/model_inplement/train.py b/reproduction/train.py
similarity index 96%
rename from model_inplement/train.py
rename to reproduction/train.py
index a4a64b32..add570c1 100644
--- a/model_inplement/train.py
+++ b/reproduction/train.py
@@ -1,9 +1,6 @@
 import os
 import pickle
 
-import matplotlib.pyplot as plt
-import matplotlib.ticker as ticker
-
 import nltk
 import numpy as np
 import torch
@@ -60,7 +57,6 @@ class YelpDocSet(Dataset):
         file_id = n // 5000
         idx = file_id % 5
         if self._cache[idx][0] != file_id:
-            # print('load {} to {}'.format(file_id, idx))
             with open(os.path.join(self.dirname, self._files[file_id]), 'rb') as f:
                 self._cache[idx] = (file_id, pickle.load(f))
         y, x = self._cache[idx][1][n % 5000]
@@ -90,7 +86,6 @@ class YelpDocSet(Dataset):
                 vec = self.embedding.get_vec(word)
                 sent_vec.append(vec.tolist())
             sent_vec = torch.Tensor(sent_vec)
-            # print(sent_vec.size())
             doc.append(sent_vec)
         if len(doc) == 0:
             doc = [torch.zeros(1,200)]
@@ -124,7 +119,6 @@ def train(net, dataset, num_epoch, batch_size, print_size=10, use_cuda=False):
             for sample in x:
                 doc = []
                 for sent_vec in sample:
-                    # print(sent_vec.size())
                     if use_cuda:
                         sent_vec = sent_vec.cuda()
                     doc.append(Variable(sent_vec))