From 544ca8631b4ea13203005abeee4c27efc830e7a3 Mon Sep 17 00:00:00 2001
From: choocewhatulike <1901722105@qq.com>
Date: Mon, 12 Mar 2018 00:54:28 +0800
Subject: [PATCH] tokenize data

---
 .../code/__pycache__/model.cpython-36.pyc     | Bin 0 -> 2223 bytes
 model_inplement/code/model.py                 |  63 ++++++++++++++----
 model_inplement/code/preprocess.py            |  42 ++++++++++++
 model_inplement/code/train.py                 |   0
 4 files changed, 93 insertions(+), 12 deletions(-)
 create mode 100644 model_inplement/code/__pycache__/model.cpython-36.pyc
 create mode 100644 model_inplement/code/preprocess.py
 create mode 100644 model_inplement/code/train.py

diff --git a/model_inplement/code/__pycache__/model.cpython-36.pyc b/model_inplement/code/__pycache__/model.cpython-36.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e18ae7f3f1addfca6601d873620127d292e35aa
GIT binary patch
literal 2223
zcmZuyOK;mo5Z+xrL{YM=*iL|^k0gDhWe>g-D9|EE8yHA(NQEGFdmxMwG<Ri_p(tl}
z87BgLl7RpPdg!q|73iryp{M?jz4ny9&{JoYk}LzIz}cDE$1G>QZ}i2+M)>9Lf5g9p
zg#1I+9vAd);Z-pRPB<+|+>SJ+)@FqhJD{=Bop`ZF+gZPGicTCnBHZEb1>vr6itgNr
zL-2Uq2amt-^f=uoQRgk3Peu_n!4Ij-)1y*k=vh9_LQ!`?5d>q$><t-HPC2^>;08W-
zxpzTh_YHAL?BSWldHi6Mbr)N4^x(y#)H4v8XxflDE7+V+qMe2|PUAM-+-*qWYKMcp
zLmI#7G=cSb+Alib^});L3|0l;>COpgAtw-$jovaiIqDft)u%#A3}%8P$*Wu^iRnMk
zT2wlpRwJQJcs7+hDbv?Nnoxm%p|h$o{*$~CsWkm*txxOaOlN;O)|2#=>8MtTy2e$a
z9Gft&aGA<yBJM5rSYv2|oaCH~s-51no~oX-;Z%IhI<je{#jC{;LwlVoyR2t@mvyY~
zg6egXBr8*;lH^bF?^ojM!@X%)ieqs&u6bIGo;*Dqr%K~EnQ~D=UQSCffloioz;2zp
zr>~_CdBcLK8z6`?U@m3w3*gs|TXfck>h7*mOV6?;NN_Xid+@5qAR5N$b0+ULq#zBQ
z({l$jsC~g2XO6bpxcj(@!K!=U2P}ZsYXC;t<*)`}4fS<LhAO!ON6v26)$?k4cDocW
zMR_}~Za+<BIuTmPJCP^x?s9;lk0JnJyi=K1I`ZWvTJ)9i>|R4(8o#ZWypD^vR$?0$
zcE(eAmP-B(7-|TDH~|gmkh+kJkh*Nv!$d4rMbus|@)30vTnJDXCJAJ_E>Z6%$%{HI
z+Zmf?`7TD;SQIW3Wtk+GcYyP*!Mwz)STy6OC~OM>?GLRd17r0pYoI;`*Z&gLHiR{r
zda?scGo&UyIHCg&Qa^#XN>d;yNt_1B=K@_x;sIqz;_FTkz#J45I|6rp_Q2Z(Z^+$-
z0byNB*dBQLmarS34Xm~a+E8CBwi@Ulb2i5a+W>;?VyC#yJKUeMMfro#!viQT79sAf
zlOtT?Meiup*-0|arP5~ODCfBp8B#tiCFbod1#$e_ulW4+%~JV0x*_q?Z8F-rU>LJ`
zEbC+~lo_DoGMpJM8Te9UmfsS3IEkG{&%QSftd%`HhK;k>JxHq)<7$v{14AJ`O?QP`
z<LRl)PGbK+RB9?sZ|!8`@>HjBP**tsZIVD15XCkX@<Y@<LV-nJBF4hl!Up;H8NBKP
z5QGkxPwmg~Etwe&*%lx<WIJ?reZ51j*qvcl#RrjrN!){1{Rl!Mk>Oup&dysx191=8
zIXib?25@3$04)Wer0}3^w{$0k5Az+wP7z?u&pm51+s*+6=YHechA!_wlqi%D3~{%v
z)Qeh(nUJ4=4Q+T~eF~Pu|E4ynY%-Cb;{vLqPRn=SHfR+)Cy9>zwxWjC#!Ji7lT_Zs
zy;qrUiFqIV3JXP8pAKjrc)tVhZ0AazmgM*4dw9<KC@|TV1edqaTm@Jb$<_yxiXX6~
z^$iQcsBb*L88l=2>462uZdJAT8nk9w>*=vfxx@m++sie(<GNa4J`OK+LmwLdJKKtE
h=V-ycx>S2T+w_8aZO6KgWKq~d1BTtMPy1}^)_*cc7w!N6

literal 0
HcmV?d00001

diff --git a/model_inplement/code/model.py b/model_inplement/code/model.py
index f73cabe3..32ebdbf9 100644
--- a/model_inplement/code/model.py
+++ b/model_inplement/code/model.py
@@ -22,19 +22,16 @@ class HAN(nn.Module):
         self.output_layer = nn.Linear(2* sent_hidden_size, output_size)
         self.softmax = nn.Softmax()
 
-    def forward(self, x, level='w'):
+    def forward(self, doc):
         # input is a sequence of vector
         # if level == w, a seq of words (a sent); level == s, a seq of sents (a doc)
-        if level == 's':
-            v = self.sent_layer(x)
-            output = self.softmax(self.output_layer(v))
-            return output
-        elif level == 'w':
-            s = self.word_layer(x)
-            return s
-        else:
-            print('unknow level in Parameter!')
-
+        s_list = []
+        for sent in doc:
+            s_list.append(self.word_layer(sent))
+        s_vec = torch.cat(s_list, dim=1).t()
+        doc_vec = self.sent_layer(s_vec)
+        output = self.softmax(self.output_layer(doc_vec))
+        return output
 
 class AttentionNet(nn.Module):
     def __init__(self, input_size, gru_hidden_size, gru_num_layers, context_vec_size):
@@ -60,11 +57,53 @@ class AttentionNet(nn.Module):
         self.context_vec.data.uniform_(-0.1, 0.1)
 
     def forward(self, inputs):
-        # inputs's dim seq_len*word_dim
+        # inputs's dim (seq_len, word_dim)
         inputs = torch.unsqueeze(inputs, 1)
         h_t, hidden = self.gru(inputs)
         h_t = torch.squeeze(h_t, 1)
         u = self.tanh(self.fc(h_t))
         alpha = self.softmax(torch.mm(u, self.context_vec))
         output = torch.mm(h_t.t(), alpha)
+        # output's dim (2*hidden_size, 1)
         return output
+
+
+'''
+Train process
+'''
+import math
+import os
+import copy
+import pickle
+
+import matplotlib.pyplot as plt
+import matplotlib.ticker as ticker
+import numpy as np
+import json
+import nltk
+
+optimizer = torch.optim.SGD(lr=0.01)
+criterion = nn.NLLLoss()
+epoch = 1
+batch_size = 10
+
+net = HAN(input_size=100, output_size=5, 
+        word_hidden_size=50, word_num_layers=1, word_context_size=100,
+        sent_hidden_size=50, sent_num_layers=1, sent_context_size=100)
+
+def dataloader(filename):
+    samples = pickle.load(open(filename, 'rb'))
+    return samples
+
+def gen_doc(text):
+    pass
+
+class SampleDoc:
+    def __init__(self, doc, label):
+        self.doc = doc
+        self.label = label
+
+    def __iter__(self):
+        for sent in self.doc:
+            for word in sent:
+
diff --git a/model_inplement/code/preprocess.py b/model_inplement/code/preprocess.py
new file mode 100644
index 00000000..37f6eb25
--- /dev/null
+++ b/model_inplement/code/preprocess.py
@@ -0,0 +1,42 @@
+import pickle
+import json
+import nltk
+from nltk.tokenize import stanford
+
+# f = open('dataset/review.json', encoding='utf-8')
+# samples = []
+# j = 0
+# for i, line in enumerate(f.readlines()):
+#     review = json.loads(line)
+#     samples.append((review['stars'], review['text']))
+#     if (i+1) % 5000 == 0:
+#         print(i)
+#         pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb'))
+#         j += 1
+#         samples = []
+# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb'))
+samples = pickle.load(open('review/samples0.pkl', 'rb'))
+# print(samples[0])
+
+import os
+os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
+path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
+tokenizer = stanford.CoreNLPTokenizer()
+
+dirname = 'review'
+dirname1 = 'reviews'
+
+for fn in os.listdir(dirname):
+    print(fn)
+    precessed = []
+    for stars, text in pickle.load(open(os.path.join(dirname, fn), 'rb')):
+        tokens = []
+        sents = nltk.tokenize.sent_tokenize(text)
+        for s in sents:
+            tokens.append(tokenizer.tokenize(s))
+        precessed.append((stars, tokens))
+        # print(tokens)
+        if len(precessed) % 100 == 0:
+            print(len(precessed))
+    pickle.dump(precessed, open(os.path.join(dirname1, fn), 'wb'))
+    
diff --git a/model_inplement/code/train.py b/model_inplement/code/train.py
new file mode 100644
index 00000000..e69de29b