From 544ca8631b4ea13203005abeee4c27efc830e7a3 Mon Sep 17 00:00:00 2001 From: choocewhatulike <1901722105@qq.com> Date: Mon, 12 Mar 2018 00:54:28 +0800 Subject: [PATCH] tokenize data --- .../code/__pycache__/model.cpython-36.pyc | Bin 0 -> 2223 bytes model_inplement/code/model.py | 63 ++++++++++++++---- model_inplement/code/preprocess.py | 42 ++++++++++++ model_inplement/code/train.py | 0 4 files changed, 93 insertions(+), 12 deletions(-) create mode 100644 model_inplement/code/__pycache__/model.cpython-36.pyc create mode 100644 model_inplement/code/preprocess.py create mode 100644 model_inplement/code/train.py diff --git a/model_inplement/code/__pycache__/model.cpython-36.pyc b/model_inplement/code/__pycache__/model.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e18ae7f3f1addfca6601d873620127d292e35aa GIT binary patch literal 2223 zcmZuyOK;mo5Z+xrL{YM=*iL|^k0gDhWe>g-D9|EE8yHA(NQEGFdmxMwGQZ}i2+M)>9Lf5g9p zg#1I+9vAd);Z-pRPB<+|+>SJ+)@FqhJD{=Bop`ZF+gZPGicTCnBHZEb1>vr6itgNr zL-2Uq2amt-^f=uoQRgk3Peu_n!4Ij-)1y*k=vh9_LQ!`?5d>q$>;08W- zxpzTh_YHAL?BSWldHi6Mbr)N4^x(y#)H4v8XxflDE7+V+qMe2|PUAM-+-*qWYKMcp zLmI#7G=cSb+Alib^});L3|0l;>COpgAtw-$jovaiIqDft)u%#A3}%8P$*Wu^iRnMk zT2wlpRwJQJcs7+hDbv?Nnoxm%p|h$o{*$~CsWkm*txxOaOlN;O)|2#=>8MtTy2e$a z9Gft&aGA~_CdBcLK8z6`?U@m3w3*gs|TXfck>h7*mOV6?;NN_Xid+@5qAR5N$b0+ULq#zBQ z({l$jsC~g2XO6bpxcj(@!K!=U2P}ZsYXC;t<*)`}4fS|R4(8o#ZWypD^vR$?0$ zcE(eAmP-B(7-|TDH~|gmkh+kJkh*Nv!$d4rMbus|@)30vTnJDXCJAJ_E>Z6%$%{HI z+Zmf?`7TD;SQIW3Wtk+GcYyP*!Mwz)STy6OC~OM>?GLRd17r0pYoI;`*Z&gLHiR{r zda?scGo&UyIHCg&Qa^#XN>d;yNt_1B=K@_x;sIqz;_FTkz#J45I|6rp_Q2Z(Z^+$- z0byNB*dBQLmarS34Xm~a+E8CBwi@Ulb2i5a+W>;?VyC#yJKUeMMfro#!viQT79sAf zlOtT?Meiup*-0|arP5~ODCfBp8B#tiCFbod1#$e_ulW4+%~JV0x*_q?Z8F-rU>LJ` zEbC+~lo_DoGMpJM8Te9UmfsS3IEkG{&%QSftd%`HhK;k>JxHq)<7$v{14AJ`O?QP` zHjBP**tsZIVD15XCkX@Oup&dysx191=8 zIXib?25@3$04)Wer0}3^w{$0k5Az+wP7z?u&pm51+s*+6=YHechA!_wlqi%D3~{%v z)Qeh(nUJ4=4Q+T~eF~Pu|E4ynY%-Cb;{vLqPRn=SHfR+)Cy9>zwxWjC#!Ji7lT_Zs zy;qrUiFqIV3JXP8pAKjrc)tVhZ0AazmgM*4dw9462uZdJAT8nk9w>*=vfxx@m++sie(S2T+w_8aZO6KgWKq~d1BTtMPy1}^)_*cc7w!N6 literal 0 HcmV?d00001 diff --git a/model_inplement/code/model.py b/model_inplement/code/model.py index f73cabe3..32ebdbf9 100644 --- a/model_inplement/code/model.py +++ b/model_inplement/code/model.py @@ -22,19 +22,16 @@ class HAN(nn.Module): self.output_layer = nn.Linear(2* sent_hidden_size, output_size) self.softmax = nn.Softmax() - def forward(self, x, level='w'): + def forward(self, doc): # input is a sequence of vector # if level == w, a seq of words (a sent); level == s, a seq of sents (a doc) - if level == 's': - v = self.sent_layer(x) - output = self.softmax(self.output_layer(v)) - return output - elif level == 'w': - s = self.word_layer(x) - return s - else: - print('unknow level in Parameter!') - + s_list = [] + for sent in doc: + s_list.append(self.word_layer(sent)) + s_vec = torch.cat(s_list, dim=1).t() + doc_vec = self.sent_layer(s_vec) + output = self.softmax(self.output_layer(doc_vec)) + return output class AttentionNet(nn.Module): def __init__(self, input_size, gru_hidden_size, gru_num_layers, context_vec_size): @@ -60,11 +57,53 @@ class AttentionNet(nn.Module): self.context_vec.data.uniform_(-0.1, 0.1) def forward(self, inputs): - # inputs's dim seq_len*word_dim + # inputs's dim (seq_len, word_dim) inputs = torch.unsqueeze(inputs, 1) h_t, hidden = self.gru(inputs) h_t = torch.squeeze(h_t, 1) u = self.tanh(self.fc(h_t)) alpha = self.softmax(torch.mm(u, self.context_vec)) output = torch.mm(h_t.t(), alpha) + # output's dim (2*hidden_size, 1) return output + + +''' +Train process +''' +import math +import os +import copy +import pickle + +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +import numpy as np +import json +import nltk + +optimizer = torch.optim.SGD(lr=0.01) +criterion = nn.NLLLoss() +epoch = 1 +batch_size = 10 + +net = HAN(input_size=100, output_size=5, + word_hidden_size=50, word_num_layers=1, word_context_size=100, + sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) + +def dataloader(filename): + samples = pickle.load(open(filename, 'rb')) + return samples + +def gen_doc(text): + pass + +class SampleDoc: + def __init__(self, doc, label): + self.doc = doc + self.label = label + + def __iter__(self): + for sent in self.doc: + for word in sent: + diff --git a/model_inplement/code/preprocess.py b/model_inplement/code/preprocess.py new file mode 100644 index 00000000..37f6eb25 --- /dev/null +++ b/model_inplement/code/preprocess.py @@ -0,0 +1,42 @@ +import pickle +import json +import nltk +from nltk.tokenize import stanford + +# f = open('dataset/review.json', encoding='utf-8') +# samples = [] +# j = 0 +# for i, line in enumerate(f.readlines()): +# review = json.loads(line) +# samples.append((review['stars'], review['text'])) +# if (i+1) % 5000 == 0: +# print(i) +# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb')) +# j += 1 +# samples = [] +# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb')) +samples = pickle.load(open('review/samples0.pkl', 'rb')) +# print(samples[0]) + +import os +os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe' +path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar' +tokenizer = stanford.CoreNLPTokenizer() + +dirname = 'review' +dirname1 = 'reviews' + +for fn in os.listdir(dirname): + print(fn) + precessed = [] + for stars, text in pickle.load(open(os.path.join(dirname, fn), 'rb')): + tokens = [] + sents = nltk.tokenize.sent_tokenize(text) + for s in sents: + tokens.append(tokenizer.tokenize(s)) + precessed.append((stars, tokens)) + # print(tokens) + if len(precessed) % 100 == 0: + print(len(precessed)) + pickle.dump(precessed, open(os.path.join(dirname1, fn), 'wb')) + diff --git a/model_inplement/code/train.py b/model_inplement/code/train.py new file mode 100644 index 00000000..e69de29b