From 58ddc2d26759154c82dba54121992f94adfe9631 Mon Sep 17 00:00:00 2001 From: choocewhatulike <1901722105@qq.com> Date: Mon, 12 Mar 2018 17:38:22 +0800 Subject: [PATCH] add train --- model_inplement/.gitignore | 1 + .../code/__pycache__/model.cpython-36.pyc | Bin 2223 -> 2342 bytes model_inplement/code/model.py | 63 ++------ model_inplement/code/train.py | 138 ++++++++++++++++++ 4 files changed, 153 insertions(+), 49 deletions(-) create mode 100644 model_inplement/.gitignore diff --git a/model_inplement/.gitignore b/model_inplement/.gitignore new file mode 100644 index 00000000..7e99e367 --- /dev/null +++ b/model_inplement/.gitignore @@ -0,0 +1 @@ +*.pyc \ No newline at end of file diff --git a/model_inplement/code/__pycache__/model.cpython-36.pyc b/model_inplement/code/__pycache__/model.cpython-36.pyc index 2e18ae7f3f1addfca6601d873620127d292e35aa..167f6ff9d4401d8077c23d66909e6c96b6049a11 100644 GIT binary patch delta 1274 zcmY*YTaVL56!who&V_6?AeKfI1zHG*Erm+$L#wKHs;$Wo+*zR$_RX?X`c(C)zoW174|we_=sDx;lG>VY=A8Lv=FB;Qtth{&Plz+SPXRh5Umfm_8|-w($@o`>;x z?E9||=n-}fNnLT#ct~8aFt*6g;y1&;ADNM*v~`NxNsDhJZOF2{q)XJ>CM0h&=_IEk zH*%JgXi9%x|Es$_leO`{+Pbd1HF1a?^L%qX(aaR7dA4O^7$|Y3qHc>{%@01S<5v3D z-+eFj16HsGnsh&u#@!$tN`oWXd)Ypp1T$3->Fx$Bn8ZA0>`i2ghIQ^E=@jV*5;O53 zJgk+9kzL2pHp06A$Ei5Uy9%TyLhAx)il3|-P`e-ib(UMC z`O_kf4`cQoxTSq;y^m+yL_pCT(JRJt? zBCf4ef#qZ<3su-pV34B-HChJ|)IkOv(xKundu#DFlHNhMjBp|bb_LB7jUC@x?TH1z z>D&NtNd1f|Nv|$VDB<*Zbx%h!#UbI62Lh?U&kjaKW7 zB->dOHn4_3Zbp9~5GeFidhHMCVJ_`8w_Mv(-|F@ru@ zZ8QrlyLQzsxxDj+W4w9<55TSdgv%4TFe>u>A`i0uIF`-Bk&1#|KZ@dX z7FjZpv=|0S_%v3TY?(lJl;-i#X;v1e`G;91ayH8Z&Sv?b>p@UG;@^Q&z2=(}4ToJS znmz}O_TQpqI|wC@;4xQMN;Y66OyQY@J3!;5H6>q^;vTU$vTkDBGAz49g>n%g2bfVn zbTnjY1%EkSEz-y7=x{BG58`BP(offZ3RO6a^H{C>wjv?bIf4d34ME%cs-Nb*qn5~n zBW;^x4F}qp>8Q>Vd3g?PBD#Acbr`DX0y1pa}wK6@0v~)7tJ_e@FAd6j*JE zt6z~4>JtRL7zEuU%(8BrBth{02xPa2)UJ-jL+t85;sU&^{u9BjY5Y>|4w`vuuw3Hc zO!*W=3)(19d$2M%AGwh;=l(i!5 zqZ-2i3nHyeRceEFpA%e0IK8+Ln{+m1d|CbFY=Bq&>ukc$)iw9wgDD9I9(XfZTUyeRujPzL-o}y&#v)Oeqb8w!>t< z7phgteLn#cQyR7K7FtHvX1oFCpn(Ihf;L{+&U1W0{ZZP9^Z5`s>QiK@iv*t$%tt|8 zB6ohmsSf5GY6smQ69_eEe55ug{``q9TbEy~A0T=O@n2Zdand4wwE107)U) diff --git a/model_inplement/code/model.py b/model_inplement/code/model.py index 32ebdbf9..a4c2e59b 100644 --- a/model_inplement/code/model.py +++ b/model_inplement/code/model.py @@ -20,16 +20,20 @@ class HAN(nn.Module): sent_num_layers, sent_context_size) self.output_layer = nn.Linear(2* sent_hidden_size, output_size) - self.softmax = nn.Softmax() + self.softmax = nn.LogSoftmax(dim=1) - def forward(self, doc): + def forward(self, batch_doc): # input is a sequence of vector # if level == w, a seq of words (a sent); level == s, a seq of sents (a doc) - s_list = [] - for sent in doc: - s_list.append(self.word_layer(sent)) - s_vec = torch.cat(s_list, dim=1).t() - doc_vec = self.sent_layer(s_vec) + doc_vec_list = [] + for doc in batch_doc: + s_list = [] + for sent in doc: + s_list.append(self.word_layer(sent)) + s_vec = torch.cat(s_list, dim=0) + vec = self.sent_layer(s_vec) + doc_vec_list.append(vec) + doc_vec = torch.cat(doc_vec_list, dim=0) output = self.softmax(self.output_layer(doc_vec)) return output @@ -51,7 +55,7 @@ class AttentionNet(nn.Module): # Attention self.fc = nn.Linear(2* gru_hidden_size, context_vec_size) self.tanh = nn.Tanh() - self.softmax = nn.Softmax() + self.softmax = nn.Softmax(dim=0) # context vector self.context_vec = nn.Parameter(torch.Tensor(context_vec_size, 1)) self.context_vec.data.uniform_(-0.1, 0.1) @@ -63,47 +67,8 @@ class AttentionNet(nn.Module): h_t = torch.squeeze(h_t, 1) u = self.tanh(self.fc(h_t)) alpha = self.softmax(torch.mm(u, self.context_vec)) - output = torch.mm(h_t.t(), alpha) - # output's dim (2*hidden_size, 1) + output = torch.mm(h_t.t(), alpha).t() + # output's dim (1, 2*hidden_size) return output -''' -Train process -''' -import math -import os -import copy -import pickle - -import matplotlib.pyplot as plt -import matplotlib.ticker as ticker -import numpy as np -import json -import nltk - -optimizer = torch.optim.SGD(lr=0.01) -criterion = nn.NLLLoss() -epoch = 1 -batch_size = 10 - -net = HAN(input_size=100, output_size=5, - word_hidden_size=50, word_num_layers=1, word_context_size=100, - sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) - -def dataloader(filename): - samples = pickle.load(open(filename, 'rb')) - return samples - -def gen_doc(text): - pass - -class SampleDoc: - def __init__(self, doc, label): - self.doc = doc - self.label = label - - def __iter__(self): - for sent in self.doc: - for word in sent: - diff --git a/model_inplement/code/train.py b/model_inplement/code/train.py index e69de29b..ae7ee925 100644 --- a/model_inplement/code/train.py +++ b/model_inplement/code/train.py @@ -0,0 +1,138 @@ +import gensim +from gensim import models + +import os +import pickle + +class SampleIter: + def __init__(self, dirname): + self.dirname = dirname + + def __iter__(self): + for f in os.listdir(self.dirname): + for y, x in pickle.load(open(os.path.join(self.dirname, f), 'rb')): + yield x, y + +class SentIter: + def __init__(self, dirname, count): + self.dirname = dirname + self.count = int(count) + + def __iter__(self): + for f in os.listdir(self.dirname)[:self.count]: + for y, x in pickle.load(open(os.path.join(self.dirname, f), 'rb')): + for sent in x: + yield sent + +def train_word_vec(): + # load data + dirname = 'reviews' + sents = SentIter(dirname, 238) + # define model and train + model = models.Word2Vec(sentences=sents, size=200, sg=0, workers=4, min_count=5) + model.save('yelp.word2vec') + + +''' +Train process +''' +import math +import os +import copy +import pickle + +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +import numpy as np +import json +import nltk +from gensim.models import Word2Vec +import torch +from torch.utils.data import DataLoader, Dataset + +from model import * + +net = HAN(input_size=200, output_size=5, + word_hidden_size=50, word_num_layers=1, word_context_size=100, + sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) + +optimizer = torch.optim.SGD(net.parameters(), lr=0.01) +criterion = nn.NLLLoss() +num_epoch = 1 +batch_size = 64 + +class Embedding_layer: + def __init__(self, wv, vector_size): + self.wv = wv + self.vector_size = vector_size + + def get_vec(self, w): + try: + v = self.wv[w] + except KeyError as e: + v = np.zeros(self.vector_size) + return v + +embed_model = Word2Vec.load('yelp.word2vec') +embedding = Embedding_layer(embed_model.wv, embed_model.wv.vector_size) +del embed_model + +class YelpDocSet(Dataset): + def __init__(self, dirname, num_files, embedding): + self.dirname = dirname + self.num_files = num_files + self._len = num_files*5000 + self._files = os.listdir(dirname)[:num_files] + self.embedding = embedding + + def __len__(self): + return self._len + + def __getitem__(self, n): + file_id = n // 5000 + sample_list = pickle.load(open( + os.path.join(self.dirname, self._files[file_id]), 'rb')) + y, x = sample_list[n % 5000] + return x, y-1 + +def collate(iterable): + y_list = [] + x_list = [] + for x, y in iterable: + y_list.append(y) + x_list.append(x) + return x_list, torch.LongTensor(y_list) + +if __name__ == '__main__': + dirname = 'reviews' + dataloader = DataLoader(YelpDocSet(dirname, 238, embedding), batch_size=batch_size, collate_fn=collate) + running_loss = 0.0 + print_size = 10 + + for epoch in range(num_epoch): + for i, batch_samples in enumerate(dataloader): + x, y = batch_samples + doc_list = [] + for sample in x: + doc = [] + for sent in sample: + sent_vec = [] + for word in sent: + vec = embedding.get_vec(word) + sent_vec.append(torch.Tensor(vec.reshape((1, -1)))) + sent_vec = torch.cat(sent_vec, dim=0) + # print(sent_vec.size()) + doc.append(Variable(sent_vec)) + doc_list.append(doc) + y = Variable(y) + predict = net(doc_list) + loss = criterion(predict, y) + optimizer.zero_grad() + loss.backward() + optimizer.step() + running_loss += loss.data[0] + print(loss.data[0]) + if i % print_size == print_size-1: + print(running_loss/print_size) + running_loss = 0.0 +