diff --git a/model_inplement/code/__pycache__/model.cpython-36.pyc b/model_inplement/code/__pycache__/model.cpython-36.pyc new file mode 100644 index 00000000..2e18ae7f Binary files /dev/null and b/model_inplement/code/__pycache__/model.cpython-36.pyc differ diff --git a/model_inplement/code/model.py b/model_inplement/code/model.py index f73cabe3..32ebdbf9 100644 --- a/model_inplement/code/model.py +++ b/model_inplement/code/model.py @@ -22,19 +22,16 @@ class HAN(nn.Module): self.output_layer = nn.Linear(2* sent_hidden_size, output_size) self.softmax = nn.Softmax() - def forward(self, x, level='w'): + def forward(self, doc): # input is a sequence of vector # if level == w, a seq of words (a sent); level == s, a seq of sents (a doc) - if level == 's': - v = self.sent_layer(x) - output = self.softmax(self.output_layer(v)) - return output - elif level == 'w': - s = self.word_layer(x) - return s - else: - print('unknow level in Parameter!') - + s_list = [] + for sent in doc: + s_list.append(self.word_layer(sent)) + s_vec = torch.cat(s_list, dim=1).t() + doc_vec = self.sent_layer(s_vec) + output = self.softmax(self.output_layer(doc_vec)) + return output class AttentionNet(nn.Module): def __init__(self, input_size, gru_hidden_size, gru_num_layers, context_vec_size): @@ -60,11 +57,53 @@ class AttentionNet(nn.Module): self.context_vec.data.uniform_(-0.1, 0.1) def forward(self, inputs): - # inputs's dim seq_len*word_dim + # inputs's dim (seq_len, word_dim) inputs = torch.unsqueeze(inputs, 1) h_t, hidden = self.gru(inputs) h_t = torch.squeeze(h_t, 1) u = self.tanh(self.fc(h_t)) alpha = self.softmax(torch.mm(u, self.context_vec)) output = torch.mm(h_t.t(), alpha) + # output's dim (2*hidden_size, 1) return output + + +''' +Train process +''' +import math +import os +import copy +import pickle + +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +import numpy as np +import json +import nltk + +optimizer = torch.optim.SGD(lr=0.01) +criterion = nn.NLLLoss() +epoch = 1 +batch_size = 10 + +net = HAN(input_size=100, output_size=5, + word_hidden_size=50, word_num_layers=1, word_context_size=100, + sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) + +def dataloader(filename): + samples = pickle.load(open(filename, 'rb')) + return samples + +def gen_doc(text): + pass + +class SampleDoc: + def __init__(self, doc, label): + self.doc = doc + self.label = label + + def __iter__(self): + for sent in self.doc: + for word in sent: + diff --git a/model_inplement/code/preprocess.py b/model_inplement/code/preprocess.py new file mode 100644 index 00000000..37f6eb25 --- /dev/null +++ b/model_inplement/code/preprocess.py @@ -0,0 +1,42 @@ +import pickle +import json +import nltk +from nltk.tokenize import stanford + +# f = open('dataset/review.json', encoding='utf-8') +# samples = [] +# j = 0 +# for i, line in enumerate(f.readlines()): +# review = json.loads(line) +# samples.append((review['stars'], review['text'])) +# if (i+1) % 5000 == 0: +# print(i) +# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb')) +# j += 1 +# samples = [] +# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb')) +samples = pickle.load(open('review/samples0.pkl', 'rb')) +# print(samples[0]) + +import os +os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe' +path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar' +tokenizer = stanford.CoreNLPTokenizer() + +dirname = 'review' +dirname1 = 'reviews' + +for fn in os.listdir(dirname): + print(fn) + precessed = [] + for stars, text in pickle.load(open(os.path.join(dirname, fn), 'rb')): + tokens = [] + sents = nltk.tokenize.sent_tokenize(text) + for s in sents: + tokens.append(tokenizer.tokenize(s)) + precessed.append((stars, tokens)) + # print(tokens) + if len(precessed) % 100 == 0: + print(len(precessed)) + pickle.dump(precessed, open(os.path.join(dirname1, fn), 'wb')) + diff --git a/model_inplement/code/train.py b/model_inplement/code/train.py new file mode 100644 index 00000000..e69de29b