Browse Source

tokenize data

tags/v0.1.0
choocewhatulike 6 years ago
parent
commit
544ca8631b
4 changed files with 93 additions and 12 deletions
  1. BIN
      model_inplement/code/__pycache__/model.cpython-36.pyc
  2. +51
    -12
      model_inplement/code/model.py
  3. +42
    -0
      model_inplement/code/preprocess.py
  4. +0
    -0
      model_inplement/code/train.py

BIN
model_inplement/code/__pycache__/model.cpython-36.pyc View File


+ 51
- 12
model_inplement/code/model.py View File

@@ -22,19 +22,16 @@ class HAN(nn.Module):
self.output_layer = nn.Linear(2* sent_hidden_size, output_size) self.output_layer = nn.Linear(2* sent_hidden_size, output_size)
self.softmax = nn.Softmax() self.softmax = nn.Softmax()


def forward(self, x, level='w'):
def forward(self, doc):
# input is a sequence of vector # input is a sequence of vector
# if level == w, a seq of words (a sent); level == s, a seq of sents (a doc) # if level == w, a seq of words (a sent); level == s, a seq of sents (a doc)
if level == 's':
v = self.sent_layer(x)
output = self.softmax(self.output_layer(v))
return output
elif level == 'w':
s = self.word_layer(x)
return s
else:
print('unknow level in Parameter!')

s_list = []
for sent in doc:
s_list.append(self.word_layer(sent))
s_vec = torch.cat(s_list, dim=1).t()
doc_vec = self.sent_layer(s_vec)
output = self.softmax(self.output_layer(doc_vec))
return output


class AttentionNet(nn.Module): class AttentionNet(nn.Module):
def __init__(self, input_size, gru_hidden_size, gru_num_layers, context_vec_size): def __init__(self, input_size, gru_hidden_size, gru_num_layers, context_vec_size):
@@ -60,11 +57,53 @@ class AttentionNet(nn.Module):
self.context_vec.data.uniform_(-0.1, 0.1) self.context_vec.data.uniform_(-0.1, 0.1)


def forward(self, inputs): def forward(self, inputs):
# inputs's dim seq_len*word_dim
# inputs's dim (seq_len, word_dim)
inputs = torch.unsqueeze(inputs, 1) inputs = torch.unsqueeze(inputs, 1)
h_t, hidden = self.gru(inputs) h_t, hidden = self.gru(inputs)
h_t = torch.squeeze(h_t, 1) h_t = torch.squeeze(h_t, 1)
u = self.tanh(self.fc(h_t)) u = self.tanh(self.fc(h_t))
alpha = self.softmax(torch.mm(u, self.context_vec)) alpha = self.softmax(torch.mm(u, self.context_vec))
output = torch.mm(h_t.t(), alpha) output = torch.mm(h_t.t(), alpha)
# output's dim (2*hidden_size, 1)
return output return output


'''
Train process
'''
import math
import os
import copy
import pickle

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import json
import nltk

optimizer = torch.optim.SGD(lr=0.01)
criterion = nn.NLLLoss()
epoch = 1
batch_size = 10

net = HAN(input_size=100, output_size=5,
word_hidden_size=50, word_num_layers=1, word_context_size=100,
sent_hidden_size=50, sent_num_layers=1, sent_context_size=100)

def dataloader(filename):
samples = pickle.load(open(filename, 'rb'))
return samples

def gen_doc(text):
pass

class SampleDoc:
def __init__(self, doc, label):
self.doc = doc
self.label = label

def __iter__(self):
for sent in self.doc:
for word in sent:


+ 42
- 0
model_inplement/code/preprocess.py View File

@@ -0,0 +1,42 @@
import pickle
import json
import nltk
from nltk.tokenize import stanford

# f = open('dataset/review.json', encoding='utf-8')
# samples = []
# j = 0
# for i, line in enumerate(f.readlines()):
# review = json.loads(line)
# samples.append((review['stars'], review['text']))
# if (i+1) % 5000 == 0:
# print(i)
# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb'))
# j += 1
# samples = []
# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb'))
samples = pickle.load(open('review/samples0.pkl', 'rb'))
# print(samples[0])

import os
os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
tokenizer = stanford.CoreNLPTokenizer()

dirname = 'review'
dirname1 = 'reviews'

for fn in os.listdir(dirname):
print(fn)
precessed = []
for stars, text in pickle.load(open(os.path.join(dirname, fn), 'rb')):
tokens = []
sents = nltk.tokenize.sent_tokenize(text)
for s in sents:
tokens.append(tokenizer.tokenize(s))
precessed.append((stars, tokens))
# print(tokens)
if len(precessed) % 100 == 0:
print(len(precessed))
pickle.dump(precessed, open(os.path.join(dirname1, fn), 'wb'))

+ 0
- 0
model_inplement/code/train.py View File


Loading…
Cancel
Save