Browse Source

add train

tags/v0.1.0
choocewhatulike 6 years ago
parent
commit
58ddc2d267
4 changed files with 153 additions and 49 deletions
  1. +1
    -0
      model_inplement/.gitignore
  2. BIN
      model_inplement/code/__pycache__/model.cpython-36.pyc
  3. +14
    -49
      model_inplement/code/model.py
  4. +138
    -0
      model_inplement/code/train.py

+ 1
- 0
model_inplement/.gitignore View File

@@ -0,0 +1 @@
*.pyc

BIN
model_inplement/code/__pycache__/model.cpython-36.pyc View File


+ 14
- 49
model_inplement/code/model.py View File

@@ -20,16 +20,20 @@ class HAN(nn.Module):
sent_num_layers,
sent_context_size)
self.output_layer = nn.Linear(2* sent_hidden_size, output_size)
self.softmax = nn.Softmax()
self.softmax = nn.LogSoftmax(dim=1)

def forward(self, doc):
def forward(self, batch_doc):
# input is a sequence of vector
# if level == w, a seq of words (a sent); level == s, a seq of sents (a doc)
s_list = []
for sent in doc:
s_list.append(self.word_layer(sent))
s_vec = torch.cat(s_list, dim=1).t()
doc_vec = self.sent_layer(s_vec)
doc_vec_list = []
for doc in batch_doc:
s_list = []
for sent in doc:
s_list.append(self.word_layer(sent))
s_vec = torch.cat(s_list, dim=0)
vec = self.sent_layer(s_vec)
doc_vec_list.append(vec)
doc_vec = torch.cat(doc_vec_list, dim=0)
output = self.softmax(self.output_layer(doc_vec))
return output

@@ -51,7 +55,7 @@ class AttentionNet(nn.Module):
# Attention
self.fc = nn.Linear(2* gru_hidden_size, context_vec_size)
self.tanh = nn.Tanh()
self.softmax = nn.Softmax()
self.softmax = nn.Softmax(dim=0)
# context vector
self.context_vec = nn.Parameter(torch.Tensor(context_vec_size, 1))
self.context_vec.data.uniform_(-0.1, 0.1)
@@ -63,47 +67,8 @@ class AttentionNet(nn.Module):
h_t = torch.squeeze(h_t, 1)
u = self.tanh(self.fc(h_t))
alpha = self.softmax(torch.mm(u, self.context_vec))
output = torch.mm(h_t.t(), alpha)
# output's dim (2*hidden_size, 1)
output = torch.mm(h_t.t(), alpha).t()
# output's dim (1, 2*hidden_size)
return output


'''
Train process
'''
import math
import os
import copy
import pickle

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import json
import nltk

optimizer = torch.optim.SGD(lr=0.01)
criterion = nn.NLLLoss()
epoch = 1
batch_size = 10

net = HAN(input_size=100, output_size=5,
word_hidden_size=50, word_num_layers=1, word_context_size=100,
sent_hidden_size=50, sent_num_layers=1, sent_context_size=100)

def dataloader(filename):
samples = pickle.load(open(filename, 'rb'))
return samples

def gen_doc(text):
pass

class SampleDoc:
def __init__(self, doc, label):
self.doc = doc
self.label = label

def __iter__(self):
for sent in self.doc:
for word in sent:


+ 138
- 0
model_inplement/code/train.py View File

@@ -0,0 +1,138 @@
import gensim
from gensim import models

import os
import pickle
class SampleIter:
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for f in os.listdir(self.dirname):
for y, x in pickle.load(open(os.path.join(self.dirname, f), 'rb')):
yield x, y

class SentIter:
def __init__(self, dirname, count):
self.dirname = dirname
self.count = int(count)
def __iter__(self):
for f in os.listdir(self.dirname)[:self.count]:
for y, x in pickle.load(open(os.path.join(self.dirname, f), 'rb')):
for sent in x:
yield sent

def train_word_vec():
# load data
dirname = 'reviews'
sents = SentIter(dirname, 238)
# define model and train
model = models.Word2Vec(sentences=sents, size=200, sg=0, workers=4, min_count=5)
model.save('yelp.word2vec')


'''
Train process
'''
import math
import os
import copy
import pickle

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import json
import nltk
from gensim.models import Word2Vec
import torch
from torch.utils.data import DataLoader, Dataset

from model import *

net = HAN(input_size=200, output_size=5,
word_hidden_size=50, word_num_layers=1, word_context_size=100,
sent_hidden_size=50, sent_num_layers=1, sent_context_size=100)

optimizer = torch.optim.SGD(net.parameters(), lr=0.01)
criterion = nn.NLLLoss()
num_epoch = 1
batch_size = 64

class Embedding_layer:
def __init__(self, wv, vector_size):
self.wv = wv
self.vector_size = vector_size

def get_vec(self, w):
try:
v = self.wv[w]
except KeyError as e:
v = np.zeros(self.vector_size)
return v

embed_model = Word2Vec.load('yelp.word2vec')
embedding = Embedding_layer(embed_model.wv, embed_model.wv.vector_size)
del embed_model

class YelpDocSet(Dataset):
def __init__(self, dirname, num_files, embedding):
self.dirname = dirname
self.num_files = num_files
self._len = num_files*5000
self._files = os.listdir(dirname)[:num_files]
self.embedding = embedding
def __len__(self):
return self._len

def __getitem__(self, n):
file_id = n // 5000
sample_list = pickle.load(open(
os.path.join(self.dirname, self._files[file_id]), 'rb'))
y, x = sample_list[n % 5000]
return x, y-1

def collate(iterable):
y_list = []
x_list = []
for x, y in iterable:
y_list.append(y)
x_list.append(x)
return x_list, torch.LongTensor(y_list)

if __name__ == '__main__':
dirname = 'reviews'
dataloader = DataLoader(YelpDocSet(dirname, 238, embedding), batch_size=batch_size, collate_fn=collate)
running_loss = 0.0
print_size = 10

for epoch in range(num_epoch):
for i, batch_samples in enumerate(dataloader):
x, y = batch_samples
doc_list = []
for sample in x:
doc = []
for sent in sample:
sent_vec = []
for word in sent:
vec = embedding.get_vec(word)
sent_vec.append(torch.Tensor(vec.reshape((1, -1))))
sent_vec = torch.cat(sent_vec, dim=0)
# print(sent_vec.size())
doc.append(Variable(sent_vec))
doc_list.append(doc)
y = Variable(y)
predict = net(doc_list)
loss = criterion(predict, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.data[0]
print(loss.data[0])
if i % print_size == print_size-1:
print(running_loss/print_size)
running_loss = 0.0

Loading…
Cancel
Save