diff --git a/model_inplement/code/model.py b/model_inplement/code/model.py index a4c2e59b..068eaa55 100644 --- a/model_inplement/code/model.py +++ b/model_inplement/code/model.py @@ -1,10 +1,23 @@ -import numpy - import torch import torch.nn as nn from torch.autograd import Variable import torch.nn.functional as F +def pack_sequence(tensor_seq, padding_value=0.0): + if len(tensor_seq) <= 0: + return + length = [v.size(0) for v in tensor_seq] + max_len = max(length) + size = [len(tensor_seq), max_len] + size.extend(list(tensor_seq[0].size()[1:])) + ans = torch.Tensor(*size).fill_(padding_value) + if tensor_seq[0].data.is_cuda: + ans = ans.cuda() + ans = Variable(ans) + for i, v in enumerate(tensor_seq): + ans[i, :length[i], :] = v + return ans + class HAN(nn.Module): def __init__(self, input_size, output_size, word_hidden_size, word_num_layers, word_context_size, @@ -23,17 +36,14 @@ class HAN(nn.Module): self.softmax = nn.LogSoftmax(dim=1) def forward(self, batch_doc): - # input is a sequence of vector - # if level == w, a seq of words (a sent); level == s, a seq of sents (a doc) + # input is a sequence of matrix doc_vec_list = [] for doc in batch_doc: - s_list = [] - for sent in doc: - s_list.append(self.word_layer(sent)) - s_vec = torch.cat(s_list, dim=0) - vec = self.sent_layer(s_vec) - doc_vec_list.append(vec) - doc_vec = torch.cat(doc_vec_list, dim=0) + # doc's dim (num_sent, seq_len, word_dim) + sent_mat = self.word_layer(doc) + # sent_mat's dim (num_sent, vec_dim) + doc_vec_list.append(sent_mat) + doc_vec = self.sent_layer(pack_sequence(doc_vec_list)) output = self.softmax(self.output_layer(doc_vec)) return output @@ -50,25 +60,52 @@ class AttentionNet(nn.Module): self.gru = nn.GRU(input_size=input_size, hidden_size=gru_hidden_size, num_layers=gru_num_layers, - batch_first=False, + batch_first=True, bidirectional=True) # Attention self.fc = nn.Linear(2* gru_hidden_size, context_vec_size) self.tanh = nn.Tanh() - self.softmax = nn.Softmax(dim=0) + self.softmax = nn.Softmax(dim=1) # context vector self.context_vec = nn.Parameter(torch.Tensor(context_vec_size, 1)) self.context_vec.data.uniform_(-0.1, 0.1) def forward(self, inputs): - # inputs's dim (seq_len, word_dim) - inputs = torch.unsqueeze(inputs, 1) + # inputs's dim (batch_size, seq_len, word_dim) h_t, hidden = self.gru(inputs) - h_t = torch.squeeze(h_t, 1) u = self.tanh(self.fc(h_t)) - alpha = self.softmax(torch.mm(u, self.context_vec)) - output = torch.mm(h_t.t(), alpha).t() - # output's dim (1, 2*hidden_size) - return output + # u's dim (batch_size, seq_len, context_vec_size) + alpha = self.softmax(torch.matmul(u, self.context_vec)) + # alpha's dim (batch_size, seq_len, 1) + output = torch.bmm(torch.transpose(h_t, 1, 2), alpha) + # output's dim (batch_size, 2*hidden_size, 1) + return torch.squeeze(output, dim=2) +if __name__ == '__main__': + import numpy as np + use_cuda = True + net = HAN(input_size=200, output_size=5, + word_hidden_size=50, word_num_layers=1, word_context_size=100, + sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) + optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9) + criterion = nn.NLLLoss() + test_time = 10 + batch_size = 64 + if use_cuda: + net.cuda() + print('test training') + for step in range(test_time): + x_data = [torch.randn(np.random.randint(1,10), 200, 200) for i in range(batch_size)] + y_data = torch.LongTensor([np.random.randint(0, 5) for i in range(batch_size)]) + if use_cuda: + x_data = [x_i.cuda() for x_i in x_data] + y_data = y_data.cuda() + x = [Variable(x_i) for x_i in x_data] + y = Variable(y_data) + predict = net(x) + loss = criterion(predict, y) + optimizer.zero_grad() + loss.backward() + optimizer.step() + print(loss.data[0]) \ No newline at end of file diff --git a/model_inplement/code/train.py b/model_inplement/code/train.py index 13fdd784..1533a34a 100644 --- a/model_inplement/code/train.py +++ b/model_inplement/code/train.py @@ -136,7 +136,7 @@ def train(net, dataset, num_epoch, batch_size, print_size=10, use_cuda=False): if use_cuda: sent_vec = sent_vec.cuda() doc.append(Variable(sent_vec)) - doc_list.append(doc) + doc_list.append(pack_sequence(doc)) if use_cuda: y = y.cuda() y = Variable(y)