diff --git a/model_inplement/code/model.py b/model_inplement/code/model.py
index a4c2e59b..068eaa55 100644
--- a/model_inplement/code/model.py
+++ b/model_inplement/code/model.py
@@ -1,10 +1,23 @@
-import numpy
-
 import torch
 import torch.nn as nn
 from torch.autograd import Variable
 import torch.nn.functional as F
 
+def pack_sequence(tensor_seq, padding_value=0.0):
+    if len(tensor_seq) <= 0:
+        return
+    length = [v.size(0) for v in tensor_seq]
+    max_len = max(length)
+    size = [len(tensor_seq), max_len]
+    size.extend(list(tensor_seq[0].size()[1:]))
+    ans = torch.Tensor(*size).fill_(padding_value)
+    if tensor_seq[0].data.is_cuda:
+        ans = ans.cuda()
+    ans = Variable(ans)
+    for i, v in enumerate(tensor_seq):
+        ans[i, :length[i], :] = v
+    return ans
+
 class HAN(nn.Module):
     def __init__(self, input_size, output_size, 
                 word_hidden_size, word_num_layers, word_context_size, 
@@ -23,17 +36,14 @@ class HAN(nn.Module):
         self.softmax = nn.LogSoftmax(dim=1)
 
     def forward(self, batch_doc):
-        # input is a sequence of vector
-        # if level == w, a seq of words (a sent); level == s, a seq of sents (a doc)
+        # input is a sequence of matrix
         doc_vec_list = []
         for doc in batch_doc:
-            s_list = []
-            for sent in doc:
-                s_list.append(self.word_layer(sent))
-            s_vec = torch.cat(s_list, dim=0)
-            vec = self.sent_layer(s_vec)
-            doc_vec_list.append(vec)
-        doc_vec = torch.cat(doc_vec_list, dim=0)
+            # doc's dim (num_sent, seq_len, word_dim)
+            sent_mat = self.word_layer(doc)
+            # sent_mat's dim (num_sent, vec_dim)
+            doc_vec_list.append(sent_mat)
+        doc_vec = self.sent_layer(pack_sequence(doc_vec_list))
         output = self.softmax(self.output_layer(doc_vec))
         return output
 
@@ -50,25 +60,52 @@ class AttentionNet(nn.Module):
         self.gru = nn.GRU(input_size=input_size, 
                         hidden_size=gru_hidden_size, 
                         num_layers=gru_num_layers, 
-                        batch_first=False, 
+                        batch_first=True, 
                         bidirectional=True)
         # Attention
         self.fc = nn.Linear(2* gru_hidden_size, context_vec_size)
         self.tanh = nn.Tanh()
-        self.softmax = nn.Softmax(dim=0)
+        self.softmax = nn.Softmax(dim=1)
         # context vector
         self.context_vec = nn.Parameter(torch.Tensor(context_vec_size, 1))
         self.context_vec.data.uniform_(-0.1, 0.1)
 
     def forward(self, inputs):
-        # inputs's dim (seq_len, word_dim)
-        inputs = torch.unsqueeze(inputs, 1)
+        # inputs's dim (batch_size, seq_len,  word_dim)
         h_t, hidden = self.gru(inputs)
-        h_t = torch.squeeze(h_t, 1)
         u = self.tanh(self.fc(h_t))
-        alpha = self.softmax(torch.mm(u, self.context_vec))
-        output = torch.mm(h_t.t(), alpha).t()
-        # output's dim (1, 2*hidden_size)
-        return output
+        # u's dim (batch_size, seq_len, context_vec_size)
+        alpha = self.softmax(torch.matmul(u, self.context_vec))
+        # alpha's dim (batch_size, seq_len, 1)
+        output = torch.bmm(torch.transpose(h_t, 1, 2), alpha)
+        # output's dim (batch_size, 2*hidden_size, 1)
+        return torch.squeeze(output, dim=2)
 
 
+if __name__ == '__main__':
+    import numpy as np
+    use_cuda = True
+    net = HAN(input_size=200, output_size=5, 
+                word_hidden_size=50, word_num_layers=1, word_context_size=100,
+                sent_hidden_size=50, sent_num_layers=1, sent_context_size=100)
+    optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
+    criterion = nn.NLLLoss()
+    test_time = 10
+    batch_size = 64
+    if use_cuda:
+        net.cuda()
+    print('test training')
+    for step in range(test_time):
+        x_data = [torch.randn(np.random.randint(1,10), 200, 200) for i in range(batch_size)]
+        y_data = torch.LongTensor([np.random.randint(0, 5) for i in range(batch_size)])
+        if use_cuda:
+            x_data = [x_i.cuda() for x_i in x_data]
+            y_data = y_data.cuda()
+        x = [Variable(x_i) for x_i in x_data]
+        y = Variable(y_data)
+        predict = net(x)
+        loss = criterion(predict, y)
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        print(loss.data[0])
\ No newline at end of file
diff --git a/model_inplement/code/train.py b/model_inplement/code/train.py
index 13fdd784..1533a34a 100644
--- a/model_inplement/code/train.py
+++ b/model_inplement/code/train.py
@@ -136,7 +136,7 @@ def train(net, dataset, num_epoch, batch_size, print_size=10, use_cuda=False):
                     if use_cuda:
                         sent_vec = sent_vec.cuda()
                     doc.append(Variable(sent_vec))
-                doc_list.append(doc)
+                doc_list.append(pack_sequence(doc))
             if use_cuda:
                 y = y.cuda()
             y = Variable(y)