From b93cf0869122058dece64d732ba8128f1deca460 Mon Sep 17 00:00:00 2001
From: HENRY L <henryL7>
Date: Mon, 2 Jul 2018 01:40:17 +0800
Subject: [PATCH 1/3] initial commit

---
 fastNLP/modules/prototype/Word2Idx.py    |  62 +++++++++++++
 fastNLP/modules/prototype/aggregation.py |  41 +++++++++
 fastNLP/modules/prototype/dataloader.py  |  82 +++++++++++++++++
 fastNLP/modules/prototype/embedding.py   |  23 +++++
 fastNLP/modules/prototype/encoder.py     |  25 ++++++
 fastNLP/modules/prototype/example.py     | 108 +++++++++++++++++++++++
 fastNLP/modules/prototype/predict.py     |  25 ++++++
 7 files changed, 366 insertions(+)
 create mode 100644 fastNLP/modules/prototype/Word2Idx.py
 create mode 100644 fastNLP/modules/prototype/aggregation.py
 create mode 100644 fastNLP/modules/prototype/dataloader.py
 create mode 100644 fastNLP/modules/prototype/embedding.py
 create mode 100644 fastNLP/modules/prototype/encoder.py
 create mode 100644 fastNLP/modules/prototype/example.py
 create mode 100644 fastNLP/modules/prototype/predict.py
diff --git a/fastNLP/modules/prototype/Word2Idx.py b/fastNLP/modules/prototype/Word2Idx.py
new file mode 100644
index 00000000..544126be
--- /dev/null
+++ b/fastNLP/modules/prototype/Word2Idx.py
@@ -0,0 +1,62 @@
+import collections
+import pickle
+
+class Word2Idx():
+    """
+    Build a word index according to word frequency.
+    If "min_freq" is given, then only words with a frequncy not lesser than min_freq will be kept.
+    If "max_num" is given, then at most the most frequent $max_num words will be kept.
+    "words" should be a list [ w_1,w_2,...,w_i,...,w_n ] where each w_i is a string representing a word.
+    
+    num is the size of the lookup table.
+    w2i is a lookup table assigning each word an index.
+    Note that index 0 will be returned for any unregistered words.
+    i2w is a vector which serves as an invert mapping of w2i.
+    Token "<UNK>" will be returned for index 0
+    e.g. i2w[w2i["word"]] == "word"
+    """
+    def __init__(self):
+        self.__w2i = dict()
+        self.__i2w = []
+        self.num = 0
+
+    def build(self, words, min_freq=0, max_num=None):
+        """build a model from words"""
+        counter = collections.Counter(words)
+        word_set = set(words)
+        if max_num is not None:
+            most_common = counter.most_common(min(len(word_set), max_num - 1))
+        else:
+            most_common = counter.most_common()
+        self.__w2i = dict((w[0],i + 1) for i,w in enumerate(most_common) if w[1] >= min_freq)
+        self.__w2i["<UNK>"] = 0
+        self.__i2w = ["<UNK>"] + [ w[0] for w in most_common if w[1] >= min_freq ]
+        self.num = len(self.__i2w)
+
+    def w2i(self,word):
+        """word to index"""
+        if word in self.__w2i:
+            return self.__w2i[word]
+        return 0
+
+    def i2w(self,idx):
+        """index to word"""
+        if idx >= self.num:
+            raise Exception("out of range\n")
+        return self.__i2w[idx]
+
+    def save(self,addr):
+        """save the model to a file with address "addr" """
+        f = open(addr,"wb")
+        pickle.dump([self.__i2w, self.__w2i, self.num], f)
+        f.close()
+
+    def load(self,addr):
+        """load a model from a file with address "addr" """
+        f = open(addr,"rb")
+        paras = pickle.load(f)
+        self.__i2w, self.__w2i, self.num = paras[0], paras[1], paras[2]
+        f.close()
+
+    
+
diff --git a/fastNLP/modules/prototype/aggregation.py b/fastNLP/modules/prototype/aggregation.py
new file mode 100644
index 00000000..e87862b8
--- /dev/null
+++ b/fastNLP/modules/prototype/aggregation.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+
+class Selfattention(nn.Module):
+    """
+    Self Attention Module.
+
+    Args:
+    input_size : the size for the input vector
+    d_a : the width of weight matrix
+    r : the number of encoded vectors
+    """
+    def __init__(self, input_size, d_a, r):
+        super(Selfattention, self).__init__()
+        self.W_s1 = nn.Parameter(torch.randn(d_a, input_size), requires_grad=True)
+        self.W_s2 = nn.Parameter(torch.randn(r, d_a), requires_grad=True)
+        self.softmax = nn.Softmax(dim=2)
+        self.tanh = nn.Tanh()
+
+    def penalization(self, A):
+        """
+        compute the penalization term for attention module
+        """
+        if self.W_s1.is_cuda:
+            I = Variable(torch.eye(A.size(1)).cuda(), requires_grad=False)
+        else:
+            I = Variable(torch.eye(A.size(1)), requires_grad=False)
+        M = torch.matmul(A, torch.transpose(A, 1, 2)) - I
+        M = M.view(M.size(0), -1)
+        return torch.sum(M ** 2, dim=1)
+        
+    def forward(self, x):
+        inter = self.tanh(torch.matmul(self.W_s1, torch.transpose(x, 1, 2)))
+        A = self.softmax(torch.matmul(self.W_s2, inter))
+        out = torch.matmul(A, H)
+        out = out.view(out.size(0), -1)
+        penalty = self.penalization(A)
+        return out, penalty
+
+if __name__ == "__main__":
+    model = Selfattention(100, 10, 20)
diff --git a/fastNLP/modules/prototype/dataloader.py b/fastNLP/modules/prototype/dataloader.py
new file mode 100644
index 00000000..a7eafdc2
--- /dev/null
+++ b/fastNLP/modules/prototype/dataloader.py
@@ -0,0 +1,82 @@
+import random
+import pickle
+import torch
+import numpy as np
+from torch.autograd import Variable
+
+def float_wrapper(x, requires_grad=True, using_cuda=True):
+    """
+    transform float type list to pytorch variable
+    """
+    if using_cuda==True:
+        return Variable(torch.FloatTensor(x).cuda(), requires_grad=requires_grad)
+    else:
+        return Variable(torch.FloatTensor(x), requires_grad=requires_grad)
+
+def long_wrapper(x, requires_grad=True, using_cuda=True):
+    """
+    transform long type list to pytorch variable
+    """
+    if using_cuda==True:
+        return Variable(torch.LongTensor(x).cuda(), requires_grad=requires_grad)
+    else:
+        return Variable(torch.LongTensor(x), requires_grad=requires_grad)
+    
+def pad(X, using_cuda):
+        """
+        zero-pad sequnces to same length then pack them together
+        """
+        maxlen = max([x.size(0) for x in X])
+        Y = []
+        for x in X:
+            padlen = maxlen - x.size(0)
+            if padlen > 0:
+                if using_cuda:
+                    paddings = torch.zeros(padlen).cuda()
+                else:
+                    paddings = torch.zeros(padlen)
+                x_ = torch.cat(x, paddings)
+                Y.append(x_)
+            else:
+                Y.append(x)
+        return torch.stack(Y)
+
+class DataLoader(object):
+    """
+    load data with form {"feature", "class"}
+
+    Args:
+    fdir : data file address
+    batch_size : batch_size
+    shuffle : if True, shuffle dataset every epoch
+    using_cuda : if True, return tensors on GPU
+    """
+    def __init__(self, fdir, batch_size, shuffle=True, using_cuda=True):
+        with open(fdir, "rb") as f:
+            self.data = pickle.load(f)
+        self.batch_size = batch_size
+        self.num = len(self.data)
+        self.count = 0
+        self.iters = int(self.num / batch_size)
+        self.shuffle = shuffle
+        self.using_cuda = using_cuda
+        
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.count == self.iters:
+            self.count = 0
+            if self.shuffle:
+                random.shuffle(self.data)
+            raise StopIteration()
+        else:
+            X = self.data[self.count * self.batch_size : (self.count + 1) * self.batch_size]
+            self.count += 1
+            X = [long_wrapper(x["sent"], using_cuda=self.using_cuda) for x in X]
+            X = pad(X, self.using_cuda)
+            y = [long_wrapper(x["class"], using_cuda=self.using_cuda) for x in X]
+            y = torch.stack(y)
+            return {"feature" : X, "class" : y}
+            
+
diff --git a/fastNLP/modules/prototype/embedding.py b/fastNLP/modules/prototype/embedding.py
new file mode 100644
index 00000000..1ee88a92
--- /dev/null
+++ b/fastNLP/modules/prototype/embedding.py
@@ -0,0 +1,23 @@
+import torch
+import torch.nn as nn
+
+class Lookuptable(nn.Module):
+    """
+    A simple lookup table
+
+    Args:
+    nums : the size of the lookup table
+    dims : the size of each vector
+    padding_idx : pads the tensor with zeros whenever it encounters this index
+    sparse : If True, gradient matrix will be a sparse tensor. In this case,
+    only optim.SGD(cuda and cpu) and optim.Adagrad(cpu) can be used
+    """
+    def __init__(self, nums, dims, padding_idx=0, sparse=False):
+        super(Lookuptable, self).__init__()
+        self.embed = nn.Embedding(nums, dims, padding_idx, sparse=sparse)
+        
+    def forward(self, x):
+        return self.embed(x)
+
+if __name__ == "__main__":
+    model = Lookuptable(10, 20)
diff --git a/fastNLP/modules/prototype/encoder.py b/fastNLP/modules/prototype/encoder.py
new file mode 100644
index 00000000..249eaf8c
--- /dev/null
+++ b/fastNLP/modules/prototype/encoder.py
@@ -0,0 +1,25 @@
+import torch
+import torch.nn as nn
+
+class Lstm(nn.Module):
+    """
+    LSTM module
+
+    Args:
+    input_size : input size
+    hidden_size : hidden size
+    num_layers : number of hidden layers
+    dropout : dropout rate
+    bidirectional : If True, becomes a bidirectional RNN
+    """
+    def __init__(self, input_size, hidden_size, num_layers, dropout, bidirectional):
+        super(Lstm, self).__init__()
+        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=True,\
+         dropout=dropout, bidirectional=bidirectional)
+        
+    def forward(self, x):
+        x, _ = self.lstm(x)
+        return x
+
+if __name__ == "__main__":
+    model = Lstm(20, 30, 1, 0.5, False)
diff --git a/fastNLP/modules/prototype/example.py b/fastNLP/modules/prototype/example.py
new file mode 100644
index 00000000..9dffc59a
--- /dev/null
+++ b/fastNLP/modules/prototype/example.py
@@ -0,0 +1,108 @@
+import torch
+import torch.nn as nn
+import encoder
+import aggregation
+import embedding
+import predict
+import torch.optim as optim
+import time
+import dataloader
+
+WORD_SIZE = 100
+HIDDEN_SIZE = 300
+D_A = 350
+R = 20
+MLP_HIDDEN = 2000 
+CLASSES_NUM = 5
+WORD_NUM = 357361
+
+class Net(nn.Module):
+    """
+    A model for sentiment analysis using lstm and self-attention
+    """
+    def __init__(self):
+        super(Net, self).__init__()
+        self.embedding = embedding.Lookuptable(WORD_NUM, WORD_SIZE)
+        self.encoder = encoder.Lstm(WORD_SIZE, HIDDEN_SIZE, 1, 0.5, True)
+        self.aggregation = aggregation.Selfattention(2 * HIDDEN_SIZE, D_A, R)
+        self.predict = predict.MLP(R * HIDDEN_SIZE * 2, MLP_HIDDEN, CLASSES_NUM)
+
+    def forward(self, x):
+        x = self.embedding(x)
+        x = self.encoder(x)
+        x, penalty = self.aggregation(x)
+        x = self.predict(x)
+        return r, x
+
+def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
+    momentum=0.3, batch_size=32, epochs=5, coef=1.0, interval=10):
+    """
+    training procedure
+
+    Args: 
+    If model_dict is given (a file address), it will continue training on the given model.
+    Otherwise, it would train a new model from scratch.
+    If using_cuda is true, the training would be conducted on GPU.
+    Learning_rate and momentum is for SGD optimizer.
+    coef is the coefficent between the cross-entropy loss and the penalization term.
+    interval is the frequncy of reporting.
+
+    the result will be saved with a form "model_dict_+current time", which could be used for further training
+    """
+    
+    if using_cuda == True:
+        net = Net().cuda()
+    else:
+        net = Net()
+        
+    if model_dict != None:
+        net.load_state_dict(torch.load(model_dict))
+
+    optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)
+    criterion = nn.CrossEntropyLoss()
+    dataset = dataloader.DataLoader("trainset.pkl", using_cuda=using_cuda)
+
+    #statistics
+    loss_count = 0
+    prepare_time = 0
+    run_time = 0
+    count = 0
+
+    for epoch in range(epochs):
+        for i, batch in enumerate(dataset):
+            t1 = time.time()
+            X = batch["feature"]
+            y = batch["class"]
+            
+            t2 = time.time()
+            y_pred, y_penl = net(X)
+            loss = criterion(y_pred, y) + torch.sum(y_penl) / batch_size * coef
+            optimizer.zero_grad()
+            loss.backward()
+            nn.utils.clip_grad_norm(net.parameters(), 0.5)
+            optimizer.step()
+            t3 = time.time()
+
+            loss_count += torch.sum(y_penl).data[0]
+            prepare_time += (t2 - t1)
+            run_time += (t3 - t2)
+            p, idx = torch.max(y_pred, dim=1)
+            idx = idx.data
+            count += torch.sum(torch.eq(idx.cpu(), y))
+
+            if i % interval == 0:
+                print(i)      
+                print("loss count:" + str(loss_count / batch_size))
+                print("acuracy:" + str(count / batch_size))
+                print("penalty:" + str(torch.sum(y_penl).data[0] / batch_size))
+                print("prepare time:" + str(prepare_time / batch_size))
+                print("run time:" + str(run_time / batch_size))
+                prepare_time = 0
+                run_time = 0
+                loss_count = 0
+                count = 0
+        torch.save(net.state_dict(), "model_dict_%s.pkl"%(str(time.time())))
+
+if __name__ == "__main__":
+    train(using_cuda=torch.cuda.is_available())
+
diff --git a/fastNLP/modules/prototype/predict.py b/fastNLP/modules/prototype/predict.py
new file mode 100644
index 00000000..c8e72629
--- /dev/null
+++ b/fastNLP/modules/prototype/predict.py
@@ -0,0 +1,25 @@
+import torch
+import torch.nn as nn
+
+class MLP(nn.Module):
+    """
+    A two layers perceptron for classification.
+
+    Output : Unnormalized possibility distribution
+    Args:
+    input_size : the size of input
+    hidden_size : the size of hidden layer
+    output_size : the size of output
+    """
+    def __init__(self, input_size, hidden_size, output_size):
+        super(MLP,self).__init__()
+        self.L1 = nn.Linear(input_size, hidden_size)
+        self.L2 = nn.Linear(hidden_size, output_size)
+        self.softmax = nn.Softmax(dim=1)
+
+    def forward(self, x):
+        out = self.L2(F.relu(self.L1(x)))
+        return out
+
+if __name__ == "__main__":
+    MLP(20, 30, 20)
\ No newline at end of file

From 561305e03d51eb9209300fb21a32f7b5c0560ff8 Mon Sep 17 00:00:00 2001
From: HENRY L <henryL7>
Date: Mon, 2 Jul 2018 02:06:33 +0800
Subject: [PATCH 2/3] update and add readme

---
 fastNLP/modules/prototype/README.md      | 41 +++++++++++++++++++
 fastNLP/modules/prototype/Word2Idx.py    | 19 ++++-----
 fastNLP/modules/prototype/aggregation.py |  5 +--
 fastNLP/modules/prototype/dataloader.py  | 13 +++---
 fastNLP/modules/prototype/encoder.py     |  3 --
 fastNLP/modules/prototype/example.py     | 51 +++++++++++++++++-------
 fastNLP/modules/prototype/predict.py     |  2 +-
 fastNLP/modules/prototype/prepare.py     | 50 +++++++++++++++++++++++
 8 files changed, 146 insertions(+), 38 deletions(-)
 create mode 100644 fastNLP/modules/prototype/README.md
 create mode 100644 fastNLP/modules/prototype/prepare.py

diff --git a/fastNLP/modules/prototype/README.md b/fastNLP/modules/prototype/README.md
new file mode 100644
index 00000000..2dff7caa
--- /dev/null
+++ b/fastNLP/modules/prototype/README.md
@@ -0,0 +1,41 @@
+# Prototype
+
+## Word2Idx.py
+A mapping model between words and indexes
+
+## embedding.py
+embedding modules
+
+Contains a simple encapsulation for torch.nn.Embedding
+
+## encoder.py
+encoder modules
+
+Contains a simple encapsulation for torch.nn.LSTM
+
+## aggregation.py
+aggregation modules
+
+Contains a self-attention model, according to paper "A Structured Self-attentive Sentence Embedding", https://arxiv.org/abs/1703.03130
+
+## predict.py
+predict modules
+
+Contains a two layers perceptron for classification
+
+## example.py
+An example showing how to use above modules to build a model
+
+Contains a model for sentiment analysis on Yelp dataset, and its training and testing procedures. See https://arxiv.org/abs/1703.03130 for more details.
+
+## prepare.py
+A case of using Word2Idx to build Yelp datasets
+
+## dataloader.py
+A dataloader for Yelp dataset
+
+It is an iterable object, returning a zero-padded batch every iteration.
+
+
+
+
diff --git a/fastNLP/modules/prototype/Word2Idx.py b/fastNLP/modules/prototype/Word2Idx.py
index 544126be..2499aeae 100644
--- a/fastNLP/modules/prototype/Word2Idx.py
+++ b/fastNLP/modules/prototype/Word2Idx.py
@@ -4,15 +4,15 @@ import pickle
 class Word2Idx():
     """
     Build a word index according to word frequency.
+
     If "min_freq" is given, then only words with a frequncy not lesser than min_freq will be kept.
     If "max_num" is given, then at most the most frequent $max_num words will be kept.
     "words" should be a list [ w_1,w_2,...,w_i,...,w_n ] where each w_i is a string representing a word.
-    
     num is the size of the lookup table.
     w2i is a lookup table assigning each word an index.
-    Note that index 0 will be returned for any unregistered words.
     i2w is a vector which serves as an invert mapping of w2i.
-    Token "<UNK>" will be returned for index 0
+    Note that index 0 is token "<PAD>" for padding
+    index 1 is token "<UNK>" for unregistered words
     e.g. i2w[w2i["word"]] == "word"
     """
     def __init__(self):
@@ -29,29 +29,30 @@ class Word2Idx():
         else:
             most_common = counter.most_common()
         self.__w2i = dict((w[0],i + 1) for i,w in enumerate(most_common) if w[1] >= min_freq)
-        self.__w2i["<UNK>"] = 0
-        self.__i2w = ["<UNK>"] + [ w[0] for w in most_common if w[1] >= min_freq ]
+        self.__w2i["<PAD>"] = 0
+        self.__w2i["<UNK>"] = 1
+        self.__i2w = ["<PAD>", "<UNK>"] + [ w[0] for w in most_common if w[1] >= min_freq ]
         self.num = len(self.__i2w)
 
-    def w2i(self,word):
+    def w2i(self, word):
         """word to index"""
         if word in self.__w2i:
             return self.__w2i[word]
         return 0
 
-    def i2w(self,idx):
+    def i2w(self, idx):
         """index to word"""
         if idx >= self.num:
             raise Exception("out of range\n")
         return self.__i2w[idx]
 
-    def save(self,addr):
+    def save(self, addr):
         """save the model to a file with address "addr" """
         f = open(addr,"wb")
         pickle.dump([self.__i2w, self.__w2i, self.num], f)
         f.close()
 
-    def load(self,addr):
+    def load(self, addr):
         """load a model from a file with address "addr" """
         f = open(addr,"rb")
         paras = pickle.load(f)
diff --git a/fastNLP/modules/prototype/aggregation.py b/fastNLP/modules/prototype/aggregation.py
index e87862b8..59e50e99 100644
--- a/fastNLP/modules/prototype/aggregation.py
+++ b/fastNLP/modules/prototype/aggregation.py
@@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
+from torch.autograd import Variable
 
 class Selfattention(nn.Module):
     """
@@ -32,10 +33,8 @@ class Selfattention(nn.Module):
     def forward(self, x):
         inter = self.tanh(torch.matmul(self.W_s1, torch.transpose(x, 1, 2)))
         A = self.softmax(torch.matmul(self.W_s2, inter))
-        out = torch.matmul(A, H)
+        out = torch.matmul(A, x)
         out = out.view(out.size(0), -1)
         penalty = self.penalization(A)
         return out, penalty
 
-if __name__ == "__main__":
-    model = Selfattention(100, 10, 20)
diff --git a/fastNLP/modules/prototype/dataloader.py b/fastNLP/modules/prototype/dataloader.py
index a7eafdc2..af5cd8b8 100644
--- a/fastNLP/modules/prototype/dataloader.py
+++ b/fastNLP/modules/prototype/dataloader.py
@@ -32,10 +32,10 @@ def pad(X, using_cuda):
             padlen = maxlen - x.size(0)
             if padlen > 0:
                 if using_cuda:
-                    paddings = torch.zeros(padlen).cuda()
+                    paddings = Variable(torch.zeros(padlen).long()).cuda()
                 else:
-                    paddings = torch.zeros(padlen)
-                x_ = torch.cat(x, paddings)
+                    paddings = Variable(torch.zeros(padlen).long())
+                x_ = torch.cat((x, paddings), 0)
                 Y.append(x_)
             else:
                 Y.append(x)
@@ -71,12 +71,11 @@ class DataLoader(object):
                 random.shuffle(self.data)
             raise StopIteration()
         else:
-            X = self.data[self.count * self.batch_size : (self.count + 1) * self.batch_size]
+            batch = self.data[self.count * self.batch_size : (self.count + 1) * self.batch_size]
             self.count += 1
-            X = [long_wrapper(x["sent"], using_cuda=self.using_cuda) for x in X]
+            X = [long_wrapper(x["sent"], using_cuda=self.using_cuda, requires_grad=False) for x in batch]
             X = pad(X, self.using_cuda)
-            y = [long_wrapper(x["class"], using_cuda=self.using_cuda) for x in X]
-            y = torch.stack(y)
+            y = long_wrapper([x["class"] for x in batch], using_cuda=self.using_cuda, requires_grad=False)
             return {"feature" : X, "class" : y}
             
 
diff --git a/fastNLP/modules/prototype/encoder.py b/fastNLP/modules/prototype/encoder.py
index 249eaf8c..142496e1 100644
--- a/fastNLP/modules/prototype/encoder.py
+++ b/fastNLP/modules/prototype/encoder.py
@@ -20,6 +20,3 @@ class Lstm(nn.Module):
     def forward(self, x):
         x, _ = self.lstm(x)
         return x
-
-if __name__ == "__main__":
-    model = Lstm(20, 30, 1, 0.5, False)
diff --git a/fastNLP/modules/prototype/example.py b/fastNLP/modules/prototype/example.py
index 9dffc59a..782937fe 100644
--- a/fastNLP/modules/prototype/example.py
+++ b/fastNLP/modules/prototype/example.py
@@ -8,13 +8,13 @@ import torch.optim as optim
 import time
 import dataloader
 
+WORD_NUM = 357361
 WORD_SIZE = 100
 HIDDEN_SIZE = 300
 D_A = 350
-R = 20
+R = 10
 MLP_HIDDEN = 2000 
 CLASSES_NUM = 5
-WORD_NUM = 357361
 
 class Net(nn.Module):
     """
@@ -32,7 +32,7 @@ class Net(nn.Module):
         x = self.encoder(x)
         x, penalty = self.aggregation(x)
         x = self.predict(x)
-        return r, x
+        return x, penalty
 
 def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
     momentum=0.3, batch_size=32, epochs=5, coef=1.0, interval=10):
@@ -50,7 +50,7 @@ def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
     the result will be saved with a form "model_dict_+current time", which could be used for further training
     """
     
-    if using_cuda == True:
+    if using_cuda:
         net = Net().cuda()
     else:
         net = Net()
@@ -60,7 +60,7 @@ def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
 
     optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)
     criterion = nn.CrossEntropyLoss()
-    dataset = dataloader.DataLoader("trainset.pkl", using_cuda=using_cuda)
+    dataset = dataloader.DataLoader("test_set.pkl", batch_size, using_cuda=using_cuda)
 
     #statistics
     loss_count = 0
@@ -69,6 +69,7 @@ def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
     count = 0
 
     for epoch in range(epochs):
+        print("epoch: %d"%(epoch))
         for i, batch in enumerate(dataset):
             t1 = time.time()
             X = batch["feature"]
@@ -86,23 +87,43 @@ def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
             loss_count += torch.sum(y_penl).data[0]
             prepare_time += (t2 - t1)
             run_time += (t3 - t2)
-            p, idx = torch.max(y_pred, dim=1)
-            idx = idx.data
-            count += torch.sum(torch.eq(idx.cpu(), y))
+            p, idx = torch.max(y_pred.data, dim=1)
+            count += torch.sum(torch.eq(idx.cpu(), y.data.cpu()))
 
-            if i % interval == 0:
-                print(i)      
-                print("loss count:" + str(loss_count / batch_size))
-                print("acuracy:" + str(count / batch_size))
+            if (i + 1) % interval == 0:
+                print("epoch : %d, iters: %d"%(epoch, i + 1))     
+                print("loss count:" + str(loss_count / (interval * batch_size)))
+                print("acuracy:" + str(count / (interval * batch_size)))
                 print("penalty:" + str(torch.sum(y_penl).data[0] / batch_size))
-                print("prepare time:" + str(prepare_time / batch_size))
-                print("run time:" + str(run_time / batch_size))
+                print("prepare time:" + str(prepare_time))
+                print("run time:" + str(run_time))
                 prepare_time = 0
                 run_time = 0
                 loss_count = 0
                 count = 0
-        torch.save(net.state_dict(), "model_dict_%s.pkl"%(str(time.time())))
+        string = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
+        torch.save(net.state_dict(), "model_dict_%s.dict"%(string))
+
+def test(model_dict, using_cuda=True):
+    if using_cuda:
+        net = Net().cuda()
+    else:
+        net = Net()
+    net.load_state_dict(torch.load(model_dict))
+    dataset = dataloader.DataLoader("test_set.pkl", batch_size=1, using_cuda=using_cuda)
+    count = 0
+    for i, batch in enumerate(dataset):
+        X = batch["feature"]
+        y = batch["class"]
+        y_pred, _ = net(X)
+        p, idx = torch.max(y_pred.data, dim=1)
+        count += torch.sum(torch.eq(idx.cpu(), y.data.cpu()))
+    print("accuracy: %f"%(count / dataset.num))
+        
 
 if __name__ == "__main__":
     train(using_cuda=torch.cuda.is_available())
+    
+    
+    
 
diff --git a/fastNLP/modules/prototype/predict.py b/fastNLP/modules/prototype/predict.py
index c8e72629..d5346c0e 100644
--- a/fastNLP/modules/prototype/predict.py
+++ b/fastNLP/modules/prototype/predict.py
@@ -1,5 +1,6 @@
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 class MLP(nn.Module):
     """
@@ -15,7 +16,6 @@ class MLP(nn.Module):
         super(MLP,self).__init__()
         self.L1 = nn.Linear(input_size, hidden_size)
         self.L2 = nn.Linear(hidden_size, output_size)
-        self.softmax = nn.Softmax(dim=1)
 
     def forward(self, x):
         out = self.L2(F.relu(self.L1(x)))
diff --git a/fastNLP/modules/prototype/prepare.py b/fastNLP/modules/prototype/prepare.py
new file mode 100644
index 00000000..02fd19c5
--- /dev/null
+++ b/fastNLP/modules/prototype/prepare.py
@@ -0,0 +1,50 @@
+import pickle
+import Word2Idx
+
+def get_sets(m, n):
+    """
+    get a train set containing m samples and a test set containing n samples
+    """
+    samples = pickle.load(open("tuples.pkl","rb"))
+    if m+n > len(samples):
+        print("asking for too many tuples\n")
+        return
+    train_samples = samples[ : m]
+    test_samples = samples[m: m+n]
+    return train_samples, test_samples
+
+def build_wordidx():
+    """
+    build wordidx using word2idx
+    """
+    train, test = get_sets(500000, 2000)
+    words = []
+    for x in train:
+        words += x[0]
+    wordidx = Word2Idx.Word2Idx()
+    wordidx.build(words)
+    print(wordidx.num)
+    print(wordidx.i2w(0))
+    wordidx.save("wordidx.pkl")
+
+def build_sets():
+    """
+    build train set and test set, transform word to index
+    """
+    train, test = get_sets(500000, 2000)
+    wordidx = Word2Idx.Word2Idx()
+    wordidx.load("wordidx.pkl")
+    train_set = []
+    for x in train:
+        sent = [wordidx.w2i(w) for w in x[0]]
+        train_set.append({"sent" : sent, "class" : x[1]})
+    test_set = []
+    for x in test:
+        sent = [wordidx.w2i(w) for w in x[0]]
+        test_set.append({"sent" : sent, "class" : x[1]})
+    pickle.dump(train_set, open("train_set.pkl", "wb"))
+    pickle.dump(test_set, open("test_set.pkl", "wb"))
+
+if __name__ == "__main__":
+    build_wordidx()
+    build_sets()

From f585a9aa7df9b73e757dd51526a45bf3380b2ead Mon Sep 17 00:00:00 2001
From: HENRY L <henryL7>
Date: Mon, 2 Jul 2018 02:49:55 +0800
Subject: [PATCH 3/3] update

---
 fastNLP/modules/prototype/example.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastNLP/modules/prototype/example.py b/fastNLP/modules/prototype/example.py
index 782937fe..a19898c6 100644
--- a/fastNLP/modules/prototype/example.py
+++ b/fastNLP/modules/prototype/example.py
@@ -60,7 +60,7 @@ def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
 
     optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)
     criterion = nn.CrossEntropyLoss()
-    dataset = dataloader.DataLoader("test_set.pkl", batch_size, using_cuda=using_cuda)
+    dataset = dataloader.DataLoader("train_set.pkl", batch_size, using_cuda=using_cuda)
 
     #statistics
     loss_count = 0