Merge pull request #6 from henryL7/master

prototype and self-attention model
7 years ago · 96c65993bd
--- a/fastNLP/modules/prototype/README.md
+++ b/fastNLP/modules/prototype/README.md
@@ -0,0 +1,41 @@
 # Prototype

 ## Word2Idx.py
 A mapping model between words and indexes

 ## embedding.py
 embedding modules

 Contains a simple encapsulation for torch.nn.Embedding

 ## encoder.py
 encoder modules

 Contains a simple encapsulation for torch.nn.LSTM

 ## aggregation.py
 aggregation modules

 Contains a self-attention model, according to paper "A Structured Self-attentive Sentence Embedding", https://arxiv.org/abs/1703.03130

 ## predict.py
 predict modules

 Contains a two layers perceptron for classification

 ## example.py
 An example showing how to use above modules to build a model

 Contains a model for sentiment analysis on Yelp dataset, and its training and testing procedures. See https://arxiv.org/abs/1703.03130 for more details.

 ## prepare.py
 A case of using Word2Idx to build Yelp datasets

 ## dataloader.py
 A dataloader for Yelp dataset

 It is an iterable object, returning a zero-padded batch every iteration.




--- a/fastNLP/modules/prototype/Word2Idx.py
+++ b/fastNLP/modules/prototype/Word2Idx.py
@@ -0,0 +1,63 @@
 import collections
 import pickle

 class Word2Idx():
    """
    Build a word index according to word frequency.

    If "min_freq" is given, then only words with a frequncy not lesser than min_freq will be kept.
    If "max_num" is given, then at most the most frequent $max_num words will be kept.
    "words" should be a list [ w_1,w_2,...,w_i,...,w_n ] where each w_i is a string representing a word.
    num is the size of the lookup table.
    w2i is a lookup table assigning each word an index.
    i2w is a vector which serves as an invert mapping of w2i.
    Note that index 0 is token "<PAD>" for padding
    index 1 is token "<UNK>" for unregistered words
    e.g. i2w[w2i["word"]] == "word"
    """
    def __init__(self):
        self.__w2i = dict()
        self.__i2w = []
        self.num = 0

    def build(self, words, min_freq=0, max_num=None):
        """build a model from words"""
        counter = collections.Counter(words)
        word_set = set(words)
        if max_num is not None:
            most_common = counter.most_common(min(len(word_set), max_num - 1))
        else:
            most_common = counter.most_common()
        self.__w2i = dict((w[0],i + 1) for i,w in enumerate(most_common) if w[1] >= min_freq)
        self.__w2i["<PAD>"] = 0
        self.__w2i["<UNK>"] = 1
        self.__i2w = ["<PAD>", "<UNK>"] + [ w[0] for w in most_common if w[1] >= min_freq ]
        self.num = len(self.__i2w)

    def w2i(self, word):
        """word to index"""
        if word in self.__w2i:
            return self.__w2i[word]
        return 0

    def i2w(self, idx):
        """index to word"""
        if idx >= self.num:
            raise Exception("out of range\n")
        return self.__i2w[idx]

    def save(self, addr):
        """save the model to a file with address "addr" """
        f = open(addr,"wb")
        pickle.dump([self.__i2w, self.__w2i, self.num], f)
        f.close()

    def load(self, addr):
        """load a model from a file with address "addr" """
        f = open(addr,"rb")
        paras = pickle.load(f)
        self.__i2w, self.__w2i, self.num = paras[0], paras[1], paras[2]
        f.close()

    

--- a/fastNLP/modules/prototype/aggregation.py
+++ b/fastNLP/modules/prototype/aggregation.py
@@ -0,0 +1,40 @@
 import torch
 import torch.nn as nn
 from torch.autograd import Variable

 class Selfattention(nn.Module):
    """
    Self Attention Module.

    Args:
    input_size : the size for the input vector
    d_a : the width of weight matrix
    r : the number of encoded vectors
    """
    def __init__(self, input_size, d_a, r):
        super(Selfattention, self).__init__()
        self.W_s1 = nn.Parameter(torch.randn(d_a, input_size), requires_grad=True)
        self.W_s2 = nn.Parameter(torch.randn(r, d_a), requires_grad=True)
        self.softmax = nn.Softmax(dim=2)
        self.tanh = nn.Tanh()

    def penalization(self, A):
        """
        compute the penalization term for attention module
        """
        if self.W_s1.is_cuda:
            I = Variable(torch.eye(A.size(1)).cuda(), requires_grad=False)
        else:
            I = Variable(torch.eye(A.size(1)), requires_grad=False)
        M = torch.matmul(A, torch.transpose(A, 1, 2)) - I
        M = M.view(M.size(0), -1)
        return torch.sum(M ** 2, dim=1)
        
    def forward(self, x):
        inter = self.tanh(torch.matmul(self.W_s1, torch.transpose(x, 1, 2)))
        A = self.softmax(torch.matmul(self.W_s2, inter))
        out = torch.matmul(A, x)
        out = out.view(out.size(0), -1)
        penalty = self.penalization(A)
        return out, penalty

--- a/fastNLP/modules/prototype/dataloader.py
+++ b/fastNLP/modules/prototype/dataloader.py
@@ -0,0 +1,81 @@
 import random
 import pickle
 import torch
 import numpy as np
 from torch.autograd import Variable

 def float_wrapper(x, requires_grad=True, using_cuda=True):
    """
    transform float type list to pytorch variable
    """
    if using_cuda==True:
        return Variable(torch.FloatTensor(x).cuda(), requires_grad=requires_grad)
    else:
        return Variable(torch.FloatTensor(x), requires_grad=requires_grad)

 def long_wrapper(x, requires_grad=True, using_cuda=True):
    """
    transform long type list to pytorch variable
    """
    if using_cuda==True:
        return Variable(torch.LongTensor(x).cuda(), requires_grad=requires_grad)
    else:
        return Variable(torch.LongTensor(x), requires_grad=requires_grad)
    
 def pad(X, using_cuda):
        """
        zero-pad sequnces to same length then pack them together
        """
        maxlen = max([x.size(0) for x in X])
        Y = []
        for x in X:
            padlen = maxlen - x.size(0)
            if padlen > 0:
                if using_cuda:
                    paddings = Variable(torch.zeros(padlen).long()).cuda()
                else:
                    paddings = Variable(torch.zeros(padlen).long())
                x_ = torch.cat((x, paddings), 0)
                Y.append(x_)
            else:
                Y.append(x)
        return torch.stack(Y)

 class DataLoader(object):
    """
    load data with form {"feature", "class"}

    Args:
    fdir : data file address
    batch_size : batch_size
    shuffle : if True, shuffle dataset every epoch
    using_cuda : if True, return tensors on GPU
    """
    def __init__(self, fdir, batch_size, shuffle=True, using_cuda=True):
        with open(fdir, "rb") as f:
            self.data = pickle.load(f)
        self.batch_size = batch_size
        self.num = len(self.data)
        self.count = 0
        self.iters = int(self.num / batch_size)
        self.shuffle = shuffle
        self.using_cuda = using_cuda
        
    def __iter__(self):
        return self

    def __next__(self):
        if self.count == self.iters:
            self.count = 0
            if self.shuffle:
                random.shuffle(self.data)
            raise StopIteration()
        else:
            batch = self.data[self.count * self.batch_size : (self.count + 1) * self.batch_size]
            self.count += 1
            X = [long_wrapper(x["sent"], using_cuda=self.using_cuda, requires_grad=False) for x in batch]
            X = pad(X, self.using_cuda)
            y = long_wrapper([x["class"] for x in batch], using_cuda=self.using_cuda, requires_grad=False)
            return {"feature" : X, "class" : y}
            

--- a/fastNLP/modules/prototype/embedding.py
+++ b/fastNLP/modules/prototype/embedding.py
@@ -0,0 +1,23 @@
 import torch
 import torch.nn as nn

 class Lookuptable(nn.Module):
    """
    A simple lookup table

    Args:
    nums : the size of the lookup table
    dims : the size of each vector
    padding_idx : pads the tensor with zeros whenever it encounters this index
    sparse : If True, gradient matrix will be a sparse tensor. In this case,
    only optim.SGD(cuda and cpu) and optim.Adagrad(cpu) can be used
    """
    def __init__(self, nums, dims, padding_idx=0, sparse=False):
        super(Lookuptable, self).__init__()
        self.embed = nn.Embedding(nums, dims, padding_idx, sparse=sparse)
        
    def forward(self, x):
        return self.embed(x)

 if __name__ == "__main__":
    model = Lookuptable(10, 20)
--- a/fastNLP/modules/prototype/encoder.py
+++ b/fastNLP/modules/prototype/encoder.py
@@ -0,0 +1,22 @@
 import torch
 import torch.nn as nn

 class Lstm(nn.Module):
    """
    LSTM module

    Args:
    input_size : input size
    hidden_size : hidden size
    num_layers : number of hidden layers
    dropout : dropout rate
    bidirectional : If True, becomes a bidirectional RNN
    """
    def __init__(self, input_size, hidden_size, num_layers, dropout, bidirectional):
        super(Lstm, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=True, batch_first=True,\
         dropout=dropout, bidirectional=bidirectional)
        
    def forward(self, x):
        x, _ = self.lstm(x)
        return x
--- a/fastNLP/modules/prototype/example.py
+++ b/fastNLP/modules/prototype/example.py
@@ -0,0 +1,129 @@
 import torch
 import torch.nn as nn
 import encoder
 import aggregation
 import embedding
 import predict
 import torch.optim as optim
 import time
 import dataloader

 WORD_NUM = 357361
 WORD_SIZE = 100
 HIDDEN_SIZE = 300
 D_A = 350
 R = 10
 MLP_HIDDEN = 2000 
 CLASSES_NUM = 5

 class Net(nn.Module):
    """
    A model for sentiment analysis using lstm and self-attention
    """
    def __init__(self):
        super(Net, self).__init__()
        self.embedding = embedding.Lookuptable(WORD_NUM, WORD_SIZE)
        self.encoder = encoder.Lstm(WORD_SIZE, HIDDEN_SIZE, 1, 0.5, True)
        self.aggregation = aggregation.Selfattention(2 * HIDDEN_SIZE, D_A, R)
        self.predict = predict.MLP(R * HIDDEN_SIZE * 2, MLP_HIDDEN, CLASSES_NUM)

    def forward(self, x):
        x = self.embedding(x)
        x = self.encoder(x)
        x, penalty = self.aggregation(x)
        x = self.predict(x)
        return x, penalty

 def train(model_dict=None, using_cuda=True, learning_rate=0.06,\
    momentum=0.3, batch_size=32, epochs=5, coef=1.0, interval=10):
    """
    training procedure

    Args: 
    If model_dict is given (a file address), it will continue training on the given model.
    Otherwise, it would train a new model from scratch.
    If using_cuda is true, the training would be conducted on GPU.
    Learning_rate and momentum is for SGD optimizer.
    coef is the coefficent between the cross-entropy loss and the penalization term.
    interval is the frequncy of reporting.

    the result will be saved with a form "model_dict_+current time", which could be used for further training
    """
    
    if using_cuda:
        net = Net().cuda()
    else:
        net = Net()
        
    if model_dict != None:
        net.load_state_dict(torch.load(model_dict))

    optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=momentum)
    criterion = nn.CrossEntropyLoss()
    dataset = dataloader.DataLoader("train_set.pkl", batch_size, using_cuda=using_cuda)

    #statistics
    loss_count = 0
    prepare_time = 0
    run_time = 0
    count = 0

    for epoch in range(epochs):
        print("epoch: %d"%(epoch))
        for i, batch in enumerate(dataset):
            t1 = time.time()
            X = batch["feature"]
            y = batch["class"]
            
            t2 = time.time()
            y_pred, y_penl = net(X)
            loss = criterion(y_pred, y) + torch.sum(y_penl) / batch_size * coef
            optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm(net.parameters(), 0.5)
            optimizer.step()
            t3 = time.time()

            loss_count += torch.sum(y_penl).data[0]
            prepare_time += (t2 - t1)
            run_time += (t3 - t2)
            p, idx = torch.max(y_pred.data, dim=1)
            count += torch.sum(torch.eq(idx.cpu(), y.data.cpu()))

            if (i + 1) % interval == 0:
                print("epoch : %d, iters: %d"%(epoch, i + 1))     
                print("loss count:" + str(loss_count / (interval * batch_size)))
                print("acuracy:" + str(count / (interval * batch_size)))
                print("penalty:" + str(torch.sum(y_penl).data[0] / batch_size))
                print("prepare time:" + str(prepare_time))
                print("run time:" + str(run_time))
                prepare_time = 0
                run_time = 0
                loss_count = 0
                count = 0
        string = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
        torch.save(net.state_dict(), "model_dict_%s.dict"%(string))

 def test(model_dict, using_cuda=True):
    if using_cuda:
        net = Net().cuda()
    else:
        net = Net()
    net.load_state_dict(torch.load(model_dict))
    dataset = dataloader.DataLoader("test_set.pkl", batch_size=1, using_cuda=using_cuda)
    count = 0
    for i, batch in enumerate(dataset):
        X = batch["feature"]
        y = batch["class"]
        y_pred, _ = net(X)
        p, idx = torch.max(y_pred.data, dim=1)
        count += torch.sum(torch.eq(idx.cpu(), y.data.cpu()))
    print("accuracy: %f"%(count / dataset.num))
        

 if __name__ == "__main__":
    train(using_cuda=torch.cuda.is_available())
    
    
    

--- a/fastNLP/modules/prototype/predict.py
+++ b/fastNLP/modules/prototype/predict.py
@@ -0,0 +1,25 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 class MLP(nn.Module):
    """
    A two layers perceptron for classification.

    Output : Unnormalized possibility distribution
    Args:
    input_size : the size of input
    hidden_size : the size of hidden layer
    output_size : the size of output
    """
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP,self).__init__()
        self.L1 = nn.Linear(input_size, hidden_size)
        self.L2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out = self.L2(F.relu(self.L1(x)))
        return out

 if __name__ == "__main__":
    MLP(20, 30, 20)
--- a/fastNLP/modules/prototype/prepare.py
+++ b/fastNLP/modules/prototype/prepare.py
@@ -0,0 +1,50 @@
 import pickle
 import Word2Idx

 def get_sets(m, n):
    """
    get a train set containing m samples and a test set containing n samples
    """
    samples = pickle.load(open("tuples.pkl","rb"))
    if m+n > len(samples):
        print("asking for too many tuples\n")
        return
    train_samples = samples[ : m]
    test_samples = samples[m: m+n]
    return train_samples, test_samples

 def build_wordidx():
    """
    build wordidx using word2idx
    """
    train, test = get_sets(500000, 2000)
    words = []
    for x in train:
        words += x[0]
    wordidx = Word2Idx.Word2Idx()
    wordidx.build(words)
    print(wordidx.num)
    print(wordidx.i2w(0))
    wordidx.save("wordidx.pkl")

 def build_sets():
    """
    build train set and test set, transform word to index
    """
    train, test = get_sets(500000, 2000)
    wordidx = Word2Idx.Word2Idx()
    wordidx.load("wordidx.pkl")
    train_set = []
    for x in train:
        sent = [wordidx.w2i(w) for w in x[0]]
        train_set.append({"sent" : sent, "class" : x[1]})
    test_set = []
    for x in test:
        sent = [wordidx.w2i(w) for w in x[0]]
        test_set.append({"sent" : sent, "class" : x[1]})
    pickle.dump(train_set, open("train_set.pkl", "wb"))
    pickle.dump(test_set, open("test_set.pkl", "wb"))

 if __name__ == "__main__":
    build_wordidx()
    build_sets()