Merge pull request #2 from FengZiYjun/master

character-aware neural language model
7 years ago · cfc47392e8
--- a/Char-aware_NLM/LICENSE
+++ b/Char-aware_NLM/LICENSE
@@ -0,0 +1,21 @@
 MIT License

 Copyright (c) 2017 

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/Char-aware_NLM/README.md
+++ b/Char-aware_NLM/README.md
@@ -0,0 +1,40 @@

 # PyTorch-Character-Aware-Neural-Language-Model

 This is the PyTorch implementation of character-aware neural language model proposed in this [paper](https://arxiv.org/abs/1508.06615) by Yoon Kim. 

 ## Requiredments
 The code is run and tested with **Python 3.5.2** and **PyTorch 0.3.1**.

 ## HyperParameters
 | HyperParam | value |
 | ------ | :-------|
 | LSTM batch size | 20 |
 | LSTM sequence length | 35 |
 | LSTM hidden units | 300 |
 | epochs | 35 |
 | initial learning rate | 1.0 |
 | character embedding dimension | 15 |

 ## Demo
 Train the model with split train/valid/test data.

 `python train.py`

 The trained model will saved in `cache/net.pkl`.
 Test the model.

 `python test.py`

 Best result on test set: 
 PPl=127.2163
 cross entropy loss=4.8459

 ## Acknowledgement 
 This implementation borrowed ideas from

 https://github.com/jarfo/kchar

 https://github.com/cronos123/Character-Aware-Neural-Language-Models


--- a/Char-aware_NLM/model.py
+++ b/Char-aware_NLM/model.py
@@ -0,0 +1,148 @@

 import torch
 from torch.autograd import Variable
 import torch.nn as nn
 import torch.nn.functional as F


 class Highway(nn.Module):
    """Highway network"""
    def __init__(self, input_size):
        super(Highway, self).__init__()
        self.fc1 = nn.Linear(input_size, input_size, bias=True)
        self.fc2 = nn.Linear(input_size, input_size, bias=True)

    def forward(self, x):
        t = F.sigmoid(self.fc1(x))
        return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1-t, x)


 class charLM(nn.Module):
    """CNN + highway network + LSTM
    # Input: 
        4D tensor with shape [batch_size, in_channel, height, width]
    # Output:
        2D Tensor with shape [batch_size, vocab_size]
    # Arguments:
        char_emb_dim: the size of each character's embedding
        word_emb_dim: the size of each word's embedding
        vocab_size: num of unique words
        num_char: num of characters
        use_gpu: True or False
    """
    def __init__(self, char_emb_dim, word_emb_dim,  
                vocab_size, num_char, use_gpu):
        super(charLM, self).__init__()
        self.char_emb_dim = char_emb_dim
        self.word_emb_dim = word_emb_dim
        self.vocab_size = vocab_size

        # char embedding layer
        self.char_embed = nn.Embedding(num_char, char_emb_dim)

        # convolutions of filters with different sizes
        self.convolutions = []

        # list of tuples: (the number of filter, width)
        self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)]
        
        for out_channel, filter_width in self.filter_num_width:
            self.convolutions.append(
                nn.Conv2d(
                    1,           # in_channel
                    out_channel, # out_channel
                    kernel_size=(char_emb_dim, filter_width), # (height, width)
                    bias=True
                    )
            )

        self.highway_input_dim = sum([x for x, y in self.filter_num_width])

        self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False)

        # highway net
        self.highway1 = Highway(self.highway_input_dim)
        self.highway2 = Highway(self.highway_input_dim)

        # LSTM
        self.lstm_num_layers = 2

        self.lstm = nn.LSTM(input_size=self.highway_input_dim, 
                            hidden_size=self.word_emb_dim, 
                            num_layers=self.lstm_num_layers,
                            bias=True,
                            dropout=0.5,
                            batch_first=True)

        # output layer
        self.dropout = nn.Dropout(p=0.5)
        self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)

        
        if use_gpu is True:
            for x in range(len(self.convolutions)):
                self.convolutions[x] = self.convolutions[x].cuda()
            self.highway1 = self.highway1.cuda()
            self.highway2 = self.highway2.cuda()
            self.lstm = self.lstm.cuda()
            self.dropout = self.dropout.cuda()
            self.char_embed = self.char_embed.cuda()
            self.linear = self.linear.cuda()
            self.batch_norm = self.batch_norm.cuda()


    def forward(self, x, hidden):
        # Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2]
        # Return: Variable of Tensor with shape [num_words, len(word_dict)]
        lstm_batch_size = x.size()[0]
        lstm_seq_len = x.size()[1]

        x = x.contiguous().view(-1, x.size()[2])
        # [num_seq*seq_len, max_word_len+2]
        
        x = self.char_embed(x)
        # [num_seq*seq_len, max_word_len+2, char_emb_dim]
        
        x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
        # [num_seq*seq_len, 1, max_word_len+2, char_emb_dim]
        
        x = self.conv_layers(x)
        # [num_seq*seq_len, total_num_filters]

        x = self.batch_norm(x)
        # [num_seq*seq_len, total_num_filters]

        x = self.highway1(x)
        x = self.highway2(x)
        # [num_seq*seq_len, total_num_filters]

        x = x.contiguous().view(lstm_batch_size,lstm_seq_len, -1)
        # [num_seq, seq_len, total_num_filters]
        
        x, hidden = self.lstm(x, hidden)
        # [seq_len, num_seq, hidden_size]
        
        x = self.dropout(x)
        # [seq_len, num_seq, hidden_size]
        
        x = x.contiguous().view(lstm_batch_size*lstm_seq_len, -1)
        # [num_seq*seq_len, hidden_size]

        x = self.linear(x)
        # [num_seq*seq_len, vocab_size]
        return x, hidden


    def conv_layers(self, x):
        chosen_list = list()
        for conv in self.convolutions:
            feature_map = F.tanh(conv(x))
            # (batch_size, out_channel, 1, max_word_len-width+1)
            chosen = torch.max(feature_map, 3)[0]
            # (batch_size, out_channel, 1)            
            chosen = chosen.squeeze()
            # (batch_size, out_channel)
            chosen_list.append(chosen)
        
        # (batch_size, total_num_filers)
        return torch.cat(chosen_list, 1)
--- a/Char-aware_NLM/test.py
+++ b/Char-aware_NLM/test.py
@@ -0,0 +1,123 @@
 import os
 import torch
 from torch.autograd import Variable
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
 from model import charLM
 from utilities import *
 from collections import namedtuple

 def to_var(x):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x)


 def test(net, data, opt):
    net.eval()
 
    test_input = torch.from_numpy(data.test_input)
    test_label = torch.from_numpy(data.test_label)

    num_seq = test_input.size()[0] // opt.lstm_seq_len
    test_input = test_input[:num_seq*opt.lstm_seq_len, :]
    # [num_seq, seq_len, max_word_len+2]
    test_input = test_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2)

    criterion = nn.CrossEntropyLoss()

    loss_list = []
    num_hits = 0
    total = 0
    iterations = test_input.size()[0] // opt.lstm_batch_size
    test_generator = batch_generator(test_input, opt.lstm_batch_size)
    label_generator = batch_generator(test_label, opt.lstm_batch_size*opt.lstm_seq_len)

    hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)), 
              to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)))
    
    add_loss = 0.0 
    for t in range(iterations):
        batch_input = test_generator.__next__ ()
        batch_label = label_generator.__next__()
        
        net.zero_grad()
        hidden = [state.detach() for state in hidden]
        test_output, hidden = net(to_var(batch_input), hidden)
        
        test_loss = criterion(test_output, to_var(batch_label)).data
        loss_list.append(test_loss)
        add_loss += test_loss

    print("Test Loss={0:.4f}".format(float(add_loss) / iterations))
    print("Test PPL={0:.4f}".format(float(np.exp(add_loss / iterations))))


 #############################################################

 if __name__ == "__main__":

    word_embed_dim = 300
    char_embedding_dim = 15

    if os.path.exists("cache/prep.pt") is False:
        print("Cannot find prep.pt")

    objetcs = torch.load("cache/prep.pt")

    word_dict = objetcs["word_dict"]
    char_dict = objetcs["char_dict"]
    reverse_word_dict = objetcs["reverse_word_dict"]
    max_word_len = objetcs["max_word_len"]
    num_words = len(word_dict)

    print("word/char dictionary built. Start making inputs.")


    if os.path.exists("cache/data_sets.pt") is False:
        
        test_text  = read_data("./test.txt")
        test_set  = np.array(text2vec(test_text,  char_dict, max_word_len))

        # Labels are next-word index in word_dict with the same length as inputs
        test_label  = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]])

        category = {"test": test_set, "tlabel":test_label}
        torch.save(category, "cache/data_sets.pt") 
    else:
        data_sets = torch.load("cache/data_sets.pt")
        test_set  = data_sets["test"]
        test_label = data_sets["tlabel"]
        train_set = data_sets["tdata"]
        train_label = data_sets["trlabel"]


    DataTuple = namedtuple("DataTuple", "test_input test_label train_input train_label ")
    data = DataTuple( test_input=test_set,
                     test_label=test_label, train_label=train_label, train_input=train_set)

    print("Loaded data sets. Start building network.")



    USE_GPU = True
    cnn_batch_size = 700
    lstm_seq_len = 35 
    lstm_batch_size = 20
    

    net = torch.load("cache/net.pkl")
    
    Options = namedtuple("Options", [ "cnn_batch_size", "lstm_seq_len",
            "max_word_len", "lstm_batch_size", "word_embed_dim"])
    opt = Options(cnn_batch_size=lstm_seq_len*lstm_batch_size,
                  lstm_seq_len=lstm_seq_len,
                  max_word_len=max_word_len,
                  lstm_batch_size=lstm_batch_size,
                  word_embed_dim=word_embed_dim)


    print("Network built. Start testing.")

    test(net, data, opt)
--- a/Char-aware_NLM/test.txt
+++ b/Char-aware_NLM/test.txt
--- a/Char-aware_NLM/train.py
+++ b/Char-aware_NLM/train.py
@@ -0,0 +1,268 @@

 import torch
 from torch.autograd import Variable
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 import numpy as np
 import os
 from model import charLM
 from utilities import *
 from collections import namedtuple
 from test import test


 def preprocess():
    
    word_dict, char_dict = create_word_char_dict("valid.txt", "train.txt", "test.txt")
    num_words = len(word_dict)
    num_char  = len(char_dict)
    char_dict["BOW"] = num_char+1
    char_dict["EOW"] = num_char+2
    char_dict["PAD"] = 0
    
    #  dict of (int, string)
    reverse_word_dict = {value:key for key, value in word_dict.items()}
    max_word_len = max([len(word) for word in word_dict])

    objects = {
        "word_dict": word_dict,
        "char_dict": char_dict,
        "reverse_word_dict": reverse_word_dict,
        "max_word_len": max_word_len
    }
    
    torch.save(objects, "cache/prep.pt")
    print("Preprocess done.")


 def to_var(x):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x)


 def train(net, data, opt):
    
    torch.manual_seed(1024)

    train_input = torch.from_numpy(data.train_input)
    train_label = torch.from_numpy(data.train_label)
    valid_input = torch.from_numpy(data.valid_input)
    valid_label = torch.from_numpy(data.valid_label)

    # [num_seq, seq_len, max_word_len+2]
    num_seq = train_input.size()[0] // opt.lstm_seq_len
    train_input = train_input[:num_seq*opt.lstm_seq_len, :]
    train_input = train_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2)

    num_seq = valid_input.size()[0] // opt.lstm_seq_len
    valid_input = valid_input[:num_seq*opt.lstm_seq_len, :]
    valid_input = valid_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2)

    num_epoch = opt.epochs
    num_iter_per_epoch = train_input.size()[0] // opt.lstm_batch_size
    
    learning_rate = opt.init_lr
    old_PPL = 100000
    best_PPL = 100000

    # Log-SoftMax
    criterion = nn.CrossEntropyLoss()
    
    # word_emb_dim == hidden_size / num of hidden units 
    hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)), 
              to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)))


    for epoch in range(num_epoch):

        ################  Validation  ####################
        net.eval()
        loss_batch = []
        PPL_batch = []
        iterations = valid_input.size()[0] // opt.lstm_batch_size
        
        valid_generator = batch_generator(valid_input, opt.lstm_batch_size)
        vlabel_generator = batch_generator(valid_label, opt.lstm_batch_size*opt.lstm_seq_len)


        for t in range(iterations):
            batch_input = valid_generator.__next__()
            batch_label = vlabel_generator.__next__()

            hidden = [state.detach() for state in hidden]
            valid_output, hidden = net(to_var(batch_input), hidden)

            length = valid_output.size()[0]

            # [num_sample-1, len(word_dict)] vs [num_sample-1]
            valid_loss = criterion(valid_output, to_var(batch_label))

            PPL = torch.exp(valid_loss.data)

            loss_batch.append(float(valid_loss))
            PPL_batch.append(float(PPL))

        PPL = np.mean(PPL_batch)
        print("[epoch {}] valid PPL={}".format(epoch, PPL))
        print("valid loss={}".format(np.mean(loss_batch)))
        print("PPL decrease={}".format(float(old_PPL - PPL)))

        # Preserve the best model
        if best_PPL > PPL:
            best_PPL = PPL
            torch.save(net.state_dict(), "cache/model.pt")
            torch.save(net, "cache/net.pkl")

        # Adjust the learning rate
        if float(old_PPL - PPL) <= 1.0:
            learning_rate /= 2
            print("halved lr:{}".format(learning_rate))

        old_PPL = PPL

        ##################################################
        #################### Training ####################
        net.train()
        optimizer  = optim.SGD(net.parameters(), 
                               lr = learning_rate, 
                               momentum=0.85)

        # split the first dim
        input_generator = batch_generator(train_input, opt.lstm_batch_size)
        label_generator = batch_generator(train_label, opt.lstm_batch_size*opt.lstm_seq_len)

        for t in range(num_iter_per_epoch):
            batch_input = input_generator.__next__()
            batch_label = label_generator.__next__()

            # detach hidden state of LSTM from last batch
            hidden = [state.detach() for state in hidden]

            output, hidden = net(to_var(batch_input), hidden)
            # [num_word, vocab_size]
            
            loss = criterion(output, to_var(batch_label))

            net.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm(net.parameters(), 5, norm_type=2)
            optimizer.step()
            
            
            if (t+1) % 100 == 0:
                print("[epoch {} step {}] train loss={}, Perplexity={}".format(epoch+1, 
                    t+1, float(loss.data), float(np.exp(loss.data))))


    torch.save(net.state_dict(), "cache/model.pt")
    print("Training finished.")


 ################################################################

 if __name__=="__main__":

    word_embed_dim = 300
    char_embedding_dim = 15

    if os.path.exists("cache/prep.pt") is False:
        preprocess()

    objetcs = torch.load("cache/prep.pt")

    word_dict = objetcs["word_dict"]
    char_dict = objetcs["char_dict"]
    reverse_word_dict = objetcs["reverse_word_dict"]
    max_word_len = objetcs["max_word_len"]
    num_words = len(word_dict)

    print("word/char dictionary built. Start making inputs.")


    if os.path.exists("cache/data_sets.pt") is False:
        train_text = read_data("./train.txt")
        valid_text = read_data("./valid.txt")
        test_text  = read_data("./test.txt")

        train_set = np.array(text2vec(train_text, char_dict, max_word_len))
        valid_set = np.array(text2vec(valid_text, char_dict, max_word_len))
        test_set  = np.array(text2vec(test_text,  char_dict, max_word_len))

        # Labels are next-word index in word_dict with the same length as inputs
        train_label = np.array([word_dict[w] for w in train_text[1:]] + [word_dict[train_text[-1]]])
        valid_label = np.array([word_dict[w] for w in valid_text[1:]] + [word_dict[valid_text[-1]]])
        test_label  = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]])

        category = {"tdata":train_set, "vdata":valid_set, "test": test_set, 
                    "trlabel":train_label, "vlabel":valid_label, "tlabel":test_label}
        torch.save(category, "cache/data_sets.pt") 
    else:
        data_sets = torch.load("cache/data_sets.pt")
        train_set = data_sets["tdata"]
        valid_set = data_sets["vdata"]
        test_set  = data_sets["test"]
        train_label = data_sets["trlabel"]
        valid_label = data_sets["vlabel"]
        test_label = data_sets["tlabel"]


    DataTuple = namedtuple("DataTuple", 
                "train_input train_label valid_input valid_label test_input test_label")
    data = DataTuple(train_input=train_set,
                     train_label=train_label,
                     valid_input=valid_set,
                     valid_label=valid_label,
                     test_input=test_set,
                     test_label=test_label)

    print("Loaded data sets. Start building network.")



    USE_GPU = True
    cnn_batch_size = 700
    lstm_seq_len = 35
    lstm_batch_size = 20
    # cnn_batch_size == lstm_seq_len * lstm_batch_size

    net = charLM(char_embedding_dim, 
                word_embed_dim, 
                num_words,
                len(char_dict),
                use_gpu=USE_GPU)

    for param in net.parameters():
        nn.init.uniform(param.data, -0.05, 0.05)


    Options = namedtuple("Options", [
            "cnn_batch_size", "init_lr", "lstm_seq_len",
            "max_word_len", "lstm_batch_size", "epochs",
            "word_embed_dim"])
    opt = Options(cnn_batch_size=lstm_seq_len*lstm_batch_size,
                  init_lr=1.0,
                  lstm_seq_len=lstm_seq_len,
                  max_word_len=max_word_len,
                  lstm_batch_size=lstm_batch_size,
                  epochs=35,
                  word_embed_dim=word_embed_dim)


    print("Network built. Start training.")


    # You can stop training anytime by "ctrl+C"
    try:
        train(net, data, opt)
    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')


    torch.save(net, "cache/net.pkl")
    print("save net")


    test(net, data, opt)
--- a/Char-aware_NLM/train.txt
+++ b/Char-aware_NLM/train.txt
--- a/Char-aware_NLM/utilities.py
+++ b/Char-aware_NLM/utilities.py
@@ -0,0 +1,86 @@
 import torch
 from torch.autograd import Variable
 import torch.nn as nn
 import torch.nn.functional as F



 def batch_generator(x, batch_size):
    # x: [num_words, in_channel, height, width]
    # partitions x into batches
    num_step = x.size()[0] // batch_size
    for t in range(num_step):
        yield x[t*batch_size:(t+1)*batch_size]


 def text2vec(words, char_dict, max_word_len):
    """ Return list of list of int """
    word_vec = []
    for word in words:
        vec = [char_dict[ch] for ch in word] 
        if len(vec) < max_word_len:
            vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))]
        vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]]
        word_vec.append(vec)
    return word_vec


 def seq2vec(input_words, char_embedding, char_embedding_dim, char_table):
    """ convert the input strings into character embeddings """
    # input_words == list of string
    # char_embedding == torch.nn.Embedding
    # char_embedding_dim == int
    # char_table == list of unique chars
    # Returns: tensor of shape [len(input_words), char_embedding_dim, max_word_len+2]
    max_word_len = max([len(word) for word in input_words])
    print("max_word_len={}".format(max_word_len))
    tensor_list = []
    
    start_column = torch.ones(char_embedding_dim, 1)
    end_column = torch.ones(char_embedding_dim, 1)

    for word in input_words:
        # convert string to word embedding
        word_encoding = char_embedding_lookup(word, char_embedding, char_table)
        # add start and end columns
        word_encoding = torch.cat([start_column, word_encoding, end_column], 1)
        # zero-pad right columns
        word_encoding = F.pad(word_encoding, (0, max_word_len-word_encoding.size()[1]+2)).data
        # create dimension
        word_encoding = word_encoding.unsqueeze(0)

        tensor_list.append(word_encoding)

    return torch.cat(tensor_list, 0)


 def read_data(file_name):
    # Return: list of strings
    with open(file_name, 'r') as f:
        corpus = f.read().lower()
    import re
    corpus = re.sub(r"<unk>", "unk", corpus)
    return corpus.split()


 def get_char_dict(vocabulary):
    # vocabulary == dict of (word, int)
    # Return: dict of (char, int), starting from 1
    char_dict = dict()
    count = 1
    for word in vocabulary:
        for ch in word:
            if ch not in char_dict:
                char_dict[ch] = count
                count += 1
    return char_dict


 def create_word_char_dict(*file_name):
    text = []
    for file in file_name:
        text += read_data(file)
    word_dict = {word:ix for ix, word in enumerate(set(text))}
    char_dict = get_char_dict(word_dict)
    return word_dict, char_dict

--- a/Char-aware_NLM/valid.txt
+++ b/Char-aware_NLM/valid.txt