design intermediate controller between trainer and pytorch model

7 years ago · 6b357bec40
--- a/.idea/fastNLP.iml
+++ b/.idea/fastNLP.iml
@@ -2,7 +2,7 @@
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$" />
    <orderEntry type="inheritedJdk" />
    <orderEntry type="jdk" jdkName="Remote Python 3.6.5 (ssh://zyfeng@10.141.208.102:22/home/zyfeng/anaconda2/envs/conda_env3/bin/python)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="TestRunnerService">
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,4 +1,4 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5 (PCA_emb)" project-jdk-type="Python SDK" />
  <component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.6.5 (ssh://zyfeng@10.141.208.102:22/home/zyfeng/anaconda2/envs/conda_env3/bin/python)" project-jdk-type="Python SDK" />
 </project>
--- a/action/trainer.py
+++ b/action/trainer.py
@@ -19,9 +19,10 @@ class Trainer(Action):
        self.save_when_better = self.train_args.save_when_better
    def train(self, network, data, dev_data):
        X, Y = network.prepare_input(data)
        train_x, train_y = network.prepare_input(data.train_set, data.train_label)
        valid_x, valid_y = network.prepare_input(dev_data.valid_set, dev_data.valid_label)
        iterations, train_batch_generator = self.batchify(X, Y)
        iterations, train_batch_generator = self.batchify(train_x, train_y)
        loss_history = list()
        network.mode(test=False)
@@ -33,15 +34,18 @@ class Trainer(Action):
            for step in range(iterations):
                batch_x, batch_y = train_batch_generator.__next__()
                prediction = network.data_forward(batch_x)
                loss = network.loss(batch_y, prediction)
                network.grad_backward()
                loss_history.append(loss)
                self.log(self.make_log(epoch, step, loss))
            # evaluate over dev set
            #################### evaluate over dev set  ###################
            if self.validate:
                evaluator.test(network, dev_data)
                evaluator.test(network, [valid_x, valid_y])
                self.log(self.make_valid_log(epoch, evaluator.loss))
                if evaluator.loss < best_loss:
                    best_loss = evaluator.loss
@@ -50,6 +54,10 @@ class Trainer(Action):
        # finish training
    @staticmethod
    def prepare_training(network, data):
        return network.prepare_training(data)
    def make_log(self, *args):
        print("logged")
--- a/model/init.py
+++ b/model/init.py
--- a/model/base_model.py
+++ b/model/base_model.py
@@ -2,7 +2,7 @@ import numpy as np
 class BaseModel(object):
    """base model for all models"""
    """PyTorch base model for all models"""
    def __init__(self):
        pass
@@ -17,7 +17,8 @@ class BaseModel(object):
    def mode(self, test=False):
        raise NotImplementedError
    def data_forward(self, x):
    def data_forward(self, *x):
        # required by PyTorch nn
        raise NotImplementedError
    def grad_backward(self):
--- a/model/char_language_model.py
+++ b/model/char_language_model.py
@@ -0,0 +1,342 @@
 import os
 from collections import namedtuple
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 from torch.autograd import Variable
 from model.base_model import BaseModel
 class CharLM(BaseModel):
    """
        Controller of the Character-level Neural Language Model
    """
    def __init__(self):
        super(CharLM, self).__init__()
        """
            Settings
        """
        self.word_embed_dim = 300
        self.char_embedding_dim = 15
        self.cnn_batch_size = 700
        self.lstm_seq_len = 35
        self.lstm_batch_size = 20
        self.vocab_size = 100
        self.num_char = 150
        self.data = None  # named tuple to store all data set
        self.data_ready = False
        self.criterion = nn.CrossEntropyLoss()
        self.loss = None
        self.optimizer = optim.SGD(self.parameters(), lr=learning_rate, momentum=0.85)
        self.use_gpu = False
        # word_emb_dim == hidden_size / num of hidden units
        self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)),
                       to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)))
        self.model = charLM(self.char_embedding_dim,
                            self.word_embed_dim,
                            self.vocab_size,
                            self.num_char,
                            use_gpu=self.use_gpu)
    def prepare_input(self, raw_text):
        """
            Do some preparation jobs. Transform raw data into input vectors.
        """
        if not self.data_ready:
            # To do: These need to be dropped out from here. (below)
            if os.path.exists("cache/prep.pt") is False:
                self.preprocess()
            objects = torch.load("cache/prep.pt")
            word_dict = objects["word_dict"]
            char_dict = objects["char_dict"]
            max_word_len = objects["max_word_len"]
            self.data_ready = True
            print("word/char dictionary built. Start making inputs.")
            if os.path.exists("cache/data_sets.pt") is False:
                train_text = read_data("./train.txt")
                valid_text = read_data("./valid.txt")
                test_text = read_data("./tests.txt")
                # To do: These need to be dropped out from here. (above)
                input_vec = np.array(text2vec(raw_text, char_dict, max_word_len))
                # Labels are next-word index in word_dict with the same length as inputs
                input_label = np.array([word_dict[w] for w in raw_text[1:]] + [word_dict[raw_text[-1]]])
                category = {"features": input_vec, "label": input_label}
                torch.save(category, "cache/data_sets.pt")
            else:
                data_sets = torch.load("cache/data_sets.pt")
                input_vec = data_sets["features"]
                input_label = data_sets["label"]
            DataTuple = namedtuple("DataTuple", ["feature", "label"])
            self.data = DataTuple(feature=input_vec, label=input_label)
        return self.data.feature, self.data.label
    def mode(self, test=False):
        raise NotImplementedError
    def data_forward(self, x):
        # detach hidden state of LSTM from last batch
        hidden = [state.detach() for state in self.hidden]
        output, self.hidden = self.model(to_var(x), hidden)
        return output
    def grad_backward(self):
        self.model.zero_grad()
        self.loss.backward()
        torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
        self.optimizer.step()
    def loss(self, predict, truth):
        self.loss = self.criterion(predict, to_var(truth))
        return self.loss
    @staticmethod
    def preprocess():
        word_dict, char_dict = create_word_char_dict("valid.txt", "train.txt", "tests.txt")
        num_char = len(char_dict)
        char_dict["BOW"] = num_char + 1
        char_dict["EOW"] = num_char + 2
        char_dict["PAD"] = 0
        #  dict of (int, string)
        reverse_word_dict = {value: key for key, value in word_dict.items()}
        max_word_len = max([len(word) for word in word_dict])
        objects = {
            "word_dict": word_dict,
            "char_dict": char_dict,
            "reverse_word_dict": reverse_word_dict,
            "max_word_len": max_word_len
        }
        torch.save(objects, "cache/prep.pt")
        print("Preprocess done.")
    def forward(self, x, hidden):
        lstm_batch_size = x.size()[0]
        lstm_seq_len = x.size()[1]
        x = x.contiguous().view(-1, x.size()[2])
        x = self.char_embed(x)
        x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
        x = self.conv_layers(x)
        x = self.batch_norm(x)
        x = self.highway1(x)
        x = self.highway2(x)
        x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
        x, hidden = self.lstm(x, hidden)
        x = self.dropout(x)
        x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1)
        x = self.linear(x)
        return x, hidden
 """
    Global Functions
 """
 def batch_generator(x, batch_size):
    # x: [num_words, in_channel, height, width]
    # partitions x into batches
    num_step = x.size()[0] // batch_size
    for t in range(num_step):
        yield x[t * batch_size:(t + 1) * batch_size]
 def text2vec(words, char_dict, max_word_len):
    """ Return list of list of int """
    word_vec = []
    for word in words:
        vec = [char_dict[ch] for ch in word]
        if len(vec) < max_word_len:
            vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))]
        vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]]
        word_vec.append(vec)
    return word_vec
 def read_data(file_name):
    with open(file_name, 'r') as f:
        corpus = f.read().lower()
    import re
    corpus = re.sub(r"<unk>", "unk", corpus)
    return corpus.split()
 def get_char_dict(vocabulary):
    char_dict = dict()
    count = 1
    for word in vocabulary:
        for ch in word:
            if ch not in char_dict:
                char_dict[ch] = count
                count += 1
    return char_dict
 def create_word_char_dict(*file_name):
    text = []
    for file in file_name:
        text += read_data(file)
    word_dict = {word: ix for ix, word in enumerate(set(text))}
    char_dict = get_char_dict(word_dict)
    return word_dict, char_dict
 def to_var(x):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x)
 class Highway(nn.Module):
    """Highway network"""
    def __init__(self, input_size):
        super(Highway, self).__init__()
        self.fc1 = nn.Linear(input_size, input_size, bias=True)
        self.fc2 = nn.Linear(input_size, input_size, bias=True)
    def forward(self, x):
        t = F.sigmoid(self.fc1(x))
        return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x)
 class charLM(nn.Module):
    """Character-level Neural Language Model
    CNN + highway network + LSTM
    # Input:
        4D tensor with shape [batch_size, in_channel, height, width]
    # Output:
        2D Tensor with shape [batch_size, vocab_size]
    # Arguments:
        char_emb_dim: the size of each character's embedding
        word_emb_dim: the size of each word's embedding
        vocab_size: num of unique words
        num_char: num of characters
        use_gpu: True or False
    """
    def __init__(self, char_emb_dim, word_emb_dim,
                 vocab_size, num_char, use_gpu):
        super(charLM, self).__init__()
        self.char_emb_dim = char_emb_dim
        self.word_emb_dim = word_emb_dim
        self.vocab_size = vocab_size
        # char embedding layer
        self.char_embed = nn.Embedding(num_char, char_emb_dim)
        # convolutions of filters with different sizes
        self.convolutions = []
        # list of tuples: (the number of filter, width)
        self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)]
        for out_channel, filter_width in self.filter_num_width:
            self.convolutions.append(
                nn.Conv2d(
                    1,  # in_channel
                    out_channel,  # out_channel
                    kernel_size=(char_emb_dim, filter_width),  # (height, width)
                    bias=True
                )
            )
        self.highway_input_dim = sum([x for x, y in self.filter_num_width])
        self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False)
        # highway net
        self.highway1 = Highway(self.highway_input_dim)
        self.highway2 = Highway(self.highway_input_dim)
        # LSTM
        self.lstm_num_layers = 2
        self.lstm = nn.LSTM(input_size=self.highway_input_dim,
                            hidden_size=self.word_emb_dim,
                            num_layers=self.lstm_num_layers,
                            bias=True,
                            dropout=0.5,
                            batch_first=True)
        # output layer
        self.dropout = nn.Dropout(p=0.5)
        self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)
        if use_gpu is True:
            for x in range(len(self.convolutions)):
                self.convolutions[x] = self.convolutions[x].cuda()
            self.highway1 = self.highway1.cuda()
            self.highway2 = self.highway2.cuda()
            self.lstm = self.lstm.cuda()
            self.dropout = self.dropout.cuda()
            self.char_embed = self.char_embed.cuda()
            self.linear = self.linear.cuda()
            self.batch_norm = self.batch_norm.cuda()
    def forward(self, x, hidden):
        # Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2]
        # Return: Variable of Tensor with shape [num_words, len(word_dict)]
        lstm_batch_size = x.size()[0]
        lstm_seq_len = x.size()[1]
        x = x.contiguous().view(-1, x.size()[2])
        # [num_seq*seq_len, max_word_len+2]
        x = self.char_embed(x)
        # [num_seq*seq_len, max_word_len+2, char_emb_dim]
        x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
        # [num_seq*seq_len, 1, max_word_len+2, char_emb_dim]
        x = self.conv_layers(x)
        # [num_seq*seq_len, total_num_filters]
        x = self.batch_norm(x)
        # [num_seq*seq_len, total_num_filters]
        x = self.highway1(x)
        x = self.highway2(x)
        # [num_seq*seq_len, total_num_filters]
        x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
        # [num_seq, seq_len, total_num_filters]
        x, hidden = self.lstm(x, hidden)
        # [seq_len, num_seq, hidden_size]
        x = self.dropout(x)
        # [seq_len, num_seq, hidden_size]
        x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1)
        # [num_seq*seq_len, hidden_size]
        x = self.linear(x)
        # [num_seq*seq_len, vocab_size]
        return x, hidden
    def conv_layers(self, x):
        chosen_list = list()
        for conv in self.convolutions:
            feature_map = F.tanh(conv(x))
            # (batch_size, out_channel, 1, max_word_len-width+1)
            chosen = torch.max(feature_map, 3)[0]
            # (batch_size, out_channel, 1)
            chosen = chosen.squeeze()
            # (batch_size, out_channel)
            chosen_list.append(chosen)
        # (batch_size, total_num_filers)
        return torch.cat(chosen_list, 1)
--- a/reproduction/Char-aware_NLM/train.py
+++ b/reproduction/Char-aware_NLM/train.py
@@ -135,9 +135,9 @@ def train(net, data, opt):
        ##################################################
        #################### Training ####################
        net.train()
        optimizer  = optim.SGD(net.parameters(), 
                               lr = learning_rate, 
                               momentum=0.85)
        optimizer = optim.SGD(net.parameters(),
                              lr = learning_rate,
                              momentum=0.85)
        # split the first dim
        input_generator = batch_generator(train_input, opt.lstm_batch_size)