From 6b357bec40d7dc271da098f61ef4507fbecac516 Mon Sep 17 00:00:00 2001
From: FengZiYjun <writerphone@163.com>
Date: Wed, 23 May 2018 17:48:26 +0800
Subject: [PATCH] design intermediate controller between trainer and pytorch
 model

---
 .idea/fastNLP.iml                    |   2 +-
 .idea/misc.xml                       |   2 +-
 action/trainer.py                    |  16 +-
 model/__init__.py                    |   0
 model/base_model.py                  |   5 +-
 model/char_language_model.py         | 342 +++++++++++++++++++++++++++
 reproduction/Char-aware_NLM/train.py |   6 +-
 7 files changed, 362 insertions(+), 11 deletions(-)
 create mode 100644 model/__init__.py
 create mode 100644 model/char_language_model.py
diff --git a/.idea/fastNLP.iml b/.idea/fastNLP.iml
index 67116063..29a6ed9a 100644
--- a/.idea/fastNLP.iml
+++ b/.idea/fastNLP.iml
@@ -2,7 +2,7 @@
 <module type="PYTHON_MODULE" version="4">
   <component name="NewModuleRootManager">
     <content url="file://$MODULE_DIR$" />
-    <orderEntry type="inheritedJdk" />
+    <orderEntry type="jdk" jdkName="Remote Python 3.6.5 (ssh://zyfeng@10.141.208.102:22/home/zyfeng/anaconda2/envs/conda_env3/bin/python)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="TestRunnerService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 470d1301..421ed102 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,4 +1,4 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5 (PCA_emb)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.6.5 (ssh://zyfeng@10.141.208.102:22/home/zyfeng/anaconda2/envs/conda_env3/bin/python)" project-jdk-type="Python SDK" />
 </project>
\ No newline at end of file
diff --git a/action/trainer.py b/action/trainer.py
index cd82e544..15f28583 100644
--- a/action/trainer.py
+++ b/action/trainer.py
@@ -19,9 +19,10 @@ class Trainer(Action):
         self.save_when_better = self.train_args.save_when_better
 
     def train(self, network, data, dev_data):
-        X, Y = network.prepare_input(data)
+        train_x, train_y = network.prepare_input(data.train_set, data.train_label)
+        valid_x, valid_y = network.prepare_input(dev_data.valid_set, dev_data.valid_label)
 
-        iterations, train_batch_generator = self.batchify(X, Y)
+        iterations, train_batch_generator = self.batchify(train_x, train_y)
         loss_history = list()
         network.mode(test=False)
 
@@ -33,15 +34,18 @@ class Trainer(Action):
 
             for step in range(iterations):
                 batch_x, batch_y = train_batch_generator.__next__()
+
                 prediction = network.data_forward(batch_x)
+
                 loss = network.loss(batch_y, prediction)
                 network.grad_backward()
                 loss_history.append(loss)
                 self.log(self.make_log(epoch, step, loss))
 
-            # evaluate over dev set
+            #################### evaluate over dev set  ###################
             if self.validate:
-                evaluator.test(network, dev_data)
+                evaluator.test(network, [valid_x, valid_y])
+
                 self.log(self.make_valid_log(epoch, evaluator.loss))
                 if evaluator.loss < best_loss:
                     best_loss = evaluator.loss
@@ -50,6 +54,10 @@ class Trainer(Action):
 
         # finish training
 
+    @staticmethod
+    def prepare_training(network, data):
+        return network.prepare_training(data)
+
     def make_log(self, *args):
         print("logged")
 
diff --git a/model/__init__.py b/model/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/model/base_model.py b/model/base_model.py
index 28d1fe1e..92d4068a 100644
--- a/model/base_model.py
+++ b/model/base_model.py
@@ -2,7 +2,7 @@ import numpy as np
 
 
 class BaseModel(object):
-    """base model for all models"""
+    """PyTorch base model for all models"""
 
     def __init__(self):
         pass
@@ -17,7 +17,8 @@ class BaseModel(object):
     def mode(self, test=False):
         raise NotImplementedError
 
-    def data_forward(self, x):
+    def data_forward(self, *x):
+        # required by PyTorch nn
         raise NotImplementedError
 
     def grad_backward(self):
diff --git a/model/char_language_model.py b/model/char_language_model.py
new file mode 100644
index 00000000..f5b5e09b
--- /dev/null
+++ b/model/char_language_model.py
@@ -0,0 +1,342 @@
+import os
+from collections import namedtuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.autograd import Variable
+
+from model.base_model import BaseModel
+
+
+class CharLM(BaseModel):
+    """
+        Controller of the Character-level Neural Language Model
+    """
+
+    def __init__(self):
+        super(CharLM, self).__init__()
+        """
+            Settings
+        """
+        self.word_embed_dim = 300
+        self.char_embedding_dim = 15
+        self.cnn_batch_size = 700
+        self.lstm_seq_len = 35
+        self.lstm_batch_size = 20
+        self.vocab_size = 100
+        self.num_char = 150
+
+        self.data = None  # named tuple to store all data set
+        self.data_ready = False
+        self.criterion = nn.CrossEntropyLoss()
+        self.loss = None
+        self.optimizer = optim.SGD(self.parameters(), lr=learning_rate, momentum=0.85)
+        self.use_gpu = False
+        # word_emb_dim == hidden_size / num of hidden units
+        self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)),
+                       to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)))
+
+        self.model = charLM(self.char_embedding_dim,
+                            self.word_embed_dim,
+                            self.vocab_size,
+                            self.num_char,
+                            use_gpu=self.use_gpu)
+
+    def prepare_input(self, raw_text):
+        """
+            Do some preparation jobs. Transform raw data into input vectors.
+        """
+        if not self.data_ready:
+            # To do: These need to be dropped out from here. (below)
+            if os.path.exists("cache/prep.pt") is False:
+                self.preprocess()
+            objects = torch.load("cache/prep.pt")
+            word_dict = objects["word_dict"]
+            char_dict = objects["char_dict"]
+            max_word_len = objects["max_word_len"]
+            self.data_ready = True
+            print("word/char dictionary built. Start making inputs.")
+
+            if os.path.exists("cache/data_sets.pt") is False:
+                train_text = read_data("./train.txt")
+                valid_text = read_data("./valid.txt")
+                test_text = read_data("./tests.txt")
+
+                # To do: These need to be dropped out from here. (above)
+
+                input_vec = np.array(text2vec(raw_text, char_dict, max_word_len))
+
+                # Labels are next-word index in word_dict with the same length as inputs
+                input_label = np.array([word_dict[w] for w in raw_text[1:]] + [word_dict[raw_text[-1]]])
+
+                category = {"features": input_vec, "label": input_label}
+                torch.save(category, "cache/data_sets.pt")
+            else:
+                data_sets = torch.load("cache/data_sets.pt")
+                input_vec = data_sets["features"]
+                input_label = data_sets["label"]
+
+            DataTuple = namedtuple("DataTuple", ["feature", "label"])
+            self.data = DataTuple(feature=input_vec, label=input_label)
+
+        return self.data.feature, self.data.label
+
+    def mode(self, test=False):
+        raise NotImplementedError
+
+    def data_forward(self, x):
+        # detach hidden state of LSTM from last batch
+        hidden = [state.detach() for state in self.hidden]
+        output, self.hidden = self.model(to_var(x), hidden)
+        return output
+
+    def grad_backward(self):
+        self.model.zero_grad()
+        self.loss.backward()
+        torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
+        self.optimizer.step()
+
+    def loss(self, predict, truth):
+        self.loss = self.criterion(predict, to_var(truth))
+        return self.loss
+
+    @staticmethod
+    def preprocess():
+        word_dict, char_dict = create_word_char_dict("valid.txt", "train.txt", "tests.txt")
+        num_char = len(char_dict)
+        char_dict["BOW"] = num_char + 1
+        char_dict["EOW"] = num_char + 2
+        char_dict["PAD"] = 0
+        #  dict of (int, string)
+        reverse_word_dict = {value: key for key, value in word_dict.items()}
+        max_word_len = max([len(word) for word in word_dict])
+        objects = {
+            "word_dict": word_dict,
+            "char_dict": char_dict,
+            "reverse_word_dict": reverse_word_dict,
+            "max_word_len": max_word_len
+        }
+        torch.save(objects, "cache/prep.pt")
+        print("Preprocess done.")
+
+    def forward(self, x, hidden):
+        lstm_batch_size = x.size()[0]
+        lstm_seq_len = x.size()[1]
+        x = x.contiguous().view(-1, x.size()[2])
+        x = self.char_embed(x)
+        x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
+        x = self.conv_layers(x)
+        x = self.batch_norm(x)
+        x = self.highway1(x)
+        x = self.highway2(x)
+        x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
+        x, hidden = self.lstm(x, hidden)
+        x = self.dropout(x)
+        x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1)
+        x = self.linear(x)
+        return x, hidden
+
+
+"""
+    Global Functions
+"""
+
+
+def batch_generator(x, batch_size):
+    # x: [num_words, in_channel, height, width]
+    # partitions x into batches
+    num_step = x.size()[0] // batch_size
+    for t in range(num_step):
+        yield x[t * batch_size:(t + 1) * batch_size]
+
+
+def text2vec(words, char_dict, max_word_len):
+    """ Return list of list of int """
+    word_vec = []
+    for word in words:
+        vec = [char_dict[ch] for ch in word]
+        if len(vec) < max_word_len:
+            vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))]
+        vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]]
+        word_vec.append(vec)
+    return word_vec
+
+
+def read_data(file_name):
+    with open(file_name, 'r') as f:
+        corpus = f.read().lower()
+    import re
+    corpus = re.sub(r"<unk>", "unk", corpus)
+    return corpus.split()
+
+
+def get_char_dict(vocabulary):
+    char_dict = dict()
+    count = 1
+    for word in vocabulary:
+        for ch in word:
+            if ch not in char_dict:
+                char_dict[ch] = count
+                count += 1
+    return char_dict
+
+
+def create_word_char_dict(*file_name):
+    text = []
+    for file in file_name:
+        text += read_data(file)
+    word_dict = {word: ix for ix, word in enumerate(set(text))}
+    char_dict = get_char_dict(word_dict)
+    return word_dict, char_dict
+
+
+def to_var(x):
+    if torch.cuda.is_available():
+        x = x.cuda()
+    return Variable(x)
+
+
+class Highway(nn.Module):
+    """Highway network"""
+
+    def __init__(self, input_size):
+        super(Highway, self).__init__()
+        self.fc1 = nn.Linear(input_size, input_size, bias=True)
+        self.fc2 = nn.Linear(input_size, input_size, bias=True)
+
+    def forward(self, x):
+        t = F.sigmoid(self.fc1(x))
+        return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x)
+
+
+class charLM(nn.Module):
+    """Character-level Neural Language Model
+    CNN + highway network + LSTM
+    # Input:
+        4D tensor with shape [batch_size, in_channel, height, width]
+    # Output:
+        2D Tensor with shape [batch_size, vocab_size]
+    # Arguments:
+        char_emb_dim: the size of each character's embedding
+        word_emb_dim: the size of each word's embedding
+        vocab_size: num of unique words
+        num_char: num of characters
+        use_gpu: True or False
+    """
+
+    def __init__(self, char_emb_dim, word_emb_dim,
+                 vocab_size, num_char, use_gpu):
+        super(charLM, self).__init__()
+        self.char_emb_dim = char_emb_dim
+        self.word_emb_dim = word_emb_dim
+        self.vocab_size = vocab_size
+
+        # char embedding layer
+        self.char_embed = nn.Embedding(num_char, char_emb_dim)
+
+        # convolutions of filters with different sizes
+        self.convolutions = []
+
+        # list of tuples: (the number of filter, width)
+        self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)]
+
+        for out_channel, filter_width in self.filter_num_width:
+            self.convolutions.append(
+                nn.Conv2d(
+                    1,  # in_channel
+                    out_channel,  # out_channel
+                    kernel_size=(char_emb_dim, filter_width),  # (height, width)
+                    bias=True
+                )
+            )
+
+        self.highway_input_dim = sum([x for x, y in self.filter_num_width])
+
+        self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False)
+
+        # highway net
+        self.highway1 = Highway(self.highway_input_dim)
+        self.highway2 = Highway(self.highway_input_dim)
+
+        # LSTM
+        self.lstm_num_layers = 2
+
+        self.lstm = nn.LSTM(input_size=self.highway_input_dim,
+                            hidden_size=self.word_emb_dim,
+                            num_layers=self.lstm_num_layers,
+                            bias=True,
+                            dropout=0.5,
+                            batch_first=True)
+
+        # output layer
+        self.dropout = nn.Dropout(p=0.5)
+        self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)
+
+        if use_gpu is True:
+            for x in range(len(self.convolutions)):
+                self.convolutions[x] = self.convolutions[x].cuda()
+            self.highway1 = self.highway1.cuda()
+            self.highway2 = self.highway2.cuda()
+            self.lstm = self.lstm.cuda()
+            self.dropout = self.dropout.cuda()
+            self.char_embed = self.char_embed.cuda()
+            self.linear = self.linear.cuda()
+            self.batch_norm = self.batch_norm.cuda()
+
+    def forward(self, x, hidden):
+        # Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2]
+        # Return: Variable of Tensor with shape [num_words, len(word_dict)]
+        lstm_batch_size = x.size()[0]
+        lstm_seq_len = x.size()[1]
+
+        x = x.contiguous().view(-1, x.size()[2])
+        # [num_seq*seq_len, max_word_len+2]
+
+        x = self.char_embed(x)
+        # [num_seq*seq_len, max_word_len+2, char_emb_dim]
+
+        x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
+        # [num_seq*seq_len, 1, max_word_len+2, char_emb_dim]
+
+        x = self.conv_layers(x)
+        # [num_seq*seq_len, total_num_filters]
+
+        x = self.batch_norm(x)
+        # [num_seq*seq_len, total_num_filters]
+
+        x = self.highway1(x)
+        x = self.highway2(x)
+        # [num_seq*seq_len, total_num_filters]
+
+        x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
+        # [num_seq, seq_len, total_num_filters]
+
+        x, hidden = self.lstm(x, hidden)
+        # [seq_len, num_seq, hidden_size]
+
+        x = self.dropout(x)
+        # [seq_len, num_seq, hidden_size]
+
+        x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1)
+        # [num_seq*seq_len, hidden_size]
+
+        x = self.linear(x)
+        # [num_seq*seq_len, vocab_size]
+        return x, hidden
+
+    def conv_layers(self, x):
+        chosen_list = list()
+        for conv in self.convolutions:
+            feature_map = F.tanh(conv(x))
+            # (batch_size, out_channel, 1, max_word_len-width+1)
+            chosen = torch.max(feature_map, 3)[0]
+            # (batch_size, out_channel, 1)
+            chosen = chosen.squeeze()
+            # (batch_size, out_channel)
+            chosen_list.append(chosen)
+
+        # (batch_size, total_num_filers)
+        return torch.cat(chosen_list, 1)
diff --git a/reproduction/Char-aware_NLM/train.py b/reproduction/Char-aware_NLM/train.py
index 044786fe..3e6e253e 100644
--- a/reproduction/Char-aware_NLM/train.py
+++ b/reproduction/Char-aware_NLM/train.py
@@ -135,9 +135,9 @@ def train(net, data, opt):
         ##################################################
         #################### Training ####################
         net.train()
-        optimizer  = optim.SGD(net.parameters(), 
-                               lr = learning_rate, 
-                               momentum=0.85)
+        optimizer = optim.SGD(net.parameters(),
+                              lr = learning_rate,
+                              momentum=0.85)
 
         # split the first dim
         input_generator = batch_generator(train_input, opt.lstm_batch_size)