From 58127d3c4ef652ffd066cd04a466c0a4b94dcdd2 Mon Sep 17 00:00:00 2001
From: FengZiYjun <writerphone@163.com>
Date: Wed, 30 May 2018 22:28:22 +0800
Subject: [PATCH] start building word seg (generally seq2seq) model

---
 action/action.py                              |   2 +-
 action/trainer.py                             |   8 +-
 loader/base_loader.py                         |   6 +
 model/base_model.py                           | 110 +++++++++++---
 model/word_seg_model.py                       | 135 ++++++++++++++++++
 .../CNN-sentence_classification/train.py      |   9 +-
 tests/test_word_seg.py                        |  30 ++++
 7 files changed, 275 insertions(+), 25 deletions(-)
 create mode 100644 model/word_seg_model.py
 create mode 100644 tests/test_word_seg.py
diff --git a/action/action.py b/action/action.py
index 9c3f32f6..c85a74df 100644
--- a/action/action.py
+++ b/action/action.py
@@ -27,7 +27,7 @@ class Action(object):
         :return iteration:int, the number of step in each epoch
                  generator:generator, to generate batch inputs
         """
-        n_samples = X.size()[0]
+        n_samples = X.shape[0]
         num_iter = n_samples // batch_size
         if Y is None:
             generator = self._batch_generate(batch_size, num_iter, X)
diff --git a/action/trainer.py b/action/trainer.py
index 2584552b..6b0fc1f0 100644
--- a/action/trainer.py
+++ b/action/trainer.py
@@ -6,7 +6,7 @@ from .tester import Tester
 
 class Trainer(Action):
     """
-        Trainer for common training logic of all models
+        Trainer is a common training pipeline shared among all models.
     """
     TrainConfig = namedtuple("config", ["epochs", "validate", "save_when_better",
                                         "log_per_step", "log_validation", "batch_size"])
@@ -23,12 +23,12 @@ class Trainer(Action):
         self.log_validation = train_args.log_validation
         self.batch_size = train_args.batch_size
 
-    def train(self, network, train_data, dev_data):
+    def train(self, network, train_data, dev_data=None):
         """
         :param network: the model controller
         :param train_data: raw data for training
         :param dev_data: raw data for validation
-        :return:
+        This method will call all the base methods of network (implemented in model.base_model).
         """
         train_x, train_y = network.prepare_input(train_data)
 
@@ -60,6 +60,8 @@ class Trainer(Action):
 
             #################### evaluate over dev set  ###################
             if self.validate:
+                if dev_data is None:
+                    raise RuntimeError("No validation data provided.")
                 # give all controls to tester
                 evaluator.test(network, dev_data)
 
diff --git a/loader/base_loader.py b/loader/base_loader.py
index 9579a1e5..d087f9f2 100644
--- a/loader/base_loader.py
+++ b/loader/base_loader.py
@@ -14,6 +14,11 @@ class BaseLoader(object):
             text = f.read()
         return text
 
+    def load_lines(self):
+        with open(self.data_path, "r", encoding="utf=8") as f:
+            text = f.readlines()
+        return text
+
 
 class ToyLoader0(BaseLoader):
     """
@@ -29,3 +34,4 @@ class ToyLoader0(BaseLoader):
         import re
         corpus = re.sub(r"<unk>", "unk", corpus)
         return corpus.split()
+
diff --git a/model/base_model.py b/model/base_model.py
index facb82d9..07f3e6b2 100644
--- a/model/base_model.py
+++ b/model/base_model.py
@@ -2,29 +2,64 @@ import numpy as np
 
 
 class BaseModel(object):
-    """PyTorch base model for all models"""
+    """The base class of all models.
+        This class and its subclasses are actually "wrappers" of the PyTorch models.
+        They act as an interface between Trainer and the deep learning networks.
+        This interface provides the following methods to be called by Trainer.
+        - prepare_input
+        - mode
+        - define_optimizer
+        - data_forward
+        - grad_backward
+        - get_loss
+    """
 
     def __init__(self):
         pass
 
     def prepare_input(self, data):
         """
-        :param data: str, raw input vector(?)
+        Perform data transformation from raw input to vector/matrix inputs.
+        :param data: raw inputs
         :return (X, Y): tuple, input features and labels
         """
         raise NotImplementedError
 
     def mode(self, test=False):
+        """
+        Tell the network to be trained or not, required by PyTorch.
+        :param test: bool
+        """
+        raise NotImplementedError
+
+    def define_optimizer(self):
+        """
+        Define PyTorch optimizer specified by the model.
+        """
         raise NotImplementedError
 
     def data_forward(self, *x):
+        """
+        Forward pass of the data.
+        :param x: input feature matrix and label vector
+        :return: output by the model
+        """
         # required by PyTorch nn
         raise NotImplementedError
 
     def grad_backward(self):
+        """
+        Perform gradient descent to update the model parameters.
+        """
         raise NotImplementedError
 
     def get_loss(self, pred, truth):
+        """
+        Compute loss given model prediction and ground truth. Loss function specified by the model.
+        :param pred: prediction label vector
+        :param truth: ground truth label vector
+        :return: a scalar
+        """
         raise NotImplementedError
 
 
@@ -54,29 +89,70 @@ class ToyModel(BaseModel):
         self._loss = np.mean(np.square(pred - truth))
         return self._loss
 
+    def define_optimizer(self):
+        pass
+
 
 class Vocabulary(object):
-    """
-        A collection of lookup tables.
+    """A look-up table that allows you to access `Lexeme` objects. The `Vocab`
+    instance also provides access to the `StringStore`, and owns underlying
+    data that is shared between `Doc` objects.
     """
 
     def __init__(self):
-        self.word_set = None
-        self.word2idx = None
-        self.emb_matrix = None
-
-    def lookup(self, word):
-        if word in self.word_set:
-            return self.emb_matrix[self.word2idx[word]]
-        return LookupError("The key " + word + " does not exist.")
+        """Create the vocabulary.
+        RETURNS (Vocab): The newly constructed object.
+        """
+        self.data_frame = None
 
 
 class Document(object):
+    """A sequence of Token objects. Access sentences and named entities, export
+    annotations to numpy arrays, losslessly serialize to compressed binary
+    strings. The `Doc` object holds an array of `Token` objects. The
+    Python-level `Token` and `Span` objects are views of this array, i.e.
+    they don't own the data themselves. -- spacy
     """
-        contains a sequence of tokens
-        each token is a character with linguistic attributes
+
+    def __init__(self, vocab, words=None, spaces=None):
+        """Create a Doc object.
+        vocab (Vocab): A vocabulary object, which must match any models you
+            want to use (e.g. tokenizer, parser, entity recognizer).
+        words (list or None): A list of unicode strings, to add to the document
+            as words. If `None`, defaults to empty list.
+        spaces (list or None): A list of boolean values, of the same length as
+            words. True means that the word is followed by a space, False means
+            it is not. If `None`, defaults to `[True]*len(words)`
+        user_data (dict or None): Optional extra data to attach to the Doc.
+        RETURNS (Doc): The newly constructed object.
+        """
+        self.vocab = vocab
+        self.spaces = spaces
+        self.words = words
+        if spaces is None:
+            self.spaces = [True] * len(self.words)
+        elif len(spaces) != len(self.words):
+            raise ValueError("dismatch spaces and words")
+
+    def get_chunker(self, vocab):
+        return None
+
+    def push_back(self, vocab):
+        pass
+
+
+class Token(object):
+    """An individual token – i.e. a word, punctuation symbol, whitespace,
+    etc.
     """
 
-    def __init__(self):
-        # wrap pandas.dataframe
-        self.dataframe = None
+    def __init__(self, vocab, doc, offset):
+        """Construct a `Token` object.
+            vocab (Vocabulary): A storage container for lexical types.
+            doc (Document): The parent document.
+            offset (int): The index of the token within the document.
+        """
+        self.vocab = vocab
+        self.doc = doc
+        self.token = doc[offset]
+        self.i = offset
diff --git a/model/word_seg_model.py b/model/word_seg_model.py
new file mode 100644
index 00000000..38df7a52
--- /dev/null
+++ b/model/word_seg_model.py
@@ -0,0 +1,135 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.autograd import Variable
+
+from model.base_model import BaseModel
+
+USE_GPU = True
+
+
+def to_var(x):
+    if torch.cuda.is_available() and USE_GPU:
+        x = x.cuda()
+    return Variable(x)
+
+
+class WordSegModel(BaseModel):
+    """
+        Model controller for WordSeg
+    """
+
+    def __init__(self):
+        super(WordSegModel, self).__init__()
+        self.id2word = None
+        self.word2id = None
+        self.id2tag = None
+        self.tag2id = None
+
+        self.lstm_batch_size = 8
+        self.lstm_seq_len = 32  # Trainer batch_size == lstm_batch_size * lstm_seq_len
+        self.hidden_dim = 100
+        self.lstm_num_layers = 2
+        self.vocab_size = 100
+        self.word_emb_dim = 100
+
+        self.model = WordSeg(self.hidden_dim, self.lstm_num_layers, self.vocab_size, self.word_emb_dim)
+        self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)),
+                       to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)))
+
+        self.optimizer = None
+        self._loss = None
+
+    def prepare_input(self, data):
+        """
+            perform word indices lookup to convert strings into indices
+            :param data: list of string, each string contains word + space + [B, M, E, S]
+            :return
+        """
+        word_list = []
+        tag_list = []
+        for line in data:
+            if len(line) > 2:
+                tokens = line.split("#")
+                word_list.append(tokens[0])
+                tag_list.append(tokens[2][0])
+        self.id2word = list(set(word_list))
+        self.word2id = {word: idx for idx, word in enumerate(self.id2word)}
+        self.id2tag = list(set(tag_list))
+        self.tag2id = {tag: idx for idx, tag in enumerate(self.id2tag)}
+        words = np.array([self.word2id[w] for w in word_list]).reshape(-1, 1)
+        tags = np.array([self.tag2id[t] for t in tag_list]).reshape(-1, 1)
+        return words, tags
+
+    def mode(self, test=False):
+        if test:
+            self.model.eval()
+        else:
+            self.model.train()
+
+    def data_forward(self, x):
+        """
+        :param x: sequence of length [batch_size], word indices
+        :return:
+        """
+        x = x.reshape(self.lstm_batch_size, self.lstm_seq_len)
+        output, self.hidden = self.model(x, self.hidden)
+        return output
+
+    def define_optimizer(self):
+        self.optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85)
+
+    def get_loss(self, pred, truth):
+
+        self._loss = nn.CrossEntropyLoss(pred, truth)
+        return self._loss
+
+    def grad_backward(self):
+        self.model.zero_grad()
+        self._loss.backward()
+        torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
+        self.optimizer.step()
+
+
+class WordSeg(nn.Module):
+    """
+        PyTorch Network for word segmentation
+    """
+
+    def __init__(self, hidden_dim, lstm_num_layers, vocab_size, word_emb_dim=100):
+        super(WordSeg, self).__init__()
+
+        self.vocab_size = vocab_size
+        self.word_emb_dim = word_emb_dim
+        self.lstm_num_layers = lstm_num_layers
+        self.hidden_dim = hidden_dim
+
+        self.word_emb = nn.Embedding(self.vocab_size, self.word_emb_dim)
+
+        self.lstm = nn.LSTM(input_size=self.word_emb_dim,
+                            hidden_size=self.word_emb_dim,
+                            num_layers=self.lstm_num_layers,
+                            bias=True,
+                            dropout=0.5,
+                            batch_first=True)
+
+        self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)
+
+    def forward(self, x, hidden):
+        """
+        :param x: tensor of shape [batch_size, seq_len], vocabulary index
+        :param hidden:
+        :return x: probability of vocabulary entries
+                hidden: (memory cell, hidden state) from LSTM
+        """
+        # [batch_size, seq_len]
+        x = self.word_emb(x)
+        # [batch_size, seq_len, word_emb_size]
+        x, hidden = self.lstm(x, hidden)
+        # [batch_size, seq_len, word_emb_size]
+        x = x.contiguous().view(x.shape[0] * x.shape[1], -1)
+        # [batch_size*seq_len, word_emb_size]
+        x = self.linear(x)
+        # [batch_size*seq_len, vocab_size]
+        return x, hidden
diff --git a/reproduction/CNN-sentence_classification/train.py b/reproduction/CNN-sentence_classification/train.py
index ca4ea96e..763165d4 100644
--- a/reproduction/CNN-sentence_classification/train.py
+++ b/reproduction/CNN-sentence_classification/train.py
@@ -1,5 +1,6 @@
 import os
 
+import
 import
 import torch
 import torch.nn as nn
@@ -54,10 +55,10 @@ for epoch in range(num_epochs):
     cnn.train()
     for i, (sents,labels) in enumerate(train_loader):
         sents = Variable(sents)
-        labels = Variable(labels)        
-	   if cuda:
-	       sents = sents.cuda()
-	       labels = labels.cuda()
+        labels = Variable(labels)
+        if cuda:
+            sents = sents.cuda()
+        labels = labels.cuda()
         optimizer.zero_grad()
         outputs = cnn(sents)
         loss = criterion(outputs, labels)
diff --git a/tests/test_word_seg.py b/tests/test_word_seg.py
new file mode 100644
index 00000000..1c94327d
--- /dev/null
+++ b/tests/test_word_seg.py
@@ -0,0 +1,30 @@
+from action.tester import Tester
+from action.trainer import Trainer
+from loader.base_loader import BaseLoader
+from model.word_seg_model import WordSegModel
+
+
+def test_charlm():
+    train_config = Trainer.TrainConfig(epochs=5, validate=False, save_when_better=False,
+                                       log_per_step=10, log_validation=False, batch_size=254)
+    trainer = Trainer(train_config)
+
+    model = WordSegModel()
+
+    train_data = BaseLoader("load_train", "./data_for_tests/cws_train").load_lines()
+
+    trainer.train(model, train_data)
+
+    trainer.save_model(model)
+
+    test_config = Tester.TestConfig(save_output=False, validate_in_training=False,
+                                    save_dev_input=False, save_loss=False, batch_size=254)
+    tester = Tester(test_config)
+
+    test_data = BaseLoader("load_test", "./data_for_tests/cws_test").load_lines()
+
+    tester.test(model, test_data)
+
+
+if __name__ == "__main__":
+    test_charlm()