restructure files & add "modules" directory & add CRF.py

7 years ago · b5dbd3cddc
--- a/fastNLP/action/README.md
+++ b/fastNLP/action/README.md
@@ -0,0 +1,8 @@
 SpaCy "Doc"
 https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/tokens/doc.pyx#L80

 SpaCy "Vocab"
 https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/vocab.pyx#L25

 SpaCy "Token"
 https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/tokens/token.pyx#L27
--- a/fastNLP/action/init.py
+++ b/fastNLP/action/init.py
--- a/fastNLP/action/action.py
+++ b/fastNLP/action/action.py
@@ -0,0 +1,46 @@
 from saver.logger import Logger


 class Action(object):
    """
        base class for Trainer and Tester
    """

    def __init__(self):
        super(Action, self).__init__()
        self.logger = Logger("logger_output.txt")

    def load_config(self, args):
        raise NotImplementedError

    def load_dataset(self, args):
        raise NotImplementedError

    def log(self, string):
        self.logger.log(string)

    def batchify(self, batch_size, X, Y=None):
        """
        :param batch_size: int
        :param X: feature matrix of size [n_sample, m_feature]
        :param Y: label vector of size [n_sample, 1] (optional)
        :return iteration:int, the number of step in each epoch
                 generator:generator, to generate batch inputs
        """
        n_samples = X.shape[0]
        num_iter = n_samples // batch_size
        if Y is None:
            generator = self._batch_generate(batch_size, num_iter, X)
        else:
            generator = self._batch_generate(batch_size, num_iter, X, Y)
        return num_iter, generator

    @staticmethod
    def _batch_generate(batch_size, num_iter, *data):
        for step in range(num_iter):
            start = batch_size * step
            end = batch_size * (step + 1)
            yield tuple([x[start:end] for x in data])

    def make_log(self, *args):
        return "log"
--- a/fastNLP/action/tester.py
+++ b/fastNLP/action/tester.py
@@ -0,0 +1,87 @@
 from collections import namedtuple

 import numpy as np

 from fastNLP.action import Action


 class Tester(Action):
    """docstring for Tester"""

    TestConfig = namedtuple("config", ["validate_in_training", "save_dev_input", "save_output",
                                       "save_loss", "batch_size"])

    def __init__(self, test_args):
        """
        :param test_args: named tuple
        """
        super(Tester, self).__init__()
        self.validate_in_training = test_args.validate_in_training
        self.save_dev_input = test_args.save_dev_input
        self.valid_x = None
        self.valid_y = None
        self.save_output = test_args.save_output
        self.output = None
        self.save_loss = test_args.save_loss
        self.mean_loss = None
        self.batch_size = test_args.batch_size

    def test(self, network, data):
        print("testing")
        network.mode(test=True)  # turn on the testing mode
        if self.save_dev_input:
            if self.valid_x is None:
                valid_x, valid_y = network.prepare_input(data)
                self.valid_x = valid_x
                self.valid_y = valid_y
            else:
                valid_x = self.valid_x
                valid_y = self.valid_y
        else:
            valid_x, valid_y = network.prepare_input(data)

        # split into batches by self.batch_size
        iterations, test_batch_generator = self.batchify(self.batch_size, valid_x, valid_y)

        batch_output = list()
        loss_history = list()
        # turn on the testing mode of the network
        network.mode(test=True)

        for step in range(iterations):
            batch_x, batch_y = test_batch_generator.__next__()

            # forward pass from test input to predicted output
            prediction = network.data_forward(batch_x)

            loss = network.get_loss(prediction, batch_y)

            if self.save_output:
                batch_output.append(prediction.data)
            if self.save_loss:
                loss_history.append(loss)
                self.log(self.make_log(step, loss))

        if self.save_loss:
            self.mean_loss = np.mean(np.array(loss_history))
        if self.save_output:
            self.output = self.make_output(batch_output)

    @property
    def loss(self):
        return self.mean_loss

    @property
    def result(self):
        return self.output

    @staticmethod
    def make_output(batch_outputs):
        # construct full prediction with batch outputs
        return np.concatenate(batch_outputs, axis=0)

    def load_config(self, args):
        raise NotImplementedError

    def load_dataset(self, args):
        raise NotImplementedError
--- a/fastNLP/action/trainer.py
+++ b/fastNLP/action/trainer.py
@@ -0,0 +1,93 @@
 from collections import namedtuple

 from .action import Action
 from .tester import Tester


 class Trainer(Action):
    """
        Trainer is a common training pipeline shared among all models.
    """
    TrainConfig = namedtuple("config", ["epochs", "validate", "save_when_better",
                                        "log_per_step", "log_validation", "batch_size"])

    def __init__(self, train_args):
        """
        :param train_args: namedtuple
        """
        super(Trainer, self).__init__()
        self.n_epochs = train_args.epochs
        self.validate = train_args.validate
        self.save_when_better = train_args.save_when_better
        self.log_per_step = train_args.log_per_step
        self.log_validation = train_args.log_validation
        self.batch_size = train_args.batch_size

    def train(self, network, train_data, dev_data=None):
        """
        :param network: the models controller
        :param train_data: raw data for training
        :param dev_data: raw data for validation
        This method will call all the base methods of network (implemented in models.base_model).
        """
        train_x, train_y = network.prepare_input(train_data)

        iterations, train_batch_generator = self.batchify(self.batch_size, train_x, train_y)

        test_args = Tester.TestConfig(save_output=True, validate_in_training=True,
                                      save_dev_input=True, save_loss=True, batch_size=self.batch_size)
        evaluator = Tester(test_args)

        best_loss = 1e10
        loss_history = list()

        for epoch in range(self.n_epochs):
            network.mode(test=False)  # turn on the train mode

            network.define_optimizer()
            for step in range(iterations):
                batch_x, batch_y = train_batch_generator.__next__()

                prediction = network.data_forward(batch_x)

                loss = network.get_loss(prediction, batch_y)
                network.grad_backward()

                if step % self.log_per_step == 0:
                    print("step ", step)
                    loss_history.append(loss)
                    self.log(self.make_log(epoch, step, loss))

            #################### evaluate over dev set  ###################
            if self.validate:
                if dev_data is None:
                    raise RuntimeError("No validation data provided.")
                # give all controls to tester
                evaluator.test(network, dev_data)

                if self.log_validation:
                    self.log(self.make_valid_log(epoch, evaluator.loss))
                if evaluator.loss < best_loss:
                    best_loss = evaluator.loss
                    if self.save_when_better:
                        self.save_model(network)

        # finish training

    def make_log(self, *args):
        return "make a log"

    def make_valid_log(self, *args):
        return "make a valid log"

    def save_model(self, model):
        model.save()

    def load_data(self, data_name):
        print("load data")

    def load_config(self, args):
        raise NotImplementedError

    def load_dataset(self, args):
        raise NotImplementedError
--- a/fastNLP/loader/init.py
+++ b/fastNLP/loader/init.py
--- a/fastNLP/loader/base_loader.py
+++ b/fastNLP/loader/base_loader.py
@@ -0,0 +1,36 @@
 class BaseLoader(object):
    """docstring for BaseLoader"""

    def __init__(self, data_name, data_path):
        super(BaseLoader, self).__init__()
        self.data_name = data_name
        self.data_path = data_path

    def load(self):
        """
        :return: string
        """
        with open(self.data_path, "r", encoding="utf-8") as f:
            text = f.read()
        return text

    def load_lines(self):
        with open(self.data_path, "r", encoding="utf=8") as f:
            text = f.readlines()
        return text


 class ToyLoader0(BaseLoader):
    """
        For charLM
    """

    def __init__(self, name, path):
        super(ToyLoader0, self).__init__(name, path)

    def load(self):
        with open(self.data_path, 'r') as f:
            corpus = f.read().lower()
        import re
        corpus = re.sub(r"<unk>", "unk", corpus)
        return corpus.split()
--- a/fastNLP/loader/config_loader.py
+++ b/fastNLP/loader/config_loader.py
@@ -0,0 +1,13 @@
 from loader.base_loader import BaseLoader


 class ConfigLoader(BaseLoader):
    """loader for configuration files"""

    def __int__(self, data_name, data_path):
        super(ConfigLoader, self).__init__(data_name, data_path)
        self.config = self.parse(super(ConfigLoader, self).load())

    @staticmethod
    def parse(string):
        raise NotImplementedError
--- a/fastNLP/loader/dataset_loader.py
+++ b/fastNLP/loader/dataset_loader.py
@@ -0,0 +1,47 @@
 from loader.base_loader import BaseLoader


 class DatasetLoader(BaseLoader):
    """"loader for data sets"""

    def __init__(self, data_name, data_path):
        super(DatasetLoader, self).__init__(data_name, data_path)


 class ConllLoader(DatasetLoader):
    """loader for conll format files"""

    def __int__(self, data_name, data_path):
        """
        :param  str data_name: the name of the conll data set
        :param str data_path: the path to the conll data set
        """
        super(ConllLoader, self).__init__(data_name, data_path)
        self.data_set = self.parse(self.load())

    def load(self):
        """
        :return: list lines: all lines in a conll file
        """
        with open(self.data_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        return lines

    @staticmethod
    def parse(lines):
        """
        :param list lines:a list containing all lines in a conll file.
        :return: a 3D list
        """
        sentences = list()
        tokens = list()
        for line in lines:
            if line[0] == "#":
                # skip the comments
                continue
            if line == "\n":
                sentences.append(tokens)
                tokens = []
                continue
            tokens.append(line.split())
        return sentences
--- a/fastNLP/loader/embed_loader.py
+++ b/fastNLP/loader/embed_loader.py
@@ -0,0 +1,8 @@
 from loader.base_loader import BaseLoader


 class EmbedLoader(BaseLoader):
    """docstring for EmbedLoader"""

    def __init__(self, data_name, data_path):
        super(EmbedLoader, self).__init__(data_name, data_path)
--- a/fastNLP/models/init.py
+++ b/fastNLP/models/init.py
--- a/fastNLP/models/base_model.py
+++ b/fastNLP/models/base_model.py
@@ -0,0 +1,158 @@
 import numpy as np


 class BaseModel(object):
    """The base class of all models.
        This class and its subclasses are actually "wrappers" of the PyTorch models.
        They act as an interface between Trainer and the deep learning networks.
        This interface provides the following methods to be called by Trainer.
        - prepare_input
        - mode
        - define_optimizer
        - data_forward
        - grad_backward
        - get_loss
    """

    def __init__(self):
        pass

    def prepare_input(self, data):
        """
        Perform data transformation from raw input to vector/matrix inputs.
        :param data: raw inputs
        :return (X, Y): tuple, input features and labels
        """
        raise NotImplementedError

    def mode(self, test=False):
        """
        Tell the network to be trained or not, required by PyTorch.
        :param test: bool
        """
        raise NotImplementedError

    def define_optimizer(self):
        """
        Define PyTorch optimizer specified by the models.
        """
        raise NotImplementedError

    def data_forward(self, *x):
        """
        Forward pass of the data.
        :param x: input feature matrix and label vector
        :return: output by the models
        """
        # required by PyTorch nn
        raise NotImplementedError

    def grad_backward(self):
        """
        Perform gradient descent to update the models parameters.
        """
        raise NotImplementedError

    def get_loss(self, pred, truth):
        """
        Compute loss given models prediction and ground truth. Loss function specified by the models.
        :param pred: prediction label vector
        :param truth: ground truth label vector
        :return: a scalar
        """
        raise NotImplementedError


 class ToyModel(BaseModel):
    """This is for code testing."""

    def __init__(self):
        super(ToyModel, self).__init__()
        self.test_mode = False
        self.weight = np.random.rand(5, 1)
        self.bias = np.random.rand()
        self._loss = 0

    def prepare_input(self, data):
        return data[:, :-1], data[:, -1]

    def mode(self, test=False):
        self.test_mode = test

    def data_forward(self, x):
        return np.matmul(x, self.weight) + self.bias

    def grad_backward(self):
        print("loss gradient backward")

    def get_loss(self, pred, truth):
        self._loss = np.mean(np.square(pred - truth))
        return self._loss

    def define_optimizer(self):
        pass


 class Vocabulary(object):
    """A look-up table that allows you to access `Lexeme` objects. The `Vocab`
    instance also provides access to the `StringStore`, and owns underlying
    data that is shared between `Doc` objects.
    """

    def __init__(self):
        """Create the vocabulary.
        RETURNS (Vocab): The newly constructed object.
        """
        self.data_frame = None


 class Document(object):
    """A sequence of Token objects. Access sentences and named entities, export
    annotations to numpy arrays, losslessly serialize to compressed binary
    strings. The `Doc` object holds an array of `Token` objects. The
    Python-level `Token` and `Span` objects are views of this array, i.e.
    they don't own the data themselves. -- spacy
    """

    def __init__(self, vocab, words=None, spaces=None):
        """Create a Doc object.
        vocab (Vocab): A vocabulary object, which must match any models you
            want to use (e.g. tokenizer, parser, entity recognizer).
        words (list or None): A list of unicode strings, to add to the document
            as words. If `None`, defaults to empty list.
        spaces (list or None): A list of boolean values, of the same length as
            words. True means that the word is followed by a space, False means
            it is not. If `None`, defaults to `[True]*len(words)`
        user_data (dict or None): Optional extra data to attach to the Doc.
        RETURNS (Doc): The newly constructed object.
        """
        self.vocab = vocab
        self.spaces = spaces
        self.words = words
        if spaces is None:
            self.spaces = [True] * len(self.words)
        elif len(spaces) != len(self.words):
            raise ValueError("dismatch spaces and words")

    def get_chunker(self, vocab):
        return None

    def push_back(self, vocab):
        pass


 class Token(object):
    """An individual token – i.e. a word, punctuation symbol, whitespace,
    etc.
    """

    def __init__(self, vocab, doc, offset):
        """Construct a `Token` object.
            vocab (Vocabulary): A storage container for lexical types.
            doc (Document): The parent document.
            offset (int): The index of the token within the document.
        """
        self.vocab = vocab
        self.doc = doc
        self.token = doc[offset]
        self.i = offset
--- a/fastNLP/models/char_language_model.py
+++ b/fastNLP/models/char_language_model.py
@@ -0,0 +1,354 @@
 import os
 from collections import namedtuple

 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 from model.base_model import BaseModel
 from torch.autograd import Variable

 USE_GPU = True


 class CharLM(BaseModel):
    """
        Controller of the Character-level Neural Language Model
        To do:
        - where the data goes, call data savers.
    """
    DataTuple = namedtuple("DataTuple", ["feature", "label"])

    def __init__(self, lstm_batch_size, lstm_seq_len):
        super(CharLM, self).__init__()
        """
            Settings: should come from config loader or pre-processing
        """
        self.word_embed_dim = 300
        self.char_embedding_dim = 15
        self.cnn_batch_size = lstm_batch_size * lstm_seq_len
        self.lstm_seq_len = lstm_seq_len
        self.lstm_batch_size = lstm_batch_size
        self.num_epoch = 10
        self.old_PPL = 100000
        self.best_PPL = 100000

        """
            These parameters are set by pre-processing.
        """
        self.max_word_len = None
        self.num_char = None
        self.vocab_size = None
        self.preprocess("./data_for_tests/charlm.txt")

        self.data = None  # named tuple to store all data set
        self.data_ready = False
        self.criterion = nn.CrossEntropyLoss()
        self._loss = None
        self.use_gpu = USE_GPU

        # word_emb_dim == hidden_size / num of hidden units
        self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)),
                       to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)))

        self.model = charLM(self.char_embedding_dim,
                            self.word_embed_dim,
                            self.vocab_size,
                            self.num_char,
                            use_gpu=self.use_gpu)
        for param in self.model.parameters():
            nn.init.uniform(param.data, -0.05, 0.05)

        self.learning_rate = 0.1
        self.optimizer = None

    def prepare_input(self, raw_text):
        """
        :param raw_text: raw input text consisting of words
        :return: torch.Tensor, torch.Tensor
        feature matrix, label vector
        This function is only called once in Trainer.train, but may called multiple times in Tester.test
        So Tester will save  test input for frequent calls.
        """
        if os.path.exists("cache/prep.pt") is False:
            self.preprocess("./data_for_tests/charlm.txt")  # To do: This is not good. Need to fix..
        objects = torch.load("cache/prep.pt")
        word_dict = objects["word_dict"]
        char_dict = objects["char_dict"]
        max_word_len = self.max_word_len
        print("word/char dictionary built. Start making inputs.")

        words = raw_text
        input_vec = np.array(text2vec(words, char_dict, max_word_len))
        # Labels are next-word index in word_dict with the same length as inputs
        input_label = np.array([word_dict[w] for w in words[1:]] + [word_dict[words[-1]]])
        feature_input = torch.from_numpy(input_vec)
        label_input = torch.from_numpy(input_label)
        return feature_input, label_input

    def mode(self, test=False):
        if test:
            self.model.eval()
        else:
            self.model.train()

    def data_forward(self, x):
        """
        :param x: Tensor of size [lstm_batch_size, lstm_seq_len, max_word_len+2]
        :return: Tensor of size [num_words, ?]
        """
        # additional processing of inputs after batching
        num_seq = x.size()[0] // self.lstm_seq_len
        x = x[:num_seq * self.lstm_seq_len, :]
        x = x.view(-1, self.lstm_seq_len, self.max_word_len + 2)

        # detach hidden state of LSTM from last batch
        hidden = [state.detach() for state in self.hidden]
        output, self.hidden = self.model(to_var(x), hidden)
        return output

    def grad_backward(self):
        self.model.zero_grad()
        self._loss.backward()
        torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
        self.optimizer.step()

    def get_loss(self, predict, truth):
        self._loss = self.criterion(predict, to_var(truth))
        return self._loss.data  # No pytorch data structure exposed outsides

    def define_optimizer(self):
        # redefine optimizer for every new epoch
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.learning_rate, momentum=0.85)

    def save(self):
        print("network saved")
        # torch.save(self.models, "cache/models.pkl")

    def preprocess(self, all_text_files):
        word_dict, char_dict = create_word_char_dict(all_text_files)
        num_char = len(char_dict)
        self.vocab_size = len(word_dict)
        char_dict["BOW"] = num_char + 1
        char_dict["EOW"] = num_char + 2
        char_dict["PAD"] = 0
        self.num_char = num_char + 3
        #  char_dict is a dict of (int, string), int counting from 0 to 47
        reverse_word_dict = {value: key for key, value in word_dict.items()}
        self.max_word_len = max([len(word) for word in word_dict])
        objects = {
            "word_dict": word_dict,
            "char_dict": char_dict,
            "reverse_word_dict": reverse_word_dict,
        }
        torch.save(objects, "cache/prep.pt")
        print("Preprocess done.")


 """
    Global Functions
 """


 def batch_generator(x, batch_size):
    # x: [num_words, in_channel, height, width]
    # partitions x into batches
    num_step = x.size()[0] // batch_size
    for t in range(num_step):
        yield x[t * batch_size:(t + 1) * batch_size]


 def text2vec(words, char_dict, max_word_len):
    """ Return list of list of int """
    word_vec = []
    for word in words:
        vec = [char_dict[ch] for ch in word]
        if len(vec) < max_word_len:
            vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))]
        vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]]
        word_vec.append(vec)
    return word_vec


 def read_data(file_name):
    with open(file_name, 'r') as f:
        corpus = f.read().lower()
    import re
    corpus = re.sub(r"<unk>", "unk", corpus)
    return corpus.split()


 def get_char_dict(vocabulary):
    char_dict = dict()
    count = 1
    for word in vocabulary:
        for ch in word:
            if ch not in char_dict:
                char_dict[ch] = count
                count += 1
    return char_dict


 def create_word_char_dict(*file_name):
    text = []
    for file in file_name:
        text += read_data(file)
    word_dict = {word: ix for ix, word in enumerate(set(text))}
    char_dict = get_char_dict(word_dict)
    return word_dict, char_dict


 def to_var(x):
    if torch.cuda.is_available() and USE_GPU:
        x = x.cuda()
    return Variable(x)


 """
    Neural Network
 """


 class Highway(nn.Module):
    """Highway network"""

    def __init__(self, input_size):
        super(Highway, self).__init__()
        self.fc1 = nn.Linear(input_size, input_size, bias=True)
        self.fc2 = nn.Linear(input_size, input_size, bias=True)

    def forward(self, x):
        t = F.sigmoid(self.fc1(x))
        return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x)


 class charLM(nn.Module):
    """Character-level Neural Language Model
    CNN + highway network + LSTM
    # Input:
        4D tensor with shape [batch_size, in_channel, height, width]
    # Output:
        2D Tensor with shape [batch_size, vocab_size]
    # Arguments:
        char_emb_dim: the size of each character's attention
        word_emb_dim: the size of each word's attention
        vocab_size: num of unique words
        num_char: num of characters
        use_gpu: True or False
    """

    def __init__(self, char_emb_dim, word_emb_dim,
                 vocab_size, num_char, use_gpu):
        super(charLM, self).__init__()
        self.char_emb_dim = char_emb_dim
        self.word_emb_dim = word_emb_dim
        self.vocab_size = vocab_size

        # char attention layer
        self.char_embed = nn.Embedding(num_char, char_emb_dim)

        # convolutions of filters with different sizes
        self.convolutions = []

        # list of tuples: (the number of filter, width)
        # self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)]
        self.filter_num_width = [(25, 1), (50, 2), (75, 3)]

        for out_channel, filter_width in self.filter_num_width:
            self.convolutions.append(
                nn.Conv2d(
                    1,  # in_channel
                    out_channel,  # out_channel
                    kernel_size=(char_emb_dim, filter_width),  # (height, width)
                    bias=True
                )
            )

        self.highway_input_dim = sum([x for x, y in self.filter_num_width])

        self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False)

        # highway net
        self.highway1 = Highway(self.highway_input_dim)
        self.highway2 = Highway(self.highway_input_dim)

        # LSTM
        self.lstm_num_layers = 2

        self.lstm = nn.LSTM(input_size=self.highway_input_dim,
                            hidden_size=self.word_emb_dim,
                            num_layers=self.lstm_num_layers,
                            bias=True,
                            dropout=0.5,
                            batch_first=True)

        # output layer
        self.dropout = nn.Dropout(p=0.5)
        self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)

        if use_gpu is True:
            for x in range(len(self.convolutions)):
                self.convolutions[x] = self.convolutions[x].cuda()
            self.highway1 = self.highway1.cuda()
            self.highway2 = self.highway2.cuda()
            self.lstm = self.lstm.cuda()
            self.dropout = self.dropout.cuda()
            self.char_embed = self.char_embed.cuda()
            self.linear = self.linear.cuda()
            self.batch_norm = self.batch_norm.cuda()

    def forward(self, x, hidden):
        # Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2]
        # Return: Variable of Tensor with shape [num_words, len(word_dict)]
        lstm_batch_size = x.size()[0]
        lstm_seq_len = x.size()[1]

        x = x.contiguous().view(-1, x.size()[2])
        # [num_seq*seq_len, max_word_len+2]

        x = self.char_embed(x)
        # [num_seq*seq_len, max_word_len+2, char_emb_dim]

        x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
        # [num_seq*seq_len, 1, char_emb_dim, max_word_len+2]

        x = self.conv_layers(x)
        # [num_seq*seq_len, total_num_filters]

        x = self.batch_norm(x)
        # [num_seq*seq_len, total_num_filters]

        x = self.highway1(x)
        x = self.highway2(x)
        # [num_seq*seq_len, total_num_filters]

        x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
        # [num_seq, seq_len, total_num_filters]

        x, hidden = self.lstm(x, hidden)
        # [seq_len, num_seq, hidden_size]

        x = self.dropout(x)
        # [seq_len, num_seq, hidden_size]

        x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1)
        # [num_seq*seq_len, hidden_size]

        x = self.linear(x)
        # [num_seq*seq_len, vocab_size]
        return x, hidden

    def conv_layers(self, x):
        chosen_list = list()
        for conv in self.convolutions:
            feature_map = F.tanh(conv(x))
            # (batch_size, out_channel, 1, max_word_len-width+1)
            chosen = torch.max(feature_map, 3)[0]
            # (batch_size, out_channel, 1)
            chosen = chosen.squeeze()
            # (batch_size, out_channel)
            chosen_list.append(chosen)

        # (batch_size, total_num_filers)
        return torch.cat(chosen_list, 1)
--- a/fastNLP/models/word_seg_model.py
+++ b/fastNLP/models/word_seg_model.py
@@ -0,0 +1,134 @@
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.optim as optim
 from model.base_model import BaseModel
 from torch.autograd import Variable

 USE_GPU = True


 def to_var(x):
    if torch.cuda.is_available() and USE_GPU:
        x = x.cuda()
    return Variable(x)


 class WordSegModel(BaseModel):
    """
        Model controller for WordSeg
    """

    def __init__(self):
        super(WordSegModel, self).__init__()
        self.id2word = None
        self.word2id = None
        self.id2tag = None
        self.tag2id = None

        self.lstm_batch_size = 8
        self.lstm_seq_len = 32  # Trainer batch_size == lstm_batch_size * lstm_seq_len
        self.hidden_dim = 100
        self.lstm_num_layers = 2
        self.vocab_size = 100
        self.word_emb_dim = 100

        self.model = WordSeg(self.hidden_dim, self.lstm_num_layers, self.vocab_size, self.word_emb_dim)
        self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)),
                       to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)))

        self.optimizer = None
        self._loss = None

    def prepare_input(self, data):
        """
            perform word indices lookup to convert strings into indices
            :param data: list of string, each string contains word + space + [B, M, E, S]
            :return
        """
        word_list = []
        tag_list = []
        for line in data:
            if len(line) > 2:
                tokens = line.split("#")
                word_list.append(tokens[0])
                tag_list.append(tokens[2][0])
        self.id2word = list(set(word_list))
        self.word2id = {word: idx for idx, word in enumerate(self.id2word)}
        self.id2tag = list(set(tag_list))
        self.tag2id = {tag: idx for idx, tag in enumerate(self.id2tag)}
        words = np.array([self.word2id[w] for w in word_list]).reshape(-1, 1)
        tags = np.array([self.tag2id[t] for t in tag_list]).reshape(-1, 1)
        return words, tags

    def mode(self, test=False):
        if test:
            self.model.eval()
        else:
            self.model.train()

    def data_forward(self, x):
        """
        :param x: sequence of length [batch_size], word indices
        :return:
        """
        x = x.reshape(self.lstm_batch_size, self.lstm_seq_len)
        output, self.hidden = self.model(x, self.hidden)
        return output

    def define_optimizer(self):
        self.optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85)

    def get_loss(self, pred, truth):

        self._loss = nn.CrossEntropyLoss(pred, truth)
        return self._loss

    def grad_backward(self):
        self.model.zero_grad()
        self._loss.backward()
        torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
        self.optimizer.step()


 class WordSeg(nn.Module):
    """
        PyTorch Network for word segmentation
    """

    def __init__(self, hidden_dim, lstm_num_layers, vocab_size, word_emb_dim=100):
        super(WordSeg, self).__init__()

        self.vocab_size = vocab_size
        self.word_emb_dim = word_emb_dim
        self.lstm_num_layers = lstm_num_layers
        self.hidden_dim = hidden_dim

        self.word_emb = nn.Embedding(self.vocab_size, self.word_emb_dim)

        self.lstm = nn.LSTM(input_size=self.word_emb_dim,
                            hidden_size=self.word_emb_dim,
                            num_layers=self.lstm_num_layers,
                            bias=True,
                            dropout=0.5,
                            batch_first=True)

        self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)

    def forward(self, x, hidden):
        """
        :param x: tensor of shape [batch_size, seq_len], vocabulary index
        :param hidden:
        :return x: probability of vocabulary entries
                hidden: (memory cell, hidden state) from LSTM
        """
        # [batch_size, seq_len]
        x = self.word_emb(x)
        # [batch_size, seq_len, word_emb_size]
        x, hidden = self.lstm(x, hidden)
        # [batch_size, seq_len, word_emb_size]
        x = x.contiguous().view(x.shape[0] * x.shape[1], -1)
        # [batch_size*seq_len, word_emb_size]
        x = self.linear(x)
        # [batch_size*seq_len, vocab_size]
        return x, hidden