Merge pull request #1 from fastnlp/master

update
7 years ago · 08791d05b8
--- a/+ 201
+++ b/+ 201
@@ -0,0 +1,201 @@
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
--- a/README.md
+++ b/README.md
@@ -1,2 +1,58 @@
 # FastNLP
 ```
 FastNLP
 │  LICENSE
 │  README.md
 │  requirements.txt
 │  setup.py
 |
 ├─docs  (documentation)
 |
 └─tests  (unit tests, intergrating tests, system tests)
 |   │  test_charlm.py
 |   │  test_loader.py
 |   │  test_trainer.py
 |   │  test_word_seg.py
 |   │
 |   └─data_for_tests  (test data used by models)
 |            charlm.txt
 |            cws_test
 |            cws_train
 |
 └─fastNLP
    ├─action      (model independent process)
    │  │  action.py (base class)
    │  │  README.md
    │  │  tester.py (model testing, for deployment and validation)
    │  │  trainer.py  (main logic for model training)
    │  │  __init__.py
    │  │
    |
    │
    ├─loader    (file loader for all loading operations)
    │   |  base_loader.py  (base class)
    │   |  config_loader.py   (model-specific configuration/parameter loader)
    │   |  dataset_loader.py  (data set loader, base class)
    │   |  embed_loader.py    (embedding loader, base class)
    │   |  __init__.py
    │
    ├─model  (definitions of PyTorch models)
    │  │  base_model.py  (base class, abstract)
    │  │  char_language_model.py  (derived class, to implement abstract methods)
    │  │  word_seg_model.py  
    │  │  __init__.py
    │  │
    │
    ├─reproduction   (code library for paper reproduction)
    │  ├─Char-aware_NLM
    │  │
    │  ├─CNN-sentence_classification
    │  │
    │  └─HAN-document_classification
    │
    ├─saver  (file saver for all saving operations)
    │      base_saver.py
    │      logger.py
    │      model_saver.py
    │
 ```
--- a/docs/quick_tutorial.md
+++ b/docs/quick_tutorial.md
@@ -0,0 +1 @@
 # FastNLP Quick Tutorial
--- a/fastNLP/init.py
+++ b/fastNLP/init.py
--- a/fastNLP/action/README.md
+++ b/fastNLP/action/README.md
@@ -0,0 +1,8 @@
 SpaCy "Doc"
 https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/tokens/doc.pyx#L80

 SpaCy "Vocab"
 https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/vocab.pyx#L25

 SpaCy "Token"
 https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/tokens/token.pyx#L27
--- a/fastNLP/action/init.py
+++ b/fastNLP/action/init.py
--- a/fastNLP/action/action.py
+++ b/fastNLP/action/action.py
@@ -0,0 +1,35 @@


 class Action(object):
    """
        base class for Trainer and Tester
    """

    def __init__(self):
        super(Action, self).__init__()

    def batchify(self, batch_size, X, Y=None):
        """
        :param batch_size: int
        :param X: feature matrix of size [n_sample, m_feature]
        :param Y: label vector of size [n_sample, 1] (optional)
        :return iteration:int, the number of step in each epoch
                 generator:generator, to generate batch inputs
        """
        n_samples = X.shape[0]
        num_iter = n_samples // batch_size
        if Y is None:
            generator = self._batch_generate(batch_size, num_iter, X)
        else:
            generator = self._batch_generate(batch_size, num_iter, X, Y)
        return num_iter, generator

    @staticmethod
    def _batch_generate(batch_size, num_iter, *data):
        for step in range(num_iter):
            start = batch_size * step
            end = batch_size * (step + 1)
            yield tuple([x[start:end] for x in data])

    def make_log(self, *args):
        return "log"
--- a/fastNLP/action/tester.py
+++ b/fastNLP/action/tester.py
@@ -0,0 +1,87 @@
 from collections import namedtuple

 import numpy as np

 from fastNLP.action.action import Action


 class Tester(Action):
    """docstring for Tester"""

    TestConfig = namedtuple("config", ["validate_in_training", "save_dev_input", "save_output",
                                       "save_loss", "batch_size"])

    def __init__(self, test_args):
        """
        :param test_args: named tuple
        """
        super(Tester, self).__init__()
        self.validate_in_training = test_args.validate_in_training
        self.save_dev_input = test_args.save_dev_input
        self.valid_x = None
        self.valid_y = None
        self.save_output = test_args.save_output
        self.output = None
        self.save_loss = test_args.save_loss
        self.mean_loss = None
        self.batch_size = test_args.batch_size

    def test(self, network, data):
        print("testing")
        network.mode(test=True)  # turn on the testing mode
        if self.save_dev_input:
            if self.valid_x is None:
                valid_x, valid_y = network.prepare_input(data)
                self.valid_x = valid_x
                self.valid_y = valid_y
            else:
                valid_x = self.valid_x
                valid_y = self.valid_y
        else:
            valid_x, valid_y = network.prepare_input(data)

        # split into batches by self.batch_size
        iterations, test_batch_generator = self.batchify(self.batch_size, valid_x, valid_y)

        batch_output = list()
        loss_history = list()
        # turn on the testing mode of the network
        network.mode(test=True)

        for step in range(iterations):
            batch_x, batch_y = test_batch_generator.__next__()

            # forward pass from test input to predicted output
            prediction = network.data_forward(batch_x)

            loss = network.get_loss(prediction, batch_y)

            if self.save_output:
                batch_output.append(prediction.data)
            if self.save_loss:
                loss_history.append(loss)
                self.log(self.make_log(step, loss))

        if self.save_loss:
            self.mean_loss = np.mean(np.array(loss_history))
        if self.save_output:
            self.output = self.make_output(batch_output)

    @property
    def loss(self):
        return self.mean_loss

    @property
    def result(self):
        return self.output

    @staticmethod
    def make_output(batch_outputs):
        # construct full prediction with batch outputs
        return np.concatenate(batch_outputs, axis=0)

    def load_config(self, args):
        raise NotImplementedError

    def load_dataset(self, args):
        raise NotImplementedError
--- a/fastNLP/action/trainer.py
+++ b/fastNLP/action/trainer.py
@@ -0,0 +1,268 @@
 from collections import namedtuple

 import numpy as np
 import torch

 from fastNLP.action.action import Action
 from fastNLP.action.tester import Tester


 class BaseTrainer(Action):
    """Base trainer for all trainers.
        Trainer receives a model and data, and then performs training.

        Subclasses must implement the following abstract methods:
        - prepare_input
        - mode
        - define_optimizer
        - data_forward
        - grad_backward
        - get_loss
    """
    TrainConfig = namedtuple("config", ["epochs", "validate", "save_when_better",
                                        "log_per_step", "log_validation", "batch_size"])

    def __init__(self, train_args):
        """
        training parameters
        """
        super(BaseTrainer, self).__init__()
        self.n_epochs = train_args.epochs
        self.validate = train_args.validate
        self.batch_size = train_args.batch_size
        self.model = None

    def train(self, network, train_data, dev_data=None):
        """General training loop.
        :param network: a model
        :param train_data: raw data for training
        :param dev_data: raw data for validation

        The method is framework independent.
        Work by calling the following methods:
            - prepare_input
            - mode
            - define_optimizer
            - data_forward
            - get_loss
            - grad_backward
            - update
        Subclasses must implement these methods with a specific framework.
        """
        self.model = network
        train_x, train_y = self.prepare_input(train_data)

        iterations, train_batch_generator = self.batchify(self.batch_size, train_x, train_y)

        test_args = Tester.TestConfig(save_output=True, validate_in_training=True,
                                      save_dev_input=True, save_loss=True, batch_size=self.batch_size)
        evaluator = Tester(test_args)

        best_loss = 1e10

        for epoch in range(self.n_epochs):
            self.mode(test=False)  # turn on the train mode

            self.define_optimizer()
            for step in range(iterations):
                batch_x, batch_y = train_batch_generator.__next__()

                prediction = self.data_forward(network, batch_x)

                loss = self.get_loss(prediction, batch_y)
                self.grad_backward(loss)
                self.update()

            if self.validate:
                if dev_data is None:
                    raise RuntimeError("No validation data provided.")
                evaluator.test(network, dev_data)
                if evaluator.loss < best_loss:
                    best_loss = evaluator.loss

        # finish training

    def prepare_input(self, data):
        """
        Perform data transformation from raw input to vector/matrix inputs.
        :param data: raw inputs
        :return (X, Y): tuple, input features and labels
        """
        raise NotImplementedError

    def mode(self, test=False):
        """
        Tell the network to be trained or not.
        :param test: bool
        """
        raise NotImplementedError

    def define_optimizer(self):
        """
        Define framework-specific optimizer specified by the models.
        """
        raise NotImplementedError

    def update(self):
        """
        Perform weight update on a model.

        For PyTorch, just call optimizer to update.
        """
        raise NotImplementedError

    def data_forward(self, network, x):
        """
        Forward pass of the data.
        :param network: a model
        :param x: input feature matrix and label vector
        :return: output by the models

        For PyTorch, just do "network(*x)"
        """
        raise NotImplementedError

    def grad_backward(self, loss):
        """
        Compute gradient with link rules.
        :param loss: a scalar where back-prop starts

        For PyTorch, just do "loss.backward()"
        """
        raise NotImplementedError

    def get_loss(self, predict, truth):
        """
        Compute loss given prediction and ground truth.
        :param predict: prediction label vector
        :param truth: ground truth label vector
        :return: a scalar
        """
        raise NotImplementedError


 class ToyTrainer(BaseTrainer):
    """A simple trainer for a PyTorch model."""

    def __init__(self, train_args):
        super(ToyTrainer, self).__init__(train_args)
        self.test_mode = False
        self.weight = np.random.rand(5, 1)
        self.bias = np.random.rand()
        self._loss = 0
        self._optimizer = None

    def prepare_input(self, data):
        return data[:, :-1], data[:, -1]

    def mode(self, test=False):
        self.model.mode(test)

    def data_forward(self, network, x):
        return np.matmul(x, self.weight) + self.bias

    def grad_backward(self, loss):
        loss.backward()

    def get_loss(self, pred, truth):
        self._loss = np.mean(np.square(pred - truth))
        return self._loss

    def define_optimizer(self):
        self._optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01)

    def update(self):
        self._optimizer.step()


 class WordSegTrainer(BaseTrainer):
    """
        reserve for changes
    """

    def __init__(self, train_args):
        super(WordSegTrainer, self).__init__(train_args)
        self.id2word = None
        self.word2id = None
        self.id2tag = None
        self.tag2id = None

        self.lstm_batch_size = 8
        self.lstm_seq_len = 32  # Trainer batch_size == lstm_batch_size * lstm_seq_len
        self.hidden_dim = 100
        self.lstm_num_layers = 2
        self.vocab_size = 100
        self.word_emb_dim = 100

        self.hidden = (self.to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)),
                       self.to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)))

        self.optimizer = None
        self._loss = None

        self.USE_GPU = False

    def to_var(self, x):
        if torch.cuda.is_available() and self.USE_GPU:
            x = x.cuda()
        return torch.autograd.Variable(x)

    def prepare_input(self, data):
        """
            perform word indices lookup to convert strings into indices
            :param data: list of string, each string contains word + space + [B, M, E, S]
            :return
        """
        word_list = []
        tag_list = []
        for line in data:
            if len(line) > 2:
                tokens = line.split("#")
                word_list.append(tokens[0])
                tag_list.append(tokens[2][0])
        self.id2word = list(set(word_list))
        self.word2id = {word: idx for idx, word in enumerate(self.id2word)}
        self.id2tag = list(set(tag_list))
        self.tag2id = {tag: idx for idx, tag in enumerate(self.id2tag)}
        words = np.array([self.word2id[w] for w in word_list]).reshape(-1, 1)
        tags = np.array([self.tag2id[t] for t in tag_list]).reshape(-1, 1)
        return words, tags

    def mode(self, test=False):
        if test:
            self.model.eval()
        else:
            self.model.train()

    def data_forward(self, network, x):
        """
        :param network: a PyTorch model
        :param x: sequence of length [batch_size], word indices
        :return:
        """
        x = x.reshape(self.lstm_batch_size, self.lstm_seq_len)
        output, self.hidden = network(x, self.hidden)
        return output

    def define_optimizer(self):
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85)

    def get_loss(self, predict, truth):
        self._loss = torch.nn.CrossEntropyLoss(predict, truth)
        return self._loss

    def grad_backward(self, network):
        self.model.zero_grad()
        self._loss.backward()
        torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)

    def update(self):
        self.optimizer.step()


 if __name__ == "__name__":
    Config = namedtuple("config", ["epochs", "validate", "save_when_better", "log_per_step",
                                   "log_validation", "batch_size"])
    train_config = Config(epochs=5, validate=True, save_when_better=True, log_per_step=10, log_validation=True,
                          batch_size=32)
    trainer = ToyTrainer(train_config)
--- a/fastNLP/loader/init.py
+++ b/fastNLP/loader/init.py
--- a/fastNLP/loader/base_loader.py
+++ b/fastNLP/loader/base_loader.py
@@ -0,0 +1,36 @@
 class BaseLoader(object):
    """docstring for BaseLoader"""

    def __init__(self, data_name, data_path):
        super(BaseLoader, self).__init__()
        self.data_name = data_name
        self.data_path = data_path

    def load(self):
        """
        :return: string
        """
        with open(self.data_path, "r", encoding="utf-8") as f:
            text = f.read()
        return text

    def load_lines(self):
        with open(self.data_path, "r", encoding="utf=8") as f:
            text = f.readlines()
        return text


 class ToyLoader0(BaseLoader):
    """
        For charLM
    """

    def __init__(self, name, path):
        super(ToyLoader0, self).__init__(name, path)

    def load(self):
        with open(self.data_path, 'r') as f:
            corpus = f.read().lower()
        import re
        corpus = re.sub(r"<unk>", "unk", corpus)
        return corpus.split()
--- a/fastNLP/loader/config_loader.py
+++ b/fastNLP/loader/config_loader.py
@@ -0,0 +1,42 @@
 from fastNLP.loader.base_loader import BaseLoader

 import configparser
 import traceback
 import json


 class ConfigLoader(BaseLoader):
    """loader for configuration files"""

    def __int__(self, data_name, data_path):
        super(ConfigLoader, self).__init__(data_name, data_path)
        self.config = self.parse(super(ConfigLoader, self).load())

    @staticmethod
    def parse(string):
        raise NotImplementedError

    @staticmethod
    def loadConfig(filePath, sections):
        """
        :param filePath: the path of config file
        :param sections: the dict of sections
        :return:
        """
        cfg = configparser.ConfigParser()
        cfg.read(filePath)
        for s in sections:
            attr_list = [i for i in type(sections[s]).__dict__.keys() if
                         not callable(getattr(sections[s], i)) and not i.startswith("__")]
            gen_sec = cfg[s]
            for attr in attr_list:
                try:
                    val = json.loads(gen_sec[attr])
                    print(s, attr, val, type(val))
                    assert type(val) == type(getattr(sections[s], attr)), \
                        'type not match, except %s but got %s' % \
                        (type(getattr(sections[s], attr)), type(val))
                    setattr(sections[s], attr, val)
                except Exception as e:
                    # attribute attr in section s did not been set, default val will be used
                    pass
--- a/fastNLP/loader/dataset_loader.py
+++ b/fastNLP/loader/dataset_loader.py
@@ -0,0 +1,111 @@
 import os

 from fastNLP.loader.base_loader import BaseLoader


 class DatasetLoader(BaseLoader):
    """"loader for data sets"""

    def __init__(self, data_name, data_path):
        super(DatasetLoader, self).__init__(data_name, data_path)


 class POSDatasetLoader(DatasetLoader):
    """loader for pos data sets"""

    def __init__(self, data_name, data_path):
        super(POSDatasetLoader, self).__init__(data_name, data_path)
        #self.data_set = self.load()

    def load(self):
        assert os.path.exists(self.data_path)
        with open(self.data_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        return self.parse(lines)

    @staticmethod
    def parse(lines):
        """
        :param lines: lines from dataset
        :return: list(list(list())): the three level of lists are
                token, sentence, and dataset
        """
        dataset = list()
        for line in lines:
            sentence = list()
            words = line.split(" ")
            for w in words:
                tokens = list()
                tokens.append(w.split('/')[0])
                tokens.append(w.split('/')[1])
                sentence.append(tokens)
            dataset.append(sentence)
        return dataset


 class ClassificationDatasetLoader(DatasetLoader):
    """loader for classfication data sets"""

    def __init__(self, data_name, data_path):
        super(ClassificationDatasetLoader, data_name).__init__()

    def load(self):
        assert os.path.exists(self.data_path)
        with open(self.data_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        return self.parse(lines)

    @staticmethod
    def parse(lines):
        """
        :param lines: lines from dataset
        :return: list(list(list())): the three level of lists are
                words, sentence, and dataset
        """
        dataset = list()
        for line in lines:
            label = line.split(" ")[0]
            words = line.split(" ")[1:]
            word = list([w for w in words])
            sentence = list([word, label])
            dataset.append(sentence)
        return dataset


 class ConllLoader(DatasetLoader):
    """loader for conll format files"""

    def __int__(self, data_name, data_path):
        """
        :param  str data_name: the name of the conll data set
        :param str data_path: the path to the conll data set
        """
        super(ConllLoader, self).__init__(data_name, data_path)
        self.data_set = self.parse(self.load())

    def load(self):
        """
        :return: list lines: all lines in a conll file
        """
        with open(self.data_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        return lines

    @staticmethod
    def parse(lines):
        """
        :param list lines:a list containing all lines in a conll file.
        :return: a 3D list
        """
        sentences = list()
        tokens = list()
        for line in lines:
            if line[0] == "#":
                # skip the comments
                continue
            if line == "\n":
                sentences.append(tokens)
                tokens = []
                continue
            tokens.append(line.split())
        return sentences
--- a/fastNLP/loader/embed_loader.py
+++ b/fastNLP/loader/embed_loader.py
@@ -0,0 +1,8 @@
 from loader.base_loader import BaseLoader


 class EmbedLoader(BaseLoader):
    """docstring for EmbedLoader"""

    def __init__(self, data_name, data_path):
        super(EmbedLoader, self).__init__(data_name, data_path)
--- a/fastNLP/models/init.py
+++ b/fastNLP/models/init.py
--- a/fastNLP/models/base_model.py
+++ b/fastNLP/models/base_model.py
@@ -0,0 +1,95 @@
 import torch


 class BaseModel(torch.nn.Module):
    """Base PyTorch model for all models.
        Three network modules presented:
            - embedding module
            - aggregation module
            - output module
        Subclasses must implement these three modules with "components".
    """

    def __init__(self):
        super(BaseModel, self).__init__()

    def forward(self, *inputs):
        x = self.encode(*inputs)
        x = self.aggregation(x)
        x = self.output(x)
        return x

    def encode(self, x):
        raise NotImplementedError

    def aggregation(self, x):
        raise NotImplementedError

    def output(self, x):
        raise NotImplementedError



 class Vocabulary(object):
    """A look-up table that allows you to access `Lexeme` objects. The `Vocab`
    instance also provides access to the `StringStore`, and owns underlying
    data that is shared between `Doc` objects.
    """

    def __init__(self):
        """Create the vocabulary.
        RETURNS (Vocab): The newly constructed object.
        """
        self.data_frame = None


 class Document(object):
    """A sequence of Token objects. Access sentences and named entities, export
    annotations to numpy arrays, losslessly serialize to compressed binary
    strings. The `Doc` object holds an array of `Token` objects. The
    Python-level `Token` and `Span` objects are views of this array, i.e.
    they don't own the data themselves. -- spacy
    """

    def __init__(self, vocab, words=None, spaces=None):
        """Create a Doc object.
        vocab (Vocab): A vocabulary object, which must match any models you
            want to use (e.g. tokenizer, parser, entity recognizer).
        words (list or None): A list of unicode strings, to add to the document
            as words. If `None`, defaults to empty list.
        spaces (list or None): A list of boolean values, of the same length as
            words. True means that the word is followed by a space, False means
            it is not. If `None`, defaults to `[True]*len(words)`
        user_data (dict or None): Optional extra data to attach to the Doc.
        RETURNS (Doc): The newly constructed object.
        """
        self.vocab = vocab
        self.spaces = spaces
        self.words = words
        if spaces is None:
            self.spaces = [True] * len(self.words)
        elif len(spaces) != len(self.words):
            raise ValueError("dismatch spaces and words")

    def get_chunker(self, vocab):
        return None

    def push_back(self, vocab):
        pass


 class Token(object):
    """An individual token – i.e. a word, punctuation symbol, whitespace,
    etc.
    """

    def __init__(self, vocab, doc, offset):
        """Construct a `Token` object.
            vocab (Vocabulary): A storage container for lexical types.
            doc (Document): The parent document.
            offset (int): The index of the token within the document.
        """
        self.vocab = vocab
        self.doc = doc
        self.token = doc[offset]
        self.i = offset
--- a/fastNLP/models/char_language_model.py
+++ b/fastNLP/models/char_language_model.py
@@ -0,0 +1,359 @@
 import os
 from collections import namedtuple

 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.optim as optim
 from torch.autograd import Variable

 from fastNLP.models.base_model import BaseModel

 USE_GPU = True

 """
    To be deprecated.
 """


 class CharLM(BaseModel):
    """
        Controller of the Character-level Neural Language Model
        To do:
        - where the data goes, call data savers.
    """
    DataTuple = namedtuple("DataTuple", ["feature", "label"])

    def __init__(self, lstm_batch_size, lstm_seq_len):
        super(CharLM, self).__init__()
        """
            Settings: should come from config loader or pre-processing
        """
        self.word_embed_dim = 300
        self.char_embedding_dim = 15
        self.cnn_batch_size = lstm_batch_size * lstm_seq_len
        self.lstm_seq_len = lstm_seq_len
        self.lstm_batch_size = lstm_batch_size
        self.num_epoch = 10
        self.old_PPL = 100000
        self.best_PPL = 100000

        """
            These parameters are set by pre-processing.
        """
        self.max_word_len = None
        self.num_char = None
        self.vocab_size = None
        self.preprocess("./data_for_tests/charlm.txt")

        self.data = None  # named tuple to store all data set
        self.data_ready = False
        self.criterion = nn.CrossEntropyLoss()
        self._loss = None
        self.use_gpu = USE_GPU

        # word_emb_dim == hidden_size / num of hidden units
        self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)),
                       to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)))

        self.model = charLM(self.char_embedding_dim,
                            self.word_embed_dim,
                            self.vocab_size,
                            self.num_char,
                            use_gpu=self.use_gpu)
        for param in self.model.parameters():
            nn.init.uniform(param.data, -0.05, 0.05)

        self.learning_rate = 0.1
        self.optimizer = None

    def prepare_input(self, raw_text):
        """
        :param raw_text: raw input text consisting of words
        :return: torch.Tensor, torch.Tensor
        feature matrix, label vector
        This function is only called once in Trainer.train, but may called multiple times in Tester.test
        So Tester will save  test input for frequent calls.
        """
        if os.path.exists("cache/prep.pt") is False:
            self.preprocess("./data_for_tests/charlm.txt")  # To do: This is not good. Need to fix..
        objects = torch.load("cache/prep.pt")
        word_dict = objects["word_dict"]
        char_dict = objects["char_dict"]
        max_word_len = self.max_word_len
        print("word/char dictionary built. Start making inputs.")

        words = raw_text
        input_vec = np.array(text2vec(words, char_dict, max_word_len))
        # Labels are next-word index in word_dict with the same length as inputs
        input_label = np.array([word_dict[w] for w in words[1:]] + [word_dict[words[-1]]])
        feature_input = torch.from_numpy(input_vec)
        label_input = torch.from_numpy(input_label)
        return feature_input, label_input

    def mode(self, test=False):
        if test:
            self.model.eval()
        else:
            self.model.train()

    def data_forward(self, x):
        """
        :param x: Tensor of size [lstm_batch_size, lstm_seq_len, max_word_len+2]
        :return: Tensor of size [num_words, ?]
        """
        # additional processing of inputs after batching
        num_seq = x.size()[0] // self.lstm_seq_len
        x = x[:num_seq * self.lstm_seq_len, :]
        x = x.view(-1, self.lstm_seq_len, self.max_word_len + 2)

        # detach hidden state of LSTM from last batch
        hidden = [state.detach() for state in self.hidden]
        output, self.hidden = self.model(to_var(x), hidden)
        return output

    def grad_backward(self):
        self.model.zero_grad()
        self._loss.backward()
        torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
        self.optimizer.step()

    def get_loss(self, predict, truth):
        self._loss = self.criterion(predict, to_var(truth))
        return self._loss.data  # No pytorch data structure exposed outsides

    def define_optimizer(self):
        # redefine optimizer for every new epoch
        self.optimizer = optim.SGD(self.model.parameters(), lr=self.learning_rate, momentum=0.85)

    def save(self):
        print("network saved")
        # torch.save(self.models, "cache/models.pkl")

    def preprocess(self, all_text_files):
        word_dict, char_dict = create_word_char_dict(all_text_files)
        num_char = len(char_dict)
        self.vocab_size = len(word_dict)
        char_dict["BOW"] = num_char + 1
        char_dict["EOW"] = num_char + 2
        char_dict["PAD"] = 0
        self.num_char = num_char + 3
        #  char_dict is a dict of (int, string), int counting from 0 to 47
        reverse_word_dict = {value: key for key, value in word_dict.items()}
        self.max_word_len = max([len(word) for word in word_dict])
        objects = {
            "word_dict": word_dict,
            "char_dict": char_dict,
            "reverse_word_dict": reverse_word_dict,
        }
        torch.save(objects, "cache/prep.pt")
        print("Preprocess done.")


 """
    Global Functions
 """


 def batch_generator(x, batch_size):
    # x: [num_words, in_channel, height, width]
    # partitions x into batches
    num_step = x.size()[0] // batch_size
    for t in range(num_step):
        yield x[t * batch_size:(t + 1) * batch_size]


 def text2vec(words, char_dict, max_word_len):
    """ Return list of list of int """
    word_vec = []
    for word in words:
        vec = [char_dict[ch] for ch in word]
        if len(vec) < max_word_len:
            vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))]
        vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]]
        word_vec.append(vec)
    return word_vec


 def read_data(file_name):
    with open(file_name, 'r') as f:
        corpus = f.read().lower()
    import re
    corpus = re.sub(r"<unk>", "unk", corpus)
    return corpus.split()


 def get_char_dict(vocabulary):
    char_dict = dict()
    count = 1
    for word in vocabulary:
        for ch in word:
            if ch not in char_dict:
                char_dict[ch] = count
                count += 1
    return char_dict


 def create_word_char_dict(*file_name):
    text = []
    for file in file_name:
        text += read_data(file)
    word_dict = {word: ix for ix, word in enumerate(set(text))}
    char_dict = get_char_dict(word_dict)
    return word_dict, char_dict


 def to_var(x):
    if torch.cuda.is_available() and USE_GPU:
        x = x.cuda()
    return Variable(x)


 """
    Neural Network
 """


 class Highway(nn.Module):
    """Highway network"""

    def __init__(self, input_size):
        super(Highway, self).__init__()
        self.fc1 = nn.Linear(input_size, input_size, bias=True)
        self.fc2 = nn.Linear(input_size, input_size, bias=True)

    def forward(self, x):
        t = F.sigmoid(self.fc1(x))
        return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x)


 class charLM(nn.Module):
    """Character-level Neural Language Model
    CNN + highway network + LSTM
    # Input:
        4D tensor with shape [batch_size, in_channel, height, width]
    # Output:
        2D Tensor with shape [batch_size, vocab_size]
    # Arguments:
        char_emb_dim: the size of each character's attention
        word_emb_dim: the size of each word's attention
        vocab_size: num of unique words
        num_char: num of characters
        use_gpu: True or False
    """

    def __init__(self, char_emb_dim, word_emb_dim,
                 vocab_size, num_char, use_gpu):
        super(charLM, self).__init__()
        self.char_emb_dim = char_emb_dim
        self.word_emb_dim = word_emb_dim
        self.vocab_size = vocab_size

        # char attention layer
        self.char_embed = nn.Embedding(num_char, char_emb_dim)

        # convolutions of filters with different sizes
        self.convolutions = []

        # list of tuples: (the number of filter, width)
        # self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)]
        self.filter_num_width = [(25, 1), (50, 2), (75, 3)]

        for out_channel, filter_width in self.filter_num_width:
            self.convolutions.append(
                nn.Conv2d(
                    1,  # in_channel
                    out_channel,  # out_channel
                    kernel_size=(char_emb_dim, filter_width),  # (height, width)
                    bias=True
                )
            )

        self.highway_input_dim = sum([x for x, y in self.filter_num_width])

        self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False)

        # highway net
        self.highway1 = Highway(self.highway_input_dim)
        self.highway2 = Highway(self.highway_input_dim)

        # LSTM
        self.lstm_num_layers = 2

        self.lstm = nn.LSTM(input_size=self.highway_input_dim,
                            hidden_size=self.word_emb_dim,
                            num_layers=self.lstm_num_layers,
                            bias=True,
                            dropout=0.5,
                            batch_first=True)

        # output layer
        self.dropout = nn.Dropout(p=0.5)
        self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)

        if use_gpu is True:
            for x in range(len(self.convolutions)):
                self.convolutions[x] = self.convolutions[x].cuda()
            self.highway1 = self.highway1.cuda()
            self.highway2 = self.highway2.cuda()
            self.lstm = self.lstm.cuda()
            self.dropout = self.dropout.cuda()
            self.char_embed = self.char_embed.cuda()
            self.linear = self.linear.cuda()
            self.batch_norm = self.batch_norm.cuda()

    def forward(self, x, hidden):
        # Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2]
        # Return: Variable of Tensor with shape [num_words, len(word_dict)]
        lstm_batch_size = x.size()[0]
        lstm_seq_len = x.size()[1]

        x = x.contiguous().view(-1, x.size()[2])
        # [num_seq*seq_len, max_word_len+2]

        x = self.char_embed(x)
        # [num_seq*seq_len, max_word_len+2, char_emb_dim]

        x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
        # [num_seq*seq_len, 1, char_emb_dim, max_word_len+2]

        x = self.conv_layers(x)
        # [num_seq*seq_len, total_num_filters]

        x = self.batch_norm(x)
        # [num_seq*seq_len, total_num_filters]

        x = self.highway1(x)
        x = self.highway2(x)
        # [num_seq*seq_len, total_num_filters]

        x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
        # [num_seq, seq_len, total_num_filters]

        x, hidden = self.lstm(x, hidden)
        # [seq_len, num_seq, hidden_size]

        x = self.dropout(x)
        # [seq_len, num_seq, hidden_size]

        x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1)
        # [num_seq*seq_len, hidden_size]

        x = self.linear(x)
        # [num_seq*seq_len, vocab_size]
        return x, hidden

    def conv_layers(self, x):
        chosen_list = list()
        for conv in self.convolutions:
            feature_map = F.tanh(conv(x))
            # (batch_size, out_channel, 1, max_word_len-width+1)
            chosen = torch.max(feature_map, 3)[0]
            # (batch_size, out_channel, 1)
            chosen = chosen.squeeze()
            # (batch_size, out_channel)
            chosen_list.append(chosen)

        # (batch_size, total_num_filers)
        return torch.cat(chosen_list, 1)
--- a/fastNLP/models/word_seg_model.py
+++ b/fastNLP/models/word_seg_model.py
@@ -0,0 +1,46 @@
 import torch.nn as nn

 from fastNLP.models.base_model import BaseModel


 class WordSeg(BaseModel):
    """
        PyTorch Network for word segmentation
    """

    def __init__(self, hidden_dim, lstm_num_layers, vocab_size, word_emb_dim=100):
        super(WordSeg, self).__init__()

        self.vocab_size = vocab_size
        self.word_emb_dim = word_emb_dim
        self.lstm_num_layers = lstm_num_layers
        self.hidden_dim = hidden_dim

        self.word_emb = nn.Embedding(self.vocab_size, self.word_emb_dim)

        self.lstm = nn.LSTM(input_size=self.word_emb_dim,
                            hidden_size=self.word_emb_dim,
                            num_layers=self.lstm_num_layers,
                            bias=True,
                            dropout=0.5,
                            batch_first=True)

        self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)

    def forward(self, x, hidden):
        """
        :param x: tensor of shape [batch_size, seq_len], vocabulary index
        :param hidden:
        :return x: probability of vocabulary entries
                hidden: (memory cell, hidden state) from LSTM
        """
        # [batch_size, seq_len]
        x = self.word_emb(x)
        # [batch_size, seq_len, word_emb_size]
        x, hidden = self.lstm(x, hidden)
        # [batch_size, seq_len, word_emb_size]
        x = x.contiguous().view(x.shape[0] * x.shape[1], -1)
        # [batch_size*seq_len, word_emb_size]
        x = self.linear(x)
        # [batch_size*seq_len, vocab_size]
        return x, hidden
--- a/fastNLP/modules/CRF.py
+++ b/fastNLP/modules/CRF.py
@@ -0,0 +1,174 @@
 import torch
 from torch import nn


 def log_sum_exp(x, dim=-1):
    max_value, _ = x.max(dim=dim, keepdim=True)
    res = torch.log(torch.sum(torch.exp(x - max_value), dim=dim, keepdim=True)) + max_value
    return res.squeeze(dim)


 def seq_len_to_byte_mask(seq_lens):
    # usually seq_lens: LongTensor, batch_size
    # return value: ByteTensor, batch_size x max_len
    batch_size = seq_lens.size(0)
    max_len = seq_lens.max()
    broadcast_arange = torch.arange(max_len).view(1, -1).repeat(batch_size, 1)
    mask = broadcast_arange.lt(seq_lens.float().view(-1, 1))
    return mask


 class ContionalRandomField(nn.Module):
    def __init__(self, tag_size, include_start_end_trans=True):
        """
        :param tag_size: int, num of tags
        :param include_start_end_trans: bool, whether to include start/end tag
        """
        super(ContionalRandomField, self).__init__()

        self.include_start_end_trans = include_start_end_trans
        self.tag_size = tag_size

        # the meaning of entry in this matrix is (from_tag_id, to_tag_id) score
        self.transition_m = nn.Parameter(torch.randn(tag_size, tag_size))
        if self.include_start_end_trans:
            self.start_scores = nn.Parameter(torch.randn(tag_size))
            self.end_scores = nn.Parameter(torch.randn(tag_size))

        self.reset_parameter()

    def reset_parameter(self):
        nn.init.xavier_normal_(self.transition_m)
        if self.include_start_end_trans:
            nn.init.normal_(self.start_scores)
            nn.init.normal_(self.end_scores)

    def _normalizer_likelihood(self, feats, masks):
        """
        Computes the (batch_size,) denominator term for the log-likelihood, which is the
        sum of the likelihoods across all possible state sequences.

        :param feats:FloatTensor, batch_size x max_len x tag_size
        :param masks:ByteTensor, batch_size x max_len
        :return:FloatTensor, batch_size
        """
        batch_size, max_len, _ = feats.size()

        # alpha, batch_size x tag_size
        if self.include_start_end_trans:
            alpha = self.start_scores.view(1, -1) + feats[:, 0]
        else:
            alpha = feats[:, 0]

        # broadcast_trans_m, the meaning of entry in this matrix is [batch_idx, to_tag_id, from_tag_id]
        broadcast_trans_m = self.transition_m.permute(
            1, 0).unsqueeze(0).repeat(batch_size, 1, 1)
        # loop
        for i in range(1, max_len):
            emit_score = feats[:, i].unsqueeze(2)
            new_alpha = broadcast_trans_m + alpha.unsqueeze(1) + emit_score

            new_alpha = log_sum_exp(new_alpha, dim=2)

            alpha = new_alpha * \
                    masks[:, i:i + 1].float() + alpha * \
                    (1 - masks[:, i:i + 1].float())

        if self.include_start_end_trans:
            alpha = alpha + self.end_scores.view(1, -1)

        return log_sum_exp(alpha)

    def _glod_score(self, feats, tags, masks):
        """
        Compute the score for the gold path.
        :param feats: FloatTensor, batch_size x tag_size x tag_size
        :param tags: LongTensor, batch_size x max_len
        :param masks: ByteTensor, batch_size x max_len
        :return:FloatTensor, batch_size
        """
        batch_size, max_len, _ = feats.size()

        # alpha, B x 1
        if self.include_start_end_trans:
            alpha = self.start_scores.view(1, -1).repeat(batch_size, 1).gather(dim=1, index=tags[:, :1]) + \
                    feats[:, 0].gather(dim=1, index=tags[:, :1])
        else:
            alpha = feats[:, 0].gather(dim=1, index=tags[:, :1])

        for i in range(1, max_len):
            trans_score = self.transition_m[(
                tags[:, i - 1], tags[:, i])].unsqueeze(1)
            emit_score = feats[:, i].gather(dim=1, index=tags[:, i:i + 1])
            new_alpha = alpha + trans_score + emit_score

            alpha = new_alpha * \
                    masks[:, i:i + 1].float() + alpha * \
                    (1 - masks[:, i:i + 1].float())

        if self.include_start_end_trans:
            last_tag_index = masks.cumsum(dim=1, dtype=torch.long)[:, -1:] - 1
            last_from_tag_id = tags.gather(dim=1, index=last_tag_index)
            trans_score = self.end_scores.view(
                1, -1).repeat(batch_size, 1).gather(dim=1, index=last_from_tag_id)
            alpha = alpha + trans_score

        return alpha.squeeze(1)

    def forward(self, feats, tags, masks):
        """
        Calculate the neg log likelihood
        :param feats:FloatTensor, batch_size x tag_size x tag_size
        :param tags:LongTensor, batch_size x max_len
        :param masks:ByteTensor batch_size x max_len
        :return:FloatTensor, batch_size
        """
        all_path_score = self._normalizer_likelihood(feats, masks)
        gold_path_score = self._glod_score(feats, tags, masks)

        return all_path_score - gold_path_score

    def viterbi_decode(self, feats, masks):
        """
        Given a feats matrix, return best decode path and best score.
        :param feats:
        :param masks:
        :return:List[Tuple(List, float)],
        """
        batch_size, max_len, tag_size = feats.size()

        paths = torch.zeros(batch_size, max_len - 1, self.tag_size)
        if self.include_start_end_trans:
            alpha = self.start_scores.repeat(batch_size, 1) + feats[:, 0]
        else:
            alpha = feats[:, 0]
        for i in range(1, max_len):
            new_alpha = alpha.clone()
            for t in range(self.tag_size):
                pre_scores = self.transition_m[:, t].view(
                    1, self.tag_size) + alpha
                max_scroe, indice = pre_scores.max(dim=1)
                new_alpha[:, t] = max_scroe + feats[:, i, t]
                paths[:, i - 1, t] = indice
            alpha = new_alpha * \
                    masks[:, i:i + 1].float() + alpha * \
                    (1 - masks[:, i:i + 1].float())

        if self.include_start_end_trans:
            alpha += self.end_scores.view(1, -1)

        max_scroes, indice = alpha.max(dim=1)
        indice = indice.cpu().numpy()
        final_paths = []
        paths = paths.cpu().numpy().astype(int)

        seq_lens = masks.cumsum(dim=1, dtype=torch.long)[:, -1]

        for b in range(batch_size):
            path = [indice[b]]
            for i in range(seq_lens[b] - 2, -1, -1):
                index = paths[b, i, path[-1]]
                path.append(index)
            final_paths.append(path[::-1])

        return list(zip(final_paths, max_scroes.detach().cpu().numpy()))
--- a/fastNLP/modules/init.py
+++ b/fastNLP/modules/init.py
--- a/fastNLP/modules/attention/init.py
+++ b/fastNLP/modules/attention/init.py
--- a/fastNLP/modules/attention/attention.py
+++ b/fastNLP/modules/attention/attention.py
@@ -0,0 +1,19 @@
 import torch

 from fastNLP.modules.utils import mask_softmax


 class Attention(torch.nn.Module):

    def __init__(self, normalize=False):
        super(Attention, self).__init__()
        self.normalize = normalize

    def forward(self, query, memory, mask):
        similarities = self._atten_forward(query, memory)
        if self.normalize:
            return mask_softmax(similarities, mask)
        return similarities

    def _atten_forward(self, query, memory):
        raise NotImplementedError
--- a/fastNLP/modules/attention/linear_attention.py
+++ b/fastNLP/modules/attention/linear_attention.py
@@ -0,0 +1,9 @@
 from fastNLP.modules.attention.attention import Attention


 class LinearAttention(Attention):
    def __init__(self, normalize=False):
        super(LinearAttention, self).__init__(normalize)

    def _atten_forward(self, query, memory):
        raise NotImplementedError
--- a/fastNLP/modules/convolution/init.py
+++ b/fastNLP/modules/convolution/init.py
--- a/fastNLP/modules/recurrent/init.py
+++ b/fastNLP/modules/recurrent/init.py
--- a/fastNLP/modules/utils.py
+++ b/fastNLP/modules/utils.py
@@ -0,0 +1,9 @@
 import torch


 def mask_softmax(matrix, mask):
    if mask is None:
        result = torch.nn.functional.softmax(matrix, dim=-1)
    else:
        raise NotImplementedError
    return result
--- a/fastNLP/reproduction/CNN-sentence_classification/.gitignore
+++ b/fastNLP/reproduction/CNN-sentence_classification/.gitignore
@@ -0,0 +1,110 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class

 # C extensions
 *.so

 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST

 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec

 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt

 # Unit test / coverage reports
 htmlcov/
 .tox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 .hypothesis/
 .pytest_cache/

 # Translations
 *.mo
 *.pot

 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3

 # Flask stuff:
 instance/
 .webassets-cache

 # Scrapy stuff:
 .scrapy

 # Sphinx documentation
 docs/_build/

 # PyBuilder
 target/

 # Jupyter Notebook
 .ipynb_checkpoints

 # pyenv
 .python-version

 # celery beat schedule file
 celerybeat-schedule

 # SageMath parsed files
 *.sage.py

 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/

 # Spyder project settings
 .spyderproject
 .spyproject

 # Rope project settings
 .ropeproject

 # mkdocs documentation
 /site

 # mypy
 .mypy_cache

 #custom
 GoogleNews-vectors-negative300.bin/
 GoogleNews-vectors-negative300.bin.gz
 models/
 *.swp
--- a/fastNLP/reproduction/CNN-sentence_classification/README.md
+++ b/fastNLP/reproduction/CNN-sentence_classification/README.md
@@ -0,0 +1,77 @@
 ## Introduction
 This is the implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch.
 * MRDataset, non-static-model(word2vec rained by Mikolov etal. (2013) on 100 billion words of Google News)
 * It can be run in both CPU and GPU
 * The best accuracy is 82.61%, which is better than 81.5% in the paper
 (by Jingyuan Liu @Fudan University; Email:(fdjingyuan@outlook.com) Welcome to discussion!)

 ## Requirement
 * python 3.6
 * pytorch > 0.1
 * numpy
 * gensim

 ## Run
 STEP 1
 install packages like gensim (other needed pakages is the same)
 ```
 pip install gensim
 ```

 STEP 2
 install MRdataset and word2vec resources
 * MRdataset: you can download the dataset in (https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz)
 * word2vec: you can download the file in (https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit)

 Since this file is more than 1.5G, I did not display in folders. If you download the file, please remember modify the path in Function def word_embeddings(path = './GoogleNews-vectors-negative300.bin/'):


 STEP 3
 train the model 
 ```
 python train.py
 ```
 you will get the information printed in the screen, like
 ```
 Epoch [1/20], Iter [100/192] Loss: 0.7008
 Test Accuracy: 71.869159 %
 Epoch [2/20], Iter [100/192] Loss: 0.5957
 Test Accuracy: 75.700935 %
 Epoch [3/20], Iter [100/192] Loss: 0.4934
 Test Accuracy: 78.130841 %

 ......
 Epoch [20/20], Iter [100/192] Loss: 0.0364
 Test Accuracy: 81.495327 %
 Best Accuracy: 82.616822 %
 Best Model: models/cnn.pkl
 ```

 ## Hyperparameters
 According to the paper and experiment, I set:

 |Epoch|Kernel Size|dropout|learning rate|batch size|
 |---|---|---|---|---|
 |20|\(h,300,100\)|0.5|0.0001|50|

 h = [3,4,5]
 If the accuracy is not improved, the learning rate will \*0.8.

 ## Result
 I just tried one dataset : MR. (Other 6 dataset in paper SST-1, SST-2, TREC, CR, MPQA)
 There are four models in paper: CNN-rand, CNN-static, CNN-non-static, CNN-multichannel.
 I have tried CNN-non-static:A model with pre-trained vectors from word2vec. 
 All words—including the unknown ones that are randomly initialized and the pretrained vectors are fine-tuned for each task
 (which has almost the best performance and the most difficut to implement among the four models)

 |Dataset|Class Size|Best Result|Kim's Paper Result|
 |---|---|---|---|
 |MR|2|82.617%(CNN-non-static)|81.5%(CNN-nonstatic)|



 ## Reference
 * [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882)
 * https://github.com/Shawn1993/cnn-text-classification-pytorch
 * https://github.com/junwang4/CNN-sentence-classification-pytorch-2017/blob/master/utils.py

--- a/fastNLP/reproduction/CNN-sentence_classification/init.py
+++ b/fastNLP/reproduction/CNN-sentence_classification/init.py
--- a/fastNLP/reproduction/CNN-sentence_classification/dataset.py
+++ b/fastNLP/reproduction/CNN-sentence_classification/dataset.py
@@ -0,0 +1,136 @@
 import codecs
 import random
 import re

 import gensim
 import numpy as np
 from gensim import corpora
 from torch.utils.data import Dataset


 def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()


 def pad_sentences(sentence, padding_word=" <PAD/>"):
    sequence_length = 64
    sent = sentence.split()
    padded_sentence = sentence + padding_word * (sequence_length - len(sent))
    return padded_sentence


 # data loader
 class MRDataset(Dataset):
    def __init__(self):

        # load positive and negative sentenses from files
        with codecs.open("./rt-polaritydata/rt-polarity.pos", encoding='ISO-8859-1') as f:
            positive_examples = list(f.readlines())
        with codecs.open("./rt-polaritydata/rt-polarity.neg", encoding='ISO-8859-1') as f:
            negative_examples = list(f.readlines())
        # s.strip: clear "\n"; clear_str; pad
        positive_examples = [pad_sentences(clean_str(s.strip())) for s in positive_examples]
        negative_examples = [pad_sentences(clean_str(s.strip())) for s in negative_examples]
        self.examples = positive_examples + negative_examples
        self.sentences_texts = [sample.split() for sample in self.examples]

        # word dictionary
        dictionary = corpora.Dictionary(self.sentences_texts)
        self.word2id_dict = dictionary.token2id  # transform to dict, like {"human":0, "a":1,...}

        # set lables: postive is 1; negative is 0
        positive_labels = [1 for _ in positive_examples]
        negative_labels = [0 for _ in negative_examples]
        self.lables = positive_labels + negative_labels
        examples_lables = list(zip(self.examples, self.lables))
        random.shuffle(examples_lables)
        self.MRDataset_frame = examples_lables

        # transform word to id
        self.MRDataset_wordid = \
            [(
                np.array([self.word2id_dict[word] for word in sent[0].split()], dtype=np.int64),
                sent[1]
            ) for sent in self.MRDataset_frame]

    def word_embeddings(self, path="./GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin"):
        # establish from google
        model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)

        print('Please wait ... (it could take a while to load the file : {})'.format(path))
        word_dict = self.word2id_dict
        embedding_weights = np.random.uniform(-0.25, 0.25, (len(self.word2id_dict), 300))

        for word in word_dict:
            word_id = word_dict[word]
            if word in model.wv.vocab:
                embedding_weights[word_id, :] = model[word]
            return embedding_weights

    def __len__(self):
        return len(self.MRDataset_frame)

    def __getitem__(self, idx):

        sample = self.MRDataset_wordid[idx]
        return sample

    def getsent(self, idx):

        sample = self.MRDataset_wordid[idx][0]
        return sample

    def getlabel(self, idx):

        label = self.MRDataset_wordid[idx][1]
        return label

    def word2id(self):

        return self.word2id_dict

    def id2word(self):

        id2word_dict = dict([val, key] for key, val in self.word2id_dict.items())
        return id2word_dict


 class train_set(Dataset):

    def __init__(self, samples):
        self.train_frame = samples

    def __len__(self):
        return len(self.train_frame)

    def __getitem__(self, idx):
        return self.train_frame[idx]


 class test_set(Dataset):

    def __init__(self, samples):
        self.test_frame = samples

    def __len__(self):
        return len(self.test_frame)

    def __getitem__(self, idx):
        return self.test_frame[idx]
--- a/fastNLP/reproduction/CNN-sentence_classification/model.py
+++ b/fastNLP/reproduction/CNN-sentence_classification/model.py
@@ -0,0 +1,35 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F


 class CNN_text(nn.Module):
    def __init__(self, kernel_h=[3, 4, 5], kernel_num=100, embed_num=1000, embed_dim=300, dropout=0.5, L2_constrain=3,
                 batchsize=50, pretrained_embeddings=None):
        super(CNN_text, self).__init__()

        self.embedding = nn.Embedding(embed_num, embed_dim)
        self.dropout = nn.Dropout(dropout)
        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))

        # the network structure
        # Conv2d: input- N,C,H,W output- (50,100,62,1)
        self.conv1 = nn.ModuleList([nn.Conv2d(1, 100, (K, 300)) for K in kernel_h])
        self.fc1 = nn.Linear(300, 2)

    def max_pooling(self, x):
        x = F.relu(conv(x)).squeeze(3)  # N,C,L - (50,100,62)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        # x.size(2)=62  squeeze: (50,100,1) -> (50,100)
        return x

    def forward(self, x):
        x = self.embedding(x)  # output: (N,H,W) = (50,64,300)
        x = x.unsqueeze(1)  # (N,C,H,W)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.conv1]  # [N, C, H(50,100,62),(50,100,61),(50,100,60)]
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]  # [N,C(50,100),(50,100),(50,100)]
        x = torch.cat(x, 1)
        x = self.dropout(x)
        x = self.fc1(x)
        return x
--- a/fastNLP/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg
+++ b/fastNLP/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg
--- a/fastNLP/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos
+++ b/fastNLP/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos
--- a/fastNLP/reproduction/CNN-sentence_classification/train.py
+++ b/fastNLP/reproduction/CNN-sentence_classification/train.py
@@ -0,0 +1,93 @@
 import os

 import
 import
 import torch
 import torch.nn as nn
 .dataset as dst
 from .model import CNN_text
 from torch.autograd import Variable

 # Hyper Parameters
 batch_size = 50
 learning_rate = 0.0001
 num_epochs = 20
 cuda = True

 # split Dataset
 dataset = dst.MRDataset()
 length = len(dataset)

 train_dataset = dataset[:int(0.9 * length)]
 test_dataset = dataset[int(0.9 * length):]

 train_dataset = dst.train_set(train_dataset)
 test_dataset = dst.test_set(test_dataset)

 # Data Loader
 train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size,
                                           shuffle=True)

 test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False)

 # cnn

 cnn = CNN_text(embed_num=len(dataset.word2id()), pretrained_embeddings=dataset.word_embeddings())
 if cuda:
    cnn.cuda()

 # Loss and Optimizer
 criterion = nn.CrossEntropyLoss()
 optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate)

 # train and test
 best_acc = None

 for epoch in range(num_epochs):
    # Train the Model
    cnn.train()
    for i, (sents, labels) in enumerate(train_loader):
        sents = Variable(sents)
        labels = Variable(labels)
        if cuda:
            sents = sents.cuda()
        labels = labels.cuda()
        optimizer.zero_grad()
        outputs = cnn(sents)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f'
                  % (epoch + 1, num_epochs, i + 1, len(train_dataset) // batch_size, loss.data[0]))

    # Test the Model
    cnn.eval()
    correct = 0
    total = 0
    for sents, labels in test_loader:
        sents = Variable(sents)
        if cuda:
            sents = sents.cuda()
            labels = labels.cuda()
        outputs = cnn(sents)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()
    acc = 100. * correct / total
    print('Test Accuracy: %f %%' % (acc))

    if best_acc is None or acc > best_acc:
        best_acc = acc
        if os.path.exists("models") is False:
            os.makedirs("models")
        torch.save(cnn.state_dict(), 'models/cnn.pkl')
    else:
        learning_rate = learning_rate * 0.8

 print("Best Accuracy: %f %%" % best_acc)
 print("Best Model: models/cnn.pkl")
--- a/fastNLP/reproduction/Char-aware_NLM/LICENSE
+++ b/fastNLP/reproduction/Char-aware_NLM/LICENSE
@@ -0,0 +1,21 @@
 MIT License

 Copyright (c) 2017 

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/fastNLP/reproduction/Char-aware_NLM/README.md
+++ b/fastNLP/reproduction/Char-aware_NLM/README.md
@@ -0,0 +1,40 @@

 # PyTorch-Character-Aware-Neural-Language-Model

 This is the PyTorch implementation of character-aware neural language model proposed in this [paper](https://arxiv.org/abs/1508.06615) by Yoon Kim. 

 ## Requiredments
 The code is run and tested with **Python 3.5.2** and **PyTorch 0.3.1**.

 ## HyperParameters
 | HyperParam | value |
 | ------ | :-------|
 | LSTM batch size | 20 |
 | LSTM sequence length | 35 |
 | LSTM hidden units | 300 |
 | epochs | 35 |
 | initial learning rate | 1.0 |
 | character embedding dimension | 15 |

 ## Demo
 Train the model with split train/valid/test data.

 `python train.py`

 The trained model will saved in `cache/net.pkl`.
 Test the model.

 `python test.py`

 Best result on test set: 
 PPl=127.2163
 cross entropy loss=4.8459

 ## Acknowledgement 
 This implementation borrowed ideas from

 https://github.com/jarfo/kchar

 https://github.com/cronos123/Character-Aware-Neural-Language-Models


--- a/fastNLP/reproduction/Char-aware_NLM/init.py
+++ b/fastNLP/reproduction/Char-aware_NLM/init.py
--- a/fastNLP/reproduction/Char-aware_NLM/model.py
+++ b/fastNLP/reproduction/Char-aware_NLM/model.py
@@ -0,0 +1,145 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F


 class Highway(nn.Module):
    """Highway network"""

    def __init__(self, input_size):
        super(Highway, self).__init__()
        self.fc1 = nn.Linear(input_size, input_size, bias=True)
        self.fc2 = nn.Linear(input_size, input_size, bias=True)

    def forward(self, x):
        t = F.sigmoid(self.fc1(x))
        return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x)


 class charLM(nn.Module):
    """CNN + highway network + LSTM
    # Input: 
        4D tensor with shape [batch_size, in_channel, height, width]
    # Output:
        2D Tensor with shape [batch_size, vocab_size]
    # Arguments:
        char_emb_dim: the size of each character's attention
        word_emb_dim: the size of each word's attention
        vocab_size: num of unique words
        num_char: num of characters
        use_gpu: True or False
    """

    def __init__(self, char_emb_dim, word_emb_dim,
                 vocab_size, num_char, use_gpu):
        super(charLM, self).__init__()
        self.char_emb_dim = char_emb_dim
        self.word_emb_dim = word_emb_dim
        self.vocab_size = vocab_size

        # char attention layer
        self.char_embed = nn.Embedding(num_char, char_emb_dim)

        # convolutions of filters with different sizes
        self.convolutions = []

        # list of tuples: (the number of filter, width)
        self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)]

        for out_channel, filter_width in self.filter_num_width:
            self.convolutions.append(
                nn.Conv2d(
                    1,  # in_channel
                    out_channel,  # out_channel
                    kernel_size=(char_emb_dim, filter_width),  # (height, width)
                    bias=True
                )
            )

        self.highway_input_dim = sum([x for x, y in self.filter_num_width])

        self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False)

        # highway net
        self.highway1 = Highway(self.highway_input_dim)
        self.highway2 = Highway(self.highway_input_dim)

        # LSTM
        self.lstm_num_layers = 2

        self.lstm = nn.LSTM(input_size=self.highway_input_dim,
                            hidden_size=self.word_emb_dim,
                            num_layers=self.lstm_num_layers,
                            bias=True,
                            dropout=0.5,
                            batch_first=True)

        # output layer
        self.dropout = nn.Dropout(p=0.5)
        self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)

        if use_gpu is True:
            for x in range(len(self.convolutions)):
                self.convolutions[x] = self.convolutions[x].cuda()
            self.highway1 = self.highway1.cuda()
            self.highway2 = self.highway2.cuda()
            self.lstm = self.lstm.cuda()
            self.dropout = self.dropout.cuda()
            self.char_embed = self.char_embed.cuda()
            self.linear = self.linear.cuda()
            self.batch_norm = self.batch_norm.cuda()

    def forward(self, x, hidden):
        # Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2]
        # Return: Variable of Tensor with shape [num_words, len(word_dict)]
        lstm_batch_size = x.size()[0]
        lstm_seq_len = x.size()[1]

        x = x.contiguous().view(-1, x.size()[2])
        # [num_seq*seq_len, max_word_len+2]

        x = self.char_embed(x)
        # [num_seq*seq_len, max_word_len+2, char_emb_dim]

        x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
        # [num_seq*seq_len, 1, max_word_len+2, char_emb_dim]

        x = self.conv_layers(x)
        # [num_seq*seq_len, total_num_filters]

        x = self.batch_norm(x)
        # [num_seq*seq_len, total_num_filters]

        x = self.highway1(x)
        x = self.highway2(x)
        # [num_seq*seq_len, total_num_filters]

        x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
        # [num_seq, seq_len, total_num_filters]

        x, hidden = self.lstm(x, hidden)
        # [seq_len, num_seq, hidden_size]

        x = self.dropout(x)
        # [seq_len, num_seq, hidden_size]

        x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1)
        # [num_seq*seq_len, hidden_size]

        x = self.linear(x)
        # [num_seq*seq_len, vocab_size]
        return x, hidden

    def conv_layers(self, x):
        chosen_list = list()
        for conv in self.convolutions:
            feature_map = F.tanh(conv(x))
            # (batch_size, out_channel, 1, max_word_len-width+1)
            chosen = torch.max(feature_map, 3)[0]
            # (batch_size, out_channel, 1)            
            chosen = chosen.squeeze()
            # (batch_size, out_channel)
            chosen_list.append(chosen)

        # (batch_size, total_num_filers)
        return torch.cat(chosen_list, 1)
--- a/fastNLP/reproduction/Char-aware_NLM/test.py
+++ b/fastNLP/reproduction/Char-aware_NLM/test.py
@@ -0,0 +1,117 @@
 import os
 from collections import namedtuple

 import numpy as np
 import torch
 import torch.nn as nn
 from torch.autograd import Variable
 from utilities import *


 def to_var(x):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x)


 def test(net, data, opt):
    net.eval()

    test_input = torch.from_numpy(data.test_input)
    test_label = torch.from_numpy(data.test_label)

    num_seq = test_input.size()[0] // opt.lstm_seq_len
    test_input = test_input[:num_seq * opt.lstm_seq_len, :]
    # [num_seq, seq_len, max_word_len+2]
    test_input = test_input.view(-1, opt.lstm_seq_len, opt.max_word_len + 2)

    criterion = nn.CrossEntropyLoss()

    loss_list = []
    num_hits = 0
    total = 0
    iterations = test_input.size()[0] // opt.lstm_batch_size
    test_generator = batch_generator(test_input, opt.lstm_batch_size)
    label_generator = batch_generator(test_label, opt.lstm_batch_size * opt.lstm_seq_len)

    hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)),
              to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)))

    add_loss = 0.0
    for t in range(iterations):
        batch_input = test_generator.__next__()
        batch_label = label_generator.__next__()

        net.zero_grad()
        hidden = [state.detach() for state in hidden]
        test_output, hidden = net(to_var(batch_input), hidden)

        test_loss = criterion(test_output, to_var(batch_label)).data
        loss_list.append(test_loss)
        add_loss += test_loss

    print("Test Loss={0:.4f}".format(float(add_loss) / iterations))
    print("Test PPL={0:.4f}".format(float(np.exp(add_loss / iterations))))


 #############################################################

 if __name__ == "__main__":

    word_embed_dim = 300
    char_embedding_dim = 15

    if os.path.exists("cache/prep.pt") is False:
        print("Cannot find prep.pt")

    objetcs = torch.load("cache/prep.pt")

    word_dict = objetcs["word_dict"]
    char_dict = objetcs["char_dict"]
    reverse_word_dict = objetcs["reverse_word_dict"]
    max_word_len = objetcs["max_word_len"]
    num_words = len(word_dict)

    print("word/char dictionary built. Start making inputs.")

    if os.path.exists("cache/data_sets.pt") is False:

        test_text = read_data("./test.txt")
        test_set = np.array(text2vec(test_text, char_dict, max_word_len))

        # Labels are next-word index in word_dict with the same length as inputs
        test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]])

        category = {"test": test_set, "tlabel": test_label}
        torch.save(category, "cache/data_sets.pt")
    else:
        data_sets = torch.load("cache/data_sets.pt")
        test_set = data_sets["test"]
        test_label = data_sets["tlabel"]
        train_set = data_sets["tdata"]
        train_label = data_sets["trlabel"]

    DataTuple = namedtuple("DataTuple", "test_input test_label train_input train_label ")
    data = DataTuple(test_input=test_set,
                     test_label=test_label, train_label=train_label, train_input=train_set)

    print("Loaded data sets. Start building network.")

    USE_GPU = True
    cnn_batch_size = 700
    lstm_seq_len = 35
    lstm_batch_size = 20

    net = torch.load("cache/net.pkl")

    Options = namedtuple("Options", ["cnn_batch_size", "lstm_seq_len",
                                     "max_word_len", "lstm_batch_size", "word_embed_dim"])
    opt = Options(cnn_batch_size=lstm_seq_len * lstm_batch_size,
                  lstm_seq_len=lstm_seq_len,
                  max_word_len=max_word_len,
                  lstm_batch_size=lstm_batch_size,
                  word_embed_dim=word_embed_dim)

    print("Network built. Start testing.")

    test(net, data, opt)
--- a/fastNLP/reproduction/Char-aware_NLM/test.txt
+++ b/fastNLP/reproduction/Char-aware_NLM/test.txt
--- a/fastNLP/reproduction/Char-aware_NLM/train.py
+++ b/fastNLP/reproduction/Char-aware_NLM/train.py
@@ -0,0 +1,263 @@
 import os
 from collections import namedtuple

 import numpy as np
 import torch.optim as optim

 from .model import charLM
 from .test import test
 from .utilities import *


 def preprocess():
    word_dict, char_dict = create_word_char_dict("charlm.txt", "train.txt", "test.txt")
    num_words = len(word_dict)
    num_char = len(char_dict)
    char_dict["BOW"] = num_char + 1
    char_dict["EOW"] = num_char + 2
    char_dict["PAD"] = 0

    #  dict of (int, string)
    reverse_word_dict = {value: key for key, value in word_dict.items()}
    max_word_len = max([len(word) for word in word_dict])

    objects = {
        "word_dict": word_dict,
        "char_dict": char_dict,
        "reverse_word_dict": reverse_word_dict,
        "max_word_len": max_word_len
    }

    torch.save(objects, "cache/prep.pt")
    print("Preprocess done.")


 def to_var(x):
    if torch.cuda.is_available():
        x = x.cuda()
    return Variable(x)


 def train(net, data, opt):
    """
    :param net: the pytorch models
    :param data: numpy array
    :param opt: named tuple
    1. random seed
    2. define local input
    3. training settting: learning rate, loss, etc
    4. main loop epoch
    5. batchify
    6. validation
    7. save models
    """
    torch.manual_seed(1024)

    train_input = torch.from_numpy(data.train_input)
    train_label = torch.from_numpy(data.train_label)
    valid_input = torch.from_numpy(data.valid_input)
    valid_label = torch.from_numpy(data.valid_label)

    # [num_seq, seq_len, max_word_len+2]
    num_seq = train_input.size()[0] // opt.lstm_seq_len
    train_input = train_input[:num_seq * opt.lstm_seq_len, :]
    train_input = train_input.view(-1, opt.lstm_seq_len, opt.max_word_len + 2)

    num_seq = valid_input.size()[0] // opt.lstm_seq_len
    valid_input = valid_input[:num_seq * opt.lstm_seq_len, :]
    valid_input = valid_input.view(-1, opt.lstm_seq_len, opt.max_word_len + 2)

    num_epoch = opt.epochs
    num_iter_per_epoch = train_input.size()[0] // opt.lstm_batch_size

    learning_rate = opt.init_lr
    old_PPL = 100000
    best_PPL = 100000

    # Log-SoftMax
    criterion = nn.CrossEntropyLoss()

    # word_emb_dim == hidden_size / num of hidden units 
    hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)),
              to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)))

    for epoch in range(num_epoch):

        ################  Validation  ####################
        net.eval()
        loss_batch = []
        PPL_batch = []
        iterations = valid_input.size()[0] // opt.lstm_batch_size

        valid_generator = batch_generator(valid_input, opt.lstm_batch_size)
        vlabel_generator = batch_generator(valid_label, opt.lstm_batch_size * opt.lstm_seq_len)

        for t in range(iterations):
            batch_input = valid_generator.__next__()
            batch_label = vlabel_generator.__next__()

            hidden = [state.detach() for state in hidden]
            valid_output, hidden = net(to_var(batch_input), hidden)

            length = valid_output.size()[0]

            # [num_sample-1, len(word_dict)] vs [num_sample-1]
            valid_loss = criterion(valid_output, to_var(batch_label))

            PPL = torch.exp(valid_loss.data)

            loss_batch.append(float(valid_loss))
            PPL_batch.append(float(PPL))

        PPL = np.mean(PPL_batch)
        print("[epoch {}] valid PPL={}".format(epoch, PPL))
        print("valid loss={}".format(np.mean(loss_batch)))
        print("PPL decrease={}".format(float(old_PPL - PPL)))

        # Preserve the best models
        if best_PPL > PPL:
            best_PPL = PPL
            torch.save(net.state_dict(), "cache/models.pt")
            torch.save(net, "cache/net.pkl")

        # Adjust the learning rate
        if float(old_PPL - PPL) <= 1.0:
            learning_rate /= 2
            print("halved lr:{}".format(learning_rate))

        old_PPL = PPL

        ##################################################
        #################### Training ####################
        net.train()
        optimizer = optim.SGD(net.parameters(),
                              lr=learning_rate,
                              momentum=0.85)

        # split the first dim
        input_generator = batch_generator(train_input, opt.lstm_batch_size)
        label_generator = batch_generator(train_label, opt.lstm_batch_size * opt.lstm_seq_len)

        for t in range(num_iter_per_epoch):
            batch_input = input_generator.__next__()
            batch_label = label_generator.__next__()

            # detach hidden state of LSTM from last batch
            hidden = [state.detach() for state in hidden]

            output, hidden = net(to_var(batch_input), hidden)
            # [num_word, vocab_size]

            loss = criterion(output, to_var(batch_label))

            net.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm(net.parameters(), 5, norm_type=2)
            optimizer.step()

            if (t + 1) % 100 == 0:
                print("[epoch {} step {}] train loss={}, Perplexity={}".format(epoch + 1,
                                                                               t + 1, float(loss.data),
                                                                               float(np.exp(loss.data))))

    torch.save(net.state_dict(), "cache/models.pt")
    print("Training finished.")


 ################################################################

 if __name__ == "__main__":

    word_embed_dim = 300
    char_embedding_dim = 15

    if os.path.exists("cache/prep.pt") is False:
        preprocess()

    objetcs = torch.load("cache/prep.pt")

    word_dict = objetcs["word_dict"]
    char_dict = objetcs["char_dict"]
    reverse_word_dict = objetcs["reverse_word_dict"]
    max_word_len = objetcs["max_word_len"]
    num_words = len(word_dict)

    print("word/char dictionary built. Start making inputs.")

    if os.path.exists("cache/data_sets.pt") is False:
        train_text = read_data("./train.txt")
        valid_text = read_data("./charlm.txt")
        test_text = read_data("./test.txt")

        train_set = np.array(text2vec(train_text, char_dict, max_word_len))
        valid_set = np.array(text2vec(valid_text, char_dict, max_word_len))
        test_set = np.array(text2vec(test_text, char_dict, max_word_len))

        # Labels are next-word index in word_dict with the same length as inputs
        train_label = np.array([word_dict[w] for w in train_text[1:]] + [word_dict[train_text[-1]]])
        valid_label = np.array([word_dict[w] for w in valid_text[1:]] + [word_dict[valid_text[-1]]])
        test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]])

        category = {"tdata": train_set, "vdata": valid_set, "test": test_set,
                    "trlabel": train_label, "vlabel": valid_label, "tlabel": test_label}
        torch.save(category, "cache/data_sets.pt")
    else:
        data_sets = torch.load("cache/data_sets.pt")
        train_set = data_sets["tdata"]
        valid_set = data_sets["vdata"]
        test_set = data_sets["test"]
        train_label = data_sets["trlabel"]
        valid_label = data_sets["vlabel"]
        test_label = data_sets["tlabel"]

    DataTuple = namedtuple("DataTuple",
                           "train_input train_label valid_input valid_label test_input test_label")
    data = DataTuple(train_input=train_set,
                     train_label=train_label,
                     valid_input=valid_set,
                     valid_label=valid_label,
                     test_input=test_set,
                     test_label=test_label)

    print("Loaded data sets. Start building network.")

    USE_GPU = True
    cnn_batch_size = 700
    lstm_seq_len = 35
    lstm_batch_size = 20
    # cnn_batch_size == lstm_seq_len * lstm_batch_size

    net = charLM(char_embedding_dim,
                 word_embed_dim,
                 num_words,
                 len(char_dict),
                 use_gpu=USE_GPU)

    for param in net.parameters():
        nn.init.uniform(param.data, -0.05, 0.05)

    Options = namedtuple("Options", [
        "cnn_batch_size", "init_lr", "lstm_seq_len",
        "max_word_len", "lstm_batch_size", "epochs",
        "word_embed_dim"])
    opt = Options(cnn_batch_size=lstm_seq_len * lstm_batch_size,
                  init_lr=1.0,
                  lstm_seq_len=lstm_seq_len,
                  max_word_len=max_word_len,
                  lstm_batch_size=lstm_batch_size,
                  epochs=35,
                  word_embed_dim=word_embed_dim)

    print("Network built. Start training.")

    # You can stop training anytime by "ctrl+C"
    try:
        train(net, data, opt)
    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')

    torch.save(net, "cache/net.pkl")
    print("save net")

    test(net, data, opt)
--- a/fastNLP/reproduction/Char-aware_NLM/train.txt
+++ b/fastNLP/reproduction/Char-aware_NLM/train.txt
--- a/fastNLP/reproduction/Char-aware_NLM/utilities.py
+++ b/fastNLP/reproduction/Char-aware_NLM/utilities.py
@@ -0,0 +1,82 @@
 import torch
 import torch.nn.functional as F


 def batch_generator(x, batch_size):
    # x: [num_words, in_channel, height, width]
    # partitions x into batches
    num_step = x.size()[0] // batch_size
    for t in range(num_step):
        yield x[t * batch_size:(t + 1) * batch_size]


 def text2vec(words, char_dict, max_word_len):
    """ Return list of list of int """
    word_vec = []
    for word in words:
        vec = [char_dict[ch] for ch in word]
        if len(vec) < max_word_len:
            vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))]
        vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]]
        word_vec.append(vec)
    return word_vec


 def seq2vec(input_words, char_embedding, char_embedding_dim, char_table):
    """ convert the input strings into character embeddings """
    # input_words == list of string
    # char_embedding == torch.nn.Embedding
    # char_embedding_dim == int
    # char_table == list of unique chars
    # Returns: tensor of shape [len(input_words), char_embedding_dim, max_word_len+2]
    max_word_len = max([len(word) for word in input_words])
    print("max_word_len={}".format(max_word_len))
    tensor_list = []

    start_column = torch.ones(char_embedding_dim, 1)
    end_column = torch.ones(char_embedding_dim, 1)

    for word in input_words:
        # convert string to word attention
        word_encoding = char_embedding_lookup(word, char_embedding, char_table)
        # add start and end columns
        word_encoding = torch.cat([start_column, word_encoding, end_column], 1)
        # zero-pad right columns
        word_encoding = F.pad(word_encoding, (0, max_word_len - word_encoding.size()[1] + 2)).data
        # create dimension
        word_encoding = word_encoding.unsqueeze(0)

        tensor_list.append(word_encoding)

    return torch.cat(tensor_list, 0)


 def read_data(file_name):
    # Return: list of strings
    with open(file_name, 'r') as f:
        corpus = f.read().lower()
    import re
    corpus = re.sub(r"<unk>", "unk", corpus)
    return corpus.split()


 def get_char_dict(vocabulary):
    # vocabulary == dict of (word, int)
    # Return: dict of (char, int), starting from 1
    char_dict = dict()
    count = 1
    for word in vocabulary:
        for ch in word:
            if ch not in char_dict:
                char_dict[ch] = count
                count += 1
    return char_dict


 def create_word_char_dict(*file_name):
    text = []
    for file in file_name:
        text += read_data(file)
    word_dict = {word: ix for ix, word in enumerate(set(text))}
    char_dict = get_char_dict(word_dict)
    return word_dict, char_dict
--- a/fastNLP/reproduction/Char-aware_NLM/valid.txt
+++ b/fastNLP/reproduction/Char-aware_NLM/valid.txt
--- a/fastNLP/reproduction/HAN-document_classification/README.md
+++ b/fastNLP/reproduction/HAN-document_classification/README.md
@@ -0,0 +1,36 @@
 ## Introduction
 This is the implementation of [Hierarchical Attention Networks for Document Classification](https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf) paper in PyTorch.
 * Dataset is 600k documents extracted from [Yelp 2018](https://www.yelp.com/dataset) customer reviews
 * Use [NLTK](http://www.nltk.org/) and [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/) to tokenize documents and sentences
 * Both CPU & GPU support
 * The best accuracy is 71%, reaching the same performance in the paper

 ## Requirement
 * python 3.6
 * pytorch = 0.3.0
 * numpy
 * gensim
 * nltk
 * coreNLP

 ## Parameters
 According to the paper and experiment, I set model parameters:
 |word embedding dimension|GRU hidden size|GRU layer|word/sentence context vector dimension|
 |---|---|---|---|
 |200|50|1|100|

 And the training parameters:
 |Epoch|learning rate|momentum|batch size|
 |---|---|---|---|
 |3|0.01|0.9|64|

 ## Run
 1. Prepare dataset. Download the [data set](https://www.yelp.com/dataset), and unzip the custom reviews as a file. Use preprocess.py to transform file into data set foe model input.
 2. Train the model. Word enbedding of train data in 'yelp.word2vec'. The model will trained and autosaved in 'model.dict'
 ```
 python train
 ```
 3. Test the model.
 ```
 python evaluate
 ```
--- a/fastNLP/reproduction/HAN-document_classification/init.py
+++ b/fastNLP/reproduction/HAN-document_classification/init.py
--- a/fastNLP/reproduction/HAN-document_classification/data/test_samples.pkl
+++ b/fastNLP/reproduction/HAN-document_classification/data/test_samples.pkl
--- a/fastNLP/reproduction/HAN-document_classification/data/train_samples.pkl
+++ b/fastNLP/reproduction/HAN-document_classification/data/train_samples.pkl
--- a/fastNLP/reproduction/HAN-document_classification/data/yelp.word2vec
+++ b/fastNLP/reproduction/HAN-document_classification/data/yelp.word2vec
--- a/fastNLP/reproduction/HAN-document_classification/evaluate.py
+++ b/fastNLP/reproduction/HAN-document_classification/evaluate.py
@@ -0,0 +1,45 @@
 from model import *
 from train import *


 def evaluate(net, dataset, bactch_size=64, use_cuda=False):
    dataloader = DataLoader(dataset, batch_size=bactch_size, collate_fn=collate, num_workers=0)
    count = 0
    if use_cuda:
        net.cuda()
    for i, batch_samples in enumerate(dataloader):
        x, y = batch_samples
        doc_list = []
        for sample in x:
            doc = []
            for sent_vec in sample:
                if use_cuda:
                    sent_vec = sent_vec.cuda()
                doc.append(Variable(sent_vec, volatile=True))
            doc_list.append(pack_sequence(doc))
        if use_cuda:
            y = y.cuda()
        predicts = net(doc_list)
        p, idx = torch.max(predicts, dim=1)
        idx = idx.data
        count += torch.sum(torch.eq(idx, y))
    return count


 if __name__ == '__main__':
    '''
    Evaluate the performance of models
    '''
    from gensim.models import Word2Vec

    embed_model = Word2Vec.load('yelp.word2vec')
    embedding = Embedding_layer(embed_model.wv, embed_model.wv.vector_size)
    del embed_model

    net = HAN(input_size=200, output_size=5,
              word_hidden_size=50, word_num_layers=1, word_context_size=100,
              sent_hidden_size=50, sent_num_layers=1, sent_context_size=100)
    net.load_state_dict(torch.load('models.dict'))
    test_dataset = YelpDocSet('reviews', 199, 4, embedding)
    correct = evaluate(net, test_dataset, True)
    print('accuracy {}'.format(correct / len(test_dataset)))
--- a/fastNLP/reproduction/HAN-document_classification/model.py
+++ b/fastNLP/reproduction/HAN-document_classification/model.py
@@ -0,0 +1,113 @@
 import torch
 import torch.nn as nn
 from torch.autograd import Variable


 def pack_sequence(tensor_seq, padding_value=0.0):
    if len(tensor_seq) <= 0:
        return
    length = [v.size(0) for v in tensor_seq]
    max_len = max(length)
    size = [len(tensor_seq), max_len]
    size.extend(list(tensor_seq[0].size()[1:]))
    ans = torch.Tensor(*size).fill_(padding_value)
    if tensor_seq[0].data.is_cuda:
        ans = ans.cuda()
    ans = Variable(ans)
    for i, v in enumerate(tensor_seq):
        ans[i, :length[i], :] = v
    return ans


 class HAN(nn.Module):
    def __init__(self, input_size, output_size,
                 word_hidden_size, word_num_layers, word_context_size,
                 sent_hidden_size, sent_num_layers, sent_context_size):
        super(HAN, self).__init__()

        self.word_layer = AttentionNet(input_size,
                                       word_hidden_size,
                                       word_num_layers,
                                       word_context_size)
        self.sent_layer = AttentionNet(2 * word_hidden_size,
                                       sent_hidden_size,
                                       sent_num_layers,
                                       sent_context_size)
        self.output_layer = nn.Linear(2 * sent_hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, batch_doc):
        # input is a sequence of matrix
        doc_vec_list = []
        for doc in batch_doc:
            sent_mat = self.word_layer(doc)  # doc's dim (num_sent, seq_len, word_dim)
            doc_vec_list.append(sent_mat)  # sent_mat's dim (num_sent, vec_dim)
        doc_vec = self.sent_layer(pack_sequence(doc_vec_list))
        output = self.softmax(self.output_layer(doc_vec))
        return output


 class AttentionNet(nn.Module):
    def __init__(self, input_size, gru_hidden_size, gru_num_layers, context_vec_size):
        super(AttentionNet, self).__init__()

        self.input_size = input_size
        self.gru_hidden_size = gru_hidden_size
        self.gru_num_layers = gru_num_layers
        self.context_vec_size = context_vec_size

        # Encoder
        self.gru = nn.GRU(input_size=input_size,
                          hidden_size=gru_hidden_size,
                          num_layers=gru_num_layers,
                          batch_first=True,
                          bidirectional=True)
        # Attention
        self.fc = nn.Linear(2 * gru_hidden_size, context_vec_size)
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        # context vector
        self.context_vec = nn.Parameter(torch.Tensor(context_vec_size, 1))
        self.context_vec.data.uniform_(-0.1, 0.1)

    def forward(self, inputs):
        # GRU part
        h_t, hidden = self.gru(inputs)  # inputs's dim (batch_size, seq_len,  word_dim)
        u = self.tanh(self.fc(h_t))
        # Attention part
        alpha = self.softmax(torch.matmul(u, self.context_vec))  # u's dim (batch_size, seq_len, context_vec_size)
        output = torch.bmm(torch.transpose(h_t, 1, 2), alpha)  # alpha's dim (batch_size, seq_len, 1)
        return torch.squeeze(output, dim=2)  # output's dim (batch_size, 2*hidden_size, 1)


 if __name__ == '__main__':
    '''
    Test the models correctness
    '''
    import numpy as np

    use_cuda = True
    net = HAN(input_size=200, output_size=5,
              word_hidden_size=50, word_num_layers=1, word_context_size=100,
              sent_hidden_size=50, sent_num_layers=1, sent_context_size=100)
    optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
    criterion = nn.NLLLoss()
    test_time = 10
    batch_size = 64
    if use_cuda:
        net.cuda()
    print('test training')
    for step in range(test_time):
        x_data = [torch.randn(np.random.randint(1, 10), 200, 200) for i in range(batch_size)]
        y_data = torch.LongTensor([np.random.randint(0, 5) for i in range(batch_size)])
        if use_cuda:
            x_data = [x_i.cuda() for x_i in x_data]
            y_data = y_data.cuda()
        x = [Variable(x_i) for x_i in x_data]
        y = Variable(y_data)
        predict = net(x)
        loss = criterion(predict, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.data[0])
--- a/fastNLP/reproduction/HAN-document_classification/preprocess.py
+++ b/fastNLP/reproduction/HAN-document_classification/preprocess.py
@@ -0,0 +1,50 @@
 ''''
    Tokenize yelp dataset's documents using stanford core nlp
 '''

 import json
 import os
 import pickle

 import nltk
 from nltk.tokenize import stanford

 input_filename = 'review.json'

 # config for stanford core nlp
 os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
 path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
 tokenizer = stanford.CoreNLPTokenizer()

 in_dirname = 'review'
 out_dirname = 'reviews'

 f = open(input_filename, encoding='utf-8')
 samples = []
 j = 0
 for i, line in enumerate(f.readlines()):
    review = json.loads(line)
    samples.append((review['stars'], review['text']))
    if (i + 1) % 5000 == 0:
        print(i)
        pickle.dump(samples, open(in_dirname + '/samples%d.pkl' % j, 'wb'))
        j += 1
        samples = []
 pickle.dump(samples, open(in_dirname + '/samples%d.pkl' % j, 'wb'))
 # samples = pickle.load(open(out_dirname + '/samples0.pkl', 'rb'))
 # print(samples[0])


 for fn in os.listdir(in_dirname):
    print(fn)
    precessed = []
    for stars, text in pickle.load(open(os.path.join(in_dirname, fn), 'rb')):
        tokens = []
        sents = nltk.tokenize.sent_tokenize(text)
        for s in sents:
            tokens.append(tokenizer.tokenize(s))
        precessed.append((stars, tokens))
        # print(tokens)
        if len(precessed) % 100 == 0:
            print(len(precessed))
    pickle.dump(precessed, open(os.path.join(out_dirname, fn), 'wb'))
--- a/fastNLP/reproduction/HAN-document_classification/train.py
+++ b/fastNLP/reproduction/HAN-document_classification/train.py
@@ -0,0 +1,171 @@
 import os
 import pickle

 import numpy as np
 import torch
 from model import *


 class SentIter:
    def __init__(self, dirname, count):
        self.dirname = dirname
        self.count = int(count)

    def __iter__(self):
        for f in os.listdir(self.dirname)[:self.count]:
            with open(os.path.join(self.dirname, f), 'rb') as f:
                for y, x in pickle.load(f):
                    for sent in x:
                        yield sent


 def train_word_vec():
    # load data
    dirname = 'reviews'
    sents = SentIter(dirname, 238)
    # define models and train
    model = models.Word2Vec(size=200, sg=0, workers=4, min_count=5)
    model.build_vocab(sents)
    model.train(sents, total_examples=model.corpus_count, epochs=10)
    model.save('yelp.word2vec')
    print(model.wv.similarity('woman', 'man'))
    print(model.wv.similarity('nice', 'awful'))


 class Embedding_layer:
    def __init__(self, wv, vector_size):
        self.wv = wv
        self.vector_size = vector_size

    def get_vec(self, w):
        try:
            v = self.wv[w]
        except KeyError as e:
            v = np.random.randn(self.vector_size)
        return v


 from torch.utils.data import DataLoader, Dataset


 class YelpDocSet(Dataset):
    def __init__(self, dirname, start_file, num_files, embedding):
        self.dirname = dirname
        self.num_files = num_files
        self._files = os.listdir(dirname)[start_file:start_file + num_files]
        self.embedding = embedding
        self._cache = [(-1, None) for i in range(5)]

    def get_doc(self, n):
        file_id = n // 5000
        idx = file_id % 5
        if self._cache[idx][0] != file_id:
            with open(os.path.join(self.dirname, self._files[file_id]), 'rb') as f:
                self._cache[idx] = (file_id, pickle.load(f))
        y, x = self._cache[idx][1][n % 5000]
        sents = []
        for s_list in x:
            sents.append(' '.join(s_list))
        x = '\n'.join(sents)
        return x, y - 1

    def __len__(self):
        return len(self._files) * 5000

    def __getitem__(self, n):
        file_id = n // 5000
        idx = file_id % 5
        if self._cache[idx][0] != file_id:
            print('load {} to {}'.format(file_id, idx))
            with open(os.path.join(self.dirname, self._files[file_id]), 'rb') as f:
                self._cache[idx] = (file_id, pickle.load(f))
        y, x = self._cache[idx][1][n % 5000]
        doc = []
        for sent in x:
            if len(sent) == 0:
                continue
            sent_vec = []
            for word in sent:
                vec = self.embedding.get_vec(word)
                sent_vec.append(vec.tolist())
            sent_vec = torch.Tensor(sent_vec)
            doc.append(sent_vec)
        if len(doc) == 0:
            doc = [torch.zeros(1, 200)]
        return doc, y - 1


 def collate(iterable):
    y_list = []
    x_list = []
    for x, y in iterable:
        y_list.append(y)
        x_list.append(x)
    return x_list, torch.LongTensor(y_list)


 def train(net, dataset, num_epoch, batch_size, print_size=10, use_cuda=False):
    optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
    criterion = nn.NLLLoss()

    dataloader = DataLoader(dataset,
                            batch_size=batch_size,
                            collate_fn=collate,
                            num_workers=0)
    running_loss = 0.0

    if use_cuda:
        net.cuda()
    print('start training')
    for epoch in range(num_epoch):
        for i, batch_samples in enumerate(dataloader):
            x, y = batch_samples
            doc_list = []
            for sample in x:
                doc = []
                for sent_vec in sample:
                    if use_cuda:
                        sent_vec = sent_vec.cuda()
                    doc.append(Variable(sent_vec))
                doc_list.append(pack_sequence(doc))
            if use_cuda:
                y = y.cuda()
            y = Variable(y)
            predict = net(doc_list)
            loss = criterion(predict, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.data[0]
            if i % print_size == print_size - 1:
                print('{}, {}'.format(i + 1, running_loss / print_size))
                running_loss = 0.0
                torch.save(net.state_dict(), 'models.dict')
    torch.save(net.state_dict(), 'models.dict')


 if __name__ == '__main__':
    '''
    Train process
    '''
    from gensim.models import Word2Vec
    from gensim import models

    train_word_vec()

    embed_model = Word2Vec.load('yelp.word2vec')
    embedding = Embedding_layer(embed_model.wv, embed_model.wv.vector_size)
    del embed_model
    start_file = 0
    dataset = YelpDocSet('reviews', start_file, 120 - start_file, embedding)
    print('training data size {}'.format(len(dataset)))
    net = HAN(input_size=200, output_size=5,
              word_hidden_size=50, word_num_layers=1, word_context_size=100,
              sent_hidden_size=50, sent_num_layers=1, sent_context_size=100)
    try:
        net.load_state_dict(torch.load('models.dict'))
        print("last time trained models has loaded")
    except Exception:
        print("cannot load models, train the inital models")

    train(net, dataset, num_epoch=5, batch_size=64, use_cuda=True)
--- a/fastNLP/reproduction/init.py
+++ b/fastNLP/reproduction/init.py
--- a/fastNLP/saver/init.py
+++ b/fastNLP/saver/init.py
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,3 @@
 numpy==1.14.2
 torch==0.4.0
 torchvision==0.1.8
--- a/setup.py
+++ b/setup.py
--- a/test/data_for_tests/charlm.txt
+++ b/test/data_for_tests/charlm.txt
--- a/test/data_for_tests/cws_test
+++ b/test/data_for_tests/cws_test
--- a/test/data_for_tests/cws_train
+++ b/test/data_for_tests/cws_train
--- a/test/test_charlm.py
+++ b/test/test_charlm.py
@@ -0,0 +1,32 @@
 from loader.base_loader import ToyLoader0
 from model.char_language_model import CharLM

 from fastNLP.action import Tester
 from fastNLP.action.trainer import Trainer


 def test_charlm():
    train_config = Trainer.TrainConfig(epochs=1, validate=True, save_when_better=True,
                                       log_per_step=10, log_validation=True, batch_size=160)
    trainer = Trainer(train_config)

    model = CharLM(lstm_batch_size=16, lstm_seq_len=10)

    train_data = ToyLoader0("load_train", "./data_for_tests/charlm.txt").load()
    valid_data = ToyLoader0("load_valid", "./data_for_tests/charlm.txt").load()

    trainer.train(model, train_data, valid_data)

    trainer.save_model(model)

    test_config = Tester.TestConfig(save_output=True, validate_in_training=True,
                                    save_dev_input=True, save_loss=True, batch_size=160)
    tester = Tester(test_config)

    test_data = ToyLoader0("load_test", "./data_for_tests/charlm.txt").load()

    tester.test(model, test_data)


 if __name__ == "__main__":
    test_charlm()
--- a/test/test_loader.py
+++ b/test/test_loader.py
@@ -0,0 +1,10 @@
 import unittest


 class MyTestCase(unittest.TestCase):
    def test_something(self):
        self.assertEqual(True, False)


 if __name__ == '__main__':
    unittest.main()
--- a/test/test_trainer.py
+++ b/test/test_trainer.py
@@ -0,0 +1,21 @@
 from collections import namedtuple

 import numpy as np
 from model.base_model import ToyModel

 from fastNLP.action.trainer import Trainer


 def test_trainer():
    Config = namedtuple("config", ["epochs", "validate", "save_when_better"])
    train_config = Config(epochs=5, validate=True, save_when_better=True)
    trainer = Trainer(train_config)

    net = ToyModel()
    data = np.random.rand(20, 6)
    dev_data = np.random.rand(20, 6)
    trainer.train(net, data, dev_data)


 if __name__ == "__main__":
    test_trainer()
--- a/test/test_word_seg.py
+++ b/test/test_word_seg.py
@@ -0,0 +1,28 @@
 from fastNLP.action.tester import Tester
 from fastNLP.action.trainer import WordSegTrainer
 from fastNLP.loader.base_loader import BaseLoader
 from fastNLP.models.word_seg_model import WordSeg


 def test_wordseg():
    train_config = WordSegTrainer.TrainConfig(epochs=5, validate=False, save_when_better=False,
                                       log_per_step=10, log_validation=False, batch_size=254)
    trainer = WordSegTrainer(train_config)

    model = WordSeg(100, 2, 1000)

    train_data = BaseLoader("load_train", "./data_for_tests/cws_train").load_lines()

    trainer.train(model, train_data)

    test_config = Tester.TestConfig(save_output=False, validate_in_training=False,
                                    save_dev_input=False, save_loss=False, batch_size=254)
    tester = Tester(test_config)

    test_data = BaseLoader("load_test", "./data_for_tests/cws_test").load_lines()

    tester.test(model, test_data)


 if __name__ == "__main__":
    test_wordseg()