讨论并整合了若干模块

6 years ago · 8039f4dd45
--- a/docs/source/fastNLP.io.config_io.rst
+++ b/docs/source/fastNLP.io.config_io.rst
@@ -1,7 +0,0 @@
 fastNLP.io.config\_io module
 ============================
 .. automodule:: fastNLP.io.config_io
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.io.file_reader.rst
+++ b/docs/source/fastNLP.io.file_reader.rst
@@ -1,7 +0,0 @@
 fastNLP.io.file\_reader module
 ==============================
 .. automodule:: fastNLP.io.file_reader
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.io.rst
+++ b/docs/source/fastNLP.io.rst
@@ -12,9 +12,7 @@ Submodules
 .. toctree::
   fastNLP.io.base_loader
   fastNLP.io.config_io
   fastNLP.io.dataset_loader
   fastNLP.io.embed_loader
   fastNLP.io.file_reader
   fastNLP.io.model_io
--- a/docs/source/user/installation.rst
+++ b/docs/source/user/installation.rst
@@ -9,7 +9,6 @@ fastNLP 依赖如下包::
    torch>=0.4.0
    numpy
    tensorboardX
    tqdm
    nltk
@@ -18,4 +17,4 @@ fastNLP 依赖如下包::
 ..  code:: shell
   >>> pip install fitlog
   >>> pip install fastNLP
--- a/fastNLP/io/init.py
+++ b/fastNLP/io/init.py
@@ -5,16 +5,13 @@
 2. 用于读入数据的 :doc:`DataSetLoader <fastNLP.io.dataset_loader>` 类
 3. 用于读写config文件的类, 参考 :doc:`Config-IO <fastNLP.io.config_io>`
 4. 用于保存和载入模型的类, 参考 :doc:`Model-IO <fastNLP.io.model_io>`
 3. 用于保存和载入模型的类, 参考 :doc:`Model-IO <fastNLP.io.model_io>`
 这些类的使用方法可以在对应module的文档下查看.
 """
 from .embed_loader import EmbedLoader
 from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \
    PeopleDailyCorpusLoader, Conll2003Loader
 from .config_io import ConfigLoader, ConfigSection, ConfigSaver
 from .model_io import ModelLoader as ModelLoader, ModelSaver as ModelSaver
 __all__ = [
@@ -29,10 +26,6 @@ __all__ = [
    'PeopleDailyCorpusLoader',
    'Conll2003Loader',
    'ConfigLoader',
    'ConfigSection',
    'ConfigSaver',
    'ModelLoader',
    'ModelSaver',
 ]
--- a/fastNLP/models/init.py
+++ b/fastNLP/models/init.py
@@ -5,7 +5,6 @@ TODO 详细介绍的表格，与主页相对应
 """
 from .base_model import BaseModel
 from .biaffine_parser import BiaffineParser, GraphParser
 from .char_language_model import CharLM
 from .cnn_text_classification import CNNText
 from .sequence_modeling import SeqLabeling, AdvSeqLabel
 from .snli import ESIM
--- a/fastNLP/models/char_language_model.py
+++ b/fastNLP/models/char_language_model.py
@@ -1,138 +0,0 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from ..modules.encoder.lstm import LSTM
 class Highway(nn.Module):
    """Highway network"""
    def __init__(self, input_size):
        super(Highway, self).__init__()
        self.fc1 = nn.Linear(input_size, input_size, bias=True)
        self.fc2 = nn.Linear(input_size, input_size, bias=True)
    def forward(self, x):
        t = F.sigmoid(self.fc1(x))
        return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x)
 class CharLM(nn.Module):
    """CNN + highway network + LSTM
    # Input::
        4D tensor with shape [batch_size, in_channel, height, width]
    # Output::
        2D Tensor with shape [batch_size, vocab_size]
    # Arguments::
        char_emb_dim: the size of each character's attention
        word_emb_dim: the size of each word's attention
        vocab_size: num of unique words
        num_char: num of characters
        use_gpu: True or False
    """
    def __init__(self, char_emb_dim, word_emb_dim,
                 vocab_size, num_char):
        super(CharLM, self).__init__()
        self.char_emb_dim = char_emb_dim
        self.word_emb_dim = word_emb_dim
        self.vocab_size = vocab_size
        # char attention layer
        self.char_embed = nn.Embedding(num_char, char_emb_dim)
        # convolutions of filters with different sizes
        self.convolutions = []
        # list of tuples: (the number of filter, width)
        self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)]
        for out_channel, filter_width in self.filter_num_width:
            self.convolutions.append(
                nn.Conv2d(
                    1,  # in_channel
                    out_channel,  # out_channel
                    kernel_size=(char_emb_dim, filter_width),  # (height, width)
                    bias=True
                )
            )
        self.highway_input_dim = sum([x for x, y in self.filter_num_width])
        self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False)
        # highway net
        self.highway1 = Highway(self.highway_input_dim)
        self.highway2 = Highway(self.highway_input_dim)
        # LSTM
        self.lstm_num_layers = 2
        self.lstm = LSTM(self.highway_input_dim, hidden_size=self.word_emb_dim, num_layers=self.lstm_num_layers,
                         dropout=0.5)
        # output layer
        self.dropout = nn.Dropout(p=0.5)
        self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)
    def forward(self, x):
        # Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2]
        # Return: Variable of Tensor with shape [num_words, len(word_dict)]
        lstm_batch_size = x.size()[0]
        lstm_seq_len = x.size()[1]
        x = x.contiguous().view(-1, x.size()[2])
        # [num_seq*seq_len, max_word_len+2]
        x = self.char_embed(x)
        # [num_seq*seq_len, max_word_len+2, char_emb_dim]
        x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
        # [num_seq*seq_len, 1, max_word_len+2, char_emb_dim]
        x = self.conv_layers(x)
        # [num_seq*seq_len, total_num_filters]
        x = self.batch_norm(x)
        # [num_seq*seq_len, total_num_filters]
        x = self.highway1(x)
        x = self.highway2(x)
        # [num_seq*seq_len, total_num_filters]
        x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
        # [num_seq, seq_len, total_num_filters]
        x = self.lstm(x)
        # [seq_len, num_seq, hidden_size]
        x = self.dropout(x)
        # [seq_len, num_seq, hidden_size]
        x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1)
        # [num_seq*seq_len, hidden_size]
        x = self.linear(x)
        # [num_seq*seq_len, vocab_size]
        return x
    def conv_layers(self, x):
        chosen_list = list()
        for conv in self.convolutions:
            feature_map = F.tanh(conv(x))
            # (batch_size, out_channel, 1, max_word_len-width+1)
            chosen = torch.max(feature_map, 3)[0]
            # (batch_size, out_channel, 1)
            chosen = chosen.squeeze()
            # (batch_size, out_channel)
            chosen_list.append(chosen)
        # (batch_size, total_num_filers)
        return torch.cat(chosen_list, 1)
--- a/fastNLP/models/snli.py
+++ b/fastNLP/models/snli.py
@@ -12,19 +12,21 @@ my_inf = 10e12
 class ESIM(BaseModel):
    """ESIM模型的一个PyTorch实现。
    """
    ESIM模型的一个PyTorch实现。
    ESIM模型的论文: Enhanced LSTM for Natural Language Inference (arXiv: 1609.06038)
    :param int vocab_size: 词表大小
    :param int embed_dim: 词嵌入维度
    :param int hidden_size: LSTM隐层大小
    :param float dropout: dropout大小，默认为0
    :param int num_classes: 标签数目，默认为3
    :param numpy.array init_embedding: 初始词嵌入矩阵，形状为(vocab_size, embed_dim)，默认为None，即随机初始化词嵌入矩阵
    """
    def __init__(self, vocab_size, embed_dim, hidden_size, dropout=0.0, num_classes=3, init_embedding=None):
        """
        :param int vocab_size: 词表大小
        :param int embed_dim: 词嵌入维度
        :param int hidden_size: LSTM隐层大小
        :param float dropout: dropout大小，默认为0
        :param int num_classes: 标签数目，默认为3
        :param numpy.array init_embedding: 初始词嵌入矩阵，形状为(vocab_size, embed_dim)，默认为None，即随机初始化词嵌入矩阵
        """
        super(ESIM, self).__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
--- a/fastNLP/modules/init.py
+++ b/fastNLP/modules/init.py
@@ -12,8 +12,8 @@ from . import decoder
 from . import encoder
 from .aggregator import *
 from .decoder import *
 from .other_modules import *
 from .dropout import TimestepDropout
 from .encoder import *
 from .utils import get_embeddings
 __version__ = '0.0.0'
--- a/fastNLP/modules/aggregator/init.py
+++ b/fastNLP/modules/aggregator/init.py
@@ -1,11 +1,7 @@
 __all__ = ["MaxPool", "MaxPoolWithMask", "AvgPool", "MeanPoolWithMask", "KMaxPool", "Attention", "BiAttention",
           "SelfAttention"]
 __all__ = ["MaxPool", "MaxPoolWithMask", "AvgPool", "MultiHeadAttention"]
 from .pooling import MaxPool
 from .pooling import MaxPoolWithMask
 from .pooling import AvgPool
 from .pooling import MeanPoolWithMask
 from .pooling import KMaxPool
 from .attention import Attention
 from .attention import BiAttention
 from .attention import SelfAttention
 from .attention import MultiHeadAttention
--- a/fastNLP/modules/aggregator/attention.py
+++ b/fastNLP/modules/aggregator/attention.py
@@ -1,3 +1,4 @@
 __all__ =["MultiHeadAttention"]
 import math
 import torch
@@ -5,27 +6,14 @@ import torch.nn.functional as F
 from torch import nn
 from ..dropout import TimestepDropout
 from ..utils import mask_softmax
 from ..utils import initial_parameter
 class Attention(torch.nn.Module):
    def __init__(self, normalize=False):
        super(Attention, self).__init__()
        self.normalize = normalize
    def forward(self, query, memory, mask):
        similarities = self._atten_forward(query, memory)
        if self.normalize:
            return mask_softmax(similarities, mask)
        return similarities
    def _atten_forward(self, query, memory):
        raise NotImplementedError
 class DotAttention(nn.Module):
    """
    TODO
    """
    def __init__(self, key_size, value_size, dropout=0.1):
        super(DotAttention, self).__init__()
        self.key_size = key_size
@@ -51,15 +39,15 @@ class DotAttention(nn.Module):
 class MultiHeadAttention(nn.Module):
    def __init__(self, input_size, key_size, value_size, num_head, dropout=0.1):
        """
    """
        :param input_size: int, 输入维度的大小。同时也是输出维度的大小。
        :param key_size: int, 每个head的维度大小。
        :param value_size: int，每个head中value的维度。
        :param num_head: int，head的数量。
        :param dropout: float。
        """
    :param input_size: int, 输入维度的大小。同时也是输出维度的大小。
    :param key_size: int, 每个head的维度大小。
    :param value_size: int，每个head中value的维度。
    :param num_head: int，head的数量。
    :param dropout: float。
    """
    def __init__(self, input_size, key_size, value_size, num_head, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        self.input_size = input_size
        self.key_size = key_size
@@ -112,16 +100,16 @@ class MultiHeadAttention(nn.Module):
 class BiAttention(nn.Module):
    """Bi Attention module
    r"""Bi Attention module
    Calculate Bi Attention matrix `e`
    .. math::
        \\begin{array}{ll} \\\\
            e_ij = {a}^{\\mathbf{T}}_{i}{b}_{j} \\\\
        \begin{array}{ll} \\
            e_ij = {a}^{\mathbf{T}}_{i}{b}_{j} \\
            a_i =
            b_j =
        \\end{array}
        \end{array}
    """
@@ -171,8 +159,11 @@ class BiAttention(nn.Module):
        return out_x1, out_x2
 class SelfAttention(nn.Module):
    """Self Attention Module.
    """
    Self Attention Module.
    :param int input_size: 输入tensor的hidden维度
    :param int attention_unit: 输出tensor的hidden维度
    :param int attention_hops:
--- a/fastNLP/modules/aggregator/pooling.py
+++ b/fastNLP/modules/aggregator/pooling.py
@@ -1,21 +1,23 @@
 __all__ = ["MaxPool", "MaxPoolWithMask", "AvgPool"]
 import torch
 import torch.nn as nn
 class MaxPool(nn.Module):
    """Max-pooling模块。"""
    """
    Max-pooling模块。
    :param stride: 窗口移动大小，默认为kernel_size
    :param padding: padding的内容，默认为0
    :param dilation: 控制窗口内元素移动距离的大小
    :param dimension: MaxPool的维度，支持1，2，3维。
    :param kernel_size: max pooling的窗口大小，默认为tensor最后k维，其中k为dimension
    :param return_indices:
    :param ceil_mode:
    """
    def __init__(self, stride=None, padding=0, dilation=1, dimension=1, kernel_size=None,
                 return_indices=False, ceil_mode=False):
        """
        :param stride: 窗口移动大小，默认为kernel_size
        :param padding: padding的内容，默认为0
        :param dilation: 控制窗口内元素移动距离的大小
        :param dimension: MaxPool的维度，支持1，2，3维。
        :param kernel_size: max pooling的窗口大小，默认为tensor最后k维，其中k为dimension
        :param return_indices:
        :param ceil_mode:
        """
        super(MaxPool, self).__init__()
        assert (1 <= dimension) and (dimension <= 3)
        self.dimension = dimension
@@ -110,6 +112,7 @@ class AvgPool(nn.Module):
 class MeanPoolWithMask(nn.Module):
    def __init__(self):
        super(MeanPoolWithMask, self).__init__()
        self.inf = 10e12
--- a/fastNLP/modules/decoder/init.py
+++ b/fastNLP/modules/decoder/init.py
@@ -1,3 +1,4 @@
 __all__ = ["MLP", "ConditionalRandomField"]
 __all__ = ["MLP", "ConditionalRandomField","viterbi_decode"]
 from .CRF import ConditionalRandomField
 from .MLP import MLP
 from .utils import viterbi_decode
--- a/fastNLP/modules/decoder/utils.py
+++ b/fastNLP/modules/decoder/utils.py
@@ -1,4 +1,4 @@
 __all__ = ["viterbi_decode"]
 import torch
--- a/fastNLP/modules/dropout.py
+++ b/fastNLP/modules/dropout.py
@@ -1,5 +1,5 @@
 import torch
 __all__ = []
 class TimestepDropout(torch.nn.Dropout):
    """This module accepts a ``[batch_size, num_timesteps, embedding_dim)]`` and use a single
--- a/fastNLP/modules/encoder/init.py
+++ b/fastNLP/modules/encoder/init.py
@@ -1,11 +1,9 @@
 from .conv_maxpool import ConvMaxpool
 from .embedding import Embedding
 from .linear import Linear
 from .lstm import LSTM
 from .bert import BertModel
 __all__ = ["LSTM",
           "Embedding",
           "Linear",
           "ConvMaxpool",
           "BertModel"]
--- a/fastNLP/modules/encoder/char_encoder.py
+++ b/fastNLP/modules/encoder/char_encoder.py
@@ -6,16 +6,15 @@ from ..utils import initial_parameter
 # from torch.nn.init import xavier_uniform
 class ConvolutionCharEncoder(nn.Module):
    """char级别的卷积编码器."""
    """
    char级别的卷积编码器.
    :param int char_emb_size: char级别embedding的维度. Default: 50
            例: 有26个字符, 每一个的embedding是一个50维的向量, 所以输入的向量维度为50.
    :param tuple feature_maps: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的filter.
    :param tuple kernels: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的卷积核.
    :param initial_method: 初始化参数的方式, 默认为`xavier normal`
    """
    def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(3, 4, 5), initial_method=None):
        """
        :param int char_emb_size: char级别embedding的维度. Default: 50
                例: 有26个字符, 每一个的embedding是一个50维的向量, 所以输入的向量维度为50.
        :param tuple feature_maps: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的filter.
        :param tuple kernels: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的卷积核.
        :param initial_method: 初始化参数的方式, 默认为`xavier normal`
        """
        super(ConvolutionCharEncoder, self).__init__()
        self.convs = nn.ModuleList([
            nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, 4))
--- a/fastNLP/modules/encoder/linear.py
+++ b/fastNLP/modules/encoder/linear.py
@@ -1,21 +0,0 @@
 import torch.nn as nn
 from ..utils import initial_parameter
 class Linear(nn.Module):
    """
    :param int input_size: input size
    :param int output_size: output size
    :param bool bias:
    :param str initial_method:
    """
    def __init__(self, input_size, output_size, bias=True, initial_method=None):
        super(Linear, self).__init__()
        self.linear = nn.Linear(input_size, output_size, bias)
        initial_parameter(self, initial_method)
    def forward(self, x):
        x = self.linear(x)
        return x
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -19,15 +19,13 @@ class LSTM(nn.Module):
    :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
        :(batch, seq, feature). Default: ``False``
    :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
    :param get_hidden: 是否返回隐状态 `h` . Default: ``False``
    """
    def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True,
                 bidirectional=False, bias=True, initial_method=None, get_hidden=False):
                 bidirectional=False, bias=True, initial_method=None):
        super(LSTM, self).__init__()
        self.batch_first = batch_first
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=bias, batch_first=batch_first,
                            dropout=dropout, bidirectional=bidirectional)
        self.get_hidden = get_hidden
        initial_parameter(self, initial_method)
    def forward(self, x, seq_len=None, h0=None, c0=None):
@@ -39,7 +37,6 @@ class LSTM(nn.Module):
        :param c0: [batch, hidden_size] 初始Cell状态, 若为 ``None`` , 设为全1向量. Default: ``None``
        :return (output, ht) 或 output: 若 ``get_hidden=True`` [batch, seq_len, hidden_size*num_direction] 输出序列
            和 [batch, hidden_size*num_direction] 最后时刻隐状态.
            若 ``get_hidden=False`` 仅返回输出序列.
        """
        if h0 is not None and c0 is not None:
            hx = (h0, c0)
@@ -61,16 +58,4 @@ class LSTM(nn.Module):
                output = output[:, unsort_idx]
        else:
            output, hx = self.lstm(x, hx)
        if self.get_hidden:
            return output, hx
        return output
 if __name__ == "__main__":
    lstm = LSTM(input_size=2, hidden_size=2, get_hidden=False)
    x = torch.randn((3, 5, 2))
    seq_lens = torch.tensor([5,1,2])
    y = lstm(x, seq_lens)
    print(x)
    print(y)
    print(x.size(), y.size(), )
        return output, hx
--- a/fastNLP/modules/other_modules.py
+++ b/fastNLP/modules/other_modules.py
@@ -1,186 +0,0 @@
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.utils.data
 from torch.nn import Parameter
 class GroupNorm(nn.Module):
    def __init__(self, num_features, num_groups=20, eps=1e-5):
        super(GroupNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(1, num_features, 1))
        self.bias = nn.Parameter(torch.zeros(1, num_features, 1))
        self.num_groups = num_groups
        self.eps = eps
    def forward(self, x):
        N, C, H = x.size()
        G = self.num_groups
        assert C % G == 0
        x = x.view(N, G, -1)
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, keepdim=True)
        x = (x - mean) / (var + self.eps).sqrt()
        x = x.view(N, C, H)
        return x * self.weight + self.bias
 class LayerNormalization(nn.Module):
    """
    :param int layer_size:
    :param float eps: default=1e-3
    """
    def __init__(self, layer_size, eps=1e-3):
        super(LayerNormalization, self).__init__()
        self.eps = eps
        self.a_2 = nn.Parameter(torch.ones(1, layer_size, requires_grad=True))
        self.b_2 = nn.Parameter(torch.zeros(1, layer_size, requires_grad=True))
    def forward(self, z):
        if z.size(1) == 1:
            return z
        mu = torch.mean(z, keepdim=True, dim=-1)
        sigma = torch.std(z, keepdim=True, dim=-1)
        ln_out = (z - mu) / (sigma + self.eps)
        ln_out = ln_out * self.a_2 + self.b_2
        return ln_out
 class BiLinear(nn.Module):
    def __init__(self, n_left, n_right, n_out, bias=True):
        """
        :param int n_left: size of left input
        :param int n_right: size of right input
        :param int n_out: size of output
        :param bool bias: If set to False, the layer will not learn an additive bias. Default: True
        """
        super(BiLinear, self).__init__()
        self.n_left = n_left
        self.n_right = n_right
        self.n_out = n_out
        self.U = Parameter(torch.Tensor(self.n_out, self.n_left, self.n_right))
        self.W_l = Parameter(torch.Tensor(self.n_out, self.n_left))
        self.W_r = Parameter(torch.Tensor(self.n_out, self.n_left))
        if bias:
            self.bias = Parameter(torch.Tensor(n_out))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()
    def reset_parameters(self):
        nn.init.xavier_uniform_(self.W_l)
        nn.init.xavier_uniform_(self.W_r)
        nn.init.constant_(self.bias, 0.)
        nn.init.xavier_uniform_(self.U)
    def forward(self, input_left, input_right):
        """
        :param Tensor input_left: the left input tensor with shape = [batch1, batch2, ..., left_features]
        :param Tensor input_right: the right input tensor with shape = [batch1, batch2, ..., right_features]
        """
        left_size = input_left.size()
        right_size = input_right.size()
        assert left_size[:-1] == right_size[:-1], \
            "batch size of left and right inputs mis-match: (%s, %s)" % (left_size[:-1], right_size[:-1])
        batch = int(np.prod(left_size[:-1]))
        # convert left and right input to matrices [batch, left_features], [batch, right_features]
        input_left = input_left.view(batch, self.n_left)
        input_right = input_right.view(batch, self.n_right)
        # output [batch, out_features]
        output = F.bilinear(input_left, input_right, self.U, self.bias)
        output = output + \
                 F.linear(input_left, self.W_l, None) + \
                 F.linear(input_right, self.W_r, None)
        # convert back to [batch1, batch2, ..., out_features]
        return output.view(left_size[:-1] + (self.n_out,))
    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + 'in1_features=' + str(self.n_left) \
               + ', in2_features=' + str(self.n_right) \
               + ', out_features=' + str(self.n_out) + ')'
 class BiAffine(nn.Module):
    def __init__(self, n_enc, n_dec, n_labels, biaffine=True, **kwargs):
        """
        :param int n_enc: the dimension of the encoder input.
        :param int n_dec: the dimension of the decoder input.
        :param int n_labels: the number of labels of the crf layer
        :param bool biaffine: if apply bi-affine parameter.
        """
        super(BiAffine, self).__init__()
        self.n_enc = n_enc
        self.n_dec = n_dec
        self.num_labels = n_labels
        self.biaffine = biaffine
        self.W_d = Parameter(torch.Tensor(self.num_labels, self.n_dec))
        self.W_e = Parameter(torch.Tensor(self.num_labels, self.n_enc))
        self.b = Parameter(torch.Tensor(self.num_labels, 1, 1))
        if self.biaffine:
            self.U = Parameter(torch.Tensor(self.num_labels, self.n_dec, self.n_enc))
        else:
            self.register_parameter('U', None)
        self.reset_parameters()
    def reset_parameters(self):
        nn.init.xavier_uniform_(self.W_d)
        nn.init.xavier_uniform_(self.W_e)
        nn.init.constant_(self.b, 0.)
        if self.biaffine:
            nn.init.xavier_uniform_(self.U)
    def forward(self, input_d, input_e, mask_d=None, mask_e=None):
        """
        :param Tensor input_d: the decoder input tensor with shape = [batch, length_decoder, input_size]
        :param Tensor input_e: the child input tensor with shape = [batch, length_encoder, input_size]
        :param mask_d: Tensor or None, the mask tensor for decoder with shape = [batch, length_decoder]
        :param mask_e: Tensor or None, the mask tensor for encoder with shape = [batch, length_encoder]
        :returns: Tensor, the energy tensor with shape = [batch, num_label, length, length]
        """
        assert input_d.size(0) == input_e.size(0), 'batch sizes of encoder and decoder are requires to be equal.'
        batch, length_decoder, _ = input_d.size()
        _, length_encoder, _ = input_e.size()
        # compute decoder part: [num_label, input_size_decoder] * [batch, input_size_decoder, length_decoder]
        # the output shape is [batch, num_label, length_decoder]
        out_d = torch.matmul(self.W_d, input_d.transpose(1, 2)).unsqueeze(3)
        # compute decoder part: [num_label, input_size_encoder] * [batch, input_size_encoder, length_encoder]
        # the output shape is [batch, num_label, length_encoder]
        out_e = torch.matmul(self.W_e, input_e.transpose(1, 2)).unsqueeze(2)
        # output shape [batch, num_label, length_decoder, length_encoder]
        if self.biaffine:
            # compute bi-affine part
            # [batch, 1, length_decoder, input_size_decoder] * [num_labels, input_size_decoder, input_size_encoder]
            # output shape [batch, num_label, length_decoder, input_size_encoder]
            output = torch.matmul(input_d.unsqueeze(1), self.U)
            # [batch, num_label, length_decoder, input_size_encoder] * [batch, 1, input_size_encoder, length_encoder]
            # output shape [batch, num_label, length_decoder, length_encoder]
            output = torch.matmul(output, input_e.unsqueeze(1).transpose(2, 3))
            output = output + out_d + out_e + self.b
        else:
            output = out_d + out_d + self.b
        if mask_d is not None:
            output = output * mask_d.unsqueeze(1).unsqueeze(3) * mask_e.unsqueeze(1).unsqueeze(2)
        return output
--- a/fastNLP/modules/utils.py
+++ b/fastNLP/modules/utils.py
@@ -4,14 +4,6 @@ import torch.nn as nn
 import torch.nn.init as init
 def mask_softmax(matrix, mask):
    if mask is None:
        result = torch.nn.functional.softmax(matrix, dim=-1)
    else:
        raise NotImplementedError
    return result
 def initial_parameter(net, initial_method=None):
    """A method used to initialize the weights of PyTorch models.
@@ -77,7 +69,8 @@ def initial_parameter(net, initial_method=None):
 def seq_mask(seq_len, max_len):
    """Create sequence mask.
    """
    Create sequence mask.
    :param seq_len: list or torch.Tensor, the lengths of sequences in a batch.
    :param max_len: int, the maximum sequence length in a batch.
@@ -92,7 +85,8 @@ def seq_mask(seq_len, max_len):
 def get_embeddings(init_embed):
    """得到词嵌入
    """
    得到词嵌入 TODO
    :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即
        embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象,
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
 numpy
 torch>=0.4.0
 tensorboardX
 tqdm
 nltk
--- a/test/io/test_config_saver.py
+++ b/test/io/test_config_saver.py
@@ -1,112 +1,112 @@
 import os
 import unittest
 from fastNLP.io.config_io import ConfigSection, ConfigLoader, ConfigSaver
 from fastNLP.io import ConfigSection, ConfigLoader, ConfigSaver
 class TestConfigSaver(unittest.TestCase):
    def test_case_1(self):
        config_file_dir = "test/io"
        config_file_dir = "."
        config_file_name = "config"
        config_file_path = os.path.join(config_file_dir, config_file_name)
        tmp_config_file_path = os.path.join(config_file_dir, "tmp_config")
        with open(config_file_path, "r") as f:
            lines = f.readlines()
        standard_section = ConfigSection()
        t_section = ConfigSection()
        ConfigLoader().load_config(config_file_path, {"test": standard_section, "t": t_section})
        config_saver = ConfigSaver(config_file_path)
        section = ConfigSection()
        section["doubles"] = 0.8
        section["tt"] = 0.5
        section["test"] = 105
        section["str"] = "this is a str"
        test_case_2_section = section
        test_case_2_section["double"] = 0.5
        for k in section.__dict__.keys():
            standard_section[k] = section[k]
        config_saver.save_config_file("test", section)
        config_saver.save_config_file("another-test", section)
        config_saver.save_config_file("one-another-test", section)
        config_saver.save_config_file("test-case-2", section)
        test_section = ConfigSection()
        at_section = ConfigSection()
        another_test_section = ConfigSection()
        one_another_test_section = ConfigSection()
        a_test_case_2_section = ConfigSection()
        ConfigLoader().load_config(config_file_path, {"test": test_section,
                                                      "another-test": another_test_section,
                                                      "t": at_section,
                                                      "one-another-test": one_another_test_section,
                                                      "test-case-2": a_test_case_2_section})
        assert test_section == standard_section
        assert at_section == t_section
        assert another_test_section == section
        assert one_another_test_section == section
        assert a_test_case_2_section == test_case_2_section
        config_saver.save_config_file("test", section)
        with open(config_file_path, "w") as f:
            f.writelines(lines)
        with open(tmp_config_file_path, "w") as f:
            f.write('[test]\n')
            f.write('this is an fault example\n')
        tmp_config_saver = ConfigSaver(tmp_config_file_path)
        try:
            tmp_config_saver._read_section()
        except Exception as e:
            pass
        os.remove(tmp_config_file_path)
        try:
            tmp_config_saver = ConfigSaver("file-NOT-exist")
        except Exception as e:
            pass
    def test_case_2(self):
        config = "[section_A]\n[section_B]\n"
        with open("./test.cfg", "w", encoding="utf-8") as f:
            f.write(config)
        saver = ConfigSaver("./test.cfg")
        section = ConfigSection()
        section["doubles"] = 0.8
        section["tt"] = [1, 2, 3]
        section["test"] = 105
        section["str"] = "this is a str"
        saver.save_config_file("section_A", section)
        os.system("rm ./test.cfg")
    def test_case_3(self):
        config = "[section_A]\ndoubles = 0.9\ntt = [1, 2, 3]\n[section_B]\n"
        with open("./test.cfg", "w", encoding="utf-8") as f:
            f.write(config)
        saver = ConfigSaver("./test.cfg")
        section = ConfigSection()
        section["doubles"] = 0.8
        section["tt"] = [1, 2, 3]
        section["test"] = 105
        section["str"] = "this is a str"
        saver.save_config_file("section_A", section)
        os.system("rm ./test.cfg")
--- a/test/io/test_dataset_loader.py
+++ b/test/io/test_dataset_loader.py
@@ -1,31 +1,30 @@
 import unittest
 from fastNLP.io.dataset_loader import Conll2003Loader, PeopleDailyCorpusLoader, \
    CSVLoader, SNLILoader, JsonLoader
 from fastNLP.io import Conll2003Loader, PeopleDailyCorpusLoader, CSVLoader, SNLILoader, JsonLoader
 class TestDatasetLoader(unittest.TestCase):
 class TestDatasetLoader(unittest.TestCase):
    def test_Conll2003Loader(self):
        """
            Test the the loader of Conll2003 dataset
        """
        dataset_path = "test/data_for_tests/conll_2003_example.txt"
        dataset_path = "../data_for_tests/conll_2003_example.txt"
        loader = Conll2003Loader()
        dataset_2003 = loader.load(dataset_path)
    def test_PeopleDailyCorpusLoader(self):
        data_set = PeopleDailyCorpusLoader().load("test/data_for_tests/people_daily_raw.txt")
        data_set = PeopleDailyCorpusLoader().load("../data_for_tests/people_daily_raw.txt")
    def test_CSVLoader(self):
        ds = CSVLoader(sep='\t', headers=['words', 'label'])\
            .load('test/data_for_tests/tutorial_sample_dataset.csv')
        ds = CSVLoader(sep='\t', headers=['words', 'label']) \
            .load('../data_for_tests/tutorial_sample_dataset.csv')
        assert len(ds) > 0
    def test_SNLILoader(self):
        ds = SNLILoader().load('test/data_for_tests/sample_snli.jsonl')
        ds = SNLILoader().load('../data_for_tests/sample_snli.jsonl')
        assert len(ds) == 3
    def test_JsonLoader(self):
        ds = JsonLoader().load('test/data_for_tests/sample_snli.jsonl')
        ds = JsonLoader().load('../data_for_tests/sample_snli.jsonl')
        assert len(ds) == 3
--- a/test/io/test_embed_loader.py
+++ b/test/io/test_embed_loader.py
@@ -1,15 +1,15 @@
 import unittest
 import numpy as np
 from fastNLP.core.vocabulary import Vocabulary
 from fastNLP.io.embed_loader import EmbedLoader
 from fastNLP import Vocabulary
 from fastNLP.io import EmbedLoader
 class TestEmbedLoader(unittest.TestCase):
    def test_load_with_vocab(self):
        vocab = Vocabulary()
        glove = "test/data_for_tests/glove.6B.50d_test.txt"
        word2vec = "test/data_for_tests/word2vec_test.txt"
        glove = "../data_for_tests/glove.6B.50d_test.txt"
        word2vec = "../data_for_tests/word2vec_test.txt"
        vocab.add_word('the')
        vocab.add_word('none')
        g_m = EmbedLoader.load_with_vocab(glove, vocab)
@@ -20,8 +20,8 @@ class TestEmbedLoader(unittest.TestCase):
    def test_load_without_vocab(self):
        words = ['the', 'of', 'in', 'a', 'to', 'and']
        glove = "test/data_for_tests/glove.6B.50d_test.txt"
        word2vec = "test/data_for_tests/word2vec_test.txt"
        glove = "../data_for_tests/glove.6B.50d_test.txt"
        word2vec = "../data_for_tests/word2vec_test.txt"
        g_m, vocab = EmbedLoader.load_without_vocab(glove)
        self.assertEqual(g_m.shape, (8, 50))
        for word in words: