codebank

7 years ago · 042f63aa46
--- a/fastNLP/loader/config
+++ b/fastNLP/loader/config
@@ -52,3 +52,14 @@ early_stopping = 70
 reg = 1e-05
 test = 5
 new_attr = 40

 [POStest]
 epochs = 20
 batch_size=1
 num_classes=10,
 vocab_size=vocab_size
 pickle_path=pickle_path
 validate=true



--- a/fastNLP/modules/embedding.py
+++ b/fastNLP/modules/embedding.py
@@ -0,0 +1,87 @@
 import torch
 import torch.nn.functional as F
 from torch import nn


 class ConvCharEmbedding(nn.Module):

    def __init__(self, char_emb_size, feature_maps=(40, 30, 30), kernels=(3, 4, 5)):
        """
        Character Level Word Embedding
        :param char_emb_size: the size of character level embedding,
            say 26 characters, each embedded to 50 dim vector, then the input_size is 50.
        :param feature_maps: table of feature maps (for each kernel width)
        :param kernels: table of kernel widths
        """
        super(ConvCharEmbedding, self).__init__()
        self.convs = nn.ModuleList([
            nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, 4))
            for i in range(len(kernels))])

    def forward(self, x):
        """
        :param x: [batch_size * sent_length, word_length, char_emb_size]
        :return: [batch_size * sent_length, sum(feature_maps), 1]
        """
        x = x.contiguous().view(x.size(0), 1, x.size(1), x.size(2))  # [batch_size*sent_length, channel, width, height]
        x = x.transpose(2, 3)  # [batch_size*sent_length, channel, height, width]
        return self.convolute(x).unsqueeze(2)

    def convolute(self, x):
        feats = []
        for conv in self.convs:
            y = conv(x)  # [batch_size*sent_length, feature_maps[i], 1, width - kernels[i] + 1]
            y = torch.squeeze(y, 2)  # [batch_size*sent_length, feature_maps[i], width - kernels[i] + 1]
            y = F.tanh(y)
            y, __ = torch.max(y, 2)  # [batch_size*sent_length, feature_maps[i]]
            feats.append(y)
        return torch.cat(feats, 1)  # [batch_size*sent_length, sum(feature_maps)]


 class LSTMCharEmbedding(nn.Module):
    """
    Character Level Word Embedding with LSTM
    :param char_emb_size: the size of character level embedding,
        say 26 characters, each embedded to 50 dim vector, then the input_size is 50.
    """

    def __init__(self, char_emb_size, hidden_size=None):
        super(LSTMCharEmbedding, self).__init__()
        self.hidden_size = char_emb_size if hidden_size is None else hidden_size

        self.lstm = nn.LSTM(input_size=char_emb_size,
                            hidden_size=self.hidden_size,
                            num_layers=1,
                            bias=True,
                            batch_first=True)

    def forward(self, x):
        """
        :param x:[ n_batch*n_word, word_length, char_emb_size]
        :return: [ n_batch*n_word, char_emb_size]
        """
        batch_size = x.shape[0]
        h0 = torch.empty(1, batch_size, self.hidden_size)
        h0 = nn.init.orthogonal_(h0)
        c0 = torch.empty(1, batch_size, self.hidden_size)
        c0 = nn.init.orthogonal_(c0)

        _, hidden = self.lstm(x, (h0, c0))
        return hidden[0].squeeze().unsqueeze(2)


 if __name__ == "__main__":
    batch_size = 128
    char_emb = 100
    word_length = 1
    x = torch.Tensor(batch_size, char_emb, word_length)
    x = x.transpose(1, 2)
    cce = ConvCharEmbedding(char_emb)
    y = cce(x)
    print("CNN Char Emb input: ", x.shape)
    print("CNN Char Emb output: ", y.shape)  # [128, 100]

    lce = LSTMCharEmbedding(char_emb)
    o = lce(x)
    print("LSTM Char Emb input: ", x.shape)
    print("LSTM Char Emb size: ", o.shape)
--- a/fastNLP/modules/masked_rnn.py
+++ b/fastNLP/modules/masked_rnn.py
@@ -0,0 +1,422 @@
 __author__ = 'max'

 import torch
 import torch.nn as nn
 import torch.nn.functional as F


 def MaskedRecurrent(reverse=False):
    def forward(input, hidden, cell, mask, train=True, dropout=0):
        """
        :param input:
        :param hidden:
        :param cell:
        :param mask:
        :param dropout: step之间的dropout，对mask了的也会drop，应该是没问题的，反正没有gradient
        :param train: 控制dropout的行为，在StackedRNN的forward中调用
        :return:
        """
        output = []
        steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
        for i in steps:
            if mask is None or mask[i].data.min() > 0.5:  # 没有mask，都是1
                hidden = cell(input[i], hidden)
            elif mask[i].data.max() > 0.5:  # 有mask，但不全为0
                hidden_next = cell(input[i], hidden)  # 一次喂入一个batch！
                # hack to handle LSTM
                if isinstance(hidden, tuple):  # LSTM outputs a tuple of (hidden, cell), this is a common hack 😁
                    mask = mask.float()
                    hx, cx = hidden
                    hp1, cp1 = hidden_next
                    hidden = (
                        hx + (hp1 - hx) * mask[i].squeeze(),
                        cx + (cp1 - cx) * mask[i].squeeze())  # Why? 我知道了！！如果是mask就不用改变
                else:
                    hidden = hidden + (hidden_next - hidden) * mask[i]

            # if dropout != 0 and train: # warning, should i treat masked tensor differently?
            #     if isinstance(hidden, tuple):
            #         hidden = (F.dropout(hidden[0], p=dropout, training=train),
            #                   F.dropout(hidden[1], p=dropout, training=train))
            #     else:
            #         hidden = F.dropout(hidden, p=dropout, training=train)

            # hack to handle LSTM
            output.append(hidden[0] if isinstance(hidden, tuple) else hidden)

        if reverse:
            output.reverse()
        output = torch.cat(output, 0).view(input.size(0), *output[0].size())

        return hidden, output

    return forward


 def StackedRNN(inners, num_layers, lstm=False, train=True, step_dropout=0, layer_dropout=0):
    num_directions = len(inners)  # rec_factory!
    total_layers = num_layers * num_directions

    def forward(input, hidden, cells, mask):
        assert (len(cells) == total_layers)
        next_hidden = []

        if lstm:
            hidden = list(zip(*hidden))

        for i in range(num_layers):
            all_output = []
            for j, inner in enumerate(inners):
                l = i * num_directions + j
                hy, output = inner(input, hidden[l], cells[l], mask, step_dropout, train)
                next_hidden.append(hy)
                all_output.append(output)

            input = torch.cat(all_output, input.dim() - 1)  # 下一层的输入

            if layer_dropout != 0 and i < num_layers - 1:
                input = F.dropout(input, p=layer_dropout, training=train, inplace=False)

        if lstm:
            next_h, next_c = zip(*next_hidden)
            next_hidden = (
                torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
                torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
            )
        else:
            next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())

        return next_hidden, input

    return forward


 def AutogradMaskedRNN(num_layers=1, batch_first=False, train=True, layer_dropout=0, step_dropout=0,
                      bidirectional=False, lstm=False):
    rec_factory = MaskedRecurrent

    if bidirectional:
        layer = (rec_factory(), rec_factory(reverse=True))
    else:
        layer = (rec_factory(),)  # rec_factory 就是每层的结构啦！！在MaskedRecurrent中进行每层的计算！然后用StackedRNN接起来

    func = StackedRNN(layer,
                      num_layers,
                      lstm=lstm,
                      layer_dropout=layer_dropout, step_dropout=step_dropout,
                      train=train)

    def forward(input, cells, hidden, mask):
        if batch_first:
            input = input.transpose(0, 1)
            if mask is not None:
                mask = mask.transpose(0, 1)

        nexth, output = func(input, hidden, cells, mask)

        if batch_first:
            output = output.transpose(0, 1)

        return output, nexth

    return forward


 def MaskedStep():
    def forward(input, hidden, cell, mask):
        if mask is None or mask.data.min() > 0.5:
            hidden = cell(input, hidden)
        elif mask.data.max() > 0.5:
            hidden_next = cell(input, hidden)
            # hack to handle LSTM
            if isinstance(hidden, tuple):
                hx, cx = hidden
                hp1, cp1 = hidden_next
                hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
            else:
                hidden = hidden + (hidden_next - hidden) * mask
        # hack to handle LSTM
        output = hidden[0] if isinstance(hidden, tuple) else hidden

        return hidden, output

    return forward


 def StackedStep(layer, num_layers, lstm=False, dropout=0, train=True):
    def forward(input, hidden, cells, mask):
        assert (len(cells) == num_layers)
        next_hidden = []

        if lstm:
            hidden = list(zip(*hidden))

        for l in range(num_layers):
            hy, output = layer(input, hidden[l], cells[l], mask)
            next_hidden.append(hy)
            input = output

            if dropout != 0 and l < num_layers - 1:
                input = F.dropout(input, p=dropout, training=train, inplace=False)

        if lstm:
            next_h, next_c = zip(*next_hidden)
            next_hidden = (
                torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
                torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
            )
        else:
            next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())

        return next_hidden, input

    return forward


 def AutogradMaskedStep(num_layers=1, dropout=0, train=True, lstm=False):
    layer = MaskedStep()

    func = StackedStep(layer,
                       num_layers,
                       lstm=lstm,
                       dropout=dropout,
                       train=train)

    def forward(input, cells, hidden, mask):
        nexth, output = func(input, hidden, cells, mask)
        return output, nexth

    return forward


 class MaskedRNNBase(nn.Module):
    def __init__(self, Cell, input_size, hidden_size,
                 num_layers=1, bias=True, batch_first=False,
                 layer_dropout=0, step_dropout=0, bidirectional=False, **kwargs):
        """
        :param Cell:
        :param input_size:
        :param hidden_size:
        :param num_layers:
        :param bias:
        :param batch_first:
        :param layer_dropout:
        :param step_dropout:
        :param bidirectional:
        :param kwargs:
        """

        super(MaskedRNNBase, self).__init__()
        self.Cell = Cell
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bias = bias
        self.batch_first = batch_first
        self.layer_dropout = layer_dropout
        self.step_dropout = step_dropout
        self.bidirectional = bidirectional
        num_directions = 2 if bidirectional else 1

        self.all_cells = []
        for layer in range(num_layers):  # 初始化所有cell
            for direction in range(num_directions):
                layer_input_size = input_size if layer == 0 else hidden_size * num_directions

                cell = self.Cell(layer_input_size, hidden_size, self.bias, **kwargs)
                self.all_cells.append(cell)
                self.add_module('cell%d' % (layer * num_directions + direction), cell)  # Max的代码写得真好看

    def reset_parameters(self):
        for cell in self.all_cells:
            cell.reset_parameters()

    def forward(self, input, mask=None, hx=None):
        batch_size = input.size(0) if self.batch_first else input.size(1)
        lstm = self.Cell is nn.LSTMCell
        if hx is None:
            num_directions = 2 if self.bidirectional else 1
            hx = torch.autograd.Variable(
                input.data.new(self.num_layers * num_directions, batch_size, self.hidden_size).zero_())
            if lstm:
                hx = (hx, hx)

        func = AutogradMaskedRNN(num_layers=self.num_layers,
                                 batch_first=self.batch_first,
                                 step_dropout=self.step_dropout,
                                 layer_dropout=self.layer_dropout,
                                 train=self.training,
                                 bidirectional=self.bidirectional,
                                 lstm=lstm)  # 传入all_cells，继续往底层封装走

        output, hidden = func(input, self.all_cells, hx,
                              None if mask is None else mask.view(mask.size() + (1,)))  # 这个+ (1, )是个什么操作？
        return output, hidden

    def step(self, input, hx=None, mask=None):
        '''
        execute one step forward (only for one-directional RNN).
        Args:
            input (batch, input_size): input tensor of this step.
            hx (num_layers, batch, hidden_size): the hidden state of last step.
            mask (batch): the mask tensor of this step.
        Returns:
            output (batch, hidden_size): tensor containing the output of this step from the last layer of RNN.
            hn (num_layers, batch, hidden_size): tensor containing the hidden state of this step
        '''
        assert not self.bidirectional, "step only cannot be applied to bidirectional RNN."  # aha, typo!
        batch_size = input.size(0)
        lstm = self.Cell is nn.LSTMCell
        if hx is None:
            hx = torch.autograd.Variable(input.data.new(self.num_layers, batch_size, self.hidden_size).zero_())
            if lstm:
                hx = (hx, hx)

        func = AutogradMaskedStep(num_layers=self.num_layers,
                                  dropout=self.dropout,
                                  train=self.training,
                                  lstm=lstm)

        output, hidden = func(input, self.all_cells, hx, mask)
        return output, hidden


 class MaskedRNN(MaskedRNNBase):
    r"""Applies a multi-layer Elman RNN with costomized non-linearity to an
    input sequence.
    For each element in the input sequence, each layer computes the following
    function:
    .. math::
        h_t = \tanh(w_{ih} * x_t + b_{ih}  +  w_{hh} * h_{(t-1)} + b_{hh})
    where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is
    the hidden state of the previous layer at time `t` or :math:`input_t`
    for the first layer. If nonlinearity='relu', then `ReLU` is used instead
    of `tanh`.
    Args:
        input_size: The number of expected features in the input x
        hidden_size: The number of features in the hidden state h
        num_layers: Number of recurrent layers.
        nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh'
        bias: If False, then the layer does not use bias weights b_ih and b_hh.
            Default: True
        batch_first: If True, then the input and output tensors are provided
            as (batch, seq, feature)
        dropout: If non-zero, introduces a dropout layer on the outputs of each
            RNN layer except the last layer
        bidirectional: If True, becomes a bidirectional RNN. Default: False
    Inputs: input, mask, h_0
        - **input** (seq_len, batch, input_size): tensor containing the features
          of the input sequence.
          **mask** (seq_len, batch): 0-1 tensor containing the mask of the input sequence.
        - **h_0** (num_layers * num_directions, batch, hidden_size): tensor
          containing the initial hidden state for each element in the batch.
    Outputs: output, h_n
        - **output** (seq_len, batch, hidden_size * num_directions): tensor
          containing the output features (h_k) from the last layer of the RNN,
          for each k.  If a :class:`torch.nn.utils.rnn.PackedSequence` has
          been given as the input, the output will also be a packed sequence.
        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor
          containing the hidden state for k=seq_len.
    """

    def __init__(self, *args, **kwargs):
        super(MaskedRNN, self).__init__(nn.RNNCell, *args, **kwargs)


 class MaskedLSTM(MaskedRNNBase):
    r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input
    sequence.
    For each element in the input sequence, each layer computes the following
    function:
    .. math::
            \begin{array}{ll}
            i_t = \mathrm{sigmoid}(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
            f_t = \mathrm{sigmoid}(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\
            o_t = \mathrm{sigmoid}(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
            c_t = f_t * c_{(t-1)} + i_t * g_t \\
            h_t = o_t * \tanh(c_t)
            \end{array}
    where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
    state at time `t`, :math:`x_t` is the hidden state of the previous layer at
    time `t` or :math:`input_t` for the first layer, and :math:`i_t`,
    :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell,
    and out gates, respectively.
    Args:
        input_size: The number of expected features in the input x
        hidden_size: The number of features in the hidden state h
        num_layers: Number of recurrent layers.
        bias: If False, then the layer does not use bias weights b_ih and b_hh.
            Default: True
        batch_first: If True, then the input and output tensors are provided
            as (batch, seq, feature)
        dropout: If non-zero, introduces a dropout layer on the outputs of each
            RNN layer except the last layer
        bidirectional: If True, becomes a bidirectional RNN. Default: False
    Inputs: input, mask, (h_0, c_0)
        - **input** (seq_len, batch, input_size): tensor containing the features
          of the input sequence.
          **mask** (seq_len, batch): 0-1 tensor containing the mask of the input sequence.
        - **h_0** (num_layers \* num_directions, batch, hidden_size): tensor
          containing the initial hidden state for each element in the batch.
        - **c_0** (num_layers \* num_directions, batch, hidden_size): tensor
          containing the initial cell state for each element in the batch.
    Outputs: output, (h_n, c_n)
        - **output** (seq_len, batch, hidden_size * num_directions): tensor
          containing the output features `(h_t)` from the last layer of the RNN,
          for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
          given as the input, the output will also be a packed sequence.
        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor
          containing the hidden state for t=seq_len
        - **c_n** (num_layers * num_directions, batch, hidden_size): tensor
          containing the cell state for t=seq_len
    """

    def __init__(self, *args, **kwargs):
        super(MaskedLSTM, self).__init__(nn.LSTMCell, *args, **kwargs)


 class MaskedGRU(MaskedRNNBase):
    r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
    For each element in the input sequence, each layer computes the following
    function:
    .. math::
            \begin{array}{ll}
            r_t = \mathrm{sigmoid}(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
            z_t = \mathrm{sigmoid}(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
            n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
            h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} \\
            \end{array}
    where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the hidden
    state of the previous layer at time `t` or :math:`input_t` for the first
    layer, and :math:`r_t`, :math:`z_t`, :math:`n_t` are the reset, input,
    and new gates, respectively.
    Args:
        input_size: The number of expected features in the input x
        hidden_size: The number of features in the hidden state h
        num_layers: Number of recurrent layers.
        nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh'
        bias: If False, then the layer does not use bias weights b_ih and b_hh.
            Default: True
        batch_first: If True, then the input and output tensors are provided
            as (batch, seq, feature)
        dropout: If non-zero, introduces a dropout layer on the outputs of each
            RNN layer except the last layer
        bidirectional: If True, becomes a bidirectional RNN. Default: False
    Inputs: input, mask, h_0
        - **input** (seq_len, batch, input_size): tensor containing the features
          of the input sequence.
          **mask** (seq_len, batch): 0-1 tensor containing the mask of the input sequence.
        - **h_0** (num_layers * num_directions, batch, hidden_size): tensor
          containing the initial hidden state for each element in the batch.
    Outputs: output, h_n
        - **output** (seq_len, batch, hidden_size * num_directions): tensor
          containing the output features (h_k) from the last layer of the RNN,
          for each k.  If a :class:`torch.nn.utils.rnn.PackedSequence` has
          been given as the input, the output will also be a packed sequence.
        - **h_n** (num_layers * num_directions, batch, hidden_size): tensor
          containing the hidden state for k=seq_len.
    """

    def __init__(self, *args, **kwargs):
        super(MaskedGRU, self).__init__(nn.GRUCell, *args, **kwargs)
--- a/fastNLP/modules/other_modules.py
+++ b/fastNLP/modules/other_modules.py
@@ -0,0 +1,490 @@
 import numpy as np
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn import Parameter

 from .emd import WlossLayer
 from ..utils import orthogonal


 class GroupNorm(nn.Module):
    def __init__(self, num_features, num_groups=20, eps=1e-5):
        super(GroupNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(1, num_features, 1))
        self.bias = nn.Parameter(torch.zeros(1, num_features, 1))
        self.num_groups = num_groups
        self.eps = eps

    def forward(self, x):
        N, C, H = x.size()
        G = self.num_groups
        assert C % G == 0

        x = x.view(N, G, -1)
        mean = x.mean(-1, keepdim=True)
        var = x.var(-1, keepdim=True)

        x = (x - mean) / (var + self.eps).sqrt()
        x = x.view(N, C, H)
        return x * self.weight + self.bias


 class LayerNormalization(nn.Module):
    """ Layer normalization module """

    def __init__(self, d_hid, eps=1e-3):
        super(LayerNormalization, self).__init__()

        self.eps = eps
        self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True)
        self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True)

    def forward(self, z):
        if z.size(1) == 1:
            return z

        mu = torch.mean(z, keepdim=True, dim=-1)
        sigma = torch.std(z, keepdim=True, dim=-1)
        ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps)
        ln_out = ln_out * self.a_2.expand_as(ln_out) + self.b_2.expand_as(ln_out)

        return ln_out


 class OrthEmbedding(nn.Embedding):
    def __init__(self, *args, **kwargs):
        super(OrthEmbedding, self).__init__(*args, **kwargs)

    def reset_parameters(self):
        self.weight = orthogonal(self.weight)
        nn.init.constant_(self.bias, 0.)


 class BiLinear(nn.Module):
    def __init__(self, n_left, n_right, n_out, bias=True):
        """
        Args:
            n_left: size of left input
            n_right: size of right input
            n_out: size of output
            bias: If set to False, the layer will not learn an additive bias.
                Default: True
        """
        super(BiLinear, self).__init__()
        self.n_left = n_left
        self.n_right = n_right
        self.n_out = n_out

        self.U = Parameter(torch.Tensor(self.n_out, self.n_left, self.n_right))
        self.W_l = Parameter(torch.Tensor(self.n_out, self.n_left))
        self.W_r = Parameter(torch.Tensor(self.n_out, self.n_left))

        if bias:
            self.bias = Parameter(torch.Tensor(n_out))
        else:
            self.register_parameter('bias', None)

        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.W_l)
        nn.init.xavier_uniform_(self.W_r)
        nn.init.constant_(self.bias, 0.)
        nn.init.xavier_uniform_(self.U)

    def forward(self, input_left, input_right):
        """
        Args:
            input_left: Tensor
                the left input tensor with shape = [batch1, batch2, ..., left_features]
            input_right: Tensor
                the right input tensor with shape = [batch1, batch2, ..., right_features]
        Returns:
        """
        left_size = input_left.size()
        right_size = input_right.size()
        assert left_size[:-1] == right_size[:-1], \
            "batch size of left and right inputs mis-match: (%s, %s)" % (left_size[:-1], right_size[:-1])
        batch = int(np.prod(left_size[:-1]))

        # convert left and right input to matrices [batch, left_features], [batch, right_features]
        input_left = input_left.view(batch, self.n_left)
        input_right = input_right.view(batch, self.n_right)

        # output [batch, out_features]
        output = F.bilinear(input_left, input_right, self.U, self.bias)
        output = output + \
                 F.linear(input_left, self.W_l, None) + \
                 F.linear(input_right, self.W_r, None)
        # convert back to [batch1, batch2, ..., out_features]
        return output.view(left_size[:-1] + (self.n_out,))

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + 'in1_features=' + str(self.n_left) \
               + ', in2_features=' + str(self.n_right) \
               + ', out_features=' + str(self.n_out) + ')'


 class BiAffine(nn.Module):
    def __init__(self, n_enc, n_dec, n_labels, biaffine=True, **kwargs):
        """
        Args:
            n_enc: int
                the dimension of the encoder input.
            n_dec: int
                the dimension of the decoder input.
            n_labels: int
                the number of labels of the crf layer
            biaffine: bool
                if apply bi-affine parameter.
            **kwargs:
        """
        super(BiAffine, self).__init__()
        self.n_enc = n_enc
        self.n_dec = n_dec
        self.num_labels = n_labels
        self.biaffine = biaffine

        self.W_d = Parameter(torch.Tensor(self.num_labels, self.n_dec))
        self.W_e = Parameter(torch.Tensor(self.num_labels, self.n_enc))
        self.b = Parameter(torch.Tensor(self.num_labels, 1, 1))
        if self.biaffine:
            self.U = Parameter(torch.Tensor(self.num_labels, self.n_dec, self.n_enc))
        else:
            self.register_parameter('U', None)

        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.W_d)
        nn.init.xavier_uniform_(self.W_e)
        nn.init.constant_(self.b, 0.)
        if self.biaffine:
            nn.init.xavier_uniform_(self.U)

    def forward(self, input_d, input_e, mask_d=None, mask_e=None):
        """
        Args:
            input_d: Tensor
                the decoder input tensor with shape = [batch, length_decoder, input_size]
            input_e: Tensor
                the child input tensor with shape = [batch, length_encoder, input_size]
            mask_d: Tensor or None
                the mask tensor for decoder with shape = [batch, length_decoder]
            mask_e: Tensor or None
                the mask tensor for encoder with shape = [batch, length_encoder]
        Returns: Tensor
            the energy tensor with shape = [batch, num_label, length, length]
        """
        assert input_d.size(0) == input_e.size(0), 'batch sizes of encoder and decoder are requires to be equal.'
        batch, length_decoder, _ = input_d.size()
        _, length_encoder, _ = input_e.size()

        # compute decoder part: [num_label, input_size_decoder] * [batch, input_size_decoder, length_decoder]
        # the output shape is [batch, num_label, length_decoder]
        out_d = torch.matmul(self.W_d, input_d.transpose(1, 2)).unsqueeze(3)
        # compute decoder part: [num_label, input_size_encoder] * [batch, input_size_encoder, length_encoder]
        # the output shape is [batch, num_label, length_encoder]
        out_e = torch.matmul(self.W_e, input_e.transpose(1, 2)).unsqueeze(2)

        # output shape [batch, num_label, length_decoder, length_encoder]
        if self.biaffine:
            # compute bi-affine part
            # [batch, 1, length_decoder, input_size_decoder] * [num_labels, input_size_decoder, input_size_encoder]
            # output shape [batch, num_label, length_decoder, input_size_encoder]
            output = torch.matmul(input_d.unsqueeze(1), self.U)
            # [batch, num_label, length_decoder, input_size_encoder] * [batch, 1, input_size_encoder, length_encoder]
            # output shape [batch, num_label, length_decoder, length_encoder]
            output = torch.matmul(output, input_e.unsqueeze(1).transpose(2, 3))

            output = output + out_d + out_e + self.b
        else:
            output = out_d + out_d + self.b

        if mask_d is not None:
            output = output * mask_d.unsqueeze(1).unsqueeze(3) * mask_e.unsqueeze(1).unsqueeze(2)

        return output


 class Transpose(nn.Module):
    def __init__(self, x, y):
        super(Transpose, self).__init__()
        self.x = x
        self.y = y

    def forward(self, x):
        return x.transpose(self.x, self.y)


 class WordDropout(nn.Module):
    def __init__(self, dropout_rate, drop_to_token):
        super(WordDropout, self).__init__()
        self.dropout_rate = dropout_rate
        self.drop_to_token = drop_to_token

    def forward(self, word_idx):
        if not self.training:
            return word_idx
        drop_mask = torch.rand(word_idx.shape) < self.dropout_rate
        if word_idx.device.type == 'cuda':
            drop_mask = drop_mask.cuda()
        drop_mask = drop_mask.long()
        output = drop_mask * self.drop_to_token + (1 - drop_mask) * word_idx
        return output


 import torch
 import torch.utils.data
 import numpy
 from torch.autograd import Function, Variable
 from torch import optim


 class WlossLayer(torch.nn.Module):
    def __init__(self, lam=100, sinkhorn_iter=50):
        super(WlossLayer, self).__init__()

        # cost = matrix M = distance matrix
        # lam = lambda of type float > 0
        # sinkhorn_iter > 0
        # diagonal cost should be 0
        self.lam = lam
        self.sinkhorn_iter = sinkhorn_iter
        # self.register_buffer("K", torch.exp(-self.cost / self.lam).double())
        # self.register_buffer("KM", (self.cost * self.K).double())

    def forward(self, pred, target, cost):
        return WassersteinLossStab.apply(pred, target,
                                         cost, self.lam, self.sinkhorn_iter)


 class WassersteinLossStab(Function):
    @staticmethod
    def forward(ctx, pred, target, cost,
                lam=1e-3, sinkhorn_iter=4):
        """pred: Batch * K: K = # mass points
           target: Batch * L: L = # mass points"""
        # import pdb
        # pdb.set_trace()
        eps = 1e-8

        # pred = pred.gather(dim=1, index=)
        na = pred.size(1)
        nb = target.size(1)

        cost = cost.double()
        pred = pred.double()
        target = target.double()

        cost = cost[:na, :nb].double()
        K = torch.exp(-cost / lam).double()
        KM = (cost * K).double()

        batch_size = pred.size(0)

        # pdb.set_trace()
        log_a, log_b = torch.log(pred + eps), torch.log(target + eps)
        log_u = cost.new(batch_size, na).fill_(-numpy.log(na))
        log_v = cost.new(batch_size, nb).fill_(-numpy.log(nb))
        # import pdb
        # pdb.set_trace()
        for i in range(int(sinkhorn_iter)):
            log_u_max = torch.max(log_u, dim=1)[0]
            u_stab = torch.exp(log_u - log_u_max.unsqueeze(1) + eps)
            log_v = log_b - torch.log(torch.mm(K.t(), u_stab.t()).t()) - log_u_max.unsqueeze(1)
            log_v_max = torch.max(log_v, dim=1)[0]
            v_stab = torch.exp(log_v - log_v_max.unsqueeze(1))
            tmp = log_u
            log_u = log_a - torch.log(torch.mm(K, v_stab.t()).t() + eps) - log_v_max.unsqueeze(1)
            # print(log_u.sum())
            if torch.norm(tmp - log_u) / torch.norm(log_u) < eps:
                break

        log_v_max = torch.max(log_v, dim=1)[0]
        v_stab = torch.exp(log_v - log_v_max.unsqueeze(1))
        logcostpart1 = torch.log(torch.mm(KM, v_stab.t()).t() + eps) + log_v_max.unsqueeze(1)
        wnorm = torch.exp(log_u + logcostpart1).mean(0).sum()  # sum(1) for per item pair loss...
        grad_input = log_u * lam
        # print("log_u", log_u)
        grad_input = grad_input - torch.mean(grad_input, dim=1).unsqueeze(1)
        grad_input = grad_input - torch.mean(grad_input, dim=1).unsqueeze(1)
        grad_input = grad_input / batch_size

        ctx.save_for_backward(grad_input)
        # print("grad type", type(grad_input))

        return pred.new((wnorm,)), grad_input

    @staticmethod
    def backward(ctx, grad_output, _):
        grad_input = ctx.saved_variables
        # print(grad)
        res = grad_output.clone()
        res.data.resize_(grad_input[0].size()).copy_(grad_input[0].data)
        res = res.mul_(grad_output[0]).float()
        # print("in backward func:\n\n", res)
        return res, None, None, None, None, None, None


 class Sinkhorn(Function):
    def __init__(self):
        super(Sinkhorn, self).__init__()

    def forward(ctx, a, b, M, reg, tau, warmstart, numItermax, stop):
        a = a.double()
        b = b.double()
        M = M.double()

        nbb = b.size(1)

        # init data
        na = len(a)
        nb = len(b)

        cpt = 0

        # we assume that no distances are null except those of the diagonal of
        # distances
        if warmstart is None:
            alpha, beta = np.zeros(na), np.zeros(nb)
        else:
            alpha, beta = warmstart

        if nbb:
            u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb
        else:
            u, v = np.ones(na) / na, np.ones(nb) / nb

        def get_K(alpha, beta):
            """log space computation"""
            return np.exp(-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg)

        def get_Gamma(alpha, beta, u, v):
            """log space gamma computation"""
            return np.exp(
                -(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg + np.log(u.reshape((na, 1))) + np.log(
                    v.reshape((1, nb))))

        # print(np.min(K))

        K = get_K(alpha, beta)
        transp = K
        cpt = 0
        err = 1
        while 1:

            uprev = u
            vprev = v

            # sinkhorn update
            v = b / (np.dot(K.T, u) + 1e-16)
            u = a / (np.dot(K, v) + 1e-16)

            # remove numerical problems and store them in K
            if np.abs(u).max() > tau or np.abs(v).max() > tau:
                if nbb:
                    alpha, beta = alpha + reg * \
                                  np.max(np.log(u), 1), beta + reg * np.max(np.log(v))
                else:
                    alpha, beta = alpha + reg * np.log(u), beta + reg * np.log(v)
                    if nbb:
                        u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb
                    else:
                        u, v = np.ones(na) / na, np.ones(nb) / nb
                K = get_K(alpha, beta)

            if cpt % print_period == 0:
                # we can speed up the process by checking for the error only all
                # the 10th iterations
                if nbb:
                    err = np.sum((u - uprev) ** 2) / np.sum((u) ** 2) + \
                          np.sum((v - vprev) ** 2) / np.sum((v) ** 2)
                else:
                    transp = get_Gamma(alpha, beta, u, v)
                    err = np.linalg.norm((np.sum(transp, axis=0) - b)) ** 2
                if log:
                    log['err'].append(err)

                if verbose:
                    if cpt % (print_period * 20) == 0:
                        print(
                            '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
                    print('{:5d}|{:8e}|'.format(cpt, err))

            if err <= stopThr:
                loop = False

            if cpt >= numItermax:
                loop = False

            if np.any(np.isnan(u)) or np.any(np.isnan(v)):
                # we have reached the machine precision
                # come back to previous solution and quit loop
                print('Warning: numerical errors at iteration', cpt)
                u = uprev
                v = vprev
                break

            cpt = cpt + 1

        # print('err=',err,' cpt=',cpt)
        if log:
            log['logu'] = alpha / reg + np.log(u)
            log['logv'] = beta / reg + np.log(v)
            log['alpha'] = alpha + reg * np.log(u)
            log['beta'] = beta + reg * np.log(v)
            log['warmstart'] = (log['alpha'], log['beta'])
            if nbb:
                res = np.zeros((nbb))
                for i in range(nbb):
                    res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M)
                return res, log

            else:
                return get_Gamma(alpha, beta, u, v), log
        else:
            if nbb:
                res = np.zeros((nbb))
                for i in range(nbb):
                    res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M)
                return res
            else:
                return get_Gamma(alpha, beta, u, v)


 if __name__ == "__main__":
    cost = (torch.Tensor(2, 2).fill_(1) - torch.diag(torch.Tensor(2).fill_(1)))  # .cuda()
    mylayer = WlossLayer(cost)  # .cuda()
    inp = Variable(torch.Tensor([[1, 0], [0.5, 0.5]]), requires_grad=True)  # .cuda()
    ground_true = Variable(torch.Tensor([[0, 1], [0.5, 0.5]]))  # .cuda()

    res, _ = mylayer(inp, ground_true)
    # print(inp.requires_grad, res.requires_grad)
    # print(res, inp)
    mylayer.zero_grad()
    res.backward()
    print("inp's gradient is good:")
    print(inp.grad)

    print("convert to gpu:\n", inp.cuda().grad)
    print("=============================================="
          "\n However, this does not work on pytorch when GPU is enabled")

    cost = (torch.Tensor(2, 2).fill_(1) - torch.diag(torch.Tensor(2).fill_(1))).cuda()
    mylayer = WlossLayer(cost).cuda()
    inp = Variable(torch.Tensor([[1, 0], [0.5, 0.5]]), requires_grad=True).cuda()
    ground_true = Variable(torch.Tensor([[0, 1], [0.5, 0.5]])).cuda()

    opt = optim.SGD([
        {'params': mylayer.parameters()},
    ], lr=1e-2, momentum=0.9)

    res, _ = mylayer(inp, ground_true)
    # print(inp.requires_grad, res.requires_grad)
    # print(res, inp)
    mylayer.zero_grad()
    res.backward()
    print("input's gradient is None!!!!!!!!!!!!!!!!")
    print(inp.grad)
--- a/fastNLP/modules/utils.py
+++ b/fastNLP/modules/utils.py
@@ -1,6 +1,3 @@
 import torch


 def mask_softmax(matrix, mask):
    if mask is None:
        result = torch.nn.functional.softmax(matrix, dim=-1)
@@ -13,3 +10,231 @@ def seq_mask(seq_len, max_len):
    mask = [torch.ge(torch.LongTensor(seq_len), i + 1) for i in range(max_len)]
    mask = torch.stack(mask, 1)
    return mask


 """
    Codes from FudanParser
 """
 from collections import defaultdict

 import numpy as np
 import torch


 def expand_gt(gt):
    """expand_gt: Expand ground truth to matrix
    Arguments:
        gt: tensor of (n, l)
    Return:
        f: ground truth matrix of (n, l), $gt[i][j] = k$ leads to $f[i][j][k] = 1$.
    """
    n, l = gt.shape
    ret = torch.zeros(n, l, l).long()
    for i in range(n):
        ret[i][torch.arange(l).long(), gt[i]] = 1
    return ret


 def greedy_decoding(arc_f):
    """greedy_decoding
    Arguments:
        arc_f: a tensor in shape of (n, l+1, l+1)
               length of the sentence is l and index 0 is <root>
    Output:
        arc_pred: a tensor in shape of (n, l), indicating the head words
    """

    f_arc = arc_f[:, 1:, :]  # ignore the root
    _, arc_pred = torch.max(f_arc.data, dim=-1, keepdim=False)
    return arc_pred


 def mst_decoding(arc_f):
    batch_size = arc_f.shape[0]
    length = arc_f.shape[1]
    arc_score = arc_f.data.cpu()
    pred_collection = []
    for i in range(batch_size):
        head = mst(arc_score[i].numpy())
        pred_collection.append(head[1:].reshape((1, length - 1)))
    arc_pred = torch.LongTensor(np.concatenate(pred_collection, axis=0)).type_as(arc_f).long()
    return arc_pred


 def outer_product(features):
    """InterProduct: Get inter sequence product of features
    Arguments:
        features: feature vectors of sequence in the shape of (n, l, h)
    Return:
        f: product result in (n, l, l, h) shape
    """
    n, l, c = features.shape
    features = features.contiguous()
    x = features.view(n, l, 1, c)
    x = x.expand(n, l, l, c)
    y = features.view(n, 1, l, c).contiguous()
    y = y.expand(n, l, l, c)
    return x * y


 def outer_concat(features):
    """InterProduct: Get inter sequence concatenation of features
    Arguments:
        features: feature vectors of sequence in the shape of (n, l, h)
    Return:
        f: product result in (n, l, l, h) shape
    """
    n, l, c = features.shape
    x = features.contiguous().view(n, l, 1, c)
    x = x.expand(n, l, l, c)
    y = features.view(n, 1, l, c)
    y = y.expand(n, l, l, c)
    return torch.cat((x, y), dim=3)


 def mst(scores):
    """
    https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/models/nn.py#L692  # NOQA
    """
    length = scores.shape[0]
    min_score = scores.min() - 1
    eye = np.eye(length)
    scores = scores * (1 - eye) + min_score * eye
    heads = np.argmax(scores, axis=1)
    heads[0] = 0
    tokens = np.arange(1, length)
    roots = np.where(heads[tokens] == 0)[0] + 1
    if len(roots) < 1:
        root_scores = scores[tokens, 0]
        head_scores = scores[tokens, heads[tokens]]
        new_root = tokens[np.argmax(root_scores / head_scores)]
        heads[new_root] = 0
    elif len(roots) > 1:
        root_scores = scores[roots, 0]
        scores[roots, 0] = 0
        new_heads = np.argmax(scores[roots][:, tokens], axis=1) + 1
        new_root = roots[np.argmin(
            scores[roots, new_heads] / root_scores)]
        heads[roots] = new_heads
        heads[new_root] = 0

    edges = defaultdict(set)
    vertices = set((0,))
    for dep, head in enumerate(heads[tokens]):
        vertices.add(dep + 1)
        edges[head].add(dep + 1)
    for cycle in _find_cycle(vertices, edges):
        dependents = set()
        to_visit = set(cycle)
        while len(to_visit) > 0:
            node = to_visit.pop()
            if node not in dependents:
                dependents.add(node)
                to_visit.update(edges[node])
        cycle = np.array(list(cycle))
        old_heads = heads[cycle]
        old_scores = scores[cycle, old_heads]
        non_heads = np.array(list(dependents))
        scores[np.repeat(cycle, len(non_heads)),
               np.repeat([non_heads], len(cycle), axis=0).flatten()] = min_score
        new_heads = np.argmax(scores[cycle][:, tokens], axis=1) + 1
        new_scores = scores[cycle, new_heads] / old_scores
        change = np.argmax(new_scores)
        changed_cycle = cycle[change]
        old_head = old_heads[change]
        new_head = new_heads[change]
        heads[changed_cycle] = new_head
        edges[new_head].add(changed_cycle)
        edges[old_head].remove(changed_cycle)

    return heads


 def _find_cycle(vertices, edges):
    """
    https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm  # NOQA
    https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/etc/tarjan.py  # NOQA
    """
    _index = 0
    _stack = []
    _indices = {}
    _lowlinks = {}
    _onstack = defaultdict(lambda: False)
    _SCCs = []

    def _strongconnect(v):
        nonlocal _index
        _indices[v] = _index
        _lowlinks[v] = _index
        _index += 1
        _stack.append(v)
        _onstack[v] = True

        for w in edges[v]:
            if w not in _indices:
                _strongconnect(w)
                _lowlinks[v] = min(_lowlinks[v], _lowlinks[w])
            elif _onstack[w]:
                _lowlinks[v] = min(_lowlinks[v], _indices[w])

        if _lowlinks[v] == _indices[v]:
            SCC = set()
            while True:
                w = _stack.pop()
                _onstack[w] = False
                SCC.add(w)
                if not (w != v):
                    break
            _SCCs.append(SCC)

    for v in vertices:
        if v not in _indices:
            _strongconnect(v)

    return [SCC for SCC in _SCCs if len(SCC) > 1]


 # https://github.com/alykhantejani/nninit/blob/master/nninit.py
 def orthogonal(tensor, gain=1):
    """Fills the input Tensor or Variable with a (semi) orthogonal matrix. The input tensor must have at least 2 dimensions,
       and for tensors with more than 2 dimensions the trailing dimensions are flattened. viewed as 2D representation with
       rows equal to the first dimension and columns equal to the product of  as a sparse matrix, where the non-zero elements
       will be drawn from a normal distribution with mean=0 and std=`std`.
       Reference: "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks" - Saxe, A. et al.
    Args:
        tensor: a n-dimension torch.Tensor, where n >= 2
        gain: optional gain to be applied
    Examples:
        >>> w = torch.Tensor(3, 5)
        >>> nninit.orthogonal(w)
    """
    if tensor.ndimension() < 2:
        raise ValueError("Only tensors with 2 or more dimensions are supported.")

    flattened_shape = (tensor.size(0), int(np.prod(tensor.detach().numpy().shape[1:])))
    flattened = torch.Tensor(flattened_shape[0], flattened_shape[1]).normal_(0, 1)

    u, s, v = np.linalg.svd(flattened.numpy(), full_matrices=False)
    if u.shape == flattened.detach().numpy().shape:
        tensor.view_as(flattened).copy_(torch.from_numpy(u))
    else:
        tensor.view_as(flattened).copy_(torch.from_numpy(v))

    tensor.mul_(gain)
    with torch.no_grad():
        return tensor


 def generate_step_dropout(masks, hidden_dim, step_dropout, training=False):
    # assume batch first
    # import pdb
    # pdb.set_trace()

    batch, length = masks.size()
    if not training:
        return torch.ones(batch, length, hidden_dim).fill_(1 - step_dropout).cuda(masks.device) * masks.view(batch,
                                                                                                             length, 1)
    masked = torch.zeros(batch, 1, hidden_dim).fill_(step_dropout)
    masked = torch.bernoulli(masked).repeat(1, length, 1)
    masked = masked.cuda(masks.device) * masks.view(batch, length, 1)
    return masked
--- a/fastNLP/modules/variational_rnn.py
+++ b/fastNLP/modules/variational_rnn.py
@@ -0,0 +1,384 @@
 import math

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
 from torch.nn.parameter import Parameter


 def default_initializer(hidden_size):
    stdv = 1.0 / math.sqrt(hidden_size)

    def forward(tensor):
        nn.init.uniform_(tensor, -stdv, stdv)

    return forward


 def VarMaskedRecurrent(reverse=False):
    def forward(input, hidden, cell, mask):
        output = []
        steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
        for i in steps:
            if mask is None or mask[i].data.min() > 0.5:
                hidden = cell(input[i], hidden)
            elif mask[i].data.max() > 0.5:
                hidden_next = cell(input[i], hidden)
                # hack to handle LSTM
                if isinstance(hidden, tuple):
                    hx, cx = hidden
                    hp1, cp1 = hidden_next
                    hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i])
                else:
                    hidden = hidden + (hidden_next - hidden) * mask[i]
            # hack to handle LSTM
            output.append(hidden[0] if isinstance(hidden, tuple) else hidden)

        if reverse:
            output.reverse()
        output = torch.cat(output, 0).view(input.size(0), *output[0].size())

        return hidden, output

    return forward


 def StackedRNN(inners, num_layers, lstm=False):
    num_directions = len(inners)
    total_layers = num_layers * num_directions

    def forward(input, hidden, cells, mask):
        assert (len(cells) == total_layers)
        next_hidden = []

        if lstm:
            hidden = list(zip(*hidden))

        for i in range(num_layers):
            all_output = []
            for j, inner in enumerate(inners):
                l = i * num_directions + j
                hy, output = inner(input, hidden[l], cells[l], mask)
                next_hidden.append(hy)
                all_output.append(output)

            input = torch.cat(all_output, input.dim() - 1)

        if lstm:
            next_h, next_c = zip(*next_hidden)
            next_hidden = (
                torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
                torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
            )
        else:
            next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())

        return next_hidden, input

    return forward


 def AutogradVarMaskedRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False):
    rec_factory = VarMaskedRecurrent

    if bidirectional:
        layer = (rec_factory(), rec_factory(reverse=True))
    else:
        layer = (rec_factory(),)

    func = StackedRNN(layer,
                      num_layers,
                      lstm=lstm)

    def forward(input, cells, hidden, mask):
        if batch_first:
            input = input.transpose(0, 1)
            if mask is not None:
                mask = mask.transpose(0, 1)

        nexth, output = func(input, hidden, cells, mask)

        if batch_first:
            output = output.transpose(0, 1)

        return output, nexth

    return forward


 def VarMaskedStep():
    def forward(input, hidden, cell, mask):
        if mask is None or mask.data.min() > 0.5:
            hidden = cell(input, hidden)
        elif mask.data.max() > 0.5:
            hidden_next = cell(input, hidden)
            # hack to handle LSTM
            if isinstance(hidden, tuple):
                hx, cx = hidden
                hp1, cp1 = hidden_next
                hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
            else:
                hidden = hidden + (hidden_next - hidden) * mask
        # hack to handle LSTM
        output = hidden[0] if isinstance(hidden, tuple) else hidden

        return hidden, output

    return forward


 def StackedStep(layer, num_layers, lstm=False):
    def forward(input, hidden, cells, mask):
        assert (len(cells) == num_layers)
        next_hidden = []

        if lstm:
            hidden = list(zip(*hidden))

        for l in range(num_layers):
            hy, output = layer(input, hidden[l], cells[l], mask)
            next_hidden.append(hy)
            input = output

        if lstm:
            next_h, next_c = zip(*next_hidden)
            next_hidden = (
                torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
                torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
            )
        else:
            next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())

        return next_hidden, input

    return forward


 def AutogradVarMaskedStep(num_layers=1, lstm=False):
    layer = VarMaskedStep()

    func = StackedStep(layer,
                       num_layers,
                       lstm=lstm)

    def forward(input, cells, hidden, mask):
        nexth, output = func(input, hidden, cells, mask)
        return output, nexth

    return forward


 class VarMaskedRNNBase(nn.Module):
    def __init__(self, Cell, input_size, hidden_size,
                 num_layers=1, bias=True, batch_first=False,
                 dropout=(0, 0), bidirectional=False, initializer=None, **kwargs):

        super(VarMaskedRNNBase, self).__init__()
        self.Cell = Cell
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bias = bias
        self.batch_first = batch_first
        self.bidirectional = bidirectional
        self.lstm = False
        num_directions = 2 if bidirectional else 1

        self.all_cells = []
        for layer in range(num_layers):
            for direction in range(num_directions):
                layer_input_size = input_size if layer == 0 else hidden_size * num_directions

                cell = self.Cell(layer_input_size, hidden_size, self.bias, p=dropout, initializer=initializer, **kwargs)
                self.all_cells.append(cell)
                self.add_module('cell%d' % (layer * num_directions + direction), cell)

    def reset_parameters(self):
        for cell in self.all_cells:
            cell.reset_parameters()

    def reset_noise(self, batch_size):
        for cell in self.all_cells:
            cell.reset_noise(batch_size)

    def forward(self, input, mask=None, hx=None):
        batch_size = input.size(0) if self.batch_first else input.size(1)
        if hx is None:
            num_directions = 2 if self.bidirectional else 1
            hx = torch.tensor(input.data.new(self.num_layers * num_directions, batch_size, self.hidden_size).zero_(),
                              requires_grad=True)
            if self.lstm:
                hx = (hx, hx)

        func = AutogradVarMaskedRNN(num_layers=self.num_layers,
                                    batch_first=self.batch_first,
                                    bidirectional=self.bidirectional,
                                    lstm=self.lstm)

        self.reset_noise(batch_size)

        output, hidden = func(input, self.all_cells, hx, None if mask is None else mask.view(mask.size() + (1,)))
        return output, hidden

    def step(self, input, hx=None, mask=None):
        '''
        execute one step forward (only for one-directional RNN).
        Args:
            input (batch, input_size): input tensor of this step.
            hx (num_layers, batch, hidden_size): the hidden state of last step.
            mask (batch): the mask tensor of this step.
        Returns:
            output (batch, hidden_size): tensor containing the output of this step from the last layer of RNN.
            hn (num_layers, batch, hidden_size): tensor containing the hidden state of this step
        '''
        assert not self.bidirectional, "step only cannot be applied to bidirectional RNN."
        batch_size = input.size(0)
        if hx is None:
            hx = torch.tensor(input.data.new(self.num_layers, batch_size, self.hidden_size).zero_(), requires_grad=True)
            if self.lstm:
                hx = (hx, hx)

        func = AutogradVarMaskedStep(num_layers=self.num_layers, lstm=self.lstm)

        output, hidden = func(input, self.all_cells, hx, mask)
        return output, hidden


 class VarMaskedFastLSTM(VarMaskedRNNBase):
    def __init__(self, *args, **kwargs):
        super(VarMaskedFastLSTM, self).__init__(VarFastLSTMCell, *args, **kwargs)
        self.lstm = True


 class VarRNNCellBase(nn.Module):
    def __repr__(self):
        s = '{name}({input_size}, {hidden_size}'
        if 'bias' in self.__dict__ and self.bias is not True:
            s += ', bias={bias}'
        if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh":
            s += ', nonlinearity={nonlinearity}'
        s += ')'
        return s.format(name=self.__class__.__name__, **self.__dict__)

    def reset_noise(self, batch_size):
        """
        Should be overriden by all subclasses.
        Args:
            batch_size: (int) batch size of input.
        """
        raise NotImplementedError


 class VarFastLSTMCell(VarRNNCellBase):
    """
    A long short-term memory (LSTM) cell with variational dropout.
    .. math::
        \begin{array}{ll}
        i = \mathrm{sigmoid}(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
        f = \mathrm{sigmoid}(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
        g = \tanh(W_{ig} x + b_{ig} + W_{hc} h + b_{hg}) \\
        o = \mathrm{sigmoid}(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
        c' = f * c + i * g \\
        h' = o * \tanh(c') \\
        \end{array}
    """

    def __init__(self, input_size, hidden_size, bias=True, p=(0.5, 0.5), initializer=None):
        super(VarFastLSTMCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bias = bias
        self.weight_ih = Parameter(torch.Tensor(4 * hidden_size, input_size))
        self.weight_hh = Parameter(torch.Tensor(4 * hidden_size, hidden_size))
        if bias:
            self.bias_ih = Parameter(torch.Tensor(4 * hidden_size))
            self.bias_hh = Parameter(torch.Tensor(4 * hidden_size))
        else:
            self.register_parameter('bias_ih', None)
            self.register_parameter('bias_hh', None)

        self.initializer = default_initializer(self.hidden_size) if initializer is None else initializer
        self.reset_parameters()
        p_in, p_hidden = p
        if p_in < 0 or p_in > 1:
            raise ValueError("input dropout probability has to be between 0 and 1, "
                             "but got {}".format(p_in))
        if p_hidden < 0 or p_hidden > 1:
            raise ValueError("hidden state dropout probability has to be between 0 and 1, "
                             "but got {}".format(p_hidden))
        self.p_in = p_in
        self.p_hidden = p_hidden
        self.noise_in = None
        self.noise_hidden = None

    def reset_parameters(self):
        for weight in self.parameters():
            if weight.dim() == 1:
                weight.data.zero_()
            else:
                self.initializer(weight.data)

    def reset_noise(self, batch_size):
        if self.training:
            if self.p_in:
                noise = self.weight_ih.data.new(batch_size, self.input_size)
                self.noise_in = torch.tensor(noise.bernoulli_(1.0 - self.p_in) / (1.0 - self.p_in))
            else:
                self.noise_in = None

            if self.p_hidden:
                noise = self.weight_hh.data.new(batch_size, self.hidden_size)
                self.noise_hidden = torch.tensor(noise.bernoulli_(1.0 - self.p_hidden) / (1.0 - self.p_hidden))
            else:
                self.noise_hidden = None
        else:
            self.noise_in = None
            self.noise_hidden = None

    def forward(self, input, hx):
        return self.__forward(
            input, hx,
            self.weight_ih, self.weight_hh,
            self.bias_ih, self.bias_hh,
            self.noise_in, self.noise_hidden,
        )

    @staticmethod
    def __forward(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
        if noise_in is not None:
            if input.is_cuda:
                input = input * noise_in.cuda(input.get_device())
            else:
                input = input * noise_in

        if input.is_cuda:
            w_ih = w_ih.cuda(input.get_device())
            w_hh = w_hh.cuda(input.get_device())
            hidden = [h.cuda(input.get_device()) for h in hidden]
            b_ih = b_ih.cuda(input.get_device())
            b_hh = b_hh.cuda(input.get_device())
            igates = F.linear(input, w_ih.cuda(input.get_device()))
            hgates = F.linear(hidden[0], w_hh) if noise_hidden is None \
                else F.linear(hidden[0] * noise_hidden.cuda(input.get_device()), w_hh)
            state = fusedBackend.LSTMFused.apply
            # print("use backend")
            # use some magic function
            return state(igates, hgates, hidden[1]) if b_ih is None else state(igates, hgates, hidden[1], b_ih, b_hh)

        hx, cx = hidden
        if noise_hidden is not None:
            hx = hx * noise_hidden
        gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)

        ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)

        ingate = F.sigmoid(ingate)
        forgetgate = F.sigmoid(forgetgate)
        cellgate = F.tanh(cellgate)
        outgate = F.sigmoid(outgate)

        cy = (forgetgate * cx) + (ingate * cellgate)
        hy = outgate * F.tanh(cy)

        return hy, cy