From 042f63aa466dff914ca6b6d67b8eeceb44b6f19c Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sun, 15 Jul 2018 18:39:06 +0800 Subject: [PATCH] codebank --- fastNLP/loader/config | 11 + fastNLP/modules/embedding.py | 87 +++++ fastNLP/modules/masked_rnn.py | 422 +++++++++++++++++++++++++ fastNLP/modules/other_modules.py | 490 +++++++++++++++++++++++++++++ fastNLP/modules/utils.py | 231 +++++++++++++- fastNLP/modules/variational_rnn.py | 384 ++++++++++++++++++++++ 6 files changed, 1622 insertions(+), 3 deletions(-) create mode 100644 fastNLP/modules/embedding.py create mode 100644 fastNLP/modules/masked_rnn.py create mode 100644 fastNLP/modules/other_modules.py create mode 100644 fastNLP/modules/variational_rnn.py diff --git a/fastNLP/loader/config b/fastNLP/loader/config index 5eb57db5..a035a6f2 100644 --- a/fastNLP/loader/config +++ b/fastNLP/loader/config @@ -52,3 +52,14 @@ early_stopping = 70 reg = 1e-05 test = 5 new_attr = 40 + +[POStest] +epochs = 20 +batch_size=1 +num_classes=10, +vocab_size=vocab_size +pickle_path=pickle_path +validate=true + + + diff --git a/fastNLP/modules/embedding.py b/fastNLP/modules/embedding.py new file mode 100644 index 00000000..ba70445b --- /dev/null +++ b/fastNLP/modules/embedding.py @@ -0,0 +1,87 @@ +import torch +import torch.nn.functional as F +from torch import nn + + +class ConvCharEmbedding(nn.Module): + + def __init__(self, char_emb_size, feature_maps=(40, 30, 30), kernels=(3, 4, 5)): + """ + Character Level Word Embedding + :param char_emb_size: the size of character level embedding, + say 26 characters, each embedded to 50 dim vector, then the input_size is 50. + :param feature_maps: table of feature maps (for each kernel width) + :param kernels: table of kernel widths + """ + super(ConvCharEmbedding, self).__init__() + self.convs = nn.ModuleList([ + nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, 4)) + for i in range(len(kernels))]) + + def forward(self, x): + """ + :param x: [batch_size * sent_length, word_length, char_emb_size] + :return: [batch_size * sent_length, sum(feature_maps), 1] + """ + x = x.contiguous().view(x.size(0), 1, x.size(1), x.size(2)) # [batch_size*sent_length, channel, width, height] + x = x.transpose(2, 3) # [batch_size*sent_length, channel, height, width] + return self.convolute(x).unsqueeze(2) + + def convolute(self, x): + feats = [] + for conv in self.convs: + y = conv(x) # [batch_size*sent_length, feature_maps[i], 1, width - kernels[i] + 1] + y = torch.squeeze(y, 2) # [batch_size*sent_length, feature_maps[i], width - kernels[i] + 1] + y = F.tanh(y) + y, __ = torch.max(y, 2) # [batch_size*sent_length, feature_maps[i]] + feats.append(y) + return torch.cat(feats, 1) # [batch_size*sent_length, sum(feature_maps)] + + +class LSTMCharEmbedding(nn.Module): + """ + Character Level Word Embedding with LSTM + :param char_emb_size: the size of character level embedding, + say 26 characters, each embedded to 50 dim vector, then the input_size is 50. + """ + + def __init__(self, char_emb_size, hidden_size=None): + super(LSTMCharEmbedding, self).__init__() + self.hidden_size = char_emb_size if hidden_size is None else hidden_size + + self.lstm = nn.LSTM(input_size=char_emb_size, + hidden_size=self.hidden_size, + num_layers=1, + bias=True, + batch_first=True) + + def forward(self, x): + """ + :param x:[ n_batch*n_word, word_length, char_emb_size] + :return: [ n_batch*n_word, char_emb_size] + """ + batch_size = x.shape[0] + h0 = torch.empty(1, batch_size, self.hidden_size) + h0 = nn.init.orthogonal_(h0) + c0 = torch.empty(1, batch_size, self.hidden_size) + c0 = nn.init.orthogonal_(c0) + + _, hidden = self.lstm(x, (h0, c0)) + return hidden[0].squeeze().unsqueeze(2) + + +if __name__ == "__main__": + batch_size = 128 + char_emb = 100 + word_length = 1 + x = torch.Tensor(batch_size, char_emb, word_length) + x = x.transpose(1, 2) + cce = ConvCharEmbedding(char_emb) + y = cce(x) + print("CNN Char Emb input: ", x.shape) + print("CNN Char Emb output: ", y.shape) # [128, 100] + + lce = LSTMCharEmbedding(char_emb) + o = lce(x) + print("LSTM Char Emb input: ", x.shape) + print("LSTM Char Emb size: ", o.shape) diff --git a/fastNLP/modules/masked_rnn.py b/fastNLP/modules/masked_rnn.py new file mode 100644 index 00000000..17ebcfd6 --- /dev/null +++ b/fastNLP/modules/masked_rnn.py @@ -0,0 +1,422 @@ +__author__ = 'max' + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def MaskedRecurrent(reverse=False): + def forward(input, hidden, cell, mask, train=True, dropout=0): + """ + :param input: + :param hidden: + :param cell: + :param mask: + :param dropout: step之间的dropout,对mask了的也会drop,应该是没问题的,反正没有gradient + :param train: 控制dropout的行为,在StackedRNN的forward中调用 + :return: + """ + output = [] + steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0)) + for i in steps: + if mask is None or mask[i].data.min() > 0.5: # 没有mask,都是1 + hidden = cell(input[i], hidden) + elif mask[i].data.max() > 0.5: # 有mask,但不全为0 + hidden_next = cell(input[i], hidden) # 一次喂入一个batch! + # hack to handle LSTM + if isinstance(hidden, tuple): # LSTM outputs a tuple of (hidden, cell), this is a common hack 😁 + mask = mask.float() + hx, cx = hidden + hp1, cp1 = hidden_next + hidden = ( + hx + (hp1 - hx) * mask[i].squeeze(), + cx + (cp1 - cx) * mask[i].squeeze()) # Why? 我知道了!!如果是mask就不用改变 + else: + hidden = hidden + (hidden_next - hidden) * mask[i] + + # if dropout != 0 and train: # warning, should i treat masked tensor differently? + # if isinstance(hidden, tuple): + # hidden = (F.dropout(hidden[0], p=dropout, training=train), + # F.dropout(hidden[1], p=dropout, training=train)) + # else: + # hidden = F.dropout(hidden, p=dropout, training=train) + + # hack to handle LSTM + output.append(hidden[0] if isinstance(hidden, tuple) else hidden) + + if reverse: + output.reverse() + output = torch.cat(output, 0).view(input.size(0), *output[0].size()) + + return hidden, output + + return forward + + +def StackedRNN(inners, num_layers, lstm=False, train=True, step_dropout=0, layer_dropout=0): + num_directions = len(inners) # rec_factory! + total_layers = num_layers * num_directions + + def forward(input, hidden, cells, mask): + assert (len(cells) == total_layers) + next_hidden = [] + + if lstm: + hidden = list(zip(*hidden)) + + for i in range(num_layers): + all_output = [] + for j, inner in enumerate(inners): + l = i * num_directions + j + hy, output = inner(input, hidden[l], cells[l], mask, step_dropout, train) + next_hidden.append(hy) + all_output.append(output) + + input = torch.cat(all_output, input.dim() - 1) # 下一层的输入 + + if layer_dropout != 0 and i < num_layers - 1: + input = F.dropout(input, p=layer_dropout, training=train, inplace=False) + + if lstm: + next_h, next_c = zip(*next_hidden) + next_hidden = ( + torch.cat(next_h, 0).view(total_layers, *next_h[0].size()), + torch.cat(next_c, 0).view(total_layers, *next_c[0].size()) + ) + else: + next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size()) + + return next_hidden, input + + return forward + + +def AutogradMaskedRNN(num_layers=1, batch_first=False, train=True, layer_dropout=0, step_dropout=0, + bidirectional=False, lstm=False): + rec_factory = MaskedRecurrent + + if bidirectional: + layer = (rec_factory(), rec_factory(reverse=True)) + else: + layer = (rec_factory(),) # rec_factory 就是每层的结构啦!!在MaskedRecurrent中进行每层的计算!然后用StackedRNN接起来 + + func = StackedRNN(layer, + num_layers, + lstm=lstm, + layer_dropout=layer_dropout, step_dropout=step_dropout, + train=train) + + def forward(input, cells, hidden, mask): + if batch_first: + input = input.transpose(0, 1) + if mask is not None: + mask = mask.transpose(0, 1) + + nexth, output = func(input, hidden, cells, mask) + + if batch_first: + output = output.transpose(0, 1) + + return output, nexth + + return forward + + +def MaskedStep(): + def forward(input, hidden, cell, mask): + if mask is None or mask.data.min() > 0.5: + hidden = cell(input, hidden) + elif mask.data.max() > 0.5: + hidden_next = cell(input, hidden) + # hack to handle LSTM + if isinstance(hidden, tuple): + hx, cx = hidden + hp1, cp1 = hidden_next + hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask) + else: + hidden = hidden + (hidden_next - hidden) * mask + # hack to handle LSTM + output = hidden[0] if isinstance(hidden, tuple) else hidden + + return hidden, output + + return forward + + +def StackedStep(layer, num_layers, lstm=False, dropout=0, train=True): + def forward(input, hidden, cells, mask): + assert (len(cells) == num_layers) + next_hidden = [] + + if lstm: + hidden = list(zip(*hidden)) + + for l in range(num_layers): + hy, output = layer(input, hidden[l], cells[l], mask) + next_hidden.append(hy) + input = output + + if dropout != 0 and l < num_layers - 1: + input = F.dropout(input, p=dropout, training=train, inplace=False) + + if lstm: + next_h, next_c = zip(*next_hidden) + next_hidden = ( + torch.cat(next_h, 0).view(num_layers, *next_h[0].size()), + torch.cat(next_c, 0).view(num_layers, *next_c[0].size()) + ) + else: + next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size()) + + return next_hidden, input + + return forward + + +def AutogradMaskedStep(num_layers=1, dropout=0, train=True, lstm=False): + layer = MaskedStep() + + func = StackedStep(layer, + num_layers, + lstm=lstm, + dropout=dropout, + train=train) + + def forward(input, cells, hidden, mask): + nexth, output = func(input, hidden, cells, mask) + return output, nexth + + return forward + + +class MaskedRNNBase(nn.Module): + def __init__(self, Cell, input_size, hidden_size, + num_layers=1, bias=True, batch_first=False, + layer_dropout=0, step_dropout=0, bidirectional=False, **kwargs): + """ + :param Cell: + :param input_size: + :param hidden_size: + :param num_layers: + :param bias: + :param batch_first: + :param layer_dropout: + :param step_dropout: + :param bidirectional: + :param kwargs: + """ + + super(MaskedRNNBase, self).__init__() + self.Cell = Cell + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.bias = bias + self.batch_first = batch_first + self.layer_dropout = layer_dropout + self.step_dropout = step_dropout + self.bidirectional = bidirectional + num_directions = 2 if bidirectional else 1 + + self.all_cells = [] + for layer in range(num_layers): # 初始化所有cell + for direction in range(num_directions): + layer_input_size = input_size if layer == 0 else hidden_size * num_directions + + cell = self.Cell(layer_input_size, hidden_size, self.bias, **kwargs) + self.all_cells.append(cell) + self.add_module('cell%d' % (layer * num_directions + direction), cell) # Max的代码写得真好看 + + def reset_parameters(self): + for cell in self.all_cells: + cell.reset_parameters() + + def forward(self, input, mask=None, hx=None): + batch_size = input.size(0) if self.batch_first else input.size(1) + lstm = self.Cell is nn.LSTMCell + if hx is None: + num_directions = 2 if self.bidirectional else 1 + hx = torch.autograd.Variable( + input.data.new(self.num_layers * num_directions, batch_size, self.hidden_size).zero_()) + if lstm: + hx = (hx, hx) + + func = AutogradMaskedRNN(num_layers=self.num_layers, + batch_first=self.batch_first, + step_dropout=self.step_dropout, + layer_dropout=self.layer_dropout, + train=self.training, + bidirectional=self.bidirectional, + lstm=lstm) # 传入all_cells,继续往底层封装走 + + output, hidden = func(input, self.all_cells, hx, + None if mask is None else mask.view(mask.size() + (1,))) # 这个+ (1, )是个什么操作? + return output, hidden + + def step(self, input, hx=None, mask=None): + ''' + execute one step forward (only for one-directional RNN). + Args: + input (batch, input_size): input tensor of this step. + hx (num_layers, batch, hidden_size): the hidden state of last step. + mask (batch): the mask tensor of this step. + Returns: + output (batch, hidden_size): tensor containing the output of this step from the last layer of RNN. + hn (num_layers, batch, hidden_size): tensor containing the hidden state of this step + ''' + assert not self.bidirectional, "step only cannot be applied to bidirectional RNN." # aha, typo! + batch_size = input.size(0) + lstm = self.Cell is nn.LSTMCell + if hx is None: + hx = torch.autograd.Variable(input.data.new(self.num_layers, batch_size, self.hidden_size).zero_()) + if lstm: + hx = (hx, hx) + + func = AutogradMaskedStep(num_layers=self.num_layers, + dropout=self.dropout, + train=self.training, + lstm=lstm) + + output, hidden = func(input, self.all_cells, hx, mask) + return output, hidden + + +class MaskedRNN(MaskedRNNBase): + r"""Applies a multi-layer Elman RNN with costomized non-linearity to an + input sequence. + For each element in the input sequence, each layer computes the following + function: + .. math:: + h_t = \tanh(w_{ih} * x_t + b_{ih} + w_{hh} * h_{(t-1)} + b_{hh}) + where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is + the hidden state of the previous layer at time `t` or :math:`input_t` + for the first layer. If nonlinearity='relu', then `ReLU` is used instead + of `tanh`. + Args: + input_size: The number of expected features in the input x + hidden_size: The number of features in the hidden state h + num_layers: Number of recurrent layers. + nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh' + bias: If False, then the layer does not use bias weights b_ih and b_hh. + Default: True + batch_first: If True, then the input and output tensors are provided + as (batch, seq, feature) + dropout: If non-zero, introduces a dropout layer on the outputs of each + RNN layer except the last layer + bidirectional: If True, becomes a bidirectional RNN. Default: False + Inputs: input, mask, h_0 + - **input** (seq_len, batch, input_size): tensor containing the features + of the input sequence. + **mask** (seq_len, batch): 0-1 tensor containing the mask of the input sequence. + - **h_0** (num_layers * num_directions, batch, hidden_size): tensor + containing the initial hidden state for each element in the batch. + Outputs: output, h_n + - **output** (seq_len, batch, hidden_size * num_directions): tensor + containing the output features (h_k) from the last layer of the RNN, + for each k. If a :class:`torch.nn.utils.rnn.PackedSequence` has + been given as the input, the output will also be a packed sequence. + - **h_n** (num_layers * num_directions, batch, hidden_size): tensor + containing the hidden state for k=seq_len. + """ + + def __init__(self, *args, **kwargs): + super(MaskedRNN, self).__init__(nn.RNNCell, *args, **kwargs) + + +class MaskedLSTM(MaskedRNNBase): + r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input + sequence. + For each element in the input sequence, each layer computes the following + function: + .. math:: + \begin{array}{ll} + i_t = \mathrm{sigmoid}(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\ + f_t = \mathrm{sigmoid}(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\ + g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\ + o_t = \mathrm{sigmoid}(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\ + c_t = f_t * c_{(t-1)} + i_t * g_t \\ + h_t = o_t * \tanh(c_t) + \end{array} + where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell + state at time `t`, :math:`x_t` is the hidden state of the previous layer at + time `t` or :math:`input_t` for the first layer, and :math:`i_t`, + :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell, + and out gates, respectively. + Args: + input_size: The number of expected features in the input x + hidden_size: The number of features in the hidden state h + num_layers: Number of recurrent layers. + bias: If False, then the layer does not use bias weights b_ih and b_hh. + Default: True + batch_first: If True, then the input and output tensors are provided + as (batch, seq, feature) + dropout: If non-zero, introduces a dropout layer on the outputs of each + RNN layer except the last layer + bidirectional: If True, becomes a bidirectional RNN. Default: False + Inputs: input, mask, (h_0, c_0) + - **input** (seq_len, batch, input_size): tensor containing the features + of the input sequence. + **mask** (seq_len, batch): 0-1 tensor containing the mask of the input sequence. + - **h_0** (num_layers \* num_directions, batch, hidden_size): tensor + containing the initial hidden state for each element in the batch. + - **c_0** (num_layers \* num_directions, batch, hidden_size): tensor + containing the initial cell state for each element in the batch. + Outputs: output, (h_n, c_n) + - **output** (seq_len, batch, hidden_size * num_directions): tensor + containing the output features `(h_t)` from the last layer of the RNN, + for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been + given as the input, the output will also be a packed sequence. + - **h_n** (num_layers * num_directions, batch, hidden_size): tensor + containing the hidden state for t=seq_len + - **c_n** (num_layers * num_directions, batch, hidden_size): tensor + containing the cell state for t=seq_len + """ + + def __init__(self, *args, **kwargs): + super(MaskedLSTM, self).__init__(nn.LSTMCell, *args, **kwargs) + + +class MaskedGRU(MaskedRNNBase): + r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence. + For each element in the input sequence, each layer computes the following + function: + .. math:: + \begin{array}{ll} + r_t = \mathrm{sigmoid}(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\ + z_t = \mathrm{sigmoid}(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\ + n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\ + h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} \\ + \end{array} + where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the hidden + state of the previous layer at time `t` or :math:`input_t` for the first + layer, and :math:`r_t`, :math:`z_t`, :math:`n_t` are the reset, input, + and new gates, respectively. + Args: + input_size: The number of expected features in the input x + hidden_size: The number of features in the hidden state h + num_layers: Number of recurrent layers. + nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh' + bias: If False, then the layer does not use bias weights b_ih and b_hh. + Default: True + batch_first: If True, then the input and output tensors are provided + as (batch, seq, feature) + dropout: If non-zero, introduces a dropout layer on the outputs of each + RNN layer except the last layer + bidirectional: If True, becomes a bidirectional RNN. Default: False + Inputs: input, mask, h_0 + - **input** (seq_len, batch, input_size): tensor containing the features + of the input sequence. + **mask** (seq_len, batch): 0-1 tensor containing the mask of the input sequence. + - **h_0** (num_layers * num_directions, batch, hidden_size): tensor + containing the initial hidden state for each element in the batch. + Outputs: output, h_n + - **output** (seq_len, batch, hidden_size * num_directions): tensor + containing the output features (h_k) from the last layer of the RNN, + for each k. If a :class:`torch.nn.utils.rnn.PackedSequence` has + been given as the input, the output will also be a packed sequence. + - **h_n** (num_layers * num_directions, batch, hidden_size): tensor + containing the hidden state for k=seq_len. + """ + + def __init__(self, *args, **kwargs): + super(MaskedGRU, self).__init__(nn.GRUCell, *args, **kwargs) diff --git a/fastNLP/modules/other_modules.py b/fastNLP/modules/other_modules.py new file mode 100644 index 00000000..555234c5 --- /dev/null +++ b/fastNLP/modules/other_modules.py @@ -0,0 +1,490 @@ +import numpy as np +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import Parameter + +from .emd import WlossLayer +from ..utils import orthogonal + + +class GroupNorm(nn.Module): + def __init__(self, num_features, num_groups=20, eps=1e-5): + super(GroupNorm, self).__init__() + self.weight = nn.Parameter(torch.ones(1, num_features, 1)) + self.bias = nn.Parameter(torch.zeros(1, num_features, 1)) + self.num_groups = num_groups + self.eps = eps + + def forward(self, x): + N, C, H = x.size() + G = self.num_groups + assert C % G == 0 + + x = x.view(N, G, -1) + mean = x.mean(-1, keepdim=True) + var = x.var(-1, keepdim=True) + + x = (x - mean) / (var + self.eps).sqrt() + x = x.view(N, C, H) + return x * self.weight + self.bias + + +class LayerNormalization(nn.Module): + """ Layer normalization module """ + + def __init__(self, d_hid, eps=1e-3): + super(LayerNormalization, self).__init__() + + self.eps = eps + self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True) + self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True) + + def forward(self, z): + if z.size(1) == 1: + return z + + mu = torch.mean(z, keepdim=True, dim=-1) + sigma = torch.std(z, keepdim=True, dim=-1) + ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps) + ln_out = ln_out * self.a_2.expand_as(ln_out) + self.b_2.expand_as(ln_out) + + return ln_out + + +class OrthEmbedding(nn.Embedding): + def __init__(self, *args, **kwargs): + super(OrthEmbedding, self).__init__(*args, **kwargs) + + def reset_parameters(self): + self.weight = orthogonal(self.weight) + nn.init.constant_(self.bias, 0.) + + +class BiLinear(nn.Module): + def __init__(self, n_left, n_right, n_out, bias=True): + """ + Args: + n_left: size of left input + n_right: size of right input + n_out: size of output + bias: If set to False, the layer will not learn an additive bias. + Default: True + """ + super(BiLinear, self).__init__() + self.n_left = n_left + self.n_right = n_right + self.n_out = n_out + + self.U = Parameter(torch.Tensor(self.n_out, self.n_left, self.n_right)) + self.W_l = Parameter(torch.Tensor(self.n_out, self.n_left)) + self.W_r = Parameter(torch.Tensor(self.n_out, self.n_left)) + + if bias: + self.bias = Parameter(torch.Tensor(n_out)) + else: + self.register_parameter('bias', None) + + self.reset_parameters() + + def reset_parameters(self): + nn.init.xavier_uniform_(self.W_l) + nn.init.xavier_uniform_(self.W_r) + nn.init.constant_(self.bias, 0.) + nn.init.xavier_uniform_(self.U) + + def forward(self, input_left, input_right): + """ + Args: + input_left: Tensor + the left input tensor with shape = [batch1, batch2, ..., left_features] + input_right: Tensor + the right input tensor with shape = [batch1, batch2, ..., right_features] + Returns: + """ + left_size = input_left.size() + right_size = input_right.size() + assert left_size[:-1] == right_size[:-1], \ + "batch size of left and right inputs mis-match: (%s, %s)" % (left_size[:-1], right_size[:-1]) + batch = int(np.prod(left_size[:-1])) + + # convert left and right input to matrices [batch, left_features], [batch, right_features] + input_left = input_left.view(batch, self.n_left) + input_right = input_right.view(batch, self.n_right) + + # output [batch, out_features] + output = F.bilinear(input_left, input_right, self.U, self.bias) + output = output + \ + F.linear(input_left, self.W_l, None) + \ + F.linear(input_right, self.W_r, None) + # convert back to [batch1, batch2, ..., out_features] + return output.view(left_size[:-1] + (self.n_out,)) + + def __repr__(self): + return self.__class__.__name__ + ' (' \ + + 'in1_features=' + str(self.n_left) \ + + ', in2_features=' + str(self.n_right) \ + + ', out_features=' + str(self.n_out) + ')' + + +class BiAffine(nn.Module): + def __init__(self, n_enc, n_dec, n_labels, biaffine=True, **kwargs): + """ + Args: + n_enc: int + the dimension of the encoder input. + n_dec: int + the dimension of the decoder input. + n_labels: int + the number of labels of the crf layer + biaffine: bool + if apply bi-affine parameter. + **kwargs: + """ + super(BiAffine, self).__init__() + self.n_enc = n_enc + self.n_dec = n_dec + self.num_labels = n_labels + self.biaffine = biaffine + + self.W_d = Parameter(torch.Tensor(self.num_labels, self.n_dec)) + self.W_e = Parameter(torch.Tensor(self.num_labels, self.n_enc)) + self.b = Parameter(torch.Tensor(self.num_labels, 1, 1)) + if self.biaffine: + self.U = Parameter(torch.Tensor(self.num_labels, self.n_dec, self.n_enc)) + else: + self.register_parameter('U', None) + + self.reset_parameters() + + def reset_parameters(self): + nn.init.xavier_uniform_(self.W_d) + nn.init.xavier_uniform_(self.W_e) + nn.init.constant_(self.b, 0.) + if self.biaffine: + nn.init.xavier_uniform_(self.U) + + def forward(self, input_d, input_e, mask_d=None, mask_e=None): + """ + Args: + input_d: Tensor + the decoder input tensor with shape = [batch, length_decoder, input_size] + input_e: Tensor + the child input tensor with shape = [batch, length_encoder, input_size] + mask_d: Tensor or None + the mask tensor for decoder with shape = [batch, length_decoder] + mask_e: Tensor or None + the mask tensor for encoder with shape = [batch, length_encoder] + Returns: Tensor + the energy tensor with shape = [batch, num_label, length, length] + """ + assert input_d.size(0) == input_e.size(0), 'batch sizes of encoder and decoder are requires to be equal.' + batch, length_decoder, _ = input_d.size() + _, length_encoder, _ = input_e.size() + + # compute decoder part: [num_label, input_size_decoder] * [batch, input_size_decoder, length_decoder] + # the output shape is [batch, num_label, length_decoder] + out_d = torch.matmul(self.W_d, input_d.transpose(1, 2)).unsqueeze(3) + # compute decoder part: [num_label, input_size_encoder] * [batch, input_size_encoder, length_encoder] + # the output shape is [batch, num_label, length_encoder] + out_e = torch.matmul(self.W_e, input_e.transpose(1, 2)).unsqueeze(2) + + # output shape [batch, num_label, length_decoder, length_encoder] + if self.biaffine: + # compute bi-affine part + # [batch, 1, length_decoder, input_size_decoder] * [num_labels, input_size_decoder, input_size_encoder] + # output shape [batch, num_label, length_decoder, input_size_encoder] + output = torch.matmul(input_d.unsqueeze(1), self.U) + # [batch, num_label, length_decoder, input_size_encoder] * [batch, 1, input_size_encoder, length_encoder] + # output shape [batch, num_label, length_decoder, length_encoder] + output = torch.matmul(output, input_e.unsqueeze(1).transpose(2, 3)) + + output = output + out_d + out_e + self.b + else: + output = out_d + out_d + self.b + + if mask_d is not None: + output = output * mask_d.unsqueeze(1).unsqueeze(3) * mask_e.unsqueeze(1).unsqueeze(2) + + return output + + +class Transpose(nn.Module): + def __init__(self, x, y): + super(Transpose, self).__init__() + self.x = x + self.y = y + + def forward(self, x): + return x.transpose(self.x, self.y) + + +class WordDropout(nn.Module): + def __init__(self, dropout_rate, drop_to_token): + super(WordDropout, self).__init__() + self.dropout_rate = dropout_rate + self.drop_to_token = drop_to_token + + def forward(self, word_idx): + if not self.training: + return word_idx + drop_mask = torch.rand(word_idx.shape) < self.dropout_rate + if word_idx.device.type == 'cuda': + drop_mask = drop_mask.cuda() + drop_mask = drop_mask.long() + output = drop_mask * self.drop_to_token + (1 - drop_mask) * word_idx + return output + + +import torch +import torch.utils.data +import numpy +from torch.autograd import Function, Variable +from torch import optim + + +class WlossLayer(torch.nn.Module): + def __init__(self, lam=100, sinkhorn_iter=50): + super(WlossLayer, self).__init__() + + # cost = matrix M = distance matrix + # lam = lambda of type float > 0 + # sinkhorn_iter > 0 + # diagonal cost should be 0 + self.lam = lam + self.sinkhorn_iter = sinkhorn_iter + # self.register_buffer("K", torch.exp(-self.cost / self.lam).double()) + # self.register_buffer("KM", (self.cost * self.K).double()) + + def forward(self, pred, target, cost): + return WassersteinLossStab.apply(pred, target, + cost, self.lam, self.sinkhorn_iter) + + +class WassersteinLossStab(Function): + @staticmethod + def forward(ctx, pred, target, cost, + lam=1e-3, sinkhorn_iter=4): + """pred: Batch * K: K = # mass points + target: Batch * L: L = # mass points""" + # import pdb + # pdb.set_trace() + eps = 1e-8 + + # pred = pred.gather(dim=1, index=) + na = pred.size(1) + nb = target.size(1) + + cost = cost.double() + pred = pred.double() + target = target.double() + + cost = cost[:na, :nb].double() + K = torch.exp(-cost / lam).double() + KM = (cost * K).double() + + batch_size = pred.size(0) + + # pdb.set_trace() + log_a, log_b = torch.log(pred + eps), torch.log(target + eps) + log_u = cost.new(batch_size, na).fill_(-numpy.log(na)) + log_v = cost.new(batch_size, nb).fill_(-numpy.log(nb)) + # import pdb + # pdb.set_trace() + for i in range(int(sinkhorn_iter)): + log_u_max = torch.max(log_u, dim=1)[0] + u_stab = torch.exp(log_u - log_u_max.unsqueeze(1) + eps) + log_v = log_b - torch.log(torch.mm(K.t(), u_stab.t()).t()) - log_u_max.unsqueeze(1) + log_v_max = torch.max(log_v, dim=1)[0] + v_stab = torch.exp(log_v - log_v_max.unsqueeze(1)) + tmp = log_u + log_u = log_a - torch.log(torch.mm(K, v_stab.t()).t() + eps) - log_v_max.unsqueeze(1) + # print(log_u.sum()) + if torch.norm(tmp - log_u) / torch.norm(log_u) < eps: + break + + log_v_max = torch.max(log_v, dim=1)[0] + v_stab = torch.exp(log_v - log_v_max.unsqueeze(1)) + logcostpart1 = torch.log(torch.mm(KM, v_stab.t()).t() + eps) + log_v_max.unsqueeze(1) + wnorm = torch.exp(log_u + logcostpart1).mean(0).sum() # sum(1) for per item pair loss... + grad_input = log_u * lam + # print("log_u", log_u) + grad_input = grad_input - torch.mean(grad_input, dim=1).unsqueeze(1) + grad_input = grad_input - torch.mean(grad_input, dim=1).unsqueeze(1) + grad_input = grad_input / batch_size + + ctx.save_for_backward(grad_input) + # print("grad type", type(grad_input)) + + return pred.new((wnorm,)), grad_input + + @staticmethod + def backward(ctx, grad_output, _): + grad_input = ctx.saved_variables + # print(grad) + res = grad_output.clone() + res.data.resize_(grad_input[0].size()).copy_(grad_input[0].data) + res = res.mul_(grad_output[0]).float() + # print("in backward func:\n\n", res) + return res, None, None, None, None, None, None + + +class Sinkhorn(Function): + def __init__(self): + super(Sinkhorn, self).__init__() + + def forward(ctx, a, b, M, reg, tau, warmstart, numItermax, stop): + a = a.double() + b = b.double() + M = M.double() + + nbb = b.size(1) + + # init data + na = len(a) + nb = len(b) + + cpt = 0 + + # we assume that no distances are null except those of the diagonal of + # distances + if warmstart is None: + alpha, beta = np.zeros(na), np.zeros(nb) + else: + alpha, beta = warmstart + + if nbb: + u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb + else: + u, v = np.ones(na) / na, np.ones(nb) / nb + + def get_K(alpha, beta): + """log space computation""" + return np.exp(-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg) + + def get_Gamma(alpha, beta, u, v): + """log space gamma computation""" + return np.exp( + -(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg + np.log(u.reshape((na, 1))) + np.log( + v.reshape((1, nb)))) + + # print(np.min(K)) + + K = get_K(alpha, beta) + transp = K + cpt = 0 + err = 1 + while 1: + + uprev = u + vprev = v + + # sinkhorn update + v = b / (np.dot(K.T, u) + 1e-16) + u = a / (np.dot(K, v) + 1e-16) + + # remove numerical problems and store them in K + if np.abs(u).max() > tau or np.abs(v).max() > tau: + if nbb: + alpha, beta = alpha + reg * \ + np.max(np.log(u), 1), beta + reg * np.max(np.log(v)) + else: + alpha, beta = alpha + reg * np.log(u), beta + reg * np.log(v) + if nbb: + u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb + else: + u, v = np.ones(na) / na, np.ones(nb) / nb + K = get_K(alpha, beta) + + if cpt % print_period == 0: + # we can speed up the process by checking for the error only all + # the 10th iterations + if nbb: + err = np.sum((u - uprev) ** 2) / np.sum((u) ** 2) + \ + np.sum((v - vprev) ** 2) / np.sum((v) ** 2) + else: + transp = get_Gamma(alpha, beta, u, v) + err = np.linalg.norm((np.sum(transp, axis=0) - b)) ** 2 + if log: + log['err'].append(err) + + if verbose: + if cpt % (print_period * 20) == 0: + print( + '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19) + print('{:5d}|{:8e}|'.format(cpt, err)) + + if err <= stopThr: + loop = False + + if cpt >= numItermax: + loop = False + + if np.any(np.isnan(u)) or np.any(np.isnan(v)): + # we have reached the machine precision + # come back to previous solution and quit loop + print('Warning: numerical errors at iteration', cpt) + u = uprev + v = vprev + break + + cpt = cpt + 1 + + # print('err=',err,' cpt=',cpt) + if log: + log['logu'] = alpha / reg + np.log(u) + log['logv'] = beta / reg + np.log(v) + log['alpha'] = alpha + reg * np.log(u) + log['beta'] = beta + reg * np.log(v) + log['warmstart'] = (log['alpha'], log['beta']) + if nbb: + res = np.zeros((nbb)) + for i in range(nbb): + res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M) + return res, log + + else: + return get_Gamma(alpha, beta, u, v), log + else: + if nbb: + res = np.zeros((nbb)) + for i in range(nbb): + res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M) + return res + else: + return get_Gamma(alpha, beta, u, v) + + +if __name__ == "__main__": + cost = (torch.Tensor(2, 2).fill_(1) - torch.diag(torch.Tensor(2).fill_(1))) # .cuda() + mylayer = WlossLayer(cost) # .cuda() + inp = Variable(torch.Tensor([[1, 0], [0.5, 0.5]]), requires_grad=True) # .cuda() + ground_true = Variable(torch.Tensor([[0, 1], [0.5, 0.5]])) # .cuda() + + res, _ = mylayer(inp, ground_true) + # print(inp.requires_grad, res.requires_grad) + # print(res, inp) + mylayer.zero_grad() + res.backward() + print("inp's gradient is good:") + print(inp.grad) + + print("convert to gpu:\n", inp.cuda().grad) + print("==============================================" + "\n However, this does not work on pytorch when GPU is enabled") + + cost = (torch.Tensor(2, 2).fill_(1) - torch.diag(torch.Tensor(2).fill_(1))).cuda() + mylayer = WlossLayer(cost).cuda() + inp = Variable(torch.Tensor([[1, 0], [0.5, 0.5]]), requires_grad=True).cuda() + ground_true = Variable(torch.Tensor([[0, 1], [0.5, 0.5]])).cuda() + + opt = optim.SGD([ + {'params': mylayer.parameters()}, + ], lr=1e-2, momentum=0.9) + + res, _ = mylayer(inp, ground_true) + # print(inp.requires_grad, res.requires_grad) + # print(res, inp) + mylayer.zero_grad() + res.backward() + print("input's gradient is None!!!!!!!!!!!!!!!!") + print(inp.grad) diff --git a/fastNLP/modules/utils.py b/fastNLP/modules/utils.py index a6b31a20..ef4cc87b 100644 --- a/fastNLP/modules/utils.py +++ b/fastNLP/modules/utils.py @@ -1,6 +1,3 @@ -import torch - - def mask_softmax(matrix, mask): if mask is None: result = torch.nn.functional.softmax(matrix, dim=-1) @@ -13,3 +10,231 @@ def seq_mask(seq_len, max_len): mask = [torch.ge(torch.LongTensor(seq_len), i + 1) for i in range(max_len)] mask = torch.stack(mask, 1) return mask + + +""" + Codes from FudanParser +""" +from collections import defaultdict + +import numpy as np +import torch + + +def expand_gt(gt): + """expand_gt: Expand ground truth to matrix + Arguments: + gt: tensor of (n, l) + Return: + f: ground truth matrix of (n, l), $gt[i][j] = k$ leads to $f[i][j][k] = 1$. + """ + n, l = gt.shape + ret = torch.zeros(n, l, l).long() + for i in range(n): + ret[i][torch.arange(l).long(), gt[i]] = 1 + return ret + + +def greedy_decoding(arc_f): + """greedy_decoding + Arguments: + arc_f: a tensor in shape of (n, l+1, l+1) + length of the sentence is l and index 0 is + Output: + arc_pred: a tensor in shape of (n, l), indicating the head words + """ + + f_arc = arc_f[:, 1:, :] # ignore the root + _, arc_pred = torch.max(f_arc.data, dim=-1, keepdim=False) + return arc_pred + + +def mst_decoding(arc_f): + batch_size = arc_f.shape[0] + length = arc_f.shape[1] + arc_score = arc_f.data.cpu() + pred_collection = [] + for i in range(batch_size): + head = mst(arc_score[i].numpy()) + pred_collection.append(head[1:].reshape((1, length - 1))) + arc_pred = torch.LongTensor(np.concatenate(pred_collection, axis=0)).type_as(arc_f).long() + return arc_pred + + +def outer_product(features): + """InterProduct: Get inter sequence product of features + Arguments: + features: feature vectors of sequence in the shape of (n, l, h) + Return: + f: product result in (n, l, l, h) shape + """ + n, l, c = features.shape + features = features.contiguous() + x = features.view(n, l, 1, c) + x = x.expand(n, l, l, c) + y = features.view(n, 1, l, c).contiguous() + y = y.expand(n, l, l, c) + return x * y + + +def outer_concat(features): + """InterProduct: Get inter sequence concatenation of features + Arguments: + features: feature vectors of sequence in the shape of (n, l, h) + Return: + f: product result in (n, l, l, h) shape + """ + n, l, c = features.shape + x = features.contiguous().view(n, l, 1, c) + x = x.expand(n, l, l, c) + y = features.view(n, 1, l, c) + y = y.expand(n, l, l, c) + return torch.cat((x, y), dim=3) + + +def mst(scores): + """ + https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/models/nn.py#L692 # NOQA + """ + length = scores.shape[0] + min_score = scores.min() - 1 + eye = np.eye(length) + scores = scores * (1 - eye) + min_score * eye + heads = np.argmax(scores, axis=1) + heads[0] = 0 + tokens = np.arange(1, length) + roots = np.where(heads[tokens] == 0)[0] + 1 + if len(roots) < 1: + root_scores = scores[tokens, 0] + head_scores = scores[tokens, heads[tokens]] + new_root = tokens[np.argmax(root_scores / head_scores)] + heads[new_root] = 0 + elif len(roots) > 1: + root_scores = scores[roots, 0] + scores[roots, 0] = 0 + new_heads = np.argmax(scores[roots][:, tokens], axis=1) + 1 + new_root = roots[np.argmin( + scores[roots, new_heads] / root_scores)] + heads[roots] = new_heads + heads[new_root] = 0 + + edges = defaultdict(set) + vertices = set((0,)) + for dep, head in enumerate(heads[tokens]): + vertices.add(dep + 1) + edges[head].add(dep + 1) + for cycle in _find_cycle(vertices, edges): + dependents = set() + to_visit = set(cycle) + while len(to_visit) > 0: + node = to_visit.pop() + if node not in dependents: + dependents.add(node) + to_visit.update(edges[node]) + cycle = np.array(list(cycle)) + old_heads = heads[cycle] + old_scores = scores[cycle, old_heads] + non_heads = np.array(list(dependents)) + scores[np.repeat(cycle, len(non_heads)), + np.repeat([non_heads], len(cycle), axis=0).flatten()] = min_score + new_heads = np.argmax(scores[cycle][:, tokens], axis=1) + 1 + new_scores = scores[cycle, new_heads] / old_scores + change = np.argmax(new_scores) + changed_cycle = cycle[change] + old_head = old_heads[change] + new_head = new_heads[change] + heads[changed_cycle] = new_head + edges[new_head].add(changed_cycle) + edges[old_head].remove(changed_cycle) + + return heads + + +def _find_cycle(vertices, edges): + """ + https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm # NOQA + https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/etc/tarjan.py # NOQA + """ + _index = 0 + _stack = [] + _indices = {} + _lowlinks = {} + _onstack = defaultdict(lambda: False) + _SCCs = [] + + def _strongconnect(v): + nonlocal _index + _indices[v] = _index + _lowlinks[v] = _index + _index += 1 + _stack.append(v) + _onstack[v] = True + + for w in edges[v]: + if w not in _indices: + _strongconnect(w) + _lowlinks[v] = min(_lowlinks[v], _lowlinks[w]) + elif _onstack[w]: + _lowlinks[v] = min(_lowlinks[v], _indices[w]) + + if _lowlinks[v] == _indices[v]: + SCC = set() + while True: + w = _stack.pop() + _onstack[w] = False + SCC.add(w) + if not (w != v): + break + _SCCs.append(SCC) + + for v in vertices: + if v not in _indices: + _strongconnect(v) + + return [SCC for SCC in _SCCs if len(SCC) > 1] + + +# https://github.com/alykhantejani/nninit/blob/master/nninit.py +def orthogonal(tensor, gain=1): + """Fills the input Tensor or Variable with a (semi) orthogonal matrix. The input tensor must have at least 2 dimensions, + and for tensors with more than 2 dimensions the trailing dimensions are flattened. viewed as 2D representation with + rows equal to the first dimension and columns equal to the product of as a sparse matrix, where the non-zero elements + will be drawn from a normal distribution with mean=0 and std=`std`. + Reference: "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks" - Saxe, A. et al. + Args: + tensor: a n-dimension torch.Tensor, where n >= 2 + gain: optional gain to be applied + Examples: + >>> w = torch.Tensor(3, 5) + >>> nninit.orthogonal(w) + """ + if tensor.ndimension() < 2: + raise ValueError("Only tensors with 2 or more dimensions are supported.") + + flattened_shape = (tensor.size(0), int(np.prod(tensor.detach().numpy().shape[1:]))) + flattened = torch.Tensor(flattened_shape[0], flattened_shape[1]).normal_(0, 1) + + u, s, v = np.linalg.svd(flattened.numpy(), full_matrices=False) + if u.shape == flattened.detach().numpy().shape: + tensor.view_as(flattened).copy_(torch.from_numpy(u)) + else: + tensor.view_as(flattened).copy_(torch.from_numpy(v)) + + tensor.mul_(gain) + with torch.no_grad(): + return tensor + + +def generate_step_dropout(masks, hidden_dim, step_dropout, training=False): + # assume batch first + # import pdb + # pdb.set_trace() + + batch, length = masks.size() + if not training: + return torch.ones(batch, length, hidden_dim).fill_(1 - step_dropout).cuda(masks.device) * masks.view(batch, + length, 1) + masked = torch.zeros(batch, 1, hidden_dim).fill_(step_dropout) + masked = torch.bernoulli(masked).repeat(1, length, 1) + masked = masked.cuda(masks.device) * masks.view(batch, length, 1) + return masked diff --git a/fastNLP/modules/variational_rnn.py b/fastNLP/modules/variational_rnn.py new file mode 100644 index 00000000..b08bdd2d --- /dev/null +++ b/fastNLP/modules/variational_rnn.py @@ -0,0 +1,384 @@ +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend +from torch.nn.parameter import Parameter + + +def default_initializer(hidden_size): + stdv = 1.0 / math.sqrt(hidden_size) + + def forward(tensor): + nn.init.uniform_(tensor, -stdv, stdv) + + return forward + + +def VarMaskedRecurrent(reverse=False): + def forward(input, hidden, cell, mask): + output = [] + steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0)) + for i in steps: + if mask is None or mask[i].data.min() > 0.5: + hidden = cell(input[i], hidden) + elif mask[i].data.max() > 0.5: + hidden_next = cell(input[i], hidden) + # hack to handle LSTM + if isinstance(hidden, tuple): + hx, cx = hidden + hp1, cp1 = hidden_next + hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i]) + else: + hidden = hidden + (hidden_next - hidden) * mask[i] + # hack to handle LSTM + output.append(hidden[0] if isinstance(hidden, tuple) else hidden) + + if reverse: + output.reverse() + output = torch.cat(output, 0).view(input.size(0), *output[0].size()) + + return hidden, output + + return forward + + +def StackedRNN(inners, num_layers, lstm=False): + num_directions = len(inners) + total_layers = num_layers * num_directions + + def forward(input, hidden, cells, mask): + assert (len(cells) == total_layers) + next_hidden = [] + + if lstm: + hidden = list(zip(*hidden)) + + for i in range(num_layers): + all_output = [] + for j, inner in enumerate(inners): + l = i * num_directions + j + hy, output = inner(input, hidden[l], cells[l], mask) + next_hidden.append(hy) + all_output.append(output) + + input = torch.cat(all_output, input.dim() - 1) + + if lstm: + next_h, next_c = zip(*next_hidden) + next_hidden = ( + torch.cat(next_h, 0).view(total_layers, *next_h[0].size()), + torch.cat(next_c, 0).view(total_layers, *next_c[0].size()) + ) + else: + next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size()) + + return next_hidden, input + + return forward + + +def AutogradVarMaskedRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False): + rec_factory = VarMaskedRecurrent + + if bidirectional: + layer = (rec_factory(), rec_factory(reverse=True)) + else: + layer = (rec_factory(),) + + func = StackedRNN(layer, + num_layers, + lstm=lstm) + + def forward(input, cells, hidden, mask): + if batch_first: + input = input.transpose(0, 1) + if mask is not None: + mask = mask.transpose(0, 1) + + nexth, output = func(input, hidden, cells, mask) + + if batch_first: + output = output.transpose(0, 1) + + return output, nexth + + return forward + + +def VarMaskedStep(): + def forward(input, hidden, cell, mask): + if mask is None or mask.data.min() > 0.5: + hidden = cell(input, hidden) + elif mask.data.max() > 0.5: + hidden_next = cell(input, hidden) + # hack to handle LSTM + if isinstance(hidden, tuple): + hx, cx = hidden + hp1, cp1 = hidden_next + hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask) + else: + hidden = hidden + (hidden_next - hidden) * mask + # hack to handle LSTM + output = hidden[0] if isinstance(hidden, tuple) else hidden + + return hidden, output + + return forward + + +def StackedStep(layer, num_layers, lstm=False): + def forward(input, hidden, cells, mask): + assert (len(cells) == num_layers) + next_hidden = [] + + if lstm: + hidden = list(zip(*hidden)) + + for l in range(num_layers): + hy, output = layer(input, hidden[l], cells[l], mask) + next_hidden.append(hy) + input = output + + if lstm: + next_h, next_c = zip(*next_hidden) + next_hidden = ( + torch.cat(next_h, 0).view(num_layers, *next_h[0].size()), + torch.cat(next_c, 0).view(num_layers, *next_c[0].size()) + ) + else: + next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size()) + + return next_hidden, input + + return forward + + +def AutogradVarMaskedStep(num_layers=1, lstm=False): + layer = VarMaskedStep() + + func = StackedStep(layer, + num_layers, + lstm=lstm) + + def forward(input, cells, hidden, mask): + nexth, output = func(input, hidden, cells, mask) + return output, nexth + + return forward + + +class VarMaskedRNNBase(nn.Module): + def __init__(self, Cell, input_size, hidden_size, + num_layers=1, bias=True, batch_first=False, + dropout=(0, 0), bidirectional=False, initializer=None, **kwargs): + + super(VarMaskedRNNBase, self).__init__() + self.Cell = Cell + self.input_size = input_size + self.hidden_size = hidden_size + self.num_layers = num_layers + self.bias = bias + self.batch_first = batch_first + self.bidirectional = bidirectional + self.lstm = False + num_directions = 2 if bidirectional else 1 + + self.all_cells = [] + for layer in range(num_layers): + for direction in range(num_directions): + layer_input_size = input_size if layer == 0 else hidden_size * num_directions + + cell = self.Cell(layer_input_size, hidden_size, self.bias, p=dropout, initializer=initializer, **kwargs) + self.all_cells.append(cell) + self.add_module('cell%d' % (layer * num_directions + direction), cell) + + def reset_parameters(self): + for cell in self.all_cells: + cell.reset_parameters() + + def reset_noise(self, batch_size): + for cell in self.all_cells: + cell.reset_noise(batch_size) + + def forward(self, input, mask=None, hx=None): + batch_size = input.size(0) if self.batch_first else input.size(1) + if hx is None: + num_directions = 2 if self.bidirectional else 1 + hx = torch.tensor(input.data.new(self.num_layers * num_directions, batch_size, self.hidden_size).zero_(), + requires_grad=True) + if self.lstm: + hx = (hx, hx) + + func = AutogradVarMaskedRNN(num_layers=self.num_layers, + batch_first=self.batch_first, + bidirectional=self.bidirectional, + lstm=self.lstm) + + self.reset_noise(batch_size) + + output, hidden = func(input, self.all_cells, hx, None if mask is None else mask.view(mask.size() + (1,))) + return output, hidden + + def step(self, input, hx=None, mask=None): + ''' + execute one step forward (only for one-directional RNN). + Args: + input (batch, input_size): input tensor of this step. + hx (num_layers, batch, hidden_size): the hidden state of last step. + mask (batch): the mask tensor of this step. + Returns: + output (batch, hidden_size): tensor containing the output of this step from the last layer of RNN. + hn (num_layers, batch, hidden_size): tensor containing the hidden state of this step + ''' + assert not self.bidirectional, "step only cannot be applied to bidirectional RNN." + batch_size = input.size(0) + if hx is None: + hx = torch.tensor(input.data.new(self.num_layers, batch_size, self.hidden_size).zero_(), requires_grad=True) + if self.lstm: + hx = (hx, hx) + + func = AutogradVarMaskedStep(num_layers=self.num_layers, lstm=self.lstm) + + output, hidden = func(input, self.all_cells, hx, mask) + return output, hidden + + +class VarMaskedFastLSTM(VarMaskedRNNBase): + def __init__(self, *args, **kwargs): + super(VarMaskedFastLSTM, self).__init__(VarFastLSTMCell, *args, **kwargs) + self.lstm = True + + +class VarRNNCellBase(nn.Module): + def __repr__(self): + s = '{name}({input_size}, {hidden_size}' + if 'bias' in self.__dict__ and self.bias is not True: + s += ', bias={bias}' + if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh": + s += ', nonlinearity={nonlinearity}' + s += ')' + return s.format(name=self.__class__.__name__, **self.__dict__) + + def reset_noise(self, batch_size): + """ + Should be overriden by all subclasses. + Args: + batch_size: (int) batch size of input. + """ + raise NotImplementedError + + +class VarFastLSTMCell(VarRNNCellBase): + """ + A long short-term memory (LSTM) cell with variational dropout. + .. math:: + \begin{array}{ll} + i = \mathrm{sigmoid}(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\ + f = \mathrm{sigmoid}(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\ + g = \tanh(W_{ig} x + b_{ig} + W_{hc} h + b_{hg}) \\ + o = \mathrm{sigmoid}(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\ + c' = f * c + i * g \\ + h' = o * \tanh(c') \\ + \end{array} + """ + + def __init__(self, input_size, hidden_size, bias=True, p=(0.5, 0.5), initializer=None): + super(VarFastLSTMCell, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.bias = bias + self.weight_ih = Parameter(torch.Tensor(4 * hidden_size, input_size)) + self.weight_hh = Parameter(torch.Tensor(4 * hidden_size, hidden_size)) + if bias: + self.bias_ih = Parameter(torch.Tensor(4 * hidden_size)) + self.bias_hh = Parameter(torch.Tensor(4 * hidden_size)) + else: + self.register_parameter('bias_ih', None) + self.register_parameter('bias_hh', None) + + self.initializer = default_initializer(self.hidden_size) if initializer is None else initializer + self.reset_parameters() + p_in, p_hidden = p + if p_in < 0 or p_in > 1: + raise ValueError("input dropout probability has to be between 0 and 1, " + "but got {}".format(p_in)) + if p_hidden < 0 or p_hidden > 1: + raise ValueError("hidden state dropout probability has to be between 0 and 1, " + "but got {}".format(p_hidden)) + self.p_in = p_in + self.p_hidden = p_hidden + self.noise_in = None + self.noise_hidden = None + + def reset_parameters(self): + for weight in self.parameters(): + if weight.dim() == 1: + weight.data.zero_() + else: + self.initializer(weight.data) + + def reset_noise(self, batch_size): + if self.training: + if self.p_in: + noise = self.weight_ih.data.new(batch_size, self.input_size) + self.noise_in = torch.tensor(noise.bernoulli_(1.0 - self.p_in) / (1.0 - self.p_in)) + else: + self.noise_in = None + + if self.p_hidden: + noise = self.weight_hh.data.new(batch_size, self.hidden_size) + self.noise_hidden = torch.tensor(noise.bernoulli_(1.0 - self.p_hidden) / (1.0 - self.p_hidden)) + else: + self.noise_hidden = None + else: + self.noise_in = None + self.noise_hidden = None + + def forward(self, input, hx): + return self.__forward( + input, hx, + self.weight_ih, self.weight_hh, + self.bias_ih, self.bias_hh, + self.noise_in, self.noise_hidden, + ) + + @staticmethod + def __forward(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): + if noise_in is not None: + if input.is_cuda: + input = input * noise_in.cuda(input.get_device()) + else: + input = input * noise_in + + if input.is_cuda: + w_ih = w_ih.cuda(input.get_device()) + w_hh = w_hh.cuda(input.get_device()) + hidden = [h.cuda(input.get_device()) for h in hidden] + b_ih = b_ih.cuda(input.get_device()) + b_hh = b_hh.cuda(input.get_device()) + igates = F.linear(input, w_ih.cuda(input.get_device())) + hgates = F.linear(hidden[0], w_hh) if noise_hidden is None \ + else F.linear(hidden[0] * noise_hidden.cuda(input.get_device()), w_hh) + state = fusedBackend.LSTMFused.apply + # print("use backend") + # use some magic function + return state(igates, hgates, hidden[1]) if b_ih is None else state(igates, hgates, hidden[1], b_ih, b_hh) + + hx, cx = hidden + if noise_hidden is not None: + hx = hx * noise_hidden + gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh) + + ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) + + ingate = F.sigmoid(ingate) + forgetgate = F.sigmoid(forgetgate) + cellgate = F.tanh(cellgate) + outgate = F.sigmoid(outgate) + + cy = (forgetgate * cx) + (ingate * cellgate) + hy = outgate * F.tanh(cy) + + return hy, cy