Browse Source

add accuracy in POS Tester;

optimize evaluation output in Trainer;
keep POS pipeline (loader + trainer + tester + saver) OK;
add codes borrowed from FudanParser.
tags/v0.1.0
FengZiYjun 6 years ago
parent
commit
301bbdcd1e
20 changed files with 1781 additions and 26 deletions
  1. +15
    -0
      .idea/deployment.xml
  2. +14
    -0
      .idea/fastNLP.iml
  3. +16
    -0
      .idea/inspectionProfiles/Project_Default.xml
  4. +7
    -0
      .idea/misc.xml
  5. +8
    -0
      .idea/modules.xml
  6. +7
    -0
      .idea/other.xml
  7. +6
    -0
      .idea/vcs.xml
  8. +29
    -5
      fastNLP/action/tester.py
  9. +3
    -2
      fastNLP/action/trainer.py
  10. +19
    -0
      fastNLP/loader/model_loader.py
  11. +2
    -2
      fastNLP/models/sequence_modeling.py
  12. +87
    -0
      fastNLP/modules/encoder/char_embedding.py
  13. +422
    -0
      fastNLP/modules/encoder/masked_rnn.py
  14. +384
    -0
      fastNLP/modules/encoder/variational_rnn.py
  15. +489
    -0
      fastNLP/modules/other_modules.py
  16. +228
    -3
      fastNLP/modules/utils.py
  17. +0
    -9
      fastNLP/saver/base_saver.py
  18. +11
    -1
      fastNLP/saver/model_saver.py
  19. +10
    -1
      test/data_for_tests/config
  20. +24
    -3
      test/test_POS_pipeline.py

+ 15
- 0
.idea/deployment.xml View File

@@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PublishConfigData" autoUpload="Always" serverName="zyfeng@10.40.40.86:22">
<serverData>
<paths name="zyfeng@10.40.40.86:22">
<serverdata>
<mappings>
<mapping deploy="/home/zyfeng/Desktop/fastNLP" local="$PROJECT_DIR$" />
</mappings>
</serverdata>
</paths>
</serverData>
<option name="myAutoUpload" value="ALWAYS" />
</component>
</project>

+ 14
- 0
.idea/fastNLP.iml View File

@@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Remote Python 3.6.6 (sftp://zyfeng@10.40.40.86:22/home/zyfeng/miniconda3/envs/fastnlp/bin/python3.6)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyDocumentationSettings">
<option name="renderExternalDocumentation" value="true" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>

+ 16
- 0
.idea/inspectionProfiles/Project_Default.xml View File

@@ -0,0 +1,16 @@
<component name="InspectionProjectProfileManager">
<profile version="1.0">
<option name="myName" value="Project Default" />
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
<option name="ignoredPackages">
<value>
<list size="3">
<item index="0" class="java.lang.String" itemvalue="torch" />
<item index="1" class="java.lang.String" itemvalue="numpy" />
<item index="2" class="java.lang.String" itemvalue="torchvision" />
</list>
</value>
</option>
</inspection_tool>
</profile>
</component>

+ 7
- 0
.idea/misc.xml View File

@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.6.6 (sftp://zyfeng@10.40.40.86:22/home/zyfeng/miniconda3/envs/fastnlp/bin/python3.6)" project-jdk-type="Python SDK" />
</project>

+ 8
- 0
.idea/modules.xml View File

@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/fastNLP.iml" filepath="$PROJECT_DIR$/.idea/fastNLP.iml" />
</modules>
</component>
</project>

+ 7
- 0
.idea/other.xml View File

@@ -0,0 +1,7 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PySciProjectComponent">
<option name="PY_SCI_VIEW" value="true" />
<option name="PY_SCI_VIEW_SUGGESTED" value="true" />
</component>
</project>

+ 6
- 0
.idea/vcs.xml View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>

+ 29
- 5
fastNLP/action/tester.py View File

@@ -13,7 +13,7 @@ class BaseTester(Action):

def __init__(self, test_args):
"""
:param test_args: named tuple
:param test_args: a dict-like object that has __getitem__ method, can be accessed by "test_args["key_str"]"
"""
super(BaseTester, self).__init__()
self.validate_in_training = test_args["validate_in_training"]
@@ -28,6 +28,7 @@ class BaseTester(Action):

self.model = None
self.eval_history = []
self.batch_output = []

def test(self, network):
# print("--------------testing----------------")
@@ -40,7 +41,6 @@ class BaseTester(Action):

self.iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True))

batch_output = list()
num_iter = len(dev_data) // self.batch_size

for step in range(num_iter):
@@ -50,7 +50,7 @@ class BaseTester(Action):
eval_results = self.evaluate(prediction, batch_y)

if self.save_output:
batch_output.append(prediction)
self.batch_output.append(prediction)
if self.save_loss:
self.eval_history.append(eval_results)

@@ -118,6 +118,13 @@ class BaseTester(Action):
model.train()
self.eval_history.clear()

def show_matrices(self):
"""
This is called by Trainer to print evaluation on dev set.
:return print_str: str
"""
raise NotImplementedError


class POSTester(BaseTester):
"""
@@ -125,6 +132,9 @@ class POSTester(BaseTester):
"""

def __init__(self, test_args):
"""
:param test_args: a dict-like object that has __getitem__ method, can be accessed by "test_args["key_str"]"
"""
super(POSTester, self).__init__(test_args)
self.max_len = None
self.mask = None
@@ -148,7 +158,21 @@ class POSTester(BaseTester):
def evaluate(self, predict, truth):
truth = torch.Tensor(truth)
loss, prediction = self.model.loss(predict, truth, self.mask, self.batch_size, self.max_len)
return loss.data
results = torch.Tensor(prediction[0][0]).view((-1, ))
accuracy = float(torch.sum(results == truth.view((-1, )))) / results.shape[0]
return [loss.data, accuracy]

def matrices(self):
return np.mean(self.eval_history)
batch_loss = np.mean([x[0] for x in self.eval_history])
batch_accuracy = np.mean([x[1] for x in self.eval_history])
return batch_loss, batch_accuracy

def show_matrices(self):
"""
This is called by Trainer to print evaluation on dev set.
:return print_str: str
"""
loss, accuracy = self.matrices()
return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy)



+ 3
- 2
fastNLP/action/trainer.py View File

@@ -24,7 +24,7 @@ class BaseTrainer(Action):

def __init__(self, train_args):
"""
:param train_args: dict of (key, value)
:param train_args: dict of (key, value), or dict-like object. key is str.

The base trainer requires the following keys:
- epochs: int, the number of epochs in training
@@ -89,7 +89,8 @@ class BaseTrainer(Action):
if data_dev is None:
raise RuntimeError("No validation data provided.")
validator.test(network)
print("[epoch {}] dev loss={:.2f}".format(epoch, validator.matrices()))
print("[epoch {}]".format(epoch), end=" ")
print(validator.show_matrices())

# finish training



+ 19
- 0
fastNLP/loader/model_loader.py View File

@@ -0,0 +1,19 @@
import torch

from fastNLP.loader.base_loader import BaseLoader


class ModelLoader(BaseLoader):
"""
Loader for models.
"""

def __init__(self, data_name, data_path):
super(ModelLoader, self).__init__(data_name, data_path)

def load_pytorch(self, empty_model):
"""
Load model parameters from .pkl files into the empty PyTorch model.
:param empty_model: a PyTorch model with initialized parameters.
"""
empty_model.load_state_dict(torch.load(self.data_path))

+ 2
- 2
fastNLP/models/sequence_modeling.py View File

@@ -75,8 +75,8 @@ class SeqLabeling(BaseModel):
:param mask: ByteTensor, [batch_size, max_len]
:param batch_size: int
:param max_len: int
:return loss:
prediction:
:return loss: a scalar Tensor
prediction: list of tuple of (decode path(list), best score)
"""
x = x.float()
y = y.long()


+ 87
- 0
fastNLP/modules/encoder/char_embedding.py View File

@@ -0,0 +1,87 @@
import torch
import torch.nn.functional as F
from torch import nn


class ConvCharEmbedding(nn.Module):

def __init__(self, char_emb_size, feature_maps=(40, 30, 30), kernels=(3, 4, 5)):
"""
Character Level Word Embedding
:param char_emb_size: the size of character level embedding,
say 26 characters, each embedded to 50 dim vector, then the input_size is 50.
:param feature_maps: table of feature maps (for each kernel width)
:param kernels: table of kernel widths
"""
super(ConvCharEmbedding, self).__init__()
self.convs = nn.ModuleList([
nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, 4))
for i in range(len(kernels))])

def forward(self, x):
"""
:param x: [batch_size * sent_length, word_length, char_emb_size]
:return: [batch_size * sent_length, sum(feature_maps), 1]
"""
x = x.contiguous().view(x.size(0), 1, x.size(1), x.size(2)) # [batch_size*sent_length, channel, width, height]
x = x.transpose(2, 3) # [batch_size*sent_length, channel, height, width]
return self.convolute(x).unsqueeze(2)

def convolute(self, x):
feats = []
for conv in self.convs:
y = conv(x) # [batch_size*sent_length, feature_maps[i], 1, width - kernels[i] + 1]
y = torch.squeeze(y, 2) # [batch_size*sent_length, feature_maps[i], width - kernels[i] + 1]
y = F.tanh(y)
y, __ = torch.max(y, 2) # [batch_size*sent_length, feature_maps[i]]
feats.append(y)
return torch.cat(feats, 1) # [batch_size*sent_length, sum(feature_maps)]


class LSTMCharEmbedding(nn.Module):
"""
Character Level Word Embedding with LSTM
:param char_emb_size: the size of character level embedding,
say 26 characters, each embedded to 50 dim vector, then the input_size is 50.
"""

def __init__(self, char_emb_size, hidden_size=None):
super(LSTMCharEmbedding, self).__init__()
self.hidden_size = char_emb_size if hidden_size is None else hidden_size

self.lstm = nn.LSTM(input_size=char_emb_size,
hidden_size=self.hidden_size,
num_layers=1,
bias=True,
batch_first=True)

def forward(self, x):
"""
:param x:[ n_batch*n_word, word_length, char_emb_size]
:return: [ n_batch*n_word, char_emb_size]
"""
batch_size = x.shape[0]
h0 = torch.empty(1, batch_size, self.hidden_size)
h0 = nn.init.orthogonal_(h0)
c0 = torch.empty(1, batch_size, self.hidden_size)
c0 = nn.init.orthogonal_(c0)

_, hidden = self.lstm(x, (h0, c0))
return hidden[0].squeeze().unsqueeze(2)


if __name__ == "__main__":
batch_size = 128
char_emb = 100
word_length = 1
x = torch.Tensor(batch_size, char_emb, word_length)
x = x.transpose(1, 2)
cce = ConvCharEmbedding(char_emb)
y = cce(x)
print("CNN Char Emb input: ", x.shape)
print("CNN Char Emb output: ", y.shape) # [128, 100]

lce = LSTMCharEmbedding(char_emb)
o = lce(x)
print("LSTM Char Emb input: ", x.shape)
print("LSTM Char Emb size: ", o.shape)

+ 422
- 0
fastNLP/modules/encoder/masked_rnn.py View File

@@ -0,0 +1,422 @@
__author__ = 'max'

import torch
import torch.nn as nn
import torch.nn.functional as F


def MaskedRecurrent(reverse=False):
def forward(input, hidden, cell, mask, train=True, dropout=0):
"""
:param input:
:param hidden:
:param cell:
:param mask:
:param dropout: step之间的dropout,对mask了的也会drop,应该是没问题的,反正没有gradient
:param train: 控制dropout的行为,在StackedRNN的forward中调用
:return:
"""
output = []
steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
for i in steps:
if mask is None or mask[i].data.min() > 0.5: # 没有mask,都是1
hidden = cell(input[i], hidden)
elif mask[i].data.max() > 0.5: # 有mask,但不全为0
hidden_next = cell(input[i], hidden) # 一次喂入一个batch!
# hack to handle LSTM
if isinstance(hidden, tuple): # LSTM outputs a tuple of (hidden, cell), this is a common hack 😁
mask = mask.float()
hx, cx = hidden
hp1, cp1 = hidden_next
hidden = (
hx + (hp1 - hx) * mask[i].squeeze(),
cx + (cp1 - cx) * mask[i].squeeze()) # Why? 我知道了!!如果是mask就不用改变
else:
hidden = hidden + (hidden_next - hidden) * mask[i]

# if dropout != 0 and train: # warning, should i treat masked tensor differently?
# if isinstance(hidden, tuple):
# hidden = (F.dropout(hidden[0], p=dropout, training=train),
# F.dropout(hidden[1], p=dropout, training=train))
# else:
# hidden = F.dropout(hidden, p=dropout, training=train)

# hack to handle LSTM
output.append(hidden[0] if isinstance(hidden, tuple) else hidden)

if reverse:
output.reverse()
output = torch.cat(output, 0).view(input.size(0), *output[0].size())

return hidden, output

return forward


def StackedRNN(inners, num_layers, lstm=False, train=True, step_dropout=0, layer_dropout=0):
num_directions = len(inners) # rec_factory!
total_layers = num_layers * num_directions

def forward(input, hidden, cells, mask):
assert (len(cells) == total_layers)
next_hidden = []

if lstm:
hidden = list(zip(*hidden))

for i in range(num_layers):
all_output = []
for j, inner in enumerate(inners):
l = i * num_directions + j
hy, output = inner(input, hidden[l], cells[l], mask, step_dropout, train)
next_hidden.append(hy)
all_output.append(output)

input = torch.cat(all_output, input.dim() - 1) # 下一层的输入

if layer_dropout != 0 and i < num_layers - 1:
input = F.dropout(input, p=layer_dropout, training=train, inplace=False)

if lstm:
next_h, next_c = zip(*next_hidden)
next_hidden = (
torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
)
else:
next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())

return next_hidden, input

return forward


def AutogradMaskedRNN(num_layers=1, batch_first=False, train=True, layer_dropout=0, step_dropout=0,
bidirectional=False, lstm=False):
rec_factory = MaskedRecurrent

if bidirectional:
layer = (rec_factory(), rec_factory(reverse=True))
else:
layer = (rec_factory(),) # rec_factory 就是每层的结构啦!!在MaskedRecurrent中进行每层的计算!然后用StackedRNN接起来

func = StackedRNN(layer,
num_layers,
lstm=lstm,
layer_dropout=layer_dropout, step_dropout=step_dropout,
train=train)

def forward(input, cells, hidden, mask):
if batch_first:
input = input.transpose(0, 1)
if mask is not None:
mask = mask.transpose(0, 1)

nexth, output = func(input, hidden, cells, mask)

if batch_first:
output = output.transpose(0, 1)

return output, nexth

return forward


def MaskedStep():
def forward(input, hidden, cell, mask):
if mask is None or mask.data.min() > 0.5:
hidden = cell(input, hidden)
elif mask.data.max() > 0.5:
hidden_next = cell(input, hidden)
# hack to handle LSTM
if isinstance(hidden, tuple):
hx, cx = hidden
hp1, cp1 = hidden_next
hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
else:
hidden = hidden + (hidden_next - hidden) * mask
# hack to handle LSTM
output = hidden[0] if isinstance(hidden, tuple) else hidden

return hidden, output

return forward


def StackedStep(layer, num_layers, lstm=False, dropout=0, train=True):
def forward(input, hidden, cells, mask):
assert (len(cells) == num_layers)
next_hidden = []

if lstm:
hidden = list(zip(*hidden))

for l in range(num_layers):
hy, output = layer(input, hidden[l], cells[l], mask)
next_hidden.append(hy)
input = output

if dropout != 0 and l < num_layers - 1:
input = F.dropout(input, p=dropout, training=train, inplace=False)

if lstm:
next_h, next_c = zip(*next_hidden)
next_hidden = (
torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
)
else:
next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())

return next_hidden, input

return forward


def AutogradMaskedStep(num_layers=1, dropout=0, train=True, lstm=False):
layer = MaskedStep()

func = StackedStep(layer,
num_layers,
lstm=lstm,
dropout=dropout,
train=train)

def forward(input, cells, hidden, mask):
nexth, output = func(input, hidden, cells, mask)
return output, nexth

return forward


class MaskedRNNBase(nn.Module):
def __init__(self, Cell, input_size, hidden_size,
num_layers=1, bias=True, batch_first=False,
layer_dropout=0, step_dropout=0, bidirectional=False, **kwargs):
"""
:param Cell:
:param input_size:
:param hidden_size:
:param num_layers:
:param bias:
:param batch_first:
:param layer_dropout:
:param step_dropout:
:param bidirectional:
:param kwargs:
"""

super(MaskedRNNBase, self).__init__()
self.Cell = Cell
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bias = bias
self.batch_first = batch_first
self.layer_dropout = layer_dropout
self.step_dropout = step_dropout
self.bidirectional = bidirectional
num_directions = 2 if bidirectional else 1

self.all_cells = []
for layer in range(num_layers): # 初始化所有cell
for direction in range(num_directions):
layer_input_size = input_size if layer == 0 else hidden_size * num_directions

cell = self.Cell(layer_input_size, hidden_size, self.bias, **kwargs)
self.all_cells.append(cell)
self.add_module('cell%d' % (layer * num_directions + direction), cell) # Max的代码写得真好看

def reset_parameters(self):
for cell in self.all_cells:
cell.reset_parameters()

def forward(self, input, mask=None, hx=None):
batch_size = input.size(0) if self.batch_first else input.size(1)
lstm = self.Cell is nn.LSTMCell
if hx is None:
num_directions = 2 if self.bidirectional else 1
hx = torch.autograd.Variable(
input.data.new(self.num_layers * num_directions, batch_size, self.hidden_size).zero_())
if lstm:
hx = (hx, hx)

func = AutogradMaskedRNN(num_layers=self.num_layers,
batch_first=self.batch_first,
step_dropout=self.step_dropout,
layer_dropout=self.layer_dropout,
train=self.training,
bidirectional=self.bidirectional,
lstm=lstm) # 传入all_cells,继续往底层封装走

output, hidden = func(input, self.all_cells, hx,
None if mask is None else mask.view(mask.size() + (1,))) # 这个+ (1, )是个什么操作?
return output, hidden

def step(self, input, hx=None, mask=None):
'''
execute one step forward (only for one-directional RNN).
Args:
input (batch, input_size): input tensor of this step.
hx (num_layers, batch, hidden_size): the hidden state of last step.
mask (batch): the mask tensor of this step.
Returns:
output (batch, hidden_size): tensor containing the output of this step from the last layer of RNN.
hn (num_layers, batch, hidden_size): tensor containing the hidden state of this step
'''
assert not self.bidirectional, "step only cannot be applied to bidirectional RNN." # aha, typo!
batch_size = input.size(0)
lstm = self.Cell is nn.LSTMCell
if hx is None:
hx = torch.autograd.Variable(input.data.new(self.num_layers, batch_size, self.hidden_size).zero_())
if lstm:
hx = (hx, hx)

func = AutogradMaskedStep(num_layers=self.num_layers,
dropout=self.dropout,
train=self.training,
lstm=lstm)

output, hidden = func(input, self.all_cells, hx, mask)
return output, hidden


class MaskedRNN(MaskedRNNBase):
r"""Applies a multi-layer Elman RNN with costomized non-linearity to an
input sequence.
For each element in the input sequence, each layer computes the following
function:
.. math::
h_t = \tanh(w_{ih} * x_t + b_{ih} + w_{hh} * h_{(t-1)} + b_{hh})
where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is
the hidden state of the previous layer at time `t` or :math:`input_t`
for the first layer. If nonlinearity='relu', then `ReLU` is used instead
of `tanh`.
Args:
input_size: The number of expected features in the input x
hidden_size: The number of features in the hidden state h
num_layers: Number of recurrent layers.
nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh'
bias: If False, then the layer does not use bias weights b_ih and b_hh.
Default: True
batch_first: If True, then the input and output tensors are provided
as (batch, seq, feature)
dropout: If non-zero, introduces a dropout layer on the outputs of each
RNN layer except the last layer
bidirectional: If True, becomes a bidirectional RNN. Default: False
Inputs: input, mask, h_0
- **input** (seq_len, batch, input_size): tensor containing the features
of the input sequence.
**mask** (seq_len, batch): 0-1 tensor containing the mask of the input sequence.
- **h_0** (num_layers * num_directions, batch, hidden_size): tensor
containing the initial hidden state for each element in the batch.
Outputs: output, h_n
- **output** (seq_len, batch, hidden_size * num_directions): tensor
containing the output features (h_k) from the last layer of the RNN,
for each k. If a :class:`torch.nn.utils.rnn.PackedSequence` has
been given as the input, the output will also be a packed sequence.
- **h_n** (num_layers * num_directions, batch, hidden_size): tensor
containing the hidden state for k=seq_len.
"""

def __init__(self, *args, **kwargs):
super(MaskedRNN, self).__init__(nn.RNNCell, *args, **kwargs)


class MaskedLSTM(MaskedRNNBase):
r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input
sequence.
For each element in the input sequence, each layer computes the following
function:
.. math::
\begin{array}{ll}
i_t = \mathrm{sigmoid}(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
f_t = \mathrm{sigmoid}(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\
o_t = \mathrm{sigmoid}(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
c_t = f_t * c_{(t-1)} + i_t * g_t \\
h_t = o_t * \tanh(c_t)
\end{array}
where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
state at time `t`, :math:`x_t` is the hidden state of the previous layer at
time `t` or :math:`input_t` for the first layer, and :math:`i_t`,
:math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell,
and out gates, respectively.
Args:
input_size: The number of expected features in the input x
hidden_size: The number of features in the hidden state h
num_layers: Number of recurrent layers.
bias: If False, then the layer does not use bias weights b_ih and b_hh.
Default: True
batch_first: If True, then the input and output tensors are provided
as (batch, seq, feature)
dropout: If non-zero, introduces a dropout layer on the outputs of each
RNN layer except the last layer
bidirectional: If True, becomes a bidirectional RNN. Default: False
Inputs: input, mask, (h_0, c_0)
- **input** (seq_len, batch, input_size): tensor containing the features
of the input sequence.
**mask** (seq_len, batch): 0-1 tensor containing the mask of the input sequence.
- **h_0** (num_layers \* num_directions, batch, hidden_size): tensor
containing the initial hidden state for each element in the batch.
- **c_0** (num_layers \* num_directions, batch, hidden_size): tensor
containing the initial cell state for each element in the batch.
Outputs: output, (h_n, c_n)
- **output** (seq_len, batch, hidden_size * num_directions): tensor
containing the output features `(h_t)` from the last layer of the RNN,
for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been
given as the input, the output will also be a packed sequence.
- **h_n** (num_layers * num_directions, batch, hidden_size): tensor
containing the hidden state for t=seq_len
- **c_n** (num_layers * num_directions, batch, hidden_size): tensor
containing the cell state for t=seq_len
"""

def __init__(self, *args, **kwargs):
super(MaskedLSTM, self).__init__(nn.LSTMCell, *args, **kwargs)


class MaskedGRU(MaskedRNNBase):
r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
For each element in the input sequence, each layer computes the following
function:
.. math::
\begin{array}{ll}
r_t = \mathrm{sigmoid}(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
z_t = \mathrm{sigmoid}(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} \\
\end{array}
where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the hidden
state of the previous layer at time `t` or :math:`input_t` for the first
layer, and :math:`r_t`, :math:`z_t`, :math:`n_t` are the reset, input,
and new gates, respectively.
Args:
input_size: The number of expected features in the input x
hidden_size: The number of features in the hidden state h
num_layers: Number of recurrent layers.
nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh'
bias: If False, then the layer does not use bias weights b_ih and b_hh.
Default: True
batch_first: If True, then the input and output tensors are provided
as (batch, seq, feature)
dropout: If non-zero, introduces a dropout layer on the outputs of each
RNN layer except the last layer
bidirectional: If True, becomes a bidirectional RNN. Default: False
Inputs: input, mask, h_0
- **input** (seq_len, batch, input_size): tensor containing the features
of the input sequence.
**mask** (seq_len, batch): 0-1 tensor containing the mask of the input sequence.
- **h_0** (num_layers * num_directions, batch, hidden_size): tensor
containing the initial hidden state for each element in the batch.
Outputs: output, h_n
- **output** (seq_len, batch, hidden_size * num_directions): tensor
containing the output features (h_k) from the last layer of the RNN,
for each k. If a :class:`torch.nn.utils.rnn.PackedSequence` has
been given as the input, the output will also be a packed sequence.
- **h_n** (num_layers * num_directions, batch, hidden_size): tensor
containing the hidden state for k=seq_len.
"""

def __init__(self, *args, **kwargs):
super(MaskedGRU, self).__init__(nn.GRUCell, *args, **kwargs)

+ 384
- 0
fastNLP/modules/encoder/variational_rnn.py View File

@@ -0,0 +1,384 @@
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend
from torch.nn.parameter import Parameter


def default_initializer(hidden_size):
stdv = 1.0 / math.sqrt(hidden_size)

def forward(tensor):
nn.init.uniform_(tensor, -stdv, stdv)

return forward


def VarMaskedRecurrent(reverse=False):
def forward(input, hidden, cell, mask):
output = []
steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0))
for i in steps:
if mask is None or mask[i].data.min() > 0.5:
hidden = cell(input[i], hidden)
elif mask[i].data.max() > 0.5:
hidden_next = cell(input[i], hidden)
# hack to handle LSTM
if isinstance(hidden, tuple):
hx, cx = hidden
hp1, cp1 = hidden_next
hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i])
else:
hidden = hidden + (hidden_next - hidden) * mask[i]
# hack to handle LSTM
output.append(hidden[0] if isinstance(hidden, tuple) else hidden)

if reverse:
output.reverse()
output = torch.cat(output, 0).view(input.size(0), *output[0].size())

return hidden, output

return forward


def StackedRNN(inners, num_layers, lstm=False):
num_directions = len(inners)
total_layers = num_layers * num_directions

def forward(input, hidden, cells, mask):
assert (len(cells) == total_layers)
next_hidden = []

if lstm:
hidden = list(zip(*hidden))

for i in range(num_layers):
all_output = []
for j, inner in enumerate(inners):
l = i * num_directions + j
hy, output = inner(input, hidden[l], cells[l], mask)
next_hidden.append(hy)
all_output.append(output)

input = torch.cat(all_output, input.dim() - 1)

if lstm:
next_h, next_c = zip(*next_hidden)
next_hidden = (
torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
)
else:
next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size())

return next_hidden, input

return forward


def AutogradVarMaskedRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False):
rec_factory = VarMaskedRecurrent

if bidirectional:
layer = (rec_factory(), rec_factory(reverse=True))
else:
layer = (rec_factory(),)

func = StackedRNN(layer,
num_layers,
lstm=lstm)

def forward(input, cells, hidden, mask):
if batch_first:
input = input.transpose(0, 1)
if mask is not None:
mask = mask.transpose(0, 1)

nexth, output = func(input, hidden, cells, mask)

if batch_first:
output = output.transpose(0, 1)

return output, nexth

return forward


def VarMaskedStep():
def forward(input, hidden, cell, mask):
if mask is None or mask.data.min() > 0.5:
hidden = cell(input, hidden)
elif mask.data.max() > 0.5:
hidden_next = cell(input, hidden)
# hack to handle LSTM
if isinstance(hidden, tuple):
hx, cx = hidden
hp1, cp1 = hidden_next
hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask)
else:
hidden = hidden + (hidden_next - hidden) * mask
# hack to handle LSTM
output = hidden[0] if isinstance(hidden, tuple) else hidden

return hidden, output

return forward


def StackedStep(layer, num_layers, lstm=False):
def forward(input, hidden, cells, mask):
assert (len(cells) == num_layers)
next_hidden = []

if lstm:
hidden = list(zip(*hidden))

for l in range(num_layers):
hy, output = layer(input, hidden[l], cells[l], mask)
next_hidden.append(hy)
input = output

if lstm:
next_h, next_c = zip(*next_hidden)
next_hidden = (
torch.cat(next_h, 0).view(num_layers, *next_h[0].size()),
torch.cat(next_c, 0).view(num_layers, *next_c[0].size())
)
else:
next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size())

return next_hidden, input

return forward


def AutogradVarMaskedStep(num_layers=1, lstm=False):
layer = VarMaskedStep()

func = StackedStep(layer,
num_layers,
lstm=lstm)

def forward(input, cells, hidden, mask):
nexth, output = func(input, hidden, cells, mask)
return output, nexth

return forward


class VarMaskedRNNBase(nn.Module):
def __init__(self, Cell, input_size, hidden_size,
num_layers=1, bias=True, batch_first=False,
dropout=(0, 0), bidirectional=False, initializer=None, **kwargs):

super(VarMaskedRNNBase, self).__init__()
self.Cell = Cell
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.bias = bias
self.batch_first = batch_first
self.bidirectional = bidirectional
self.lstm = False
num_directions = 2 if bidirectional else 1

self.all_cells = []
for layer in range(num_layers):
for direction in range(num_directions):
layer_input_size = input_size if layer == 0 else hidden_size * num_directions

cell = self.Cell(layer_input_size, hidden_size, self.bias, p=dropout, initializer=initializer, **kwargs)
self.all_cells.append(cell)
self.add_module('cell%d' % (layer * num_directions + direction), cell)

def reset_parameters(self):
for cell in self.all_cells:
cell.reset_parameters()

def reset_noise(self, batch_size):
for cell in self.all_cells:
cell.reset_noise(batch_size)

def forward(self, input, mask=None, hx=None):
batch_size = input.size(0) if self.batch_first else input.size(1)
if hx is None:
num_directions = 2 if self.bidirectional else 1
hx = torch.tensor(input.data.new(self.num_layers * num_directions, batch_size, self.hidden_size).zero_(),
requires_grad=True)
if self.lstm:
hx = (hx, hx)

func = AutogradVarMaskedRNN(num_layers=self.num_layers,
batch_first=self.batch_first,
bidirectional=self.bidirectional,
lstm=self.lstm)

self.reset_noise(batch_size)

output, hidden = func(input, self.all_cells, hx, None if mask is None else mask.view(mask.size() + (1,)))
return output, hidden

def step(self, input, hx=None, mask=None):
'''
execute one step forward (only for one-directional RNN).
Args:
input (batch, input_size): input tensor of this step.
hx (num_layers, batch, hidden_size): the hidden state of last step.
mask (batch): the mask tensor of this step.
Returns:
output (batch, hidden_size): tensor containing the output of this step from the last layer of RNN.
hn (num_layers, batch, hidden_size): tensor containing the hidden state of this step
'''
assert not self.bidirectional, "step only cannot be applied to bidirectional RNN."
batch_size = input.size(0)
if hx is None:
hx = torch.tensor(input.data.new(self.num_layers, batch_size, self.hidden_size).zero_(), requires_grad=True)
if self.lstm:
hx = (hx, hx)

func = AutogradVarMaskedStep(num_layers=self.num_layers, lstm=self.lstm)

output, hidden = func(input, self.all_cells, hx, mask)
return output, hidden


class VarMaskedFastLSTM(VarMaskedRNNBase):
def __init__(self, *args, **kwargs):
super(VarMaskedFastLSTM, self).__init__(VarFastLSTMCell, *args, **kwargs)
self.lstm = True


class VarRNNCellBase(nn.Module):
def __repr__(self):
s = '{name}({input_size}, {hidden_size}'
if 'bias' in self.__dict__ and self.bias is not True:
s += ', bias={bias}'
if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh":
s += ', nonlinearity={nonlinearity}'
s += ')'
return s.format(name=self.__class__.__name__, **self.__dict__)

def reset_noise(self, batch_size):
"""
Should be overriden by all subclasses.
Args:
batch_size: (int) batch size of input.
"""
raise NotImplementedError


class VarFastLSTMCell(VarRNNCellBase):
"""
A long short-term memory (LSTM) cell with variational dropout.
.. math::
\begin{array}{ll}
i = \mathrm{sigmoid}(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\
f = \mathrm{sigmoid}(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\
g = \tanh(W_{ig} x + b_{ig} + W_{hc} h + b_{hg}) \\
o = \mathrm{sigmoid}(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\
c' = f * c + i * g \\
h' = o * \tanh(c') \\
\end{array}
"""

def __init__(self, input_size, hidden_size, bias=True, p=(0.5, 0.5), initializer=None):
super(VarFastLSTMCell, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.bias = bias
self.weight_ih = Parameter(torch.Tensor(4 * hidden_size, input_size))
self.weight_hh = Parameter(torch.Tensor(4 * hidden_size, hidden_size))
if bias:
self.bias_ih = Parameter(torch.Tensor(4 * hidden_size))
self.bias_hh = Parameter(torch.Tensor(4 * hidden_size))
else:
self.register_parameter('bias_ih', None)
self.register_parameter('bias_hh', None)

self.initializer = default_initializer(self.hidden_size) if initializer is None else initializer
self.reset_parameters()
p_in, p_hidden = p
if p_in < 0 or p_in > 1:
raise ValueError("input dropout probability has to be between 0 and 1, "
"but got {}".format(p_in))
if p_hidden < 0 or p_hidden > 1:
raise ValueError("hidden state dropout probability has to be between 0 and 1, "
"but got {}".format(p_hidden))
self.p_in = p_in
self.p_hidden = p_hidden
self.noise_in = None
self.noise_hidden = None

def reset_parameters(self):
for weight in self.parameters():
if weight.dim() == 1:
weight.data.zero_()
else:
self.initializer(weight.data)

def reset_noise(self, batch_size):
if self.training:
if self.p_in:
noise = self.weight_ih.data.new(batch_size, self.input_size)
self.noise_in = torch.tensor(noise.bernoulli_(1.0 - self.p_in) / (1.0 - self.p_in))
else:
self.noise_in = None

if self.p_hidden:
noise = self.weight_hh.data.new(batch_size, self.hidden_size)
self.noise_hidden = torch.tensor(noise.bernoulli_(1.0 - self.p_hidden) / (1.0 - self.p_hidden))
else:
self.noise_hidden = None
else:
self.noise_in = None
self.noise_hidden = None

def forward(self, input, hx):
return self.__forward(
input, hx,
self.weight_ih, self.weight_hh,
self.bias_ih, self.bias_hh,
self.noise_in, self.noise_hidden,
)

@staticmethod
def __forward(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None):
if noise_in is not None:
if input.is_cuda:
input = input * noise_in.cuda(input.get_device())
else:
input = input * noise_in

if input.is_cuda:
w_ih = w_ih.cuda(input.get_device())
w_hh = w_hh.cuda(input.get_device())
hidden = [h.cuda(input.get_device()) for h in hidden]
b_ih = b_ih.cuda(input.get_device())
b_hh = b_hh.cuda(input.get_device())
igates = F.linear(input, w_ih.cuda(input.get_device()))
hgates = F.linear(hidden[0], w_hh) if noise_hidden is None \
else F.linear(hidden[0] * noise_hidden.cuda(input.get_device()), w_hh)
state = fusedBackend.LSTMFused.apply
# print("use backend")
# use some magic function
return state(igates, hgates, hidden[1]) if b_ih is None else state(igates, hgates, hidden[1], b_ih, b_hh)

hx, cx = hidden
if noise_hidden is not None:
hx = hx * noise_hidden
gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh)

ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1)

ingate = F.sigmoid(ingate)
forgetgate = F.sigmoid(forgetgate)
cellgate = F.tanh(cellgate)
outgate = F.sigmoid(outgate)

cy = (forgetgate * cx) + (ingate * cellgate)
hy = outgate * F.tanh(cy)

return hy, cy

+ 489
- 0
fastNLP/modules/other_modules.py View File

@@ -0,0 +1,489 @@
"""
This is borrowed from FudanParser. Not stable. Do not use !!!

"""
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
from .utils import orthogonal
import torch
import torch.utils.data
import numpy
from torch.autograd import Function, Variable
from torch import optim


class GroupNorm(nn.Module):
def __init__(self, num_features, num_groups=20, eps=1e-5):
super(GroupNorm, self).__init__()
self.weight = nn.Parameter(torch.ones(1, num_features, 1))
self.bias = nn.Parameter(torch.zeros(1, num_features, 1))
self.num_groups = num_groups
self.eps = eps

def forward(self, x):
N, C, H = x.size()
G = self.num_groups
assert C % G == 0

x = x.view(N, G, -1)
mean = x.mean(-1, keepdim=True)
var = x.var(-1, keepdim=True)

x = (x - mean) / (var + self.eps).sqrt()
x = x.view(N, C, H)
return x * self.weight + self.bias


class LayerNormalization(nn.Module):
""" Layer normalization module """

def __init__(self, d_hid, eps=1e-3):
super(LayerNormalization, self).__init__()

self.eps = eps
self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True)
self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True)

def forward(self, z):
if z.size(1) == 1:
return z

mu = torch.mean(z, keepdim=True, dim=-1)
sigma = torch.std(z, keepdim=True, dim=-1)
ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps)
ln_out = ln_out * self.a_2.expand_as(ln_out) + self.b_2.expand_as(ln_out)

return ln_out


class OrthEmbedding(nn.Embedding):
def __init__(self, *args, **kwargs):
super(OrthEmbedding, self).__init__(*args, **kwargs)

def reset_parameters(self):
self.weight = orthogonal(self.weight)
nn.init.constant_(self.bias, 0.)


class BiLinear(nn.Module):
def __init__(self, n_left, n_right, n_out, bias=True):
"""
Args:
n_left: size of left input
n_right: size of right input
n_out: size of output
bias: If set to False, the layer will not learn an additive bias.
Default: True
"""
super(BiLinear, self).__init__()
self.n_left = n_left
self.n_right = n_right
self.n_out = n_out

self.U = Parameter(torch.Tensor(self.n_out, self.n_left, self.n_right))
self.W_l = Parameter(torch.Tensor(self.n_out, self.n_left))
self.W_r = Parameter(torch.Tensor(self.n_out, self.n_left))

if bias:
self.bias = Parameter(torch.Tensor(n_out))
else:
self.register_parameter('bias', None)

self.reset_parameters()

def reset_parameters(self):
nn.init.xavier_uniform_(self.W_l)
nn.init.xavier_uniform_(self.W_r)
nn.init.constant_(self.bias, 0.)
nn.init.xavier_uniform_(self.U)

def forward(self, input_left, input_right):
"""
Args:
input_left: Tensor
the left input tensor with shape = [batch1, batch2, ..., left_features]
input_right: Tensor
the right input tensor with shape = [batch1, batch2, ..., right_features]
Returns:
"""
left_size = input_left.size()
right_size = input_right.size()
assert left_size[:-1] == right_size[:-1], \
"batch size of left and right inputs mis-match: (%s, %s)" % (left_size[:-1], right_size[:-1])
batch = int(np.prod(left_size[:-1]))

# convert left and right input to matrices [batch, left_features], [batch, right_features]
input_left = input_left.view(batch, self.n_left)
input_right = input_right.view(batch, self.n_right)

# output [batch, out_features]
output = F.bilinear(input_left, input_right, self.U, self.bias)
output = output + \
F.linear(input_left, self.W_l, None) + \
F.linear(input_right, self.W_r, None)
# convert back to [batch1, batch2, ..., out_features]
return output.view(left_size[:-1] + (self.n_out,))

def __repr__(self):
return self.__class__.__name__ + ' (' \
+ 'in1_features=' + str(self.n_left) \
+ ', in2_features=' + str(self.n_right) \
+ ', out_features=' + str(self.n_out) + ')'


class BiAffine(nn.Module):
def __init__(self, n_enc, n_dec, n_labels, biaffine=True, **kwargs):
"""
Args:
n_enc: int
the dimension of the encoder input.
n_dec: int
the dimension of the decoder input.
n_labels: int
the number of labels of the crf layer
biaffine: bool
if apply bi-affine parameter.
**kwargs:
"""
super(BiAffine, self).__init__()
self.n_enc = n_enc
self.n_dec = n_dec
self.num_labels = n_labels
self.biaffine = biaffine

self.W_d = Parameter(torch.Tensor(self.num_labels, self.n_dec))
self.W_e = Parameter(torch.Tensor(self.num_labels, self.n_enc))
self.b = Parameter(torch.Tensor(self.num_labels, 1, 1))
if self.biaffine:
self.U = Parameter(torch.Tensor(self.num_labels, self.n_dec, self.n_enc))
else:
self.register_parameter('U', None)

self.reset_parameters()

def reset_parameters(self):
nn.init.xavier_uniform_(self.W_d)
nn.init.xavier_uniform_(self.W_e)
nn.init.constant_(self.b, 0.)
if self.biaffine:
nn.init.xavier_uniform_(self.U)

def forward(self, input_d, input_e, mask_d=None, mask_e=None):
"""
Args:
input_d: Tensor
the decoder input tensor with shape = [batch, length_decoder, input_size]
input_e: Tensor
the child input tensor with shape = [batch, length_encoder, input_size]
mask_d: Tensor or None
the mask tensor for decoder with shape = [batch, length_decoder]
mask_e: Tensor or None
the mask tensor for encoder with shape = [batch, length_encoder]
Returns: Tensor
the energy tensor with shape = [batch, num_label, length, length]
"""
assert input_d.size(0) == input_e.size(0), 'batch sizes of encoder and decoder are requires to be equal.'
batch, length_decoder, _ = input_d.size()
_, length_encoder, _ = input_e.size()

# compute decoder part: [num_label, input_size_decoder] * [batch, input_size_decoder, length_decoder]
# the output shape is [batch, num_label, length_decoder]
out_d = torch.matmul(self.W_d, input_d.transpose(1, 2)).unsqueeze(3)
# compute decoder part: [num_label, input_size_encoder] * [batch, input_size_encoder, length_encoder]
# the output shape is [batch, num_label, length_encoder]
out_e = torch.matmul(self.W_e, input_e.transpose(1, 2)).unsqueeze(2)

# output shape [batch, num_label, length_decoder, length_encoder]
if self.biaffine:
# compute bi-affine part
# [batch, 1, length_decoder, input_size_decoder] * [num_labels, input_size_decoder, input_size_encoder]
# output shape [batch, num_label, length_decoder, input_size_encoder]
output = torch.matmul(input_d.unsqueeze(1), self.U)
# [batch, num_label, length_decoder, input_size_encoder] * [batch, 1, input_size_encoder, length_encoder]
# output shape [batch, num_label, length_decoder, length_encoder]
output = torch.matmul(output, input_e.unsqueeze(1).transpose(2, 3))

output = output + out_d + out_e + self.b
else:
output = out_d + out_d + self.b

if mask_d is not None:
output = output * mask_d.unsqueeze(1).unsqueeze(3) * mask_e.unsqueeze(1).unsqueeze(2)

return output


class Transpose(nn.Module):
def __init__(self, x, y):
super(Transpose, self).__init__()
self.x = x
self.y = y

def forward(self, x):
return x.transpose(self.x, self.y)


class WordDropout(nn.Module):
def __init__(self, dropout_rate, drop_to_token):
super(WordDropout, self).__init__()
self.dropout_rate = dropout_rate
self.drop_to_token = drop_to_token

def forward(self, word_idx):
if not self.training:
return word_idx
drop_mask = torch.rand(word_idx.shape) < self.dropout_rate
if word_idx.device.type == 'cuda':
drop_mask = drop_mask.cuda()
drop_mask = drop_mask.long()
output = drop_mask * self.drop_to_token + (1 - drop_mask) * word_idx
return output


class WlossLayer(torch.nn.Module):
def __init__(self, lam=100, sinkhorn_iter=50):
super(WlossLayer, self).__init__()

# cost = matrix M = distance matrix
# lam = lambda of type float > 0
# sinkhorn_iter > 0
# diagonal cost should be 0
self.lam = lam
self.sinkhorn_iter = sinkhorn_iter
# self.register_buffer("K", torch.exp(-self.cost / self.lam).double())
# self.register_buffer("KM", (self.cost * self.K).double())

def forward(self, pred, target, cost):
return WassersteinLossStab.apply(pred, target,
cost, self.lam, self.sinkhorn_iter)


class WassersteinLossStab(Function):
@staticmethod
def forward(ctx, pred, target, cost, lam=1e-3, sinkhorn_iter=4):
"""pred: Batch * K: K = # mass points
target: Batch * L: L = # mass points"""
# import pdb
# pdb.set_trace()
eps = 1e-8

# pred = pred.gather(dim=1, index=)
na = pred.size(1)
nb = target.size(1)

cost = cost.double()
pred = pred.double()
target = target.double()

cost = cost[:na, :nb].double()
K = torch.exp(-cost / lam).double()
KM = (cost * K).double()

batch_size = pred.size(0)

# pdb.set_trace()
log_a, log_b = torch.log(pred + eps), torch.log(target + eps)
log_u = cost.new(batch_size, na).fill_(-numpy.log(na))
log_v = cost.new(batch_size, nb).fill_(-numpy.log(nb))
# import pdb
# pdb.set_trace()
for i in range(int(sinkhorn_iter)):
log_u_max = torch.max(log_u, dim=1)[0]
u_stab = torch.exp(log_u - log_u_max.unsqueeze(1) + eps)
log_v = log_b - torch.log(torch.mm(K.t(), u_stab.t()).t()) - log_u_max.unsqueeze(1)
log_v_max = torch.max(log_v, dim=1)[0]
v_stab = torch.exp(log_v - log_v_max.unsqueeze(1))
tmp = log_u
log_u = log_a - torch.log(torch.mm(K, v_stab.t()).t() + eps) - log_v_max.unsqueeze(1)
# print(log_u.sum())
if torch.norm(tmp - log_u) / torch.norm(log_u) < eps:
break

log_v_max = torch.max(log_v, dim=1)[0]
v_stab = torch.exp(log_v - log_v_max.unsqueeze(1))
logcostpart1 = torch.log(torch.mm(KM, v_stab.t()).t() + eps) + log_v_max.unsqueeze(1)
wnorm = torch.exp(log_u + logcostpart1).mean(0).sum() # sum(1) for per item pair loss...
grad_input = log_u * lam
# print("log_u", log_u)
grad_input = grad_input - torch.mean(grad_input, dim=1).unsqueeze(1)
grad_input = grad_input - torch.mean(grad_input, dim=1).unsqueeze(1)
grad_input = grad_input / batch_size

ctx.save_for_backward(grad_input)
# print("grad type", type(grad_input))

return pred.new((wnorm,)), grad_input

@staticmethod
def backward(ctx, grad_output, _):
grad_input = ctx.saved_variables
# print(grad)
res = grad_output.clone()
res.data.resize_(grad_input[0].size()).copy_(grad_input[0].data)
res = res.mul_(grad_output[0]).float()
# print("in backward func:\n\n", res)
return res, None, None, None, None, None, None


class Sinkhorn(Function):
def __init__(self):
super(Sinkhorn, self).__init__()

def forward(ctx, a, b, M, reg, tau, warmstart, numItermax, stop):
a = a.double()
b = b.double()
M = M.double()

nbb = b.size(1)

# init data
na = len(a)
nb = len(b)

cpt = 0

# we assume that no distances are null except those of the diagonal of
# distances
if warmstart is None:
alpha, beta = np.zeros(na), np.zeros(nb)
else:
alpha, beta = warmstart

if nbb:
u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb
else:
u, v = np.ones(na) / na, np.ones(nb) / nb

def get_K(alpha, beta):
"""log space computation"""
return np.exp(-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg)

def get_Gamma(alpha, beta, u, v):
"""log space gamma computation"""
return np.exp(
-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg + np.log(u.reshape((na, 1))) + np.log(
v.reshape((1, nb))))

# print(np.min(K))

K = get_K(alpha, beta)
transp = K
cpt = 0
err = 1
while 1:

uprev = u
vprev = v

# sinkhorn update
v = b / (np.dot(K.T, u) + 1e-16)
u = a / (np.dot(K, v) + 1e-16)

# remove numerical problems and store them in K
if np.abs(u).max() > tau or np.abs(v).max() > tau:
if nbb:
alpha, beta = alpha + reg * \
np.max(np.log(u), 1), beta + reg * np.max(np.log(v))
else:
alpha, beta = alpha + reg * np.log(u), beta + reg * np.log(v)
if nbb:
u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb
else:
u, v = np.ones(na) / na, np.ones(nb) / nb
K = get_K(alpha, beta)

if cpt % print_period == 0:
# we can speed up the process by checking for the error only all
# the 10th iterations
if nbb:
err = np.sum((u - uprev) ** 2) / np.sum((u) ** 2) + \
np.sum((v - vprev) ** 2) / np.sum((v) ** 2)
else:
transp = get_Gamma(alpha, beta, u, v)
err = np.linalg.norm((np.sum(transp, axis=0) - b)) ** 2
if log:
log['err'].append(err)

if verbose:
if cpt % (print_period * 20) == 0:
print(
'{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)
print('{:5d}|{:8e}|'.format(cpt, err))

if err <= stopThr:
loop = False

if cpt >= numItermax:
loop = False

if np.any(np.isnan(u)) or np.any(np.isnan(v)):
# we have reached the machine precision
# come back to previous solution and quit loop
print('Warning: numerical errors at iteration', cpt)
u = uprev
v = vprev
break

cpt = cpt + 1

# print('err=',err,' cpt=',cpt)
if log:
log['logu'] = alpha / reg + np.log(u)
log['logv'] = beta / reg + np.log(v)
log['alpha'] = alpha + reg * np.log(u)
log['beta'] = beta + reg * np.log(v)
log['warmstart'] = (log['alpha'], log['beta'])
if nbb:
res = np.zeros((nbb))
for i in range(nbb):
res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M)
return res, log

else:
return get_Gamma(alpha, beta, u, v), log
else:
if nbb:
res = np.zeros((nbb))
for i in range(nbb):
res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M)
return res
else:
return get_Gamma(alpha, beta, u, v)


if __name__ == "__main__":
cost = (torch.Tensor(2, 2).fill_(1) - torch.diag(torch.Tensor(2).fill_(1))) # .cuda()
mylayer = WlossLayer(cost) # .cuda()
inp = Variable(torch.Tensor([[1, 0], [0.5, 0.5]]), requires_grad=True) # .cuda()
ground_true = Variable(torch.Tensor([[0, 1], [0.5, 0.5]])) # .cuda()

res, _ = mylayer(inp, ground_true)
# print(inp.requires_grad, res.requires_grad)
# print(res, inp)
mylayer.zero_grad()
res.backward()
print("inp's gradient is good:")
print(inp.grad)

print("convert to gpu:\n", inp.cuda().grad)
print("=============================================="
"\n However, this does not work on pytorch when GPU is enabled")

cost = (torch.Tensor(2, 2).fill_(1) - torch.diag(torch.Tensor(2).fill_(1))).cuda()
mylayer = WlossLayer(cost).cuda()
inp = Variable(torch.Tensor([[1, 0], [0.5, 0.5]]), requires_grad=True).cuda()
ground_true = Variable(torch.Tensor([[0, 1], [0.5, 0.5]])).cuda()

opt = optim.SGD([
{'params': mylayer.parameters()},
], lr=1e-2, momentum=0.9)

res, _ = mylayer(inp, ground_true)
# print(inp.requires_grad, res.requires_grad)
# print(res, inp)
mylayer.zero_grad()
res.backward()
print("input's gradient is None!!!!!!!!!!!!!!!!")
print(inp.grad)

+ 228
- 3
fastNLP/modules/utils.py View File

@@ -1,6 +1,3 @@
import torch


def mask_softmax(matrix, mask):
if mask is None:
result = torch.nn.functional.softmax(matrix, dim=-1)
@@ -13,3 +10,231 @@ def seq_mask(seq_len, max_len):
mask = [torch.ge(torch.LongTensor(seq_len), i + 1) for i in range(max_len)]
mask = torch.stack(mask, 1)
return mask


"""
Codes from FudanParser. Not tested. Do not use !!!
"""
from collections import defaultdict

import numpy as np
import torch


def expand_gt(gt):
"""expand_gt: Expand ground truth to matrix
Arguments:
gt: tensor of (n, l)
Return:
f: ground truth matrix of (n, l), $gt[i][j] = k$ leads to $f[i][j][k] = 1$.
"""
n, l = gt.shape
ret = torch.zeros(n, l, l).long()
for i in range(n):
ret[i][torch.arange(l).long(), gt[i]] = 1
return ret


def greedy_decoding(arc_f):
"""greedy_decoding
Arguments:
arc_f: a tensor in shape of (n, l+1, l+1)
length of the sentence is l and index 0 is <root>
Output:
arc_pred: a tensor in shape of (n, l), indicating the head words
"""

f_arc = arc_f[:, 1:, :] # ignore the root
_, arc_pred = torch.max(f_arc.data, dim=-1, keepdim=False)
return arc_pred


def mst_decoding(arc_f):
batch_size = arc_f.shape[0]
length = arc_f.shape[1]
arc_score = arc_f.data.cpu()
pred_collection = []
for i in range(batch_size):
head = mst(arc_score[i].numpy())
pred_collection.append(head[1:].reshape((1, length - 1)))
arc_pred = torch.LongTensor(np.concatenate(pred_collection, axis=0)).type_as(arc_f).long()
return arc_pred


def outer_product(features):
"""InterProduct: Get inter sequence product of features
Arguments:
features: feature vectors of sequence in the shape of (n, l, h)
Return:
f: product result in (n, l, l, h) shape
"""
n, l, c = features.shape
features = features.contiguous()
x = features.view(n, l, 1, c)
x = x.expand(n, l, l, c)
y = features.view(n, 1, l, c).contiguous()
y = y.expand(n, l, l, c)
return x * y


def outer_concat(features):
"""InterProduct: Get inter sequence concatenation of features
Arguments:
features: feature vectors of sequence in the shape of (n, l, h)
Return:
f: product result in (n, l, l, h) shape
"""
n, l, c = features.shape
x = features.contiguous().view(n, l, 1, c)
x = x.expand(n, l, l, c)
y = features.view(n, 1, l, c)
y = y.expand(n, l, l, c)
return torch.cat((x, y), dim=3)


def mst(scores):
"""
https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/models/nn.py#L692 # NOQA
"""
length = scores.shape[0]
min_score = scores.min() - 1
eye = np.eye(length)
scores = scores * (1 - eye) + min_score * eye
heads = np.argmax(scores, axis=1)
heads[0] = 0
tokens = np.arange(1, length)
roots = np.where(heads[tokens] == 0)[0] + 1
if len(roots) < 1:
root_scores = scores[tokens, 0]
head_scores = scores[tokens, heads[tokens]]
new_root = tokens[np.argmax(root_scores / head_scores)]
heads[new_root] = 0
elif len(roots) > 1:
root_scores = scores[roots, 0]
scores[roots, 0] = 0
new_heads = np.argmax(scores[roots][:, tokens], axis=1) + 1
new_root = roots[np.argmin(
scores[roots, new_heads] / root_scores)]
heads[roots] = new_heads
heads[new_root] = 0

edges = defaultdict(set)
vertices = set((0,))
for dep, head in enumerate(heads[tokens]):
vertices.add(dep + 1)
edges[head].add(dep + 1)
for cycle in _find_cycle(vertices, edges):
dependents = set()
to_visit = set(cycle)
while len(to_visit) > 0:
node = to_visit.pop()
if node not in dependents:
dependents.add(node)
to_visit.update(edges[node])
cycle = np.array(list(cycle))
old_heads = heads[cycle]
old_scores = scores[cycle, old_heads]
non_heads = np.array(list(dependents))
scores[np.repeat(cycle, len(non_heads)),
np.repeat([non_heads], len(cycle), axis=0).flatten()] = min_score
new_heads = np.argmax(scores[cycle][:, tokens], axis=1) + 1
new_scores = scores[cycle, new_heads] / old_scores
change = np.argmax(new_scores)
changed_cycle = cycle[change]
old_head = old_heads[change]
new_head = new_heads[change]
heads[changed_cycle] = new_head
edges[new_head].add(changed_cycle)
edges[old_head].remove(changed_cycle)

return heads


def _find_cycle(vertices, edges):
"""
https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm # NOQA
https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/etc/tarjan.py # NOQA
"""
_index = 0
_stack = []
_indices = {}
_lowlinks = {}
_onstack = defaultdict(lambda: False)
_SCCs = []

def _strongconnect(v):
nonlocal _index
_indices[v] = _index
_lowlinks[v] = _index
_index += 1
_stack.append(v)
_onstack[v] = True

for w in edges[v]:
if w not in _indices:
_strongconnect(w)
_lowlinks[v] = min(_lowlinks[v], _lowlinks[w])
elif _onstack[w]:
_lowlinks[v] = min(_lowlinks[v], _indices[w])

if _lowlinks[v] == _indices[v]:
SCC = set()
while True:
w = _stack.pop()
_onstack[w] = False
SCC.add(w)
if not (w != v):
break
_SCCs.append(SCC)

for v in vertices:
if v not in _indices:
_strongconnect(v)

return [SCC for SCC in _SCCs if len(SCC) > 1]


# https://github.com/alykhantejani/nninit/blob/master/nninit.py
def orthogonal(tensor, gain=1):
"""Fills the input Tensor or Variable with a (semi) orthogonal matrix. The input tensor must have at least 2 dimensions,
and for tensors with more than 2 dimensions the trailing dimensions are flattened. viewed as 2D representation with
rows equal to the first dimension and columns equal to the product of as a sparse matrix, where the non-zero elements
will be drawn from a normal distribution with mean=0 and std=`std`.
Reference: "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks" - Saxe, A. et al.
Args:
tensor: a n-dimension torch.Tensor, where n >= 2
gain: optional gain to be applied
Examples:
>>> w = torch.Tensor(3, 5)
>>> nninit.orthogonal(w)
"""
if tensor.ndimension() < 2:
raise ValueError("Only tensors with 2 or more dimensions are supported.")

flattened_shape = (tensor.size(0), int(np.prod(tensor.detach().numpy().shape[1:])))
flattened = torch.Tensor(flattened_shape[0], flattened_shape[1]).normal_(0, 1)

u, s, v = np.linalg.svd(flattened.numpy(), full_matrices=False)
if u.shape == flattened.detach().numpy().shape:
tensor.view_as(flattened).copy_(torch.from_numpy(u))
else:
tensor.view_as(flattened).copy_(torch.from_numpy(v))

tensor.mul_(gain)
with torch.no_grad():
return tensor


def generate_step_dropout(masks, hidden_dim, step_dropout, training=False):
# assume batch first
# import pdb
# pdb.set_trace()

batch, length = masks.size()
if not training:
return torch.ones(batch, length, hidden_dim).fill_(1 - step_dropout).cuda(masks.device) * masks.view(batch,
length, 1)
masked = torch.zeros(batch, 1, hidden_dim).fill_(step_dropout)
masked = torch.bernoulli(masked).repeat(1, length, 1)
masked = masked.cuda(masks.device) * masks.view(batch, length, 1)
return masked

+ 0
- 9
fastNLP/saver/base_saver.py View File

@@ -3,12 +3,3 @@ class BaseSaver(object):

def __init__(self, save_path):
self.save_path = save_path

def save_bytes(self):
raise NotImplementedError

def save_str(self):
raise NotImplementedError

def compress(self):
raise NotImplementedError

+ 11
- 1
fastNLP/saver/model_saver.py View File

@@ -1,4 +1,6 @@
from saver.base_saver import BaseSaver
import torch

from fastNLP.saver.base_saver import BaseSaver


class ModelSaver(BaseSaver):
@@ -6,3 +8,11 @@ class ModelSaver(BaseSaver):

def __init__(self, save_path):
super(ModelSaver, self).__init__(save_path)

def save_pytorch(self, model):
"""
Save a pytorch model into .pkl file.
:param model: a PyTorch model
:return:
"""
torch.save(model.state_dict(), self.save_path)

+ 10
- 1
test/data_for_tests/config View File

@@ -57,4 +57,13 @@ new_attr = 40
epochs = 20
batch_size = 1
pickle_path = "./data_for_tests/"
validate = true
validate = true

[POS_test]
save_output = true
validate_in_training = false
save_dev_input = false
save_loss = true
batch_size = 1
pickle_path = "./data_for_tests/"


+ 24
- 3
test/test_POS_pipeline.py View File

@@ -1,11 +1,13 @@
import sys

sys.path.append("..")

from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.action.trainer import POSTrainer
from fastNLP.loader.dataset_loader import POSDatasetLoader
from fastNLP.loader.preprocess import POSPreprocess
from fastNLP.saver.model_saver import ModelSaver
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.action.tester import POSTester
from fastNLP.models.sequence_modeling import SeqLabeling

data_name = "people.txt"
@@ -17,8 +19,8 @@ if __name__ == "__main__":
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args})

# Data Loader
pos = POSDatasetLoader(data_name, data_path)
train_data = pos.load_lines()
pos_loader = POSDatasetLoader(data_name, data_path)
train_data = pos_loader.load_lines()

# Preprocessor
p = POSPreprocess(train_data, pickle_path)
@@ -37,3 +39,22 @@ if __name__ == "__main__":
trainer.train(model)

print("Training finished!")

saver = ModelSaver("./saved_model.pkl")
saver.save_pytorch(model)
print("Model saved!")

del model, trainer, pos_loader

model = SeqLabeling(100, 1, num_classes, vocab_size, bi_direction=True)
ModelLoader("xxx", "./saved_model.pkl").load_pytorch(model)
print("model loaded!")

test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})

#test_args = {"save_output": True, "validate_in_training": False, "save_dev_input": False,
# "save_loss": True, "batch_size": 1, "pickle_path": pickle_path}
tester = POSTester(test_args)
tester.test(model)
print("model tested!")

Loading…
Cancel
Save