optimize evaluation output in Trainer; keep POS pipeline (loader + trainer + tester + saver) OK; add codes borrowed from FudanParser.tags/v0.1.0
@@ -0,0 +1,15 @@ | |||||
<?xml version="1.0" encoding="UTF-8"?> | |||||
<project version="4"> | |||||
<component name="PublishConfigData" autoUpload="Always" serverName="zyfeng@10.40.40.86:22"> | |||||
<serverData> | |||||
<paths name="zyfeng@10.40.40.86:22"> | |||||
<serverdata> | |||||
<mappings> | |||||
<mapping deploy="/home/zyfeng/Desktop/fastNLP" local="$PROJECT_DIR$" /> | |||||
</mappings> | |||||
</serverdata> | |||||
</paths> | |||||
</serverData> | |||||
<option name="myAutoUpload" value="ALWAYS" /> | |||||
</component> | |||||
</project> |
@@ -0,0 +1,14 @@ | |||||
<?xml version="1.0" encoding="UTF-8"?> | |||||
<module type="PYTHON_MODULE" version="4"> | |||||
<component name="NewModuleRootManager"> | |||||
<content url="file://$MODULE_DIR$" /> | |||||
<orderEntry type="jdk" jdkName="Remote Python 3.6.6 (sftp://zyfeng@10.40.40.86:22/home/zyfeng/miniconda3/envs/fastnlp/bin/python3.6)" jdkType="Python SDK" /> | |||||
<orderEntry type="sourceFolder" forTests="false" /> | |||||
</component> | |||||
<component name="PyDocumentationSettings"> | |||||
<option name="renderExternalDocumentation" value="true" /> | |||||
</component> | |||||
<component name="TestRunnerService"> | |||||
<option name="PROJECT_TEST_RUNNER" value="Unittests" /> | |||||
</component> | |||||
</module> |
@@ -0,0 +1,16 @@ | |||||
<component name="InspectionProjectProfileManager"> | |||||
<profile version="1.0"> | |||||
<option name="myName" value="Project Default" /> | |||||
<inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true"> | |||||
<option name="ignoredPackages"> | |||||
<value> | |||||
<list size="3"> | |||||
<item index="0" class="java.lang.String" itemvalue="torch" /> | |||||
<item index="1" class="java.lang.String" itemvalue="numpy" /> | |||||
<item index="2" class="java.lang.String" itemvalue="torchvision" /> | |||||
</list> | |||||
</value> | |||||
</option> | |||||
</inspection_tool> | |||||
</profile> | |||||
</component> |
@@ -0,0 +1,7 @@ | |||||
<?xml version="1.0" encoding="UTF-8"?> | |||||
<project version="4"> | |||||
<component name="JavaScriptSettings"> | |||||
<option name="languageLevel" value="ES6" /> | |||||
</component> | |||||
<component name="ProjectRootManager" version="2" project-jdk-name="Remote Python 3.6.6 (sftp://zyfeng@10.40.40.86:22/home/zyfeng/miniconda3/envs/fastnlp/bin/python3.6)" project-jdk-type="Python SDK" /> | |||||
</project> |
@@ -0,0 +1,8 @@ | |||||
<?xml version="1.0" encoding="UTF-8"?> | |||||
<project version="4"> | |||||
<component name="ProjectModuleManager"> | |||||
<modules> | |||||
<module fileurl="file://$PROJECT_DIR$/.idea/fastNLP.iml" filepath="$PROJECT_DIR$/.idea/fastNLP.iml" /> | |||||
</modules> | |||||
</component> | |||||
</project> |
@@ -0,0 +1,7 @@ | |||||
<?xml version="1.0" encoding="UTF-8"?> | |||||
<project version="4"> | |||||
<component name="PySciProjectComponent"> | |||||
<option name="PY_SCI_VIEW" value="true" /> | |||||
<option name="PY_SCI_VIEW_SUGGESTED" value="true" /> | |||||
</component> | |||||
</project> |
@@ -0,0 +1,6 @@ | |||||
<?xml version="1.0" encoding="UTF-8"?> | |||||
<project version="4"> | |||||
<component name="VcsDirectoryMappings"> | |||||
<mapping directory="$PROJECT_DIR$" vcs="Git" /> | |||||
</component> | |||||
</project> |
@@ -13,7 +13,7 @@ class BaseTester(Action): | |||||
def __init__(self, test_args): | def __init__(self, test_args): | ||||
""" | """ | ||||
:param test_args: named tuple | |||||
:param test_args: a dict-like object that has __getitem__ method, can be accessed by "test_args["key_str"]" | |||||
""" | """ | ||||
super(BaseTester, self).__init__() | super(BaseTester, self).__init__() | ||||
self.validate_in_training = test_args["validate_in_training"] | self.validate_in_training = test_args["validate_in_training"] | ||||
@@ -28,6 +28,7 @@ class BaseTester(Action): | |||||
self.model = None | self.model = None | ||||
self.eval_history = [] | self.eval_history = [] | ||||
self.batch_output = [] | |||||
def test(self, network): | def test(self, network): | ||||
# print("--------------testing----------------") | # print("--------------testing----------------") | ||||
@@ -40,7 +41,6 @@ class BaseTester(Action): | |||||
self.iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) | self.iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) | ||||
batch_output = list() | |||||
num_iter = len(dev_data) // self.batch_size | num_iter = len(dev_data) // self.batch_size | ||||
for step in range(num_iter): | for step in range(num_iter): | ||||
@@ -50,7 +50,7 @@ class BaseTester(Action): | |||||
eval_results = self.evaluate(prediction, batch_y) | eval_results = self.evaluate(prediction, batch_y) | ||||
if self.save_output: | if self.save_output: | ||||
batch_output.append(prediction) | |||||
self.batch_output.append(prediction) | |||||
if self.save_loss: | if self.save_loss: | ||||
self.eval_history.append(eval_results) | self.eval_history.append(eval_results) | ||||
@@ -118,6 +118,13 @@ class BaseTester(Action): | |||||
model.train() | model.train() | ||||
self.eval_history.clear() | self.eval_history.clear() | ||||
def show_matrices(self): | |||||
""" | |||||
This is called by Trainer to print evaluation on dev set. | |||||
:return print_str: str | |||||
""" | |||||
raise NotImplementedError | |||||
class POSTester(BaseTester): | class POSTester(BaseTester): | ||||
""" | """ | ||||
@@ -125,6 +132,9 @@ class POSTester(BaseTester): | |||||
""" | """ | ||||
def __init__(self, test_args): | def __init__(self, test_args): | ||||
""" | |||||
:param test_args: a dict-like object that has __getitem__ method, can be accessed by "test_args["key_str"]" | |||||
""" | |||||
super(POSTester, self).__init__(test_args) | super(POSTester, self).__init__(test_args) | ||||
self.max_len = None | self.max_len = None | ||||
self.mask = None | self.mask = None | ||||
@@ -148,7 +158,21 @@ class POSTester(BaseTester): | |||||
def evaluate(self, predict, truth): | def evaluate(self, predict, truth): | ||||
truth = torch.Tensor(truth) | truth = torch.Tensor(truth) | ||||
loss, prediction = self.model.loss(predict, truth, self.mask, self.batch_size, self.max_len) | loss, prediction = self.model.loss(predict, truth, self.mask, self.batch_size, self.max_len) | ||||
return loss.data | |||||
results = torch.Tensor(prediction[0][0]).view((-1, )) | |||||
accuracy = float(torch.sum(results == truth.view((-1, )))) / results.shape[0] | |||||
return [loss.data, accuracy] | |||||
def matrices(self): | def matrices(self): | ||||
return np.mean(self.eval_history) | |||||
batch_loss = np.mean([x[0] for x in self.eval_history]) | |||||
batch_accuracy = np.mean([x[1] for x in self.eval_history]) | |||||
return batch_loss, batch_accuracy | |||||
def show_matrices(self): | |||||
""" | |||||
This is called by Trainer to print evaluation on dev set. | |||||
:return print_str: str | |||||
""" | |||||
loss, accuracy = self.matrices() | |||||
return "dev loss={:.2f}, accuracy={:.2f}".format(loss, accuracy) | |||||
@@ -24,7 +24,7 @@ class BaseTrainer(Action): | |||||
def __init__(self, train_args): | def __init__(self, train_args): | ||||
""" | """ | ||||
:param train_args: dict of (key, value) | |||||
:param train_args: dict of (key, value), or dict-like object. key is str. | |||||
The base trainer requires the following keys: | The base trainer requires the following keys: | ||||
- epochs: int, the number of epochs in training | - epochs: int, the number of epochs in training | ||||
@@ -89,7 +89,8 @@ class BaseTrainer(Action): | |||||
if data_dev is None: | if data_dev is None: | ||||
raise RuntimeError("No validation data provided.") | raise RuntimeError("No validation data provided.") | ||||
validator.test(network) | validator.test(network) | ||||
print("[epoch {}] dev loss={:.2f}".format(epoch, validator.matrices())) | |||||
print("[epoch {}]".format(epoch), end=" ") | |||||
print(validator.show_matrices()) | |||||
# finish training | # finish training | ||||
@@ -0,0 +1,19 @@ | |||||
import torch | |||||
from fastNLP.loader.base_loader import BaseLoader | |||||
class ModelLoader(BaseLoader): | |||||
""" | |||||
Loader for models. | |||||
""" | |||||
def __init__(self, data_name, data_path): | |||||
super(ModelLoader, self).__init__(data_name, data_path) | |||||
def load_pytorch(self, empty_model): | |||||
""" | |||||
Load model parameters from .pkl files into the empty PyTorch model. | |||||
:param empty_model: a PyTorch model with initialized parameters. | |||||
""" | |||||
empty_model.load_state_dict(torch.load(self.data_path)) |
@@ -75,8 +75,8 @@ class SeqLabeling(BaseModel): | |||||
:param mask: ByteTensor, [batch_size, max_len] | :param mask: ByteTensor, [batch_size, max_len] | ||||
:param batch_size: int | :param batch_size: int | ||||
:param max_len: int | :param max_len: int | ||||
:return loss: | |||||
prediction: | |||||
:return loss: a scalar Tensor | |||||
prediction: list of tuple of (decode path(list), best score) | |||||
""" | """ | ||||
x = x.float() | x = x.float() | ||||
y = y.long() | y = y.long() | ||||
@@ -0,0 +1,87 @@ | |||||
import torch | |||||
import torch.nn.functional as F | |||||
from torch import nn | |||||
class ConvCharEmbedding(nn.Module): | |||||
def __init__(self, char_emb_size, feature_maps=(40, 30, 30), kernels=(3, 4, 5)): | |||||
""" | |||||
Character Level Word Embedding | |||||
:param char_emb_size: the size of character level embedding, | |||||
say 26 characters, each embedded to 50 dim vector, then the input_size is 50. | |||||
:param feature_maps: table of feature maps (for each kernel width) | |||||
:param kernels: table of kernel widths | |||||
""" | |||||
super(ConvCharEmbedding, self).__init__() | |||||
self.convs = nn.ModuleList([ | |||||
nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, 4)) | |||||
for i in range(len(kernels))]) | |||||
def forward(self, x): | |||||
""" | |||||
:param x: [batch_size * sent_length, word_length, char_emb_size] | |||||
:return: [batch_size * sent_length, sum(feature_maps), 1] | |||||
""" | |||||
x = x.contiguous().view(x.size(0), 1, x.size(1), x.size(2)) # [batch_size*sent_length, channel, width, height] | |||||
x = x.transpose(2, 3) # [batch_size*sent_length, channel, height, width] | |||||
return self.convolute(x).unsqueeze(2) | |||||
def convolute(self, x): | |||||
feats = [] | |||||
for conv in self.convs: | |||||
y = conv(x) # [batch_size*sent_length, feature_maps[i], 1, width - kernels[i] + 1] | |||||
y = torch.squeeze(y, 2) # [batch_size*sent_length, feature_maps[i], width - kernels[i] + 1] | |||||
y = F.tanh(y) | |||||
y, __ = torch.max(y, 2) # [batch_size*sent_length, feature_maps[i]] | |||||
feats.append(y) | |||||
return torch.cat(feats, 1) # [batch_size*sent_length, sum(feature_maps)] | |||||
class LSTMCharEmbedding(nn.Module): | |||||
""" | |||||
Character Level Word Embedding with LSTM | |||||
:param char_emb_size: the size of character level embedding, | |||||
say 26 characters, each embedded to 50 dim vector, then the input_size is 50. | |||||
""" | |||||
def __init__(self, char_emb_size, hidden_size=None): | |||||
super(LSTMCharEmbedding, self).__init__() | |||||
self.hidden_size = char_emb_size if hidden_size is None else hidden_size | |||||
self.lstm = nn.LSTM(input_size=char_emb_size, | |||||
hidden_size=self.hidden_size, | |||||
num_layers=1, | |||||
bias=True, | |||||
batch_first=True) | |||||
def forward(self, x): | |||||
""" | |||||
:param x:[ n_batch*n_word, word_length, char_emb_size] | |||||
:return: [ n_batch*n_word, char_emb_size] | |||||
""" | |||||
batch_size = x.shape[0] | |||||
h0 = torch.empty(1, batch_size, self.hidden_size) | |||||
h0 = nn.init.orthogonal_(h0) | |||||
c0 = torch.empty(1, batch_size, self.hidden_size) | |||||
c0 = nn.init.orthogonal_(c0) | |||||
_, hidden = self.lstm(x, (h0, c0)) | |||||
return hidden[0].squeeze().unsqueeze(2) | |||||
if __name__ == "__main__": | |||||
batch_size = 128 | |||||
char_emb = 100 | |||||
word_length = 1 | |||||
x = torch.Tensor(batch_size, char_emb, word_length) | |||||
x = x.transpose(1, 2) | |||||
cce = ConvCharEmbedding(char_emb) | |||||
y = cce(x) | |||||
print("CNN Char Emb input: ", x.shape) | |||||
print("CNN Char Emb output: ", y.shape) # [128, 100] | |||||
lce = LSTMCharEmbedding(char_emb) | |||||
o = lce(x) | |||||
print("LSTM Char Emb input: ", x.shape) | |||||
print("LSTM Char Emb size: ", o.shape) |
@@ -0,0 +1,422 @@ | |||||
__author__ = 'max' | |||||
import torch | |||||
import torch.nn as nn | |||||
import torch.nn.functional as F | |||||
def MaskedRecurrent(reverse=False): | |||||
def forward(input, hidden, cell, mask, train=True, dropout=0): | |||||
""" | |||||
:param input: | |||||
:param hidden: | |||||
:param cell: | |||||
:param mask: | |||||
:param dropout: step之间的dropout,对mask了的也会drop,应该是没问题的,反正没有gradient | |||||
:param train: 控制dropout的行为,在StackedRNN的forward中调用 | |||||
:return: | |||||
""" | |||||
output = [] | |||||
steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0)) | |||||
for i in steps: | |||||
if mask is None or mask[i].data.min() > 0.5: # 没有mask,都是1 | |||||
hidden = cell(input[i], hidden) | |||||
elif mask[i].data.max() > 0.5: # 有mask,但不全为0 | |||||
hidden_next = cell(input[i], hidden) # 一次喂入一个batch! | |||||
# hack to handle LSTM | |||||
if isinstance(hidden, tuple): # LSTM outputs a tuple of (hidden, cell), this is a common hack 😁 | |||||
mask = mask.float() | |||||
hx, cx = hidden | |||||
hp1, cp1 = hidden_next | |||||
hidden = ( | |||||
hx + (hp1 - hx) * mask[i].squeeze(), | |||||
cx + (cp1 - cx) * mask[i].squeeze()) # Why? 我知道了!!如果是mask就不用改变 | |||||
else: | |||||
hidden = hidden + (hidden_next - hidden) * mask[i] | |||||
# if dropout != 0 and train: # warning, should i treat masked tensor differently? | |||||
# if isinstance(hidden, tuple): | |||||
# hidden = (F.dropout(hidden[0], p=dropout, training=train), | |||||
# F.dropout(hidden[1], p=dropout, training=train)) | |||||
# else: | |||||
# hidden = F.dropout(hidden, p=dropout, training=train) | |||||
# hack to handle LSTM | |||||
output.append(hidden[0] if isinstance(hidden, tuple) else hidden) | |||||
if reverse: | |||||
output.reverse() | |||||
output = torch.cat(output, 0).view(input.size(0), *output[0].size()) | |||||
return hidden, output | |||||
return forward | |||||
def StackedRNN(inners, num_layers, lstm=False, train=True, step_dropout=0, layer_dropout=0): | |||||
num_directions = len(inners) # rec_factory! | |||||
total_layers = num_layers * num_directions | |||||
def forward(input, hidden, cells, mask): | |||||
assert (len(cells) == total_layers) | |||||
next_hidden = [] | |||||
if lstm: | |||||
hidden = list(zip(*hidden)) | |||||
for i in range(num_layers): | |||||
all_output = [] | |||||
for j, inner in enumerate(inners): | |||||
l = i * num_directions + j | |||||
hy, output = inner(input, hidden[l], cells[l], mask, step_dropout, train) | |||||
next_hidden.append(hy) | |||||
all_output.append(output) | |||||
input = torch.cat(all_output, input.dim() - 1) # 下一层的输入 | |||||
if layer_dropout != 0 and i < num_layers - 1: | |||||
input = F.dropout(input, p=layer_dropout, training=train, inplace=False) | |||||
if lstm: | |||||
next_h, next_c = zip(*next_hidden) | |||||
next_hidden = ( | |||||
torch.cat(next_h, 0).view(total_layers, *next_h[0].size()), | |||||
torch.cat(next_c, 0).view(total_layers, *next_c[0].size()) | |||||
) | |||||
else: | |||||
next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size()) | |||||
return next_hidden, input | |||||
return forward | |||||
def AutogradMaskedRNN(num_layers=1, batch_first=False, train=True, layer_dropout=0, step_dropout=0, | |||||
bidirectional=False, lstm=False): | |||||
rec_factory = MaskedRecurrent | |||||
if bidirectional: | |||||
layer = (rec_factory(), rec_factory(reverse=True)) | |||||
else: | |||||
layer = (rec_factory(),) # rec_factory 就是每层的结构啦!!在MaskedRecurrent中进行每层的计算!然后用StackedRNN接起来 | |||||
func = StackedRNN(layer, | |||||
num_layers, | |||||
lstm=lstm, | |||||
layer_dropout=layer_dropout, step_dropout=step_dropout, | |||||
train=train) | |||||
def forward(input, cells, hidden, mask): | |||||
if batch_first: | |||||
input = input.transpose(0, 1) | |||||
if mask is not None: | |||||
mask = mask.transpose(0, 1) | |||||
nexth, output = func(input, hidden, cells, mask) | |||||
if batch_first: | |||||
output = output.transpose(0, 1) | |||||
return output, nexth | |||||
return forward | |||||
def MaskedStep(): | |||||
def forward(input, hidden, cell, mask): | |||||
if mask is None or mask.data.min() > 0.5: | |||||
hidden = cell(input, hidden) | |||||
elif mask.data.max() > 0.5: | |||||
hidden_next = cell(input, hidden) | |||||
# hack to handle LSTM | |||||
if isinstance(hidden, tuple): | |||||
hx, cx = hidden | |||||
hp1, cp1 = hidden_next | |||||
hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask) | |||||
else: | |||||
hidden = hidden + (hidden_next - hidden) * mask | |||||
# hack to handle LSTM | |||||
output = hidden[0] if isinstance(hidden, tuple) else hidden | |||||
return hidden, output | |||||
return forward | |||||
def StackedStep(layer, num_layers, lstm=False, dropout=0, train=True): | |||||
def forward(input, hidden, cells, mask): | |||||
assert (len(cells) == num_layers) | |||||
next_hidden = [] | |||||
if lstm: | |||||
hidden = list(zip(*hidden)) | |||||
for l in range(num_layers): | |||||
hy, output = layer(input, hidden[l], cells[l], mask) | |||||
next_hidden.append(hy) | |||||
input = output | |||||
if dropout != 0 and l < num_layers - 1: | |||||
input = F.dropout(input, p=dropout, training=train, inplace=False) | |||||
if lstm: | |||||
next_h, next_c = zip(*next_hidden) | |||||
next_hidden = ( | |||||
torch.cat(next_h, 0).view(num_layers, *next_h[0].size()), | |||||
torch.cat(next_c, 0).view(num_layers, *next_c[0].size()) | |||||
) | |||||
else: | |||||
next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size()) | |||||
return next_hidden, input | |||||
return forward | |||||
def AutogradMaskedStep(num_layers=1, dropout=0, train=True, lstm=False): | |||||
layer = MaskedStep() | |||||
func = StackedStep(layer, | |||||
num_layers, | |||||
lstm=lstm, | |||||
dropout=dropout, | |||||
train=train) | |||||
def forward(input, cells, hidden, mask): | |||||
nexth, output = func(input, hidden, cells, mask) | |||||
return output, nexth | |||||
return forward | |||||
class MaskedRNNBase(nn.Module): | |||||
def __init__(self, Cell, input_size, hidden_size, | |||||
num_layers=1, bias=True, batch_first=False, | |||||
layer_dropout=0, step_dropout=0, bidirectional=False, **kwargs): | |||||
""" | |||||
:param Cell: | |||||
:param input_size: | |||||
:param hidden_size: | |||||
:param num_layers: | |||||
:param bias: | |||||
:param batch_first: | |||||
:param layer_dropout: | |||||
:param step_dropout: | |||||
:param bidirectional: | |||||
:param kwargs: | |||||
""" | |||||
super(MaskedRNNBase, self).__init__() | |||||
self.Cell = Cell | |||||
self.input_size = input_size | |||||
self.hidden_size = hidden_size | |||||
self.num_layers = num_layers | |||||
self.bias = bias | |||||
self.batch_first = batch_first | |||||
self.layer_dropout = layer_dropout | |||||
self.step_dropout = step_dropout | |||||
self.bidirectional = bidirectional | |||||
num_directions = 2 if bidirectional else 1 | |||||
self.all_cells = [] | |||||
for layer in range(num_layers): # 初始化所有cell | |||||
for direction in range(num_directions): | |||||
layer_input_size = input_size if layer == 0 else hidden_size * num_directions | |||||
cell = self.Cell(layer_input_size, hidden_size, self.bias, **kwargs) | |||||
self.all_cells.append(cell) | |||||
self.add_module('cell%d' % (layer * num_directions + direction), cell) # Max的代码写得真好看 | |||||
def reset_parameters(self): | |||||
for cell in self.all_cells: | |||||
cell.reset_parameters() | |||||
def forward(self, input, mask=None, hx=None): | |||||
batch_size = input.size(0) if self.batch_first else input.size(1) | |||||
lstm = self.Cell is nn.LSTMCell | |||||
if hx is None: | |||||
num_directions = 2 if self.bidirectional else 1 | |||||
hx = torch.autograd.Variable( | |||||
input.data.new(self.num_layers * num_directions, batch_size, self.hidden_size).zero_()) | |||||
if lstm: | |||||
hx = (hx, hx) | |||||
func = AutogradMaskedRNN(num_layers=self.num_layers, | |||||
batch_first=self.batch_first, | |||||
step_dropout=self.step_dropout, | |||||
layer_dropout=self.layer_dropout, | |||||
train=self.training, | |||||
bidirectional=self.bidirectional, | |||||
lstm=lstm) # 传入all_cells,继续往底层封装走 | |||||
output, hidden = func(input, self.all_cells, hx, | |||||
None if mask is None else mask.view(mask.size() + (1,))) # 这个+ (1, )是个什么操作? | |||||
return output, hidden | |||||
def step(self, input, hx=None, mask=None): | |||||
''' | |||||
execute one step forward (only for one-directional RNN). | |||||
Args: | |||||
input (batch, input_size): input tensor of this step. | |||||
hx (num_layers, batch, hidden_size): the hidden state of last step. | |||||
mask (batch): the mask tensor of this step. | |||||
Returns: | |||||
output (batch, hidden_size): tensor containing the output of this step from the last layer of RNN. | |||||
hn (num_layers, batch, hidden_size): tensor containing the hidden state of this step | |||||
''' | |||||
assert not self.bidirectional, "step only cannot be applied to bidirectional RNN." # aha, typo! | |||||
batch_size = input.size(0) | |||||
lstm = self.Cell is nn.LSTMCell | |||||
if hx is None: | |||||
hx = torch.autograd.Variable(input.data.new(self.num_layers, batch_size, self.hidden_size).zero_()) | |||||
if lstm: | |||||
hx = (hx, hx) | |||||
func = AutogradMaskedStep(num_layers=self.num_layers, | |||||
dropout=self.dropout, | |||||
train=self.training, | |||||
lstm=lstm) | |||||
output, hidden = func(input, self.all_cells, hx, mask) | |||||
return output, hidden | |||||
class MaskedRNN(MaskedRNNBase): | |||||
r"""Applies a multi-layer Elman RNN with costomized non-linearity to an | |||||
input sequence. | |||||
For each element in the input sequence, each layer computes the following | |||||
function: | |||||
.. math:: | |||||
h_t = \tanh(w_{ih} * x_t + b_{ih} + w_{hh} * h_{(t-1)} + b_{hh}) | |||||
where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is | |||||
the hidden state of the previous layer at time `t` or :math:`input_t` | |||||
for the first layer. If nonlinearity='relu', then `ReLU` is used instead | |||||
of `tanh`. | |||||
Args: | |||||
input_size: The number of expected features in the input x | |||||
hidden_size: The number of features in the hidden state h | |||||
num_layers: Number of recurrent layers. | |||||
nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh' | |||||
bias: If False, then the layer does not use bias weights b_ih and b_hh. | |||||
Default: True | |||||
batch_first: If True, then the input and output tensors are provided | |||||
as (batch, seq, feature) | |||||
dropout: If non-zero, introduces a dropout layer on the outputs of each | |||||
RNN layer except the last layer | |||||
bidirectional: If True, becomes a bidirectional RNN. Default: False | |||||
Inputs: input, mask, h_0 | |||||
- **input** (seq_len, batch, input_size): tensor containing the features | |||||
of the input sequence. | |||||
**mask** (seq_len, batch): 0-1 tensor containing the mask of the input sequence. | |||||
- **h_0** (num_layers * num_directions, batch, hidden_size): tensor | |||||
containing the initial hidden state for each element in the batch. | |||||
Outputs: output, h_n | |||||
- **output** (seq_len, batch, hidden_size * num_directions): tensor | |||||
containing the output features (h_k) from the last layer of the RNN, | |||||
for each k. If a :class:`torch.nn.utils.rnn.PackedSequence` has | |||||
been given as the input, the output will also be a packed sequence. | |||||
- **h_n** (num_layers * num_directions, batch, hidden_size): tensor | |||||
containing the hidden state for k=seq_len. | |||||
""" | |||||
def __init__(self, *args, **kwargs): | |||||
super(MaskedRNN, self).__init__(nn.RNNCell, *args, **kwargs) | |||||
class MaskedLSTM(MaskedRNNBase): | |||||
r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input | |||||
sequence. | |||||
For each element in the input sequence, each layer computes the following | |||||
function: | |||||
.. math:: | |||||
\begin{array}{ll} | |||||
i_t = \mathrm{sigmoid}(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\ | |||||
f_t = \mathrm{sigmoid}(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\ | |||||
g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\ | |||||
o_t = \mathrm{sigmoid}(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\ | |||||
c_t = f_t * c_{(t-1)} + i_t * g_t \\ | |||||
h_t = o_t * \tanh(c_t) | |||||
\end{array} | |||||
where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell | |||||
state at time `t`, :math:`x_t` is the hidden state of the previous layer at | |||||
time `t` or :math:`input_t` for the first layer, and :math:`i_t`, | |||||
:math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell, | |||||
and out gates, respectively. | |||||
Args: | |||||
input_size: The number of expected features in the input x | |||||
hidden_size: The number of features in the hidden state h | |||||
num_layers: Number of recurrent layers. | |||||
bias: If False, then the layer does not use bias weights b_ih and b_hh. | |||||
Default: True | |||||
batch_first: If True, then the input and output tensors are provided | |||||
as (batch, seq, feature) | |||||
dropout: If non-zero, introduces a dropout layer on the outputs of each | |||||
RNN layer except the last layer | |||||
bidirectional: If True, becomes a bidirectional RNN. Default: False | |||||
Inputs: input, mask, (h_0, c_0) | |||||
- **input** (seq_len, batch, input_size): tensor containing the features | |||||
of the input sequence. | |||||
**mask** (seq_len, batch): 0-1 tensor containing the mask of the input sequence. | |||||
- **h_0** (num_layers \* num_directions, batch, hidden_size): tensor | |||||
containing the initial hidden state for each element in the batch. | |||||
- **c_0** (num_layers \* num_directions, batch, hidden_size): tensor | |||||
containing the initial cell state for each element in the batch. | |||||
Outputs: output, (h_n, c_n) | |||||
- **output** (seq_len, batch, hidden_size * num_directions): tensor | |||||
containing the output features `(h_t)` from the last layer of the RNN, | |||||
for each t. If a :class:`torch.nn.utils.rnn.PackedSequence` has been | |||||
given as the input, the output will also be a packed sequence. | |||||
- **h_n** (num_layers * num_directions, batch, hidden_size): tensor | |||||
containing the hidden state for t=seq_len | |||||
- **c_n** (num_layers * num_directions, batch, hidden_size): tensor | |||||
containing the cell state for t=seq_len | |||||
""" | |||||
def __init__(self, *args, **kwargs): | |||||
super(MaskedLSTM, self).__init__(nn.LSTMCell, *args, **kwargs) | |||||
class MaskedGRU(MaskedRNNBase): | |||||
r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence. | |||||
For each element in the input sequence, each layer computes the following | |||||
function: | |||||
.. math:: | |||||
\begin{array}{ll} | |||||
r_t = \mathrm{sigmoid}(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\ | |||||
z_t = \mathrm{sigmoid}(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\ | |||||
n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\ | |||||
h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} \\ | |||||
\end{array} | |||||
where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the hidden | |||||
state of the previous layer at time `t` or :math:`input_t` for the first | |||||
layer, and :math:`r_t`, :math:`z_t`, :math:`n_t` are the reset, input, | |||||
and new gates, respectively. | |||||
Args: | |||||
input_size: The number of expected features in the input x | |||||
hidden_size: The number of features in the hidden state h | |||||
num_layers: Number of recurrent layers. | |||||
nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh' | |||||
bias: If False, then the layer does not use bias weights b_ih and b_hh. | |||||
Default: True | |||||
batch_first: If True, then the input and output tensors are provided | |||||
as (batch, seq, feature) | |||||
dropout: If non-zero, introduces a dropout layer on the outputs of each | |||||
RNN layer except the last layer | |||||
bidirectional: If True, becomes a bidirectional RNN. Default: False | |||||
Inputs: input, mask, h_0 | |||||
- **input** (seq_len, batch, input_size): tensor containing the features | |||||
of the input sequence. | |||||
**mask** (seq_len, batch): 0-1 tensor containing the mask of the input sequence. | |||||
- **h_0** (num_layers * num_directions, batch, hidden_size): tensor | |||||
containing the initial hidden state for each element in the batch. | |||||
Outputs: output, h_n | |||||
- **output** (seq_len, batch, hidden_size * num_directions): tensor | |||||
containing the output features (h_k) from the last layer of the RNN, | |||||
for each k. If a :class:`torch.nn.utils.rnn.PackedSequence` has | |||||
been given as the input, the output will also be a packed sequence. | |||||
- **h_n** (num_layers * num_directions, batch, hidden_size): tensor | |||||
containing the hidden state for k=seq_len. | |||||
""" | |||||
def __init__(self, *args, **kwargs): | |||||
super(MaskedGRU, self).__init__(nn.GRUCell, *args, **kwargs) |
@@ -0,0 +1,384 @@ | |||||
import math | |||||
import torch | |||||
import torch.nn as nn | |||||
import torch.nn.functional as F | |||||
from torch.nn._functions.thnn import rnnFusedPointwise as fusedBackend | |||||
from torch.nn.parameter import Parameter | |||||
def default_initializer(hidden_size): | |||||
stdv = 1.0 / math.sqrt(hidden_size) | |||||
def forward(tensor): | |||||
nn.init.uniform_(tensor, -stdv, stdv) | |||||
return forward | |||||
def VarMaskedRecurrent(reverse=False): | |||||
def forward(input, hidden, cell, mask): | |||||
output = [] | |||||
steps = range(input.size(0) - 1, -1, -1) if reverse else range(input.size(0)) | |||||
for i in steps: | |||||
if mask is None or mask[i].data.min() > 0.5: | |||||
hidden = cell(input[i], hidden) | |||||
elif mask[i].data.max() > 0.5: | |||||
hidden_next = cell(input[i], hidden) | |||||
# hack to handle LSTM | |||||
if isinstance(hidden, tuple): | |||||
hx, cx = hidden | |||||
hp1, cp1 = hidden_next | |||||
hidden = (hx + (hp1 - hx) * mask[i], cx + (cp1 - cx) * mask[i]) | |||||
else: | |||||
hidden = hidden + (hidden_next - hidden) * mask[i] | |||||
# hack to handle LSTM | |||||
output.append(hidden[0] if isinstance(hidden, tuple) else hidden) | |||||
if reverse: | |||||
output.reverse() | |||||
output = torch.cat(output, 0).view(input.size(0), *output[0].size()) | |||||
return hidden, output | |||||
return forward | |||||
def StackedRNN(inners, num_layers, lstm=False): | |||||
num_directions = len(inners) | |||||
total_layers = num_layers * num_directions | |||||
def forward(input, hidden, cells, mask): | |||||
assert (len(cells) == total_layers) | |||||
next_hidden = [] | |||||
if lstm: | |||||
hidden = list(zip(*hidden)) | |||||
for i in range(num_layers): | |||||
all_output = [] | |||||
for j, inner in enumerate(inners): | |||||
l = i * num_directions + j | |||||
hy, output = inner(input, hidden[l], cells[l], mask) | |||||
next_hidden.append(hy) | |||||
all_output.append(output) | |||||
input = torch.cat(all_output, input.dim() - 1) | |||||
if lstm: | |||||
next_h, next_c = zip(*next_hidden) | |||||
next_hidden = ( | |||||
torch.cat(next_h, 0).view(total_layers, *next_h[0].size()), | |||||
torch.cat(next_c, 0).view(total_layers, *next_c[0].size()) | |||||
) | |||||
else: | |||||
next_hidden = torch.cat(next_hidden, 0).view(total_layers, *next_hidden[0].size()) | |||||
return next_hidden, input | |||||
return forward | |||||
def AutogradVarMaskedRNN(num_layers=1, batch_first=False, bidirectional=False, lstm=False): | |||||
rec_factory = VarMaskedRecurrent | |||||
if bidirectional: | |||||
layer = (rec_factory(), rec_factory(reverse=True)) | |||||
else: | |||||
layer = (rec_factory(),) | |||||
func = StackedRNN(layer, | |||||
num_layers, | |||||
lstm=lstm) | |||||
def forward(input, cells, hidden, mask): | |||||
if batch_first: | |||||
input = input.transpose(0, 1) | |||||
if mask is not None: | |||||
mask = mask.transpose(0, 1) | |||||
nexth, output = func(input, hidden, cells, mask) | |||||
if batch_first: | |||||
output = output.transpose(0, 1) | |||||
return output, nexth | |||||
return forward | |||||
def VarMaskedStep(): | |||||
def forward(input, hidden, cell, mask): | |||||
if mask is None or mask.data.min() > 0.5: | |||||
hidden = cell(input, hidden) | |||||
elif mask.data.max() > 0.5: | |||||
hidden_next = cell(input, hidden) | |||||
# hack to handle LSTM | |||||
if isinstance(hidden, tuple): | |||||
hx, cx = hidden | |||||
hp1, cp1 = hidden_next | |||||
hidden = (hx + (hp1 - hx) * mask, cx + (cp1 - cx) * mask) | |||||
else: | |||||
hidden = hidden + (hidden_next - hidden) * mask | |||||
# hack to handle LSTM | |||||
output = hidden[0] if isinstance(hidden, tuple) else hidden | |||||
return hidden, output | |||||
return forward | |||||
def StackedStep(layer, num_layers, lstm=False): | |||||
def forward(input, hidden, cells, mask): | |||||
assert (len(cells) == num_layers) | |||||
next_hidden = [] | |||||
if lstm: | |||||
hidden = list(zip(*hidden)) | |||||
for l in range(num_layers): | |||||
hy, output = layer(input, hidden[l], cells[l], mask) | |||||
next_hidden.append(hy) | |||||
input = output | |||||
if lstm: | |||||
next_h, next_c = zip(*next_hidden) | |||||
next_hidden = ( | |||||
torch.cat(next_h, 0).view(num_layers, *next_h[0].size()), | |||||
torch.cat(next_c, 0).view(num_layers, *next_c[0].size()) | |||||
) | |||||
else: | |||||
next_hidden = torch.cat(next_hidden, 0).view(num_layers, *next_hidden[0].size()) | |||||
return next_hidden, input | |||||
return forward | |||||
def AutogradVarMaskedStep(num_layers=1, lstm=False): | |||||
layer = VarMaskedStep() | |||||
func = StackedStep(layer, | |||||
num_layers, | |||||
lstm=lstm) | |||||
def forward(input, cells, hidden, mask): | |||||
nexth, output = func(input, hidden, cells, mask) | |||||
return output, nexth | |||||
return forward | |||||
class VarMaskedRNNBase(nn.Module): | |||||
def __init__(self, Cell, input_size, hidden_size, | |||||
num_layers=1, bias=True, batch_first=False, | |||||
dropout=(0, 0), bidirectional=False, initializer=None, **kwargs): | |||||
super(VarMaskedRNNBase, self).__init__() | |||||
self.Cell = Cell | |||||
self.input_size = input_size | |||||
self.hidden_size = hidden_size | |||||
self.num_layers = num_layers | |||||
self.bias = bias | |||||
self.batch_first = batch_first | |||||
self.bidirectional = bidirectional | |||||
self.lstm = False | |||||
num_directions = 2 if bidirectional else 1 | |||||
self.all_cells = [] | |||||
for layer in range(num_layers): | |||||
for direction in range(num_directions): | |||||
layer_input_size = input_size if layer == 0 else hidden_size * num_directions | |||||
cell = self.Cell(layer_input_size, hidden_size, self.bias, p=dropout, initializer=initializer, **kwargs) | |||||
self.all_cells.append(cell) | |||||
self.add_module('cell%d' % (layer * num_directions + direction), cell) | |||||
def reset_parameters(self): | |||||
for cell in self.all_cells: | |||||
cell.reset_parameters() | |||||
def reset_noise(self, batch_size): | |||||
for cell in self.all_cells: | |||||
cell.reset_noise(batch_size) | |||||
def forward(self, input, mask=None, hx=None): | |||||
batch_size = input.size(0) if self.batch_first else input.size(1) | |||||
if hx is None: | |||||
num_directions = 2 if self.bidirectional else 1 | |||||
hx = torch.tensor(input.data.new(self.num_layers * num_directions, batch_size, self.hidden_size).zero_(), | |||||
requires_grad=True) | |||||
if self.lstm: | |||||
hx = (hx, hx) | |||||
func = AutogradVarMaskedRNN(num_layers=self.num_layers, | |||||
batch_first=self.batch_first, | |||||
bidirectional=self.bidirectional, | |||||
lstm=self.lstm) | |||||
self.reset_noise(batch_size) | |||||
output, hidden = func(input, self.all_cells, hx, None if mask is None else mask.view(mask.size() + (1,))) | |||||
return output, hidden | |||||
def step(self, input, hx=None, mask=None): | |||||
''' | |||||
execute one step forward (only for one-directional RNN). | |||||
Args: | |||||
input (batch, input_size): input tensor of this step. | |||||
hx (num_layers, batch, hidden_size): the hidden state of last step. | |||||
mask (batch): the mask tensor of this step. | |||||
Returns: | |||||
output (batch, hidden_size): tensor containing the output of this step from the last layer of RNN. | |||||
hn (num_layers, batch, hidden_size): tensor containing the hidden state of this step | |||||
''' | |||||
assert not self.bidirectional, "step only cannot be applied to bidirectional RNN." | |||||
batch_size = input.size(0) | |||||
if hx is None: | |||||
hx = torch.tensor(input.data.new(self.num_layers, batch_size, self.hidden_size).zero_(), requires_grad=True) | |||||
if self.lstm: | |||||
hx = (hx, hx) | |||||
func = AutogradVarMaskedStep(num_layers=self.num_layers, lstm=self.lstm) | |||||
output, hidden = func(input, self.all_cells, hx, mask) | |||||
return output, hidden | |||||
class VarMaskedFastLSTM(VarMaskedRNNBase): | |||||
def __init__(self, *args, **kwargs): | |||||
super(VarMaskedFastLSTM, self).__init__(VarFastLSTMCell, *args, **kwargs) | |||||
self.lstm = True | |||||
class VarRNNCellBase(nn.Module): | |||||
def __repr__(self): | |||||
s = '{name}({input_size}, {hidden_size}' | |||||
if 'bias' in self.__dict__ and self.bias is not True: | |||||
s += ', bias={bias}' | |||||
if 'nonlinearity' in self.__dict__ and self.nonlinearity != "tanh": | |||||
s += ', nonlinearity={nonlinearity}' | |||||
s += ')' | |||||
return s.format(name=self.__class__.__name__, **self.__dict__) | |||||
def reset_noise(self, batch_size): | |||||
""" | |||||
Should be overriden by all subclasses. | |||||
Args: | |||||
batch_size: (int) batch size of input. | |||||
""" | |||||
raise NotImplementedError | |||||
class VarFastLSTMCell(VarRNNCellBase): | |||||
""" | |||||
A long short-term memory (LSTM) cell with variational dropout. | |||||
.. math:: | |||||
\begin{array}{ll} | |||||
i = \mathrm{sigmoid}(W_{ii} x + b_{ii} + W_{hi} h + b_{hi}) \\ | |||||
f = \mathrm{sigmoid}(W_{if} x + b_{if} + W_{hf} h + b_{hf}) \\ | |||||
g = \tanh(W_{ig} x + b_{ig} + W_{hc} h + b_{hg}) \\ | |||||
o = \mathrm{sigmoid}(W_{io} x + b_{io} + W_{ho} h + b_{ho}) \\ | |||||
c' = f * c + i * g \\ | |||||
h' = o * \tanh(c') \\ | |||||
\end{array} | |||||
""" | |||||
def __init__(self, input_size, hidden_size, bias=True, p=(0.5, 0.5), initializer=None): | |||||
super(VarFastLSTMCell, self).__init__() | |||||
self.input_size = input_size | |||||
self.hidden_size = hidden_size | |||||
self.bias = bias | |||||
self.weight_ih = Parameter(torch.Tensor(4 * hidden_size, input_size)) | |||||
self.weight_hh = Parameter(torch.Tensor(4 * hidden_size, hidden_size)) | |||||
if bias: | |||||
self.bias_ih = Parameter(torch.Tensor(4 * hidden_size)) | |||||
self.bias_hh = Parameter(torch.Tensor(4 * hidden_size)) | |||||
else: | |||||
self.register_parameter('bias_ih', None) | |||||
self.register_parameter('bias_hh', None) | |||||
self.initializer = default_initializer(self.hidden_size) if initializer is None else initializer | |||||
self.reset_parameters() | |||||
p_in, p_hidden = p | |||||
if p_in < 0 or p_in > 1: | |||||
raise ValueError("input dropout probability has to be between 0 and 1, " | |||||
"but got {}".format(p_in)) | |||||
if p_hidden < 0 or p_hidden > 1: | |||||
raise ValueError("hidden state dropout probability has to be between 0 and 1, " | |||||
"but got {}".format(p_hidden)) | |||||
self.p_in = p_in | |||||
self.p_hidden = p_hidden | |||||
self.noise_in = None | |||||
self.noise_hidden = None | |||||
def reset_parameters(self): | |||||
for weight in self.parameters(): | |||||
if weight.dim() == 1: | |||||
weight.data.zero_() | |||||
else: | |||||
self.initializer(weight.data) | |||||
def reset_noise(self, batch_size): | |||||
if self.training: | |||||
if self.p_in: | |||||
noise = self.weight_ih.data.new(batch_size, self.input_size) | |||||
self.noise_in = torch.tensor(noise.bernoulli_(1.0 - self.p_in) / (1.0 - self.p_in)) | |||||
else: | |||||
self.noise_in = None | |||||
if self.p_hidden: | |||||
noise = self.weight_hh.data.new(batch_size, self.hidden_size) | |||||
self.noise_hidden = torch.tensor(noise.bernoulli_(1.0 - self.p_hidden) / (1.0 - self.p_hidden)) | |||||
else: | |||||
self.noise_hidden = None | |||||
else: | |||||
self.noise_in = None | |||||
self.noise_hidden = None | |||||
def forward(self, input, hx): | |||||
return self.__forward( | |||||
input, hx, | |||||
self.weight_ih, self.weight_hh, | |||||
self.bias_ih, self.bias_hh, | |||||
self.noise_in, self.noise_hidden, | |||||
) | |||||
@staticmethod | |||||
def __forward(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None, noise_in=None, noise_hidden=None): | |||||
if noise_in is not None: | |||||
if input.is_cuda: | |||||
input = input * noise_in.cuda(input.get_device()) | |||||
else: | |||||
input = input * noise_in | |||||
if input.is_cuda: | |||||
w_ih = w_ih.cuda(input.get_device()) | |||||
w_hh = w_hh.cuda(input.get_device()) | |||||
hidden = [h.cuda(input.get_device()) for h in hidden] | |||||
b_ih = b_ih.cuda(input.get_device()) | |||||
b_hh = b_hh.cuda(input.get_device()) | |||||
igates = F.linear(input, w_ih.cuda(input.get_device())) | |||||
hgates = F.linear(hidden[0], w_hh) if noise_hidden is None \ | |||||
else F.linear(hidden[0] * noise_hidden.cuda(input.get_device()), w_hh) | |||||
state = fusedBackend.LSTMFused.apply | |||||
# print("use backend") | |||||
# use some magic function | |||||
return state(igates, hgates, hidden[1]) if b_ih is None else state(igates, hgates, hidden[1], b_ih, b_hh) | |||||
hx, cx = hidden | |||||
if noise_hidden is not None: | |||||
hx = hx * noise_hidden | |||||
gates = F.linear(input, w_ih, b_ih) + F.linear(hx, w_hh, b_hh) | |||||
ingate, forgetgate, cellgate, outgate = gates.chunk(4, 1) | |||||
ingate = F.sigmoid(ingate) | |||||
forgetgate = F.sigmoid(forgetgate) | |||||
cellgate = F.tanh(cellgate) | |||||
outgate = F.sigmoid(outgate) | |||||
cy = (forgetgate * cx) + (ingate * cellgate) | |||||
hy = outgate * F.tanh(cy) | |||||
return hy, cy |
@@ -0,0 +1,489 @@ | |||||
""" | |||||
This is borrowed from FudanParser. Not stable. Do not use !!! | |||||
""" | |||||
import numpy as np | |||||
import torch.nn as nn | |||||
import torch.nn.functional as F | |||||
from torch.nn import Parameter | |||||
from .utils import orthogonal | |||||
import torch | |||||
import torch.utils.data | |||||
import numpy | |||||
from torch.autograd import Function, Variable | |||||
from torch import optim | |||||
class GroupNorm(nn.Module): | |||||
def __init__(self, num_features, num_groups=20, eps=1e-5): | |||||
super(GroupNorm, self).__init__() | |||||
self.weight = nn.Parameter(torch.ones(1, num_features, 1)) | |||||
self.bias = nn.Parameter(torch.zeros(1, num_features, 1)) | |||||
self.num_groups = num_groups | |||||
self.eps = eps | |||||
def forward(self, x): | |||||
N, C, H = x.size() | |||||
G = self.num_groups | |||||
assert C % G == 0 | |||||
x = x.view(N, G, -1) | |||||
mean = x.mean(-1, keepdim=True) | |||||
var = x.var(-1, keepdim=True) | |||||
x = (x - mean) / (var + self.eps).sqrt() | |||||
x = x.view(N, C, H) | |||||
return x * self.weight + self.bias | |||||
class LayerNormalization(nn.Module): | |||||
""" Layer normalization module """ | |||||
def __init__(self, d_hid, eps=1e-3): | |||||
super(LayerNormalization, self).__init__() | |||||
self.eps = eps | |||||
self.a_2 = nn.Parameter(torch.ones(d_hid), requires_grad=True) | |||||
self.b_2 = nn.Parameter(torch.zeros(d_hid), requires_grad=True) | |||||
def forward(self, z): | |||||
if z.size(1) == 1: | |||||
return z | |||||
mu = torch.mean(z, keepdim=True, dim=-1) | |||||
sigma = torch.std(z, keepdim=True, dim=-1) | |||||
ln_out = (z - mu.expand_as(z)) / (sigma.expand_as(z) + self.eps) | |||||
ln_out = ln_out * self.a_2.expand_as(ln_out) + self.b_2.expand_as(ln_out) | |||||
return ln_out | |||||
class OrthEmbedding(nn.Embedding): | |||||
def __init__(self, *args, **kwargs): | |||||
super(OrthEmbedding, self).__init__(*args, **kwargs) | |||||
def reset_parameters(self): | |||||
self.weight = orthogonal(self.weight) | |||||
nn.init.constant_(self.bias, 0.) | |||||
class BiLinear(nn.Module): | |||||
def __init__(self, n_left, n_right, n_out, bias=True): | |||||
""" | |||||
Args: | |||||
n_left: size of left input | |||||
n_right: size of right input | |||||
n_out: size of output | |||||
bias: If set to False, the layer will not learn an additive bias. | |||||
Default: True | |||||
""" | |||||
super(BiLinear, self).__init__() | |||||
self.n_left = n_left | |||||
self.n_right = n_right | |||||
self.n_out = n_out | |||||
self.U = Parameter(torch.Tensor(self.n_out, self.n_left, self.n_right)) | |||||
self.W_l = Parameter(torch.Tensor(self.n_out, self.n_left)) | |||||
self.W_r = Parameter(torch.Tensor(self.n_out, self.n_left)) | |||||
if bias: | |||||
self.bias = Parameter(torch.Tensor(n_out)) | |||||
else: | |||||
self.register_parameter('bias', None) | |||||
self.reset_parameters() | |||||
def reset_parameters(self): | |||||
nn.init.xavier_uniform_(self.W_l) | |||||
nn.init.xavier_uniform_(self.W_r) | |||||
nn.init.constant_(self.bias, 0.) | |||||
nn.init.xavier_uniform_(self.U) | |||||
def forward(self, input_left, input_right): | |||||
""" | |||||
Args: | |||||
input_left: Tensor | |||||
the left input tensor with shape = [batch1, batch2, ..., left_features] | |||||
input_right: Tensor | |||||
the right input tensor with shape = [batch1, batch2, ..., right_features] | |||||
Returns: | |||||
""" | |||||
left_size = input_left.size() | |||||
right_size = input_right.size() | |||||
assert left_size[:-1] == right_size[:-1], \ | |||||
"batch size of left and right inputs mis-match: (%s, %s)" % (left_size[:-1], right_size[:-1]) | |||||
batch = int(np.prod(left_size[:-1])) | |||||
# convert left and right input to matrices [batch, left_features], [batch, right_features] | |||||
input_left = input_left.view(batch, self.n_left) | |||||
input_right = input_right.view(batch, self.n_right) | |||||
# output [batch, out_features] | |||||
output = F.bilinear(input_left, input_right, self.U, self.bias) | |||||
output = output + \ | |||||
F.linear(input_left, self.W_l, None) + \ | |||||
F.linear(input_right, self.W_r, None) | |||||
# convert back to [batch1, batch2, ..., out_features] | |||||
return output.view(left_size[:-1] + (self.n_out,)) | |||||
def __repr__(self): | |||||
return self.__class__.__name__ + ' (' \ | |||||
+ 'in1_features=' + str(self.n_left) \ | |||||
+ ', in2_features=' + str(self.n_right) \ | |||||
+ ', out_features=' + str(self.n_out) + ')' | |||||
class BiAffine(nn.Module): | |||||
def __init__(self, n_enc, n_dec, n_labels, biaffine=True, **kwargs): | |||||
""" | |||||
Args: | |||||
n_enc: int | |||||
the dimension of the encoder input. | |||||
n_dec: int | |||||
the dimension of the decoder input. | |||||
n_labels: int | |||||
the number of labels of the crf layer | |||||
biaffine: bool | |||||
if apply bi-affine parameter. | |||||
**kwargs: | |||||
""" | |||||
super(BiAffine, self).__init__() | |||||
self.n_enc = n_enc | |||||
self.n_dec = n_dec | |||||
self.num_labels = n_labels | |||||
self.biaffine = biaffine | |||||
self.W_d = Parameter(torch.Tensor(self.num_labels, self.n_dec)) | |||||
self.W_e = Parameter(torch.Tensor(self.num_labels, self.n_enc)) | |||||
self.b = Parameter(torch.Tensor(self.num_labels, 1, 1)) | |||||
if self.biaffine: | |||||
self.U = Parameter(torch.Tensor(self.num_labels, self.n_dec, self.n_enc)) | |||||
else: | |||||
self.register_parameter('U', None) | |||||
self.reset_parameters() | |||||
def reset_parameters(self): | |||||
nn.init.xavier_uniform_(self.W_d) | |||||
nn.init.xavier_uniform_(self.W_e) | |||||
nn.init.constant_(self.b, 0.) | |||||
if self.biaffine: | |||||
nn.init.xavier_uniform_(self.U) | |||||
def forward(self, input_d, input_e, mask_d=None, mask_e=None): | |||||
""" | |||||
Args: | |||||
input_d: Tensor | |||||
the decoder input tensor with shape = [batch, length_decoder, input_size] | |||||
input_e: Tensor | |||||
the child input tensor with shape = [batch, length_encoder, input_size] | |||||
mask_d: Tensor or None | |||||
the mask tensor for decoder with shape = [batch, length_decoder] | |||||
mask_e: Tensor or None | |||||
the mask tensor for encoder with shape = [batch, length_encoder] | |||||
Returns: Tensor | |||||
the energy tensor with shape = [batch, num_label, length, length] | |||||
""" | |||||
assert input_d.size(0) == input_e.size(0), 'batch sizes of encoder and decoder are requires to be equal.' | |||||
batch, length_decoder, _ = input_d.size() | |||||
_, length_encoder, _ = input_e.size() | |||||
# compute decoder part: [num_label, input_size_decoder] * [batch, input_size_decoder, length_decoder] | |||||
# the output shape is [batch, num_label, length_decoder] | |||||
out_d = torch.matmul(self.W_d, input_d.transpose(1, 2)).unsqueeze(3) | |||||
# compute decoder part: [num_label, input_size_encoder] * [batch, input_size_encoder, length_encoder] | |||||
# the output shape is [batch, num_label, length_encoder] | |||||
out_e = torch.matmul(self.W_e, input_e.transpose(1, 2)).unsqueeze(2) | |||||
# output shape [batch, num_label, length_decoder, length_encoder] | |||||
if self.biaffine: | |||||
# compute bi-affine part | |||||
# [batch, 1, length_decoder, input_size_decoder] * [num_labels, input_size_decoder, input_size_encoder] | |||||
# output shape [batch, num_label, length_decoder, input_size_encoder] | |||||
output = torch.matmul(input_d.unsqueeze(1), self.U) | |||||
# [batch, num_label, length_decoder, input_size_encoder] * [batch, 1, input_size_encoder, length_encoder] | |||||
# output shape [batch, num_label, length_decoder, length_encoder] | |||||
output = torch.matmul(output, input_e.unsqueeze(1).transpose(2, 3)) | |||||
output = output + out_d + out_e + self.b | |||||
else: | |||||
output = out_d + out_d + self.b | |||||
if mask_d is not None: | |||||
output = output * mask_d.unsqueeze(1).unsqueeze(3) * mask_e.unsqueeze(1).unsqueeze(2) | |||||
return output | |||||
class Transpose(nn.Module): | |||||
def __init__(self, x, y): | |||||
super(Transpose, self).__init__() | |||||
self.x = x | |||||
self.y = y | |||||
def forward(self, x): | |||||
return x.transpose(self.x, self.y) | |||||
class WordDropout(nn.Module): | |||||
def __init__(self, dropout_rate, drop_to_token): | |||||
super(WordDropout, self).__init__() | |||||
self.dropout_rate = dropout_rate | |||||
self.drop_to_token = drop_to_token | |||||
def forward(self, word_idx): | |||||
if not self.training: | |||||
return word_idx | |||||
drop_mask = torch.rand(word_idx.shape) < self.dropout_rate | |||||
if word_idx.device.type == 'cuda': | |||||
drop_mask = drop_mask.cuda() | |||||
drop_mask = drop_mask.long() | |||||
output = drop_mask * self.drop_to_token + (1 - drop_mask) * word_idx | |||||
return output | |||||
class WlossLayer(torch.nn.Module): | |||||
def __init__(self, lam=100, sinkhorn_iter=50): | |||||
super(WlossLayer, self).__init__() | |||||
# cost = matrix M = distance matrix | |||||
# lam = lambda of type float > 0 | |||||
# sinkhorn_iter > 0 | |||||
# diagonal cost should be 0 | |||||
self.lam = lam | |||||
self.sinkhorn_iter = sinkhorn_iter | |||||
# self.register_buffer("K", torch.exp(-self.cost / self.lam).double()) | |||||
# self.register_buffer("KM", (self.cost * self.K).double()) | |||||
def forward(self, pred, target, cost): | |||||
return WassersteinLossStab.apply(pred, target, | |||||
cost, self.lam, self.sinkhorn_iter) | |||||
class WassersteinLossStab(Function): | |||||
@staticmethod | |||||
def forward(ctx, pred, target, cost, lam=1e-3, sinkhorn_iter=4): | |||||
"""pred: Batch * K: K = # mass points | |||||
target: Batch * L: L = # mass points""" | |||||
# import pdb | |||||
# pdb.set_trace() | |||||
eps = 1e-8 | |||||
# pred = pred.gather(dim=1, index=) | |||||
na = pred.size(1) | |||||
nb = target.size(1) | |||||
cost = cost.double() | |||||
pred = pred.double() | |||||
target = target.double() | |||||
cost = cost[:na, :nb].double() | |||||
K = torch.exp(-cost / lam).double() | |||||
KM = (cost * K).double() | |||||
batch_size = pred.size(0) | |||||
# pdb.set_trace() | |||||
log_a, log_b = torch.log(pred + eps), torch.log(target + eps) | |||||
log_u = cost.new(batch_size, na).fill_(-numpy.log(na)) | |||||
log_v = cost.new(batch_size, nb).fill_(-numpy.log(nb)) | |||||
# import pdb | |||||
# pdb.set_trace() | |||||
for i in range(int(sinkhorn_iter)): | |||||
log_u_max = torch.max(log_u, dim=1)[0] | |||||
u_stab = torch.exp(log_u - log_u_max.unsqueeze(1) + eps) | |||||
log_v = log_b - torch.log(torch.mm(K.t(), u_stab.t()).t()) - log_u_max.unsqueeze(1) | |||||
log_v_max = torch.max(log_v, dim=1)[0] | |||||
v_stab = torch.exp(log_v - log_v_max.unsqueeze(1)) | |||||
tmp = log_u | |||||
log_u = log_a - torch.log(torch.mm(K, v_stab.t()).t() + eps) - log_v_max.unsqueeze(1) | |||||
# print(log_u.sum()) | |||||
if torch.norm(tmp - log_u) / torch.norm(log_u) < eps: | |||||
break | |||||
log_v_max = torch.max(log_v, dim=1)[0] | |||||
v_stab = torch.exp(log_v - log_v_max.unsqueeze(1)) | |||||
logcostpart1 = torch.log(torch.mm(KM, v_stab.t()).t() + eps) + log_v_max.unsqueeze(1) | |||||
wnorm = torch.exp(log_u + logcostpart1).mean(0).sum() # sum(1) for per item pair loss... | |||||
grad_input = log_u * lam | |||||
# print("log_u", log_u) | |||||
grad_input = grad_input - torch.mean(grad_input, dim=1).unsqueeze(1) | |||||
grad_input = grad_input - torch.mean(grad_input, dim=1).unsqueeze(1) | |||||
grad_input = grad_input / batch_size | |||||
ctx.save_for_backward(grad_input) | |||||
# print("grad type", type(grad_input)) | |||||
return pred.new((wnorm,)), grad_input | |||||
@staticmethod | |||||
def backward(ctx, grad_output, _): | |||||
grad_input = ctx.saved_variables | |||||
# print(grad) | |||||
res = grad_output.clone() | |||||
res.data.resize_(grad_input[0].size()).copy_(grad_input[0].data) | |||||
res = res.mul_(grad_output[0]).float() | |||||
# print("in backward func:\n\n", res) | |||||
return res, None, None, None, None, None, None | |||||
class Sinkhorn(Function): | |||||
def __init__(self): | |||||
super(Sinkhorn, self).__init__() | |||||
def forward(ctx, a, b, M, reg, tau, warmstart, numItermax, stop): | |||||
a = a.double() | |||||
b = b.double() | |||||
M = M.double() | |||||
nbb = b.size(1) | |||||
# init data | |||||
na = len(a) | |||||
nb = len(b) | |||||
cpt = 0 | |||||
# we assume that no distances are null except those of the diagonal of | |||||
# distances | |||||
if warmstart is None: | |||||
alpha, beta = np.zeros(na), np.zeros(nb) | |||||
else: | |||||
alpha, beta = warmstart | |||||
if nbb: | |||||
u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb | |||||
else: | |||||
u, v = np.ones(na) / na, np.ones(nb) / nb | |||||
def get_K(alpha, beta): | |||||
"""log space computation""" | |||||
return np.exp(-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg) | |||||
def get_Gamma(alpha, beta, u, v): | |||||
"""log space gamma computation""" | |||||
return np.exp( | |||||
-(M - alpha.reshape((na, 1)) - beta.reshape((1, nb))) / reg + np.log(u.reshape((na, 1))) + np.log( | |||||
v.reshape((1, nb)))) | |||||
# print(np.min(K)) | |||||
K = get_K(alpha, beta) | |||||
transp = K | |||||
cpt = 0 | |||||
err = 1 | |||||
while 1: | |||||
uprev = u | |||||
vprev = v | |||||
# sinkhorn update | |||||
v = b / (np.dot(K.T, u) + 1e-16) | |||||
u = a / (np.dot(K, v) + 1e-16) | |||||
# remove numerical problems and store them in K | |||||
if np.abs(u).max() > tau or np.abs(v).max() > tau: | |||||
if nbb: | |||||
alpha, beta = alpha + reg * \ | |||||
np.max(np.log(u), 1), beta + reg * np.max(np.log(v)) | |||||
else: | |||||
alpha, beta = alpha + reg * np.log(u), beta + reg * np.log(v) | |||||
if nbb: | |||||
u, v = np.ones((na, nbb)) / na, np.ones((nb, nbb)) / nb | |||||
else: | |||||
u, v = np.ones(na) / na, np.ones(nb) / nb | |||||
K = get_K(alpha, beta) | |||||
if cpt % print_period == 0: | |||||
# we can speed up the process by checking for the error only all | |||||
# the 10th iterations | |||||
if nbb: | |||||
err = np.sum((u - uprev) ** 2) / np.sum((u) ** 2) + \ | |||||
np.sum((v - vprev) ** 2) / np.sum((v) ** 2) | |||||
else: | |||||
transp = get_Gamma(alpha, beta, u, v) | |||||
err = np.linalg.norm((np.sum(transp, axis=0) - b)) ** 2 | |||||
if log: | |||||
log['err'].append(err) | |||||
if verbose: | |||||
if cpt % (print_period * 20) == 0: | |||||
print( | |||||
'{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19) | |||||
print('{:5d}|{:8e}|'.format(cpt, err)) | |||||
if err <= stopThr: | |||||
loop = False | |||||
if cpt >= numItermax: | |||||
loop = False | |||||
if np.any(np.isnan(u)) or np.any(np.isnan(v)): | |||||
# we have reached the machine precision | |||||
# come back to previous solution and quit loop | |||||
print('Warning: numerical errors at iteration', cpt) | |||||
u = uprev | |||||
v = vprev | |||||
break | |||||
cpt = cpt + 1 | |||||
# print('err=',err,' cpt=',cpt) | |||||
if log: | |||||
log['logu'] = alpha / reg + np.log(u) | |||||
log['logv'] = beta / reg + np.log(v) | |||||
log['alpha'] = alpha + reg * np.log(u) | |||||
log['beta'] = beta + reg * np.log(v) | |||||
log['warmstart'] = (log['alpha'], log['beta']) | |||||
if nbb: | |||||
res = np.zeros((nbb)) | |||||
for i in range(nbb): | |||||
res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M) | |||||
return res, log | |||||
else: | |||||
return get_Gamma(alpha, beta, u, v), log | |||||
else: | |||||
if nbb: | |||||
res = np.zeros((nbb)) | |||||
for i in range(nbb): | |||||
res[i] = np.sum(get_Gamma(alpha, beta, u[:, i], v[:, i]) * M) | |||||
return res | |||||
else: | |||||
return get_Gamma(alpha, beta, u, v) | |||||
if __name__ == "__main__": | |||||
cost = (torch.Tensor(2, 2).fill_(1) - torch.diag(torch.Tensor(2).fill_(1))) # .cuda() | |||||
mylayer = WlossLayer(cost) # .cuda() | |||||
inp = Variable(torch.Tensor([[1, 0], [0.5, 0.5]]), requires_grad=True) # .cuda() | |||||
ground_true = Variable(torch.Tensor([[0, 1], [0.5, 0.5]])) # .cuda() | |||||
res, _ = mylayer(inp, ground_true) | |||||
# print(inp.requires_grad, res.requires_grad) | |||||
# print(res, inp) | |||||
mylayer.zero_grad() | |||||
res.backward() | |||||
print("inp's gradient is good:") | |||||
print(inp.grad) | |||||
print("convert to gpu:\n", inp.cuda().grad) | |||||
print("==============================================" | |||||
"\n However, this does not work on pytorch when GPU is enabled") | |||||
cost = (torch.Tensor(2, 2).fill_(1) - torch.diag(torch.Tensor(2).fill_(1))).cuda() | |||||
mylayer = WlossLayer(cost).cuda() | |||||
inp = Variable(torch.Tensor([[1, 0], [0.5, 0.5]]), requires_grad=True).cuda() | |||||
ground_true = Variable(torch.Tensor([[0, 1], [0.5, 0.5]])).cuda() | |||||
opt = optim.SGD([ | |||||
{'params': mylayer.parameters()}, | |||||
], lr=1e-2, momentum=0.9) | |||||
res, _ = mylayer(inp, ground_true) | |||||
# print(inp.requires_grad, res.requires_grad) | |||||
# print(res, inp) | |||||
mylayer.zero_grad() | |||||
res.backward() | |||||
print("input's gradient is None!!!!!!!!!!!!!!!!") | |||||
print(inp.grad) |
@@ -1,6 +1,3 @@ | |||||
import torch | |||||
def mask_softmax(matrix, mask): | def mask_softmax(matrix, mask): | ||||
if mask is None: | if mask is None: | ||||
result = torch.nn.functional.softmax(matrix, dim=-1) | result = torch.nn.functional.softmax(matrix, dim=-1) | ||||
@@ -13,3 +10,231 @@ def seq_mask(seq_len, max_len): | |||||
mask = [torch.ge(torch.LongTensor(seq_len), i + 1) for i in range(max_len)] | mask = [torch.ge(torch.LongTensor(seq_len), i + 1) for i in range(max_len)] | ||||
mask = torch.stack(mask, 1) | mask = torch.stack(mask, 1) | ||||
return mask | return mask | ||||
""" | |||||
Codes from FudanParser. Not tested. Do not use !!! | |||||
""" | |||||
from collections import defaultdict | |||||
import numpy as np | |||||
import torch | |||||
def expand_gt(gt): | |||||
"""expand_gt: Expand ground truth to matrix | |||||
Arguments: | |||||
gt: tensor of (n, l) | |||||
Return: | |||||
f: ground truth matrix of (n, l), $gt[i][j] = k$ leads to $f[i][j][k] = 1$. | |||||
""" | |||||
n, l = gt.shape | |||||
ret = torch.zeros(n, l, l).long() | |||||
for i in range(n): | |||||
ret[i][torch.arange(l).long(), gt[i]] = 1 | |||||
return ret | |||||
def greedy_decoding(arc_f): | |||||
"""greedy_decoding | |||||
Arguments: | |||||
arc_f: a tensor in shape of (n, l+1, l+1) | |||||
length of the sentence is l and index 0 is <root> | |||||
Output: | |||||
arc_pred: a tensor in shape of (n, l), indicating the head words | |||||
""" | |||||
f_arc = arc_f[:, 1:, :] # ignore the root | |||||
_, arc_pred = torch.max(f_arc.data, dim=-1, keepdim=False) | |||||
return arc_pred | |||||
def mst_decoding(arc_f): | |||||
batch_size = arc_f.shape[0] | |||||
length = arc_f.shape[1] | |||||
arc_score = arc_f.data.cpu() | |||||
pred_collection = [] | |||||
for i in range(batch_size): | |||||
head = mst(arc_score[i].numpy()) | |||||
pred_collection.append(head[1:].reshape((1, length - 1))) | |||||
arc_pred = torch.LongTensor(np.concatenate(pred_collection, axis=0)).type_as(arc_f).long() | |||||
return arc_pred | |||||
def outer_product(features): | |||||
"""InterProduct: Get inter sequence product of features | |||||
Arguments: | |||||
features: feature vectors of sequence in the shape of (n, l, h) | |||||
Return: | |||||
f: product result in (n, l, l, h) shape | |||||
""" | |||||
n, l, c = features.shape | |||||
features = features.contiguous() | |||||
x = features.view(n, l, 1, c) | |||||
x = x.expand(n, l, l, c) | |||||
y = features.view(n, 1, l, c).contiguous() | |||||
y = y.expand(n, l, l, c) | |||||
return x * y | |||||
def outer_concat(features): | |||||
"""InterProduct: Get inter sequence concatenation of features | |||||
Arguments: | |||||
features: feature vectors of sequence in the shape of (n, l, h) | |||||
Return: | |||||
f: product result in (n, l, l, h) shape | |||||
""" | |||||
n, l, c = features.shape | |||||
x = features.contiguous().view(n, l, 1, c) | |||||
x = x.expand(n, l, l, c) | |||||
y = features.view(n, 1, l, c) | |||||
y = y.expand(n, l, l, c) | |||||
return torch.cat((x, y), dim=3) | |||||
def mst(scores): | |||||
""" | |||||
https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/models/nn.py#L692 # NOQA | |||||
""" | |||||
length = scores.shape[0] | |||||
min_score = scores.min() - 1 | |||||
eye = np.eye(length) | |||||
scores = scores * (1 - eye) + min_score * eye | |||||
heads = np.argmax(scores, axis=1) | |||||
heads[0] = 0 | |||||
tokens = np.arange(1, length) | |||||
roots = np.where(heads[tokens] == 0)[0] + 1 | |||||
if len(roots) < 1: | |||||
root_scores = scores[tokens, 0] | |||||
head_scores = scores[tokens, heads[tokens]] | |||||
new_root = tokens[np.argmax(root_scores / head_scores)] | |||||
heads[new_root] = 0 | |||||
elif len(roots) > 1: | |||||
root_scores = scores[roots, 0] | |||||
scores[roots, 0] = 0 | |||||
new_heads = np.argmax(scores[roots][:, tokens], axis=1) + 1 | |||||
new_root = roots[np.argmin( | |||||
scores[roots, new_heads] / root_scores)] | |||||
heads[roots] = new_heads | |||||
heads[new_root] = 0 | |||||
edges = defaultdict(set) | |||||
vertices = set((0,)) | |||||
for dep, head in enumerate(heads[tokens]): | |||||
vertices.add(dep + 1) | |||||
edges[head].add(dep + 1) | |||||
for cycle in _find_cycle(vertices, edges): | |||||
dependents = set() | |||||
to_visit = set(cycle) | |||||
while len(to_visit) > 0: | |||||
node = to_visit.pop() | |||||
if node not in dependents: | |||||
dependents.add(node) | |||||
to_visit.update(edges[node]) | |||||
cycle = np.array(list(cycle)) | |||||
old_heads = heads[cycle] | |||||
old_scores = scores[cycle, old_heads] | |||||
non_heads = np.array(list(dependents)) | |||||
scores[np.repeat(cycle, len(non_heads)), | |||||
np.repeat([non_heads], len(cycle), axis=0).flatten()] = min_score | |||||
new_heads = np.argmax(scores[cycle][:, tokens], axis=1) + 1 | |||||
new_scores = scores[cycle, new_heads] / old_scores | |||||
change = np.argmax(new_scores) | |||||
changed_cycle = cycle[change] | |||||
old_head = old_heads[change] | |||||
new_head = new_heads[change] | |||||
heads[changed_cycle] = new_head | |||||
edges[new_head].add(changed_cycle) | |||||
edges[old_head].remove(changed_cycle) | |||||
return heads | |||||
def _find_cycle(vertices, edges): | |||||
""" | |||||
https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm # NOQA | |||||
https://github.com/tdozat/Parser/blob/0739216129cd39d69997d28cbc4133b360ea3934/lib/etc/tarjan.py # NOQA | |||||
""" | |||||
_index = 0 | |||||
_stack = [] | |||||
_indices = {} | |||||
_lowlinks = {} | |||||
_onstack = defaultdict(lambda: False) | |||||
_SCCs = [] | |||||
def _strongconnect(v): | |||||
nonlocal _index | |||||
_indices[v] = _index | |||||
_lowlinks[v] = _index | |||||
_index += 1 | |||||
_stack.append(v) | |||||
_onstack[v] = True | |||||
for w in edges[v]: | |||||
if w not in _indices: | |||||
_strongconnect(w) | |||||
_lowlinks[v] = min(_lowlinks[v], _lowlinks[w]) | |||||
elif _onstack[w]: | |||||
_lowlinks[v] = min(_lowlinks[v], _indices[w]) | |||||
if _lowlinks[v] == _indices[v]: | |||||
SCC = set() | |||||
while True: | |||||
w = _stack.pop() | |||||
_onstack[w] = False | |||||
SCC.add(w) | |||||
if not (w != v): | |||||
break | |||||
_SCCs.append(SCC) | |||||
for v in vertices: | |||||
if v not in _indices: | |||||
_strongconnect(v) | |||||
return [SCC for SCC in _SCCs if len(SCC) > 1] | |||||
# https://github.com/alykhantejani/nninit/blob/master/nninit.py | |||||
def orthogonal(tensor, gain=1): | |||||
"""Fills the input Tensor or Variable with a (semi) orthogonal matrix. The input tensor must have at least 2 dimensions, | |||||
and for tensors with more than 2 dimensions the trailing dimensions are flattened. viewed as 2D representation with | |||||
rows equal to the first dimension and columns equal to the product of as a sparse matrix, where the non-zero elements | |||||
will be drawn from a normal distribution with mean=0 and std=`std`. | |||||
Reference: "Exact solutions to the nonlinear dynamics of learning in deep linear neural networks" - Saxe, A. et al. | |||||
Args: | |||||
tensor: a n-dimension torch.Tensor, where n >= 2 | |||||
gain: optional gain to be applied | |||||
Examples: | |||||
>>> w = torch.Tensor(3, 5) | |||||
>>> nninit.orthogonal(w) | |||||
""" | |||||
if tensor.ndimension() < 2: | |||||
raise ValueError("Only tensors with 2 or more dimensions are supported.") | |||||
flattened_shape = (tensor.size(0), int(np.prod(tensor.detach().numpy().shape[1:]))) | |||||
flattened = torch.Tensor(flattened_shape[0], flattened_shape[1]).normal_(0, 1) | |||||
u, s, v = np.linalg.svd(flattened.numpy(), full_matrices=False) | |||||
if u.shape == flattened.detach().numpy().shape: | |||||
tensor.view_as(flattened).copy_(torch.from_numpy(u)) | |||||
else: | |||||
tensor.view_as(flattened).copy_(torch.from_numpy(v)) | |||||
tensor.mul_(gain) | |||||
with torch.no_grad(): | |||||
return tensor | |||||
def generate_step_dropout(masks, hidden_dim, step_dropout, training=False): | |||||
# assume batch first | |||||
# import pdb | |||||
# pdb.set_trace() | |||||
batch, length = masks.size() | |||||
if not training: | |||||
return torch.ones(batch, length, hidden_dim).fill_(1 - step_dropout).cuda(masks.device) * masks.view(batch, | |||||
length, 1) | |||||
masked = torch.zeros(batch, 1, hidden_dim).fill_(step_dropout) | |||||
masked = torch.bernoulli(masked).repeat(1, length, 1) | |||||
masked = masked.cuda(masks.device) * masks.view(batch, length, 1) | |||||
return masked |
@@ -3,12 +3,3 @@ class BaseSaver(object): | |||||
def __init__(self, save_path): | def __init__(self, save_path): | ||||
self.save_path = save_path | self.save_path = save_path | ||||
def save_bytes(self): | |||||
raise NotImplementedError | |||||
def save_str(self): | |||||
raise NotImplementedError | |||||
def compress(self): | |||||
raise NotImplementedError |
@@ -1,4 +1,6 @@ | |||||
from saver.base_saver import BaseSaver | |||||
import torch | |||||
from fastNLP.saver.base_saver import BaseSaver | |||||
class ModelSaver(BaseSaver): | class ModelSaver(BaseSaver): | ||||
@@ -6,3 +8,11 @@ class ModelSaver(BaseSaver): | |||||
def __init__(self, save_path): | def __init__(self, save_path): | ||||
super(ModelSaver, self).__init__(save_path) | super(ModelSaver, self).__init__(save_path) | ||||
def save_pytorch(self, model): | |||||
""" | |||||
Save a pytorch model into .pkl file. | |||||
:param model: a PyTorch model | |||||
:return: | |||||
""" | |||||
torch.save(model.state_dict(), self.save_path) |
@@ -57,4 +57,13 @@ new_attr = 40 | |||||
epochs = 20 | epochs = 20 | ||||
batch_size = 1 | batch_size = 1 | ||||
pickle_path = "./data_for_tests/" | pickle_path = "./data_for_tests/" | ||||
validate = true | |||||
validate = true | |||||
[POS_test] | |||||
save_output = true | |||||
validate_in_training = false | |||||
save_dev_input = false | |||||
save_loss = true | |||||
batch_size = 1 | |||||
pickle_path = "./data_for_tests/" | |||||
@@ -1,11 +1,13 @@ | |||||
import sys | import sys | ||||
sys.path.append("..") | sys.path.append("..") | ||||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | ||||
from fastNLP.action.trainer import POSTrainer | from fastNLP.action.trainer import POSTrainer | ||||
from fastNLP.loader.dataset_loader import POSDatasetLoader | from fastNLP.loader.dataset_loader import POSDatasetLoader | ||||
from fastNLP.loader.preprocess import POSPreprocess | from fastNLP.loader.preprocess import POSPreprocess | ||||
from fastNLP.saver.model_saver import ModelSaver | |||||
from fastNLP.loader.model_loader import ModelLoader | |||||
from fastNLP.action.tester import POSTester | |||||
from fastNLP.models.sequence_modeling import SeqLabeling | from fastNLP.models.sequence_modeling import SeqLabeling | ||||
data_name = "people.txt" | data_name = "people.txt" | ||||
@@ -17,8 +19,8 @@ if __name__ == "__main__": | |||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS": train_args}) | ||||
# Data Loader | # Data Loader | ||||
pos = POSDatasetLoader(data_name, data_path) | |||||
train_data = pos.load_lines() | |||||
pos_loader = POSDatasetLoader(data_name, data_path) | |||||
train_data = pos_loader.load_lines() | |||||
# Preprocessor | # Preprocessor | ||||
p = POSPreprocess(train_data, pickle_path) | p = POSPreprocess(train_data, pickle_path) | ||||
@@ -37,3 +39,22 @@ if __name__ == "__main__": | |||||
trainer.train(model) | trainer.train(model) | ||||
print("Training finished!") | print("Training finished!") | ||||
saver = ModelSaver("./saved_model.pkl") | |||||
saver.save_pytorch(model) | |||||
print("Model saved!") | |||||
del model, trainer, pos_loader | |||||
model = SeqLabeling(100, 1, num_classes, vocab_size, bi_direction=True) | |||||
ModelLoader("xxx", "./saved_model.pkl").load_pytorch(model) | |||||
print("model loaded!") | |||||
test_args = ConfigSection() | |||||
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args}) | |||||
#test_args = {"save_output": True, "validate_in_training": False, "save_dev_input": False, | |||||
# "save_loss": True, "batch_size": 1, "pickle_path": pickle_path} | |||||
tester = POSTester(test_args) | |||||
tester.test(model) | |||||
print("model tested!") |