@@ -8,8 +8,7 @@ | |||
FastNLP中实现的模型包括: | |||
1. Get To The Point: Summarization with Pointer-Generator Networks (See et al. 2017) | |||
2. Extractive Summarization with SWAP-NET : Sentences and Words from Alternating Pointer Networks (Jadhav et al. 2018) | |||
3. Searching for Effective Neural Extractive Summarization What Works and What's Next (Zhong et al. 2019) | |||
2. Searching for Effective Neural Extractive Summarization What Works and What's Next (Zhong et al. 2019) | |||
@@ -32,8 +31,8 @@ FastNLP中实现的模型包括: | |||
其中公开数据集(CNN/DailyMail, Newsroom, arXiv, PubMed)预处理之后的下载地址: | |||
- [百度云盘](https://pan.baidu.com) | |||
- [Google Drive](https://drive.google.com) | |||
- [百度云盘](https://pan.baidu.com/s/11qWnDjK9lb33mFZ9vuYlzA) (提取码:h1px) | |||
- [Google Drive](https://drive.google.com/file/d/1uzeSdcLk5ilHaUTeJRNrf-_j59CQGe6r/view?usp=drivesdk) | |||
未公开数据集(NYT, NYT50, DUC)数据处理部分脚本放置于data文件夹 | |||
@@ -53,5 +52,10 @@ FastNLP中实现的模型包括: | |||
### Performance and Hyperparameters | |||
| Model | ROUGE-1 | ROUGE-2 | ROUGE-L | Paper | | |||
See | |||
## Abstractive Summarization | |||
Still in Progress... |
@@ -158,11 +158,11 @@ class SummarizationLoader(JsonLoader): | |||
logger.info("[INFO] Load existing vocab from %s!" % vocab_path) | |||
word_list = [] | |||
with open(vocab_path, 'r', encoding='utf8') as vocab_f: | |||
cnt = 0 | |||
cnt = 2 # pad and unk | |||
for line in vocab_f: | |||
cnt += 1 | |||
pieces = line.split("\t") | |||
word_list.append(pieces[0]) | |||
cnt += 1 | |||
if cnt > vocab_size: | |||
break | |||
vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) | |||
@@ -0,0 +1,136 @@ | |||
import numpy as np | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.init as init | |||
import torch.nn.functional as F | |||
from torch.autograd import Variable | |||
from torch.distributions import Bernoulli | |||
class DeepLSTM(nn.Module): | |||
def __init__(self, input_size, hidden_size, num_layers, recurrent_dropout, use_orthnormal_init=True, fix_mask=True, use_cuda=True): | |||
super(DeepLSTM, self).__init__() | |||
self.fix_mask = fix_mask | |||
self.use_cuda = use_cuda | |||
self.input_size = input_size | |||
self.num_layers = num_layers | |||
self.hidden_size = hidden_size | |||
self.recurrent_dropout = recurrent_dropout | |||
self.lstms = nn.ModuleList([None] * self.num_layers) | |||
self.highway_gate_input = nn.ModuleList([None] * self.num_layers) | |||
self.highway_gate_state = nn.ModuleList([nn.Linear(hidden_size, hidden_size)] * self.num_layers) | |||
self.highway_linear_input = nn.ModuleList([None] * self.num_layers) | |||
# self._input_w = nn.Parameter(torch.Tensor(input_size, hidden_size)) | |||
# init.xavier_normal_(self._input_w) | |||
for l in range(self.num_layers): | |||
input_dim = input_size if l == 0 else hidden_size | |||
self.lstms[l] = nn.LSTMCell(input_size=input_dim, hidden_size=hidden_size) | |||
self.highway_gate_input[l] = nn.Linear(input_dim, hidden_size) | |||
self.highway_linear_input[l] = nn.Linear(input_dim, hidden_size, bias=False) | |||
# logger.info("[INFO] Initing W for LSTM .......") | |||
for l in range(self.num_layers): | |||
if use_orthnormal_init: | |||
# logger.info("[INFO] Initing W using orthnormal init .......") | |||
init.orthogonal_(self.lstms[l].weight_ih) | |||
init.orthogonal_(self.lstms[l].weight_hh) | |||
init.orthogonal_(self.highway_gate_input[l].weight.data) | |||
init.orthogonal_(self.highway_gate_state[l].weight.data) | |||
init.orthogonal_(self.highway_linear_input[l].weight.data) | |||
else: | |||
# logger.info("[INFO] Initing W using xavier_normal .......") | |||
init_weight_value = 6.0 | |||
init.xavier_normal_(self.lstms[l].weight_ih, gain=np.sqrt(init_weight_value)) | |||
init.xavier_normal_(self.lstms[l].weight_hh, gain=np.sqrt(init_weight_value)) | |||
init.xavier_normal_(self.highway_gate_input[l].weight.data, gain=np.sqrt(init_weight_value)) | |||
init.xavier_normal_(self.highway_gate_state[l].weight.data, gain=np.sqrt(init_weight_value)) | |||
init.xavier_normal_(self.highway_linear_input[l].weight.data, gain=np.sqrt(init_weight_value)) | |||
def init_hidden(self, batch_size, hidden_size): | |||
# the first is the hidden h | |||
# the second is the cell c | |||
if self.use_cuda: | |||
return (torch.zeros(batch_size, hidden_size).cuda(), | |||
torch.zeros(batch_size, hidden_size).cuda()) | |||
else: | |||
return (torch.zeros(batch_size, hidden_size), | |||
torch.zeros(batch_size, hidden_size)) | |||
def forward(self, inputs, input_masks, Train): | |||
''' | |||
inputs: [[seq_len, batch, Co * kernel_sizes], n_layer * [None]] (list) | |||
input_masks: [[seq_len, batch, Co * kernel_sizes], n_layer * [None]] (list) | |||
''' | |||
batch_size, seq_len = inputs[0].size(1), inputs[0].size(0) | |||
# inputs[0] = torch.matmul(inputs[0], self._input_w) | |||
# input_masks[0] = input_masks[0].unsqueeze(-1).expand(seq_len, batch_size, self.hidden_size) | |||
self.inputs = inputs | |||
self.input_masks = input_masks | |||
if self.fix_mask: | |||
self.output_dropout_layers = [None] * self.num_layers | |||
for l in range(self.num_layers): | |||
binary_mask = torch.rand((batch_size, self.hidden_size)) > self.recurrent_dropout | |||
# This scaling ensures expected values and variances of the output of applying this mask and the original tensor are the same. | |||
# from allennlp.nn.util.py | |||
self.output_dropout_layers[l] = binary_mask.float().div(1.0 - self.recurrent_dropout) | |||
if self.use_cuda: | |||
self.output_dropout_layers[l] = self.output_dropout_layers[l].cuda() | |||
for l in range(self.num_layers): | |||
h, c = self.init_hidden(batch_size, self.hidden_size) | |||
outputs_list = [] | |||
for t in range(len(self.inputs[l])): | |||
x = self.inputs[l][t] | |||
m = self.input_masks[l][t].float() | |||
h_temp, c_temp = self.lstms[l].forward(x, (h, c)) # [batch, hidden_size] | |||
r = torch.sigmoid(self.highway_gate_input[l](x) + self.highway_gate_state[l](h)) | |||
lx = self.highway_linear_input[l](x) # [batch, hidden_size] | |||
h_temp = r * h_temp + (1 - r) * lx | |||
if Train: | |||
if self.fix_mask: | |||
h_temp = self.output_dropout_layers[l] * h_temp | |||
else: | |||
h_temp = F.dropout(h_temp, p=self.recurrent_dropout) | |||
h = m * h_temp + (1 - m) * h | |||
c = m * c_temp + (1 - m) * c | |||
outputs_list.append(h) | |||
outputs = torch.stack(outputs_list, 0) # [seq_len, batch, hidden_size] | |||
self.inputs[l + 1] = DeepLSTM.flip(outputs, 0) # reverse [seq_len, batch, hidden_size] | |||
self.input_masks[l + 1] = DeepLSTM.flip(self.input_masks[l], 0) | |||
self.output_state = self.inputs # num_layers * [seq_len, batch, hidden_size] | |||
# flip -2 layer | |||
# self.output_state[-2] = DeepLSTM.flip(self.output_state[-2], 0) | |||
# concat last two layer | |||
# self.output_state = torch.cat([self.output_state[-1], self.output_state[-2]], dim=-1).transpose(0, 1) | |||
self.output_state = self.output_state[-1].transpose(0, 1) | |||
assert self.output_state.size() == (batch_size, seq_len, self.hidden_size) | |||
return self.output_state | |||
@staticmethod | |||
def flip(x, dim): | |||
xsize = x.size() | |||
dim = x.dim() + dim if dim < 0 else dim | |||
x = x.contiguous() | |||
x = x.view(-1, *xsize[dim:]).contiguous() | |||
x = x.view(x.size(0), x.size(1), -1)[:, getattr(torch.arange(x.size(1) - 1, | |||
-1, -1), ('cpu','cuda')[x.is_cuda])().long(), :] | |||
return x.view(xsize) |
@@ -0,0 +1,103 @@ | |||
from __future__ import absolute_import | |||
from __future__ import division | |||
from __future__ import print_function | |||
import torch | |||
import torch.nn as nn | |||
from torch.autograd import * | |||
from torch.distributions import * | |||
from .Encoder import Encoder | |||
from .DeepLSTM import DeepLSTM | |||
from transformer.SubLayers import MultiHeadAttention,PositionwiseFeedForward | |||
class SummarizationModel(nn.Module): | |||
def __init__(self, hps, embed): | |||
""" | |||
:param hps: hyperparameters for the model | |||
:param vocab: vocab object | |||
""" | |||
super(SummarizationModel, self).__init__() | |||
self._hps = hps | |||
# sentence encoder | |||
self.encoder = Encoder(hps, embed) | |||
# Multi-layer highway lstm | |||
self.num_layers = hps.n_layers | |||
self.sent_embedding_size = (hps.max_kernel_size - hps.min_kernel_size + 1) * hps.output_channel | |||
self.lstm_hidden_size = hps.lstm_hidden_size | |||
self.recurrent_dropout = hps.recurrent_dropout_prob | |||
self.deep_lstm = DeepLSTM(self.sent_embedding_size, self.lstm_hidden_size, self.num_layers, self.recurrent_dropout, | |||
hps.use_orthnormal_init, hps.fix_mask, hps.cuda) | |||
# Multi-head attention | |||
self.n_head = hps.n_head | |||
self.d_v = self.d_k = int(self.lstm_hidden_size / hps.n_head) | |||
self.d_inner = hps.ffn_inner_hidden_size | |||
self.slf_attn = MultiHeadAttention(hps.n_head, self.lstm_hidden_size , self.d_k, self.d_v, dropout=hps.atten_dropout_prob) | |||
self.pos_ffn = PositionwiseFeedForward(self.d_v, self.d_inner, dropout = hps.ffn_dropout_prob) | |||
self.wh = nn.Linear(self.d_v, 2) | |||
def forward(self, input, input_len, Train): | |||
""" | |||
:param input: [batch_size, N, seq_len], word idx long tensor | |||
:param input_len: [batch_size, N], 1 for sentence and 0 for padding | |||
:param Train: True for train and False for eval and test | |||
:param return_atten: True or False to return multi-head attention output self.output_slf_attn | |||
:return: | |||
p_sent: [batch_size, N, 2] | |||
output_slf_attn: (option) [n_head, batch_size, N, N] | |||
""" | |||
# -- Sentence Encoder | |||
self.sent_embedding = self.encoder(input) # [batch, N, Co * kernel_sizes] | |||
# -- Multi-layer highway lstm | |||
input_len = input_len.float() # [batch, N] | |||
self.inputs = [None] * (self.num_layers + 1) | |||
self.input_masks = [None] * (self.num_layers + 1) | |||
self.inputs[0] = self.sent_embedding.permute(1, 0, 2) # [N, batch, Co * kernel_sizes] | |||
self.input_masks[0] = input_len.permute(1, 0).unsqueeze(2) | |||
self.lstm_output_state = self.deep_lstm(self.inputs, self.input_masks, Train) # [batch, N, hidden_size] | |||
# -- Prepare masks | |||
batch_size, N = input_len.size() | |||
slf_attn_mask = input_len.eq(0.0) # [batch, N], 1 for padding | |||
slf_attn_mask = slf_attn_mask.unsqueeze(1).expand(-1, N, -1) # [batch, N, N] | |||
# -- Multi-head attention | |||
self.atten_output, self.output_slf_attn = self.slf_attn(self.lstm_output_state, self.lstm_output_state, self.lstm_output_state, mask=slf_attn_mask) | |||
self.atten_output *= input_len.unsqueeze(2) # [batch_size, N, lstm_hidden_size = (n_head * d_v)] | |||
self.multi_atten_output = self.atten_output.view(batch_size, N, self.n_head, self.d_v) # [batch_size, N, n_head, d_v] | |||
self.multi_atten_context = self.multi_atten_output[:, :, 0::2, :].sum(2) - self.multi_atten_output[:, :, 1::2, :].sum(2) # [batch_size, N, d_v] | |||
# -- Position-wise Feed-Forward Networks | |||
self.output_state = self.pos_ffn(self.multi_atten_context) | |||
self.output_state = self.output_state * input_len.unsqueeze(2) # [batch_size, N, d_v] | |||
p_sent = self.wh(self.output_state) # [batch, N, 2] | |||
idx = None | |||
if self._hps.m == 0: | |||
prediction = p_sent.view(-1, 2).max(1)[1] | |||
prediction = prediction.view(batch_size, -1) | |||
else: | |||
mask_output = torch.exp(p_sent[:, :, 1]) # # [batch, N] | |||
mask_output = mask_output.masked_fill(input_len.eq(0), 0) | |||
topk, idx = torch.topk(mask_output, self._hps.m) | |||
prediction = torch.zeros(batch_size, N).scatter_(1, idx.data.cpu(), 1) | |||
prediction = prediction.long().view(batch_size, -1) | |||
if self._hps.cuda: | |||
prediction = prediction.cuda() | |||
return {"p_sent": p_sent, "prediction": prediction, "pred_idx": idx} |
@@ -50,8 +50,8 @@ class LabelFMetric(MetricBase): | |||
""" | |||
target = target.data | |||
pred = pred.data | |||
logger.debug(pred.size()) | |||
logger.debug(pred[:5,:]) | |||
# logger.debug(pred.size()) | |||
# logger.debug(pred[:5,:]) | |||
batch, N = pred.size() | |||
self.pred += pred.sum() | |||
self.true += target.sum() | |||
@@ -83,7 +83,6 @@ class TransformerModel(nn.Module): | |||
:param input: [batch_size, N, seq_len] | |||
:param input_len: [batch_size, N] | |||
:param return_atten: bool | |||
:return: | |||
""" | |||
# Sentence Encoder | |||
@@ -125,12 +124,12 @@ class TransformerModel(nn.Module): | |||
p_sent = self.wh(self.dec_output_state) # [batch, N, 2] | |||
idx = None | |||
if self._hps == 0: | |||
if self._hps.m == 0: | |||
prediction = p_sent.view(-1, 2).max(1)[1] | |||
prediction = prediction.view(batch_size, -1) | |||
else: | |||
mask_output = torch.exp(p_sent[:, :, 1]) # # [batch, N] | |||
mask_output = mask_output * input_len.float() | |||
mask_output = mask_output.masked_fill(input_len.eq(0), 0) | |||
topk, idx = torch.topk(mask_output, self._hps.m) | |||
prediction = torch.zeros(batch_size, N).scatter_(1, idx.data.cpu(), 1) | |||
prediction = prediction.long().view(batch_size, -1) | |||