@@ -8,8 +8,7 @@ | |||||
FastNLP中实现的模型包括: | FastNLP中实现的模型包括: | ||||
1. Get To The Point: Summarization with Pointer-Generator Networks (See et al. 2017) | 1. Get To The Point: Summarization with Pointer-Generator Networks (See et al. 2017) | ||||
2. Extractive Summarization with SWAP-NET : Sentences and Words from Alternating Pointer Networks (Jadhav et al. 2018) | |||||
3. Searching for Effective Neural Extractive Summarization What Works and What's Next (Zhong et al. 2019) | |||||
2. Searching for Effective Neural Extractive Summarization What Works and What's Next (Zhong et al. 2019) | |||||
@@ -32,8 +31,8 @@ FastNLP中实现的模型包括: | |||||
其中公开数据集(CNN/DailyMail, Newsroom, arXiv, PubMed)预处理之后的下载地址: | 其中公开数据集(CNN/DailyMail, Newsroom, arXiv, PubMed)预处理之后的下载地址: | ||||
- [百度云盘](https://pan.baidu.com) | |||||
- [Google Drive](https://drive.google.com) | |||||
- [百度云盘](https://pan.baidu.com/s/11qWnDjK9lb33mFZ9vuYlzA) (提取码:h1px) | |||||
- [Google Drive](https://drive.google.com/file/d/1uzeSdcLk5ilHaUTeJRNrf-_j59CQGe6r/view?usp=drivesdk) | |||||
未公开数据集(NYT, NYT50, DUC)数据处理部分脚本放置于data文件夹 | 未公开数据集(NYT, NYT50, DUC)数据处理部分脚本放置于data文件夹 | ||||
@@ -53,5 +52,10 @@ FastNLP中实现的模型包括: | |||||
### Performance and Hyperparameters | ### Performance and Hyperparameters | ||||
| Model | ROUGE-1 | ROUGE-2 | ROUGE-L | Paper | | |||||
See | |||||
## Abstractive Summarization | ## Abstractive Summarization | ||||
Still in Progress... | Still in Progress... |
@@ -158,11 +158,11 @@ class SummarizationLoader(JsonLoader): | |||||
logger.info("[INFO] Load existing vocab from %s!" % vocab_path) | logger.info("[INFO] Load existing vocab from %s!" % vocab_path) | ||||
word_list = [] | word_list = [] | ||||
with open(vocab_path, 'r', encoding='utf8') as vocab_f: | with open(vocab_path, 'r', encoding='utf8') as vocab_f: | ||||
cnt = 0 | |||||
cnt = 2 # pad and unk | |||||
for line in vocab_f: | for line in vocab_f: | ||||
cnt += 1 | |||||
pieces = line.split("\t") | pieces = line.split("\t") | ||||
word_list.append(pieces[0]) | word_list.append(pieces[0]) | ||||
cnt += 1 | |||||
if cnt > vocab_size: | if cnt > vocab_size: | ||||
break | break | ||||
vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) | vocabs = Vocabulary(max_size=vocab_size, padding=WORD_PAD, unknown=WORD_UNK) | ||||
@@ -0,0 +1,136 @@ | |||||
import numpy as np | |||||
import torch | |||||
import torch.nn as nn | |||||
import torch.nn.init as init | |||||
import torch.nn.functional as F | |||||
from torch.autograd import Variable | |||||
from torch.distributions import Bernoulli | |||||
class DeepLSTM(nn.Module): | |||||
def __init__(self, input_size, hidden_size, num_layers, recurrent_dropout, use_orthnormal_init=True, fix_mask=True, use_cuda=True): | |||||
super(DeepLSTM, self).__init__() | |||||
self.fix_mask = fix_mask | |||||
self.use_cuda = use_cuda | |||||
self.input_size = input_size | |||||
self.num_layers = num_layers | |||||
self.hidden_size = hidden_size | |||||
self.recurrent_dropout = recurrent_dropout | |||||
self.lstms = nn.ModuleList([None] * self.num_layers) | |||||
self.highway_gate_input = nn.ModuleList([None] * self.num_layers) | |||||
self.highway_gate_state = nn.ModuleList([nn.Linear(hidden_size, hidden_size)] * self.num_layers) | |||||
self.highway_linear_input = nn.ModuleList([None] * self.num_layers) | |||||
# self._input_w = nn.Parameter(torch.Tensor(input_size, hidden_size)) | |||||
# init.xavier_normal_(self._input_w) | |||||
for l in range(self.num_layers): | |||||
input_dim = input_size if l == 0 else hidden_size | |||||
self.lstms[l] = nn.LSTMCell(input_size=input_dim, hidden_size=hidden_size) | |||||
self.highway_gate_input[l] = nn.Linear(input_dim, hidden_size) | |||||
self.highway_linear_input[l] = nn.Linear(input_dim, hidden_size, bias=False) | |||||
# logger.info("[INFO] Initing W for LSTM .......") | |||||
for l in range(self.num_layers): | |||||
if use_orthnormal_init: | |||||
# logger.info("[INFO] Initing W using orthnormal init .......") | |||||
init.orthogonal_(self.lstms[l].weight_ih) | |||||
init.orthogonal_(self.lstms[l].weight_hh) | |||||
init.orthogonal_(self.highway_gate_input[l].weight.data) | |||||
init.orthogonal_(self.highway_gate_state[l].weight.data) | |||||
init.orthogonal_(self.highway_linear_input[l].weight.data) | |||||
else: | |||||
# logger.info("[INFO] Initing W using xavier_normal .......") | |||||
init_weight_value = 6.0 | |||||
init.xavier_normal_(self.lstms[l].weight_ih, gain=np.sqrt(init_weight_value)) | |||||
init.xavier_normal_(self.lstms[l].weight_hh, gain=np.sqrt(init_weight_value)) | |||||
init.xavier_normal_(self.highway_gate_input[l].weight.data, gain=np.sqrt(init_weight_value)) | |||||
init.xavier_normal_(self.highway_gate_state[l].weight.data, gain=np.sqrt(init_weight_value)) | |||||
init.xavier_normal_(self.highway_linear_input[l].weight.data, gain=np.sqrt(init_weight_value)) | |||||
def init_hidden(self, batch_size, hidden_size): | |||||
# the first is the hidden h | |||||
# the second is the cell c | |||||
if self.use_cuda: | |||||
return (torch.zeros(batch_size, hidden_size).cuda(), | |||||
torch.zeros(batch_size, hidden_size).cuda()) | |||||
else: | |||||
return (torch.zeros(batch_size, hidden_size), | |||||
torch.zeros(batch_size, hidden_size)) | |||||
def forward(self, inputs, input_masks, Train): | |||||
''' | |||||
inputs: [[seq_len, batch, Co * kernel_sizes], n_layer * [None]] (list) | |||||
input_masks: [[seq_len, batch, Co * kernel_sizes], n_layer * [None]] (list) | |||||
''' | |||||
batch_size, seq_len = inputs[0].size(1), inputs[0].size(0) | |||||
# inputs[0] = torch.matmul(inputs[0], self._input_w) | |||||
# input_masks[0] = input_masks[0].unsqueeze(-1).expand(seq_len, batch_size, self.hidden_size) | |||||
self.inputs = inputs | |||||
self.input_masks = input_masks | |||||
if self.fix_mask: | |||||
self.output_dropout_layers = [None] * self.num_layers | |||||
for l in range(self.num_layers): | |||||
binary_mask = torch.rand((batch_size, self.hidden_size)) > self.recurrent_dropout | |||||
# This scaling ensures expected values and variances of the output of applying this mask and the original tensor are the same. | |||||
# from allennlp.nn.util.py | |||||
self.output_dropout_layers[l] = binary_mask.float().div(1.0 - self.recurrent_dropout) | |||||
if self.use_cuda: | |||||
self.output_dropout_layers[l] = self.output_dropout_layers[l].cuda() | |||||
for l in range(self.num_layers): | |||||
h, c = self.init_hidden(batch_size, self.hidden_size) | |||||
outputs_list = [] | |||||
for t in range(len(self.inputs[l])): | |||||
x = self.inputs[l][t] | |||||
m = self.input_masks[l][t].float() | |||||
h_temp, c_temp = self.lstms[l].forward(x, (h, c)) # [batch, hidden_size] | |||||
r = torch.sigmoid(self.highway_gate_input[l](x) + self.highway_gate_state[l](h)) | |||||
lx = self.highway_linear_input[l](x) # [batch, hidden_size] | |||||
h_temp = r * h_temp + (1 - r) * lx | |||||
if Train: | |||||
if self.fix_mask: | |||||
h_temp = self.output_dropout_layers[l] * h_temp | |||||
else: | |||||
h_temp = F.dropout(h_temp, p=self.recurrent_dropout) | |||||
h = m * h_temp + (1 - m) * h | |||||
c = m * c_temp + (1 - m) * c | |||||
outputs_list.append(h) | |||||
outputs = torch.stack(outputs_list, 0) # [seq_len, batch, hidden_size] | |||||
self.inputs[l + 1] = DeepLSTM.flip(outputs, 0) # reverse [seq_len, batch, hidden_size] | |||||
self.input_masks[l + 1] = DeepLSTM.flip(self.input_masks[l], 0) | |||||
self.output_state = self.inputs # num_layers * [seq_len, batch, hidden_size] | |||||
# flip -2 layer | |||||
# self.output_state[-2] = DeepLSTM.flip(self.output_state[-2], 0) | |||||
# concat last two layer | |||||
# self.output_state = torch.cat([self.output_state[-1], self.output_state[-2]], dim=-1).transpose(0, 1) | |||||
self.output_state = self.output_state[-1].transpose(0, 1) | |||||
assert self.output_state.size() == (batch_size, seq_len, self.hidden_size) | |||||
return self.output_state | |||||
@staticmethod | |||||
def flip(x, dim): | |||||
xsize = x.size() | |||||
dim = x.dim() + dim if dim < 0 else dim | |||||
x = x.contiguous() | |||||
x = x.view(-1, *xsize[dim:]).contiguous() | |||||
x = x.view(x.size(0), x.size(1), -1)[:, getattr(torch.arange(x.size(1) - 1, | |||||
-1, -1), ('cpu','cuda')[x.is_cuda])().long(), :] | |||||
return x.view(xsize) |
@@ -0,0 +1,103 @@ | |||||
from __future__ import absolute_import | |||||
from __future__ import division | |||||
from __future__ import print_function | |||||
import torch | |||||
import torch.nn as nn | |||||
from torch.autograd import * | |||||
from torch.distributions import * | |||||
from .Encoder import Encoder | |||||
from .DeepLSTM import DeepLSTM | |||||
from transformer.SubLayers import MultiHeadAttention,PositionwiseFeedForward | |||||
class SummarizationModel(nn.Module): | |||||
def __init__(self, hps, embed): | |||||
""" | |||||
:param hps: hyperparameters for the model | |||||
:param vocab: vocab object | |||||
""" | |||||
super(SummarizationModel, self).__init__() | |||||
self._hps = hps | |||||
# sentence encoder | |||||
self.encoder = Encoder(hps, embed) | |||||
# Multi-layer highway lstm | |||||
self.num_layers = hps.n_layers | |||||
self.sent_embedding_size = (hps.max_kernel_size - hps.min_kernel_size + 1) * hps.output_channel | |||||
self.lstm_hidden_size = hps.lstm_hidden_size | |||||
self.recurrent_dropout = hps.recurrent_dropout_prob | |||||
self.deep_lstm = DeepLSTM(self.sent_embedding_size, self.lstm_hidden_size, self.num_layers, self.recurrent_dropout, | |||||
hps.use_orthnormal_init, hps.fix_mask, hps.cuda) | |||||
# Multi-head attention | |||||
self.n_head = hps.n_head | |||||
self.d_v = self.d_k = int(self.lstm_hidden_size / hps.n_head) | |||||
self.d_inner = hps.ffn_inner_hidden_size | |||||
self.slf_attn = MultiHeadAttention(hps.n_head, self.lstm_hidden_size , self.d_k, self.d_v, dropout=hps.atten_dropout_prob) | |||||
self.pos_ffn = PositionwiseFeedForward(self.d_v, self.d_inner, dropout = hps.ffn_dropout_prob) | |||||
self.wh = nn.Linear(self.d_v, 2) | |||||
def forward(self, input, input_len, Train): | |||||
""" | |||||
:param input: [batch_size, N, seq_len], word idx long tensor | |||||
:param input_len: [batch_size, N], 1 for sentence and 0 for padding | |||||
:param Train: True for train and False for eval and test | |||||
:param return_atten: True or False to return multi-head attention output self.output_slf_attn | |||||
:return: | |||||
p_sent: [batch_size, N, 2] | |||||
output_slf_attn: (option) [n_head, batch_size, N, N] | |||||
""" | |||||
# -- Sentence Encoder | |||||
self.sent_embedding = self.encoder(input) # [batch, N, Co * kernel_sizes] | |||||
# -- Multi-layer highway lstm | |||||
input_len = input_len.float() # [batch, N] | |||||
self.inputs = [None] * (self.num_layers + 1) | |||||
self.input_masks = [None] * (self.num_layers + 1) | |||||
self.inputs[0] = self.sent_embedding.permute(1, 0, 2) # [N, batch, Co * kernel_sizes] | |||||
self.input_masks[0] = input_len.permute(1, 0).unsqueeze(2) | |||||
self.lstm_output_state = self.deep_lstm(self.inputs, self.input_masks, Train) # [batch, N, hidden_size] | |||||
# -- Prepare masks | |||||
batch_size, N = input_len.size() | |||||
slf_attn_mask = input_len.eq(0.0) # [batch, N], 1 for padding | |||||
slf_attn_mask = slf_attn_mask.unsqueeze(1).expand(-1, N, -1) # [batch, N, N] | |||||
# -- Multi-head attention | |||||
self.atten_output, self.output_slf_attn = self.slf_attn(self.lstm_output_state, self.lstm_output_state, self.lstm_output_state, mask=slf_attn_mask) | |||||
self.atten_output *= input_len.unsqueeze(2) # [batch_size, N, lstm_hidden_size = (n_head * d_v)] | |||||
self.multi_atten_output = self.atten_output.view(batch_size, N, self.n_head, self.d_v) # [batch_size, N, n_head, d_v] | |||||
self.multi_atten_context = self.multi_atten_output[:, :, 0::2, :].sum(2) - self.multi_atten_output[:, :, 1::2, :].sum(2) # [batch_size, N, d_v] | |||||
# -- Position-wise Feed-Forward Networks | |||||
self.output_state = self.pos_ffn(self.multi_atten_context) | |||||
self.output_state = self.output_state * input_len.unsqueeze(2) # [batch_size, N, d_v] | |||||
p_sent = self.wh(self.output_state) # [batch, N, 2] | |||||
idx = None | |||||
if self._hps.m == 0: | |||||
prediction = p_sent.view(-1, 2).max(1)[1] | |||||
prediction = prediction.view(batch_size, -1) | |||||
else: | |||||
mask_output = torch.exp(p_sent[:, :, 1]) # # [batch, N] | |||||
mask_output = mask_output.masked_fill(input_len.eq(0), 0) | |||||
topk, idx = torch.topk(mask_output, self._hps.m) | |||||
prediction = torch.zeros(batch_size, N).scatter_(1, idx.data.cpu(), 1) | |||||
prediction = prediction.long().view(batch_size, -1) | |||||
if self._hps.cuda: | |||||
prediction = prediction.cuda() | |||||
return {"p_sent": p_sent, "prediction": prediction, "pred_idx": idx} |
@@ -50,8 +50,8 @@ class LabelFMetric(MetricBase): | |||||
""" | """ | ||||
target = target.data | target = target.data | ||||
pred = pred.data | pred = pred.data | ||||
logger.debug(pred.size()) | |||||
logger.debug(pred[:5,:]) | |||||
# logger.debug(pred.size()) | |||||
# logger.debug(pred[:5,:]) | |||||
batch, N = pred.size() | batch, N = pred.size() | ||||
self.pred += pred.sum() | self.pred += pred.sum() | ||||
self.true += target.sum() | self.true += target.sum() | ||||
@@ -83,7 +83,6 @@ class TransformerModel(nn.Module): | |||||
:param input: [batch_size, N, seq_len] | :param input: [batch_size, N, seq_len] | ||||
:param input_len: [batch_size, N] | :param input_len: [batch_size, N] | ||||
:param return_atten: bool | |||||
:return: | :return: | ||||
""" | """ | ||||
# Sentence Encoder | # Sentence Encoder | ||||
@@ -125,12 +124,12 @@ class TransformerModel(nn.Module): | |||||
p_sent = self.wh(self.dec_output_state) # [batch, N, 2] | p_sent = self.wh(self.dec_output_state) # [batch, N, 2] | ||||
idx = None | idx = None | ||||
if self._hps == 0: | |||||
if self._hps.m == 0: | |||||
prediction = p_sent.view(-1, 2).max(1)[1] | prediction = p_sent.view(-1, 2).max(1)[1] | ||||
prediction = prediction.view(batch_size, -1) | prediction = prediction.view(batch_size, -1) | ||||
else: | else: | ||||
mask_output = torch.exp(p_sent[:, :, 1]) # # [batch, N] | mask_output = torch.exp(p_sent[:, :, 1]) # # [batch, N] | ||||
mask_output = mask_output * input_len.float() | |||||
mask_output = mask_output.masked_fill(input_len.eq(0), 0) | |||||
topk, idx = torch.topk(mask_output, self._hps.m) | topk, idx = torch.topk(mask_output, self._hps.m) | ||||
prediction = torch.zeros(batch_size, N).scatter_(1, idx.data.cpu(), 1) | prediction = torch.zeros(batch_size, N).scatter_(1, idx.data.cpu(), 1) | ||||
prediction = prediction.long().view(batch_size, -1) | prediction = prediction.long().view(batch_size, -1) | ||||