|
|
@@ -4,7 +4,7 @@ |
|
|
|
__all__ = [ |
|
|
|
"SeqLabeling", |
|
|
|
"AdvSeqLabel", |
|
|
|
# "BiLSTMCRF" |
|
|
|
"BiLSTMCRF" |
|
|
|
] |
|
|
|
|
|
|
|
import torch |
|
|
@@ -14,7 +14,6 @@ import torch.nn.functional as F |
|
|
|
from .base_model import BaseModel |
|
|
|
from ..core.const import Const as C |
|
|
|
from ..core.utils import seq_len_to_mask |
|
|
|
from ..embeddings import embedding |
|
|
|
from ..embeddings import get_embeddings |
|
|
|
from ..modules import ConditionalRandomField |
|
|
|
from ..modules import LSTM |
|
|
@@ -24,18 +23,15 @@ from ..modules.decoder.crf import allowed_transitions |
|
|
|
|
|
|
|
class BiLSTMCRF(BaseModel): |
|
|
|
""" |
|
|
|
结构为BiLSTM + FC + Dropout + CRF. |
|
|
|
结构为embedding + BiLSTM + FC + Dropout + CRF. |
|
|
|
|
|
|
|
.. todo:: |
|
|
|
继续补充文档 |
|
|
|
|
|
|
|
:param embed: tuple: |
|
|
|
:param num_classes: |
|
|
|
:param num_layers: |
|
|
|
:param hidden_size: |
|
|
|
:param dropout: |
|
|
|
:param target_vocab: |
|
|
|
:param encoding_type: |
|
|
|
:param embed: 支持(1)fastNLP的各种Embedding, (2) tuple, 指明num_embedding, dimension, 如(1000, 100) |
|
|
|
:param num_classes: 一共多少个类 |
|
|
|
:param num_layers: BiLSTM的层数 |
|
|
|
:param hidden_size: BiLSTM的hidden_size,实际hidden size为该值的两倍(前向、后向) |
|
|
|
:param dropout: dropout的概率,0为不dropout |
|
|
|
:param target_vocab: Vocabulary对象,target与index的对应关系 |
|
|
|
:param encoding_type: encoding的类型,支持'bioes', 'bmes', 'bio', 'bmeso'等 |
|
|
|
""" |
|
|
|
def __init__(self, embed, num_classes, num_layers=1, hidden_size=100, dropout=0.5, |
|
|
|
target_vocab=None, encoding_type=None): |
|
|
@@ -86,21 +82,20 @@ class SeqLabeling(BaseModel): |
|
|
|
一个基础的Sequence labeling的模型。 |
|
|
|
用于做sequence labeling的基础类。结构包含一层Embedding,一层LSTM(单向,一层),一层FC,以及一层CRF。 |
|
|
|
|
|
|
|
:param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), |
|
|
|
第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding |
|
|
|
:param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray embed: Embedding的大小(传入tuple(int, int), |
|
|
|
第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, embedding, ndarray等则直接使用该值初始化Embedding |
|
|
|
:param int hidden_size: LSTM隐藏层的大小 |
|
|
|
:param int num_classes: 一共有多少类 |
|
|
|
""" |
|
|
|
|
|
|
|
def __init__(self, init_embed, hidden_size, num_classes): |
|
|
|
def __init__(self, embed, hidden_size, num_classes): |
|
|
|
super(SeqLabeling, self).__init__() |
|
|
|
|
|
|
|
self.Embedding = embedding.Embedding(init_embed) |
|
|
|
self.Rnn = encoder.LSTM(self.Embedding.embedding_dim, hidden_size) |
|
|
|
self.Linear = nn.Linear(hidden_size, num_classes) |
|
|
|
self.Crf = decoder.ConditionalRandomField(num_classes) |
|
|
|
self.mask = None |
|
|
|
|
|
|
|
self.embedding = get_embeddings(embed) |
|
|
|
self.rnn = encoder.LSTM(self.embedding.embedding_dim, hidden_size) |
|
|
|
self.fc = nn.Linear(hidden_size, num_classes) |
|
|
|
self.crf = decoder.ConditionalRandomField(num_classes) |
|
|
|
|
|
|
|
def forward(self, words, seq_len, target): |
|
|
|
""" |
|
|
|
:param torch.LongTensor words: [batch_size, max_len],序列的index |
|
|
@@ -109,17 +104,14 @@ class SeqLabeling(BaseModel): |
|
|
|
:return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting. |
|
|
|
If truth is not None, return loss, a scalar. Used in training. |
|
|
|
""" |
|
|
|
assert words.shape[0] == seq_len.shape[0] |
|
|
|
assert target.shape == words.shape |
|
|
|
self.mask = self._make_mask(words, seq_len) |
|
|
|
|
|
|
|
x = self.Embedding(words) |
|
|
|
mask = seq_len_to_mask(seq_len, max_len=words.size(1)) |
|
|
|
x = self.embedding(words) |
|
|
|
# [batch_size, max_len, word_emb_dim] |
|
|
|
x, _ = self.Rnn(x, seq_len) |
|
|
|
x, _ = self.rnn(x, seq_len) |
|
|
|
# [batch_size, max_len, hidden_size * direction] |
|
|
|
x = self.Linear(x) |
|
|
|
x = self.fc(x) |
|
|
|
# [batch_size, max_len, num_classes] |
|
|
|
return {C.LOSS: self._internal_loss(x, target)} |
|
|
|
return {C.LOSS: self._internal_loss(x, target, mask)} |
|
|
|
|
|
|
|
def predict(self, words, seq_len): |
|
|
|
""" |
|
|
@@ -129,18 +121,18 @@ class SeqLabeling(BaseModel): |
|
|
|
:param torch.LongTensor seq_len: [batch_size,] |
|
|
|
:return: {'pred': xx}, [batch_size, max_len] |
|
|
|
""" |
|
|
|
self.mask = self._make_mask(words, seq_len) |
|
|
|
mask = seq_len_to_mask(seq_len, max_len=words.size(1)) |
|
|
|
|
|
|
|
x = self.Embedding(words) |
|
|
|
x = self.embedding(words) |
|
|
|
# [batch_size, max_len, word_emb_dim] |
|
|
|
x, _ = self.Rnn(x, seq_len) |
|
|
|
x, _ = self.rnn(x, seq_len) |
|
|
|
# [batch_size, max_len, hidden_size * direction] |
|
|
|
x = self.Linear(x) |
|
|
|
x = self.fc(x) |
|
|
|
# [batch_size, max_len, num_classes] |
|
|
|
pred = self._decode(x) |
|
|
|
pred = self._decode(x, mask) |
|
|
|
return {C.OUTPUT: pred} |
|
|
|
|
|
|
|
def _internal_loss(self, x, y): |
|
|
|
def _internal_loss(self, x, y, mask): |
|
|
|
""" |
|
|
|
Negative log likelihood loss. |
|
|
|
:param x: Tensor, [batch_size, max_len, tag_size] |
|
|
@@ -152,22 +144,15 @@ class SeqLabeling(BaseModel): |
|
|
|
y = y.long() |
|
|
|
assert x.shape[:2] == y.shape |
|
|
|
assert y.shape == self.mask.shape |
|
|
|
total_loss = self.Crf(x, y, self.mask) |
|
|
|
total_loss = self.crf(x, y, mask) |
|
|
|
return torch.mean(total_loss) |
|
|
|
|
|
|
|
def _make_mask(self, x, seq_len): |
|
|
|
batch_size, max_len = x.size(0), x.size(1) |
|
|
|
mask = seq_len_to_mask(seq_len) |
|
|
|
mask = mask.view(batch_size, max_len) |
|
|
|
mask = mask.to(x).float() |
|
|
|
return mask |
|
|
|
|
|
|
|
def _decode(self, x): |
|
|
|
def _decode(self, x, mask): |
|
|
|
""" |
|
|
|
:param torch.FloatTensor x: [batch_size, max_len, tag_size] |
|
|
|
:return prediction: [batch_size, max_len] |
|
|
|
""" |
|
|
|
tag_seq, _ = self.Crf.viterbi_decode(x, self.mask) |
|
|
|
tag_seq, _ = self.crf.viterbi_decode(x, mask) |
|
|
|
return tag_seq |
|
|
|
|
|
|
|
|
|
|
@@ -177,7 +162,7 @@ class AdvSeqLabel(nn.Module): |
|
|
|
|
|
|
|
更复杂的Sequence Labelling模型。结构为Embedding, LayerNorm, 双向LSTM(两层),FC,LayerNorm,DropOut,FC,CRF。 |
|
|
|
|
|
|
|
:param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int), |
|
|
|
:param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray embed: Embedding的大小(传入tuple(int, int), |
|
|
|
第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding |
|
|
|
:param int hidden_size: LSTM的隐层大小 |
|
|
|
:param int num_classes: 有多少个类 |
|
|
@@ -188,11 +173,11 @@ class AdvSeqLabel(nn.Module): |
|
|
|
:param str encoding_type: 支持"BIO", "BMES", "BEMSO", 只有在id2words不为None的情况有用。 |
|
|
|
""" |
|
|
|
|
|
|
|
def __init__(self, init_embed, hidden_size, num_classes, dropout=0.3, id2words=None, encoding_type='bmes'): |
|
|
|
def __init__(self, embed, hidden_size, num_classes, dropout=0.3, id2words=None, encoding_type='bmes'): |
|
|
|
|
|
|
|
super().__init__() |
|
|
|
|
|
|
|
self.Embedding = embedding.Embedding(init_embed) |
|
|
|
self.Embedding = get_embeddings(embed) |
|
|
|
self.norm1 = torch.nn.LayerNorm(self.Embedding.embedding_dim) |
|
|
|
self.Rnn = encoder.LSTM(input_size=self.Embedding.embedding_dim, hidden_size=hidden_size, num_layers=2, |
|
|
|
dropout=dropout, |
|
|
|