diff --git a/fastNLP/modules/__init__.py b/fastNLP/modules/__init__.py index 7b0237dc..0e16d48c 100644 --- a/fastNLP/modules/__init__.py +++ b/fastNLP/modules/__init__.py @@ -17,22 +17,30 @@ """ __all__ = [ - # "BertModel", + "BertModel", + "ConvolutionCharEncoder", "LSTMCharEncoder", + "ConvMaxpool", + "LSTM", + "StarTransformer", + "TransformerEncoder", + "VarRNN", "VarLSTM", "VarGRU", - + "MaxPool", "MaxPoolWithMask", "AvgPool", + "AvgPoolWithMask", + "MultiHeadAttention", - + "MLP", "ConditionalRandomField", "viterbi_decode", diff --git a/fastNLP/modules/encoder/__init__.py b/fastNLP/modules/encoder/__init__.py index 051a0c01..4946a70f 100644 --- a/fastNLP/modules/encoder/__init__.py +++ b/fastNLP/modules/encoder/__init__.py @@ -1,17 +1,17 @@ __all__ = [ "BertModel", - + "ConvolutionCharEncoder", "LSTMCharEncoder", - + "ConvMaxpool", - + "LSTM", - + "StarTransformer", - + "TransformerEncoder", - + "VarRNN", "VarLSTM", "VarGRU", diff --git a/fastNLP/modules/encoder/attention.py b/fastNLP/modules/encoder/attention.py index 0a42d889..fe3f7fd8 100644 --- a/fastNLP/modules/encoder/attention.py +++ b/fastNLP/modules/encoder/attention.py @@ -8,8 +8,6 @@ import torch import torch.nn.functional as F from torch import nn -from fastNLP.modules.dropout import TimestepDropout - from fastNLP.modules.utils import initial_parameter @@ -18,7 +16,7 @@ class DotAttention(nn.Module): .. todo:: 补上文档 """ - + def __init__(self, key_size, value_size, dropout=0.0): super(DotAttention, self).__init__() self.key_size = key_size @@ -26,7 +24,7 @@ class DotAttention(nn.Module): self.scale = math.sqrt(key_size) self.drop = nn.Dropout(dropout) self.softmax = nn.Softmax(dim=2) - + def forward(self, Q, K, V, mask_out=None): """ @@ -45,7 +43,7 @@ class DotAttention(nn.Module): class MultiHeadAttention(nn.Module): """ - 别名::class:`fastNLP.modules.MultiHeadAttention` :class:`fastNLP.modules.encoder.attention.MultiHeadAttention` + 别名::class:`fastNLP.modules.MultiHeadAttention` :class:`fastNLP.modules.encoder.MultiHeadAttention` :param input_size: int, 输入维度的大小。同时也是输出维度的大小。 :param key_size: int, 每个head的维度大小。 @@ -53,14 +51,14 @@ class MultiHeadAttention(nn.Module): :param num_head: int,head的数量。 :param dropout: float。 """ - + def __init__(self, input_size, key_size, value_size, num_head, dropout=0.1): super(MultiHeadAttention, self).__init__() self.input_size = input_size self.key_size = key_size self.value_size = value_size self.num_head = num_head - + in_size = key_size * num_head self.q_in = nn.Linear(input_size, in_size) self.k_in = nn.Linear(input_size, in_size) @@ -69,14 +67,14 @@ class MultiHeadAttention(nn.Module): self.attention = DotAttention(key_size=key_size, value_size=value_size, dropout=dropout) self.out = nn.Linear(value_size * num_head, input_size) self.reset_parameters() - + def reset_parameters(self): sqrt = math.sqrt nn.init.normal_(self.q_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.key_size))) nn.init.normal_(self.k_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.key_size))) nn.init.normal_(self.v_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.value_size))) nn.init.xavier_normal_(self.out.weight) - + def forward(self, Q, K, V, atte_mask_out=None): """ @@ -92,7 +90,7 @@ class MultiHeadAttention(nn.Module): q = self.q_in(Q).view(batch, sq, n_head, d_k) k = self.k_in(K).view(batch, sk, n_head, d_k) v = self.v_in(V).view(batch, sk, n_head, d_v) - + # transpose q, k and v to do batch attention q = q.permute(2, 0, 1, 3).contiguous().view(-1, sq, d_k) k = k.permute(2, 0, 1, 3).contiguous().view(-1, sk, d_k) @@ -100,7 +98,7 @@ class MultiHeadAttention(nn.Module): if atte_mask_out is not None: atte_mask_out = atte_mask_out.repeat(n_head, 1, 1) atte = self.attention(q, k, v, atte_mask_out).view(n_head, batch, sq, d_v) - + # concat all heads, do output linear atte = atte.permute(1, 2, 0, 3).contiguous().view(batch, sq, -1) output = self.out(atte) @@ -124,11 +122,11 @@ class BiAttention(nn.Module): \end{array} """ - + def __init__(self): super(BiAttention, self).__init__() self.inf = 10e12 - + def forward(self, in_x1, in_x2, x1_len, x2_len): """ :param torch.Tensor in_x1: [batch_size, x1_seq_len, hidden_size] 第一句的特征表示 @@ -139,36 +137,36 @@ class BiAttention(nn.Module): torch.Tensor out_x2: [batch_size, x2_seq_len, hidden_size] 第一句attend到的特征表示 """ - + assert in_x1.size()[0] == in_x2.size()[0] assert in_x1.size()[2] == in_x2.size()[2] # The batch size and hidden size must be equal. assert in_x1.size()[1] == x1_len.size()[1] and in_x2.size()[1] == x2_len.size()[1] # The seq len in in_x and x_len must be equal. assert in_x1.size()[0] == x1_len.size()[0] and x1_len.size()[0] == x2_len.size()[0] - + batch_size = in_x1.size()[0] x1_max_len = in_x1.size()[1] x2_max_len = in_x2.size()[1] - + in_x2_t = torch.transpose(in_x2, 1, 2) # [batch_size, hidden_size, x2_seq_len] - + attention_matrix = torch.bmm(in_x1, in_x2_t) # [batch_size, x1_seq_len, x2_seq_len] - + a_mask = x1_len.le(0.5).float() * -self.inf # [batch_size, x1_seq_len] a_mask = a_mask.view(batch_size, x1_max_len, -1) a_mask = a_mask.expand(-1, -1, x2_max_len) # [batch_size, x1_seq_len, x2_seq_len] b_mask = x2_len.le(0.5).float() * -self.inf b_mask = b_mask.view(batch_size, -1, x2_max_len) b_mask = b_mask.expand(-1, x1_max_len, -1) # [batch_size, x1_seq_len, x2_seq_len] - + attention_a = F.softmax(attention_matrix + a_mask, dim=2) # [batch_size, x1_seq_len, x2_seq_len] attention_b = F.softmax(attention_matrix + b_mask, dim=1) # [batch_size, x1_seq_len, x2_seq_len] - + out_x1 = torch.bmm(attention_a, in_x2) # [batch_size, x1_seq_len, hidden_size] attention_b_t = torch.transpose(attention_b, 1, 2) out_x2 = torch.bmm(attention_b_t, in_x1) # [batch_size, x2_seq_len, hidden_size] - + return out_x1, out_x2 @@ -182,10 +180,10 @@ class SelfAttention(nn.Module): :param float drop: dropout概率,默认值为0.5 :param str initial_method: 初始化参数方法 """ - + def __init__(self, input_size, attention_unit=300, attention_hops=10, drop=0.5, initial_method=None, ): super(SelfAttention, self).__init__() - + self.attention_hops = attention_hops self.ws1 = nn.Linear(input_size, attention_unit, bias=False) self.ws2 = nn.Linear(attention_unit, attention_hops, bias=False) @@ -194,7 +192,7 @@ class SelfAttention(nn.Module): self.drop = nn.Dropout(drop) self.tanh = nn.Tanh() initial_parameter(self, initial_method) - + def _penalization(self, attention): """ compute the penalization term for attention module @@ -208,7 +206,7 @@ class SelfAttention(nn.Module): mat = torch.bmm(attention, attention_t) - self.I[:attention.size(0)] ret = (torch.sum(torch.sum((mat ** 2), 2), 1).squeeze() + 1e-10) ** 0.5 return torch.sum(ret) / size[0] - + def forward(self, input, input_origin): """ :param torch.Tensor input: [baz, senLen, h_dim] 要做attention的矩阵 @@ -218,14 +216,14 @@ class SelfAttention(nn.Module): """ input = input.contiguous() size = input.size() # [bsz, len, nhid] - + input_origin = input_origin.expand(self.attention_hops, -1, -1) # [hops,baz, len] input_origin = input_origin.transpose(0, 1).contiguous() # [baz, hops,len] - + y1 = self.tanh(self.ws1(self.drop(input))) # [baz,len,dim] -->[bsz,len, attention-unit] attention = self.ws2(y1).transpose(1, 2).contiguous() # [bsz,len, attention-unit]--> [bsz, len, hop]--> [baz,hop,len] - + attention = attention + (-999999 * (input_origin == 0).float()) # remove the weight on padding token. attention = F.softmax(attention, 2) # [baz ,hop, len] return torch.bmm(attention, input), self._penalization(attention) # output1 --> [baz ,hop ,nhid] diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py index 6d32ae74..ddd22ed0 100644 --- a/fastNLP/modules/encoder/bert.py +++ b/fastNLP/modules/encoder/bert.py @@ -1,11 +1,11 @@ - - - """ 这个页面的代码很大程度上参考(复制粘贴)了https://github.com/huggingface/pytorch-pretrained-BERT的代码, 如果你发现该代码对你 有用,也请引用一下他们。 """ +__all__ = [ + "BertModel" +] import collections @@ -26,6 +26,7 @@ CONFIG_FILE = 'bert_config.json' class BertConfig(object): """Configuration class to store the configuration of a `BertModel`. """ + def __init__(self, vocab_size_or_config_json_file, hidden_size=768, @@ -65,7 +66,7 @@ class BertConfig(object): layer_norm_eps: The epsilon used by LayerNorm. """ if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): + and isinstance(vocab_size_or_config_json_file, unicode)): with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: json_config = json.loads(reader.read()) for key, value in json_config.items(): @@ -150,6 +151,7 @@ class BertLayerNorm(nn.Module): class BertEmbeddings(nn.Module): """Construct the embeddings from word, position and token_type embeddings. """ + def __init__(self, config): super(BertEmbeddings, self).__init__() self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) @@ -331,7 +333,10 @@ class BertPooler(nn.Module): class BertModel(nn.Module): - """BERT(Bidirectional Embedding Representations from Transformers). + """ + 别名::class:`fastNLP.modules.BertModel` :class:`fastNLP.modules.encoder.BertModel` + + BERT(Bidirectional Embedding Representations from Transformers). 如果你想使用预训练好的权重矩阵,请在以下网址下载. sources:: @@ -449,9 +454,9 @@ class BertModel(nn.Module): model = cls(config, *inputs, **kwargs) if state_dict is None: files = glob.glob(os.path.join(pretrained_model_dir, '*.bin')) - if len(files)==0: + if len(files) == 0: raise FileNotFoundError(f"There is no *.bin file in {pretrained_model_dir}") - elif len(files)>1: + elif len(files) > 1: raise FileExistsError(f"There are multiple *.bin files in {pretrained_model_dir}") weights_path = files[0] state_dict = torch.load(weights_path, map_location='cpu') @@ -580,6 +585,7 @@ def load_vocab(vocab_file): index += 1 return vocab + class BasicTokenizer(object): """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" @@ -765,8 +771,8 @@ class BertTokenizer(object): [(ids, tok) for tok, ids in self.vocab.items()]) self.do_basic_tokenize = do_basic_tokenize if do_basic_tokenize: - self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, - never_split=never_split) + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, + never_split=never_split) self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) self.max_len = max_len if max_len is not None else int(1e12) @@ -821,7 +827,7 @@ class BertTokenizer(object): for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): if index != token_index: print("Saving vocabulary to {}: vocabulary indices are not consecutive." - " Please check that the vocabulary is not corrupted!".format(vocab_file)) + " Please check that the vocabulary is not corrupted!".format(vocab_file)) index = token_index writer.write(token + u'\n') index += 1 @@ -841,6 +847,7 @@ class BertTokenizer(object): tokenizer = cls(pretrained_model_name_or_path, *inputs, **kwargs) return tokenizer + VOCAB_NAME = 'vocab.txt' @@ -849,7 +856,8 @@ class _WordPieceBertModel(nn.Module): 这个模块用于直接计算word_piece的结果. """ - def __init__(self, model_dir:str, layers:str='-1'): + + def __init__(self, model_dir: str, layers: str = '-1'): super().__init__() self.tokenzier = BertTokenizer.from_pretrained(model_dir) @@ -858,11 +866,11 @@ class _WordPieceBertModel(nn.Module): encoder_layer_number = len(self.encoder.encoder.layer) self.layers = list(map(int, layers.split(','))) for layer in self.layers: - if layer<0: - assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \ + if layer < 0: + assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ f"a bert model with {encoder_layer_number} layers." else: - assert layer