Browse Source

增加fastNLP.embeddings模块并修改对应的现有代码以适配fastNLP.embeddings

tags/v0.4.10
xuyige 5 years ago
parent
commit
570b214dfb
52 changed files with 2589 additions and 2556 deletions
  1. +21
    -0
      fastNLP/embeddings/__init__.py
  2. +321
    -0
      fastNLP/embeddings/bert_embedding.py
  3. +280
    -0
      fastNLP/embeddings/char_embedding.py
  4. +100
    -0
      fastNLP/embeddings/contextual_embedding.py
  5. +326
    -0
      fastNLP/embeddings/elmo_embedding.py
  6. +180
    -0
      fastNLP/embeddings/embedding.py
  7. +92
    -0
      fastNLP/embeddings/stack_embedding.py
  8. +217
    -0
      fastNLP/embeddings/static_embedding.py
  9. +47
    -0
      fastNLP/embeddings/utils.py
  10. +1
    -1
      fastNLP/io/data_loader/matching.py
  11. +1
    -1
      fastNLP/models/bert.py
  12. +1
    -1
      fastNLP/models/biaffine_parser.py
  13. +3
    -2
      fastNLP/models/cnn_text_classification.py
  14. +5
    -4
      fastNLP/models/sequence_labeling.py
  15. +4
    -5
      fastNLP/models/snli.py
  16. +1
    -1
      fastNLP/models/star_transformer.py
  17. +0
    -2
      fastNLP/modules/__init__.py
  18. +3
    -13
      fastNLP/modules/encoder/__init__.py
  19. +0
    -1069
      fastNLP/modules/encoder/_bert.py
  20. +1
    -191
      fastNLP/modules/encoder/_elmo.py
  21. +887
    -47
      fastNLP/modules/encoder/bert.py
  22. +0
    -1083
      fastNLP/modules/encoder/embedding.py
  23. +0
    -28
      fastNLP/modules/utils.py
  24. +2
    -0
      reproduction/LSTM+self_attention_sentiment_analysis/README.md
  25. +9
    -5
      reproduction/LSTM+self_attention_sentiment_analysis/main.py
  26. +2
    -2
      reproduction/Star_transformer/train.py
  27. +1
    -1
      reproduction/Summarization/BertSum/model.py
  28. +0
    -0
      reproduction/joint_cws_parse/README.md
  29. +1
    -1
      reproduction/joint_cws_parse/models/CharParser.py
  30. +4
    -4
      reproduction/joint_cws_parse/train.py
  31. +4
    -0
      reproduction/matching/data/MatchingDataLoader.py
  32. +1
    -2
      reproduction/matching/matching_bert.py
  33. +2
    -3
      reproduction/matching/matching_cntn.py
  34. +5
    -6
      reproduction/matching/matching_esim.py
  35. +5
    -11
      reproduction/matching/matching_mwan.py
  36. +1
    -1
      reproduction/matching/model/bert.py
  37. +1
    -1
      reproduction/matching/model/cntn.py
  38. +1
    -2
      reproduction/matching/model/esim.py
  39. +1
    -1
      reproduction/seqence_labelling/cws/model/model.py
  40. +2
    -4
      reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py
  41. +5
    -8
      reproduction/seqence_labelling/ner/train_idcnn.py
  42. +2
    -3
      reproduction/seqence_labelling/ner/train_ontonote.py
  43. +1
    -1
      reproduction/text_classification/model/HAN.py
  44. +1
    -1
      reproduction/text_classification/model/dpcnn.py
  45. +3
    -5
      reproduction/text_classification/train_HAN.py
  46. +2
    -9
      reproduction/text_classification/train_awdlstm.py
  47. +4
    -10
      reproduction/text_classification/train_char_cnn.py
  48. +5
    -6
      reproduction/text_classification/train_dpcnn.py
  49. +3
    -10
      reproduction/text_classification/train_lstm.py
  50. +3
    -10
      reproduction/text_classification/train_lstm_att.py
  51. +26
    -0
      test/embeddings/test_char_embedding.py
  52. +1
    -1
      test/modules/encoder/test_bert.py

+ 21
- 0
fastNLP/embeddings/__init__.py View File

@@ -0,0 +1,21 @@
"""
embeddings 模块里实现了
"""

__all__ = [
"Embedding",
"StaticEmbedding",
"ElmoEmbedding",
"BertEmbedding",
"StackEmbedding",
"LSTMCharEmbedding",
"CNNCharEmbedding",
]


from .embedding import Embedding
from .static_embedding import StaticEmbedding
from .elmo_embedding import ElmoEmbedding
from .bert_embedding import BertEmbedding
from .char_embedding import CNNCharEmbedding, LSTMCharEmbedding
from .stack_embedding import StackEmbedding

+ 321
- 0
fastNLP/embeddings/bert_embedding.py View File

@@ -0,0 +1,321 @@

import os
import collections

from torch import nn
import torch
import numpy as np
from itertools import chain

from ..core.vocabulary import Vocabulary
from ..io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR
from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer
from .contextual_embedding import ContextualEmbedding


class BertEmbedding(ContextualEmbedding):
"""
别名::class:`fastNLP.embeddings.BertEmbedding` :class:`fastNLP.embeddings.bert_embedding.BertEmbedding`

使用BERT对words进行encode的Embedding。建议将输入的words长度限制在450以内,而不要使用512。这是由于预训练的bert模型长
度限制为512个token,而因为输入的word是未进行word piece分割的,在分割之后长度可能会超过最大长度限制。

Example::

>>> embedding = BertEmbedding(vocab, model_dir_or_name='en-base-uncased', requires_grad=False, layers='4,-2,-1')


:param fastNLP.Vocabulary vocab: 词表
:param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为 ``en-base-uncased``.
:param str layers:最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层
:param str pool_method: 因为在bert中,每个word会被表示为多个word pieces, 当获取一个word的表示的时候,怎样从它的word pieces
中计算得到它对应的表示。支持``last``, ``first``, ``avg``, ``max``。
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。
:param bool include_cls_sep: bool,在bert计算句子的表示的时候,需要在前面加上[CLS]和[SEP], 是否在结果中保留这两个内容。 这样
会使得word embedding的结果比输入的结果长两个token。在使用 :class::StackEmbedding 可能会遇到问题。
:param bool requires_grad: 是否需要gradient。
"""
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1',
pool_method: str='first', word_dropout=0, dropout=0, requires_grad: bool=False,
include_cls_sep: bool=False):
super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)

# 根据model_dir_or_name检查是否存在并下载
if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR:
PRETRAIN_URL = _get_base_url('bert')
model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
model_url = PRETRAIN_URL + model_name
model_dir = cached_path(model_url)
# 检查是否存在
elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))):
model_dir = model_dir_or_name
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")

self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers,
pool_method=pool_method, include_cls_sep=include_cls_sep)

self.requires_grad = requires_grad
self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size

def _delete_model_weights(self):
del self.model

def forward(self, words):
"""
计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要
删除这两个token的表示。

:param torch.LongTensor words: [batch_size, max_len]
:return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
"""
words = self.drop_word(words)
outputs = self._get_sent_reprs(words)
if outputs is not None:
return self.dropout(words)
outputs = self.model(words)
outputs = torch.cat([*outputs], dim=-1)

return self.dropout(words)

@property
def requires_grad(self):
"""
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
:return:
"""
requires_grads = set([param.requires_grad for name, param in self.named_parameters()
if 'word_pieces_lengths' not in name])
if len(requires_grads) == 1:
return requires_grads.pop()
else:
return None

@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
if 'word_pieces_lengths' in name: # 这个不能加入到requires_grad中
continue
param.requires_grad = value


class BertWordPieceEncoder(nn.Module):
"""
读取bert模型,读取之后调用index_dataset方法在dataset中生成word_pieces这一列。

:param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为``en-base-uncased``
:param str layers:最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层
:param bool requires_grad: 是否需要gradient。
"""
def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1',
requires_grad: bool=False):
super().__init__()
PRETRAIN_URL = _get_base_url('bert')

if model_dir_or_name in PRETRAINED_BERT_MODEL_DIR:
model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
model_url = PRETRAIN_URL + model_name
model_dir = cached_path(model_url)
# 检查是否存在
elif os.path.isdir(model_dir_or_name):
model_dir = model_dir_or_name
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")

self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers)
self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size
self.requires_grad = requires_grad

@property
def requires_grad(self):
"""
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
:return:
"""
requires_grads = set([param.requires_grad for name, param in self.named_parameters()])
if len(requires_grads) == 1:
return requires_grads.pop()
else:
return None

@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
param.requires_grad = value

@property
def embed_size(self):
return self._embed_size

def index_datasets(self, *datasets, field_name):
"""
使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是
[CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。

:param datasets: DataSet对象
:param field_name: 基于哪一列的内容生成word_pieces列。这一列中每个数据应该是List[str]的形式。
:return:
"""
self.model.index_dataset(*datasets, field_name=field_name)

def forward(self, word_pieces, token_type_ids=None):
"""
计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。

:param words: batch_size x max_len
:param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话
:return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
"""
outputs = self.model(word_pieces, token_type_ids)
outputs = torch.cat([*outputs], dim=-1)

return outputs


class _WordBertModel(nn.Module):
def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first', include_cls_sep:bool=False):
super().__init__()

self.tokenzier = BertTokenizer.from_pretrained(model_dir)
self.encoder = BertModel.from_pretrained(model_dir)
# 检查encoder_layer_number是否合理
encoder_layer_number = len(self.encoder.encoder.layer)
self.layers = list(map(int, layers.split(',')))
for layer in self.layers:
if layer<0:
assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \
f"a bert model with {encoder_layer_number} layers."
else:
assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
f"a bert model with {encoder_layer_number} layers."

assert pool_method in ('avg', 'max', 'first', 'last')
self.pool_method = pool_method
self.include_cls_sep = include_cls_sep

# 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
print("Start to generating word pieces for word.")
# 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值
word_piece_dict = {'[CLS]':1, '[SEP]':1} # 用到的word_piece以及新增的
found_count = 0
for word, index in vocab:
if index == vocab.padding_idx: # pad是个特殊的符号
word = '[PAD]'
elif index == vocab.unknown_idx:
word = '[UNK]'
word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
if len(word_pieces)==1:
if not vocab._is_word_no_create_entry(word): # 如果是train中的值, 但是却没有找到
if index!=vocab.unknown_idx and word_pieces[0]=='[UNK]': # 说明这个词不在原始的word里面
word_piece_dict[word] = 1 # 新增一个值
continue
for word_piece in word_pieces:
word_piece_dict[word_piece] = 1
found_count += 1
original_embed = self.encoder.embeddings.word_embeddings.weight.data
# 特殊词汇要特殊处理
embed = nn.Embedding(len(word_piece_dict), original_embed.size(1)) # 新的embed
new_word_piece_vocab = collections.OrderedDict()
for index, token in enumerate(['[PAD]', '[UNK]']):
word_piece_dict.pop(token, None)
embed.weight.data[index] = original_embed[self.tokenzier.vocab[token]]
new_word_piece_vocab[token] = index
for token in word_piece_dict.keys():
if token in self.tokenzier.vocab:
embed.weight.data[len(new_word_piece_vocab)] = original_embed[self.tokenzier.vocab[token]]
else:
embed.weight.data[len(new_word_piece_vocab)] = original_embed[self.tokenzier.vocab['[UNK]']]
new_word_piece_vocab[token] = len(new_word_piece_vocab)
self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab)
self.encoder.embeddings.word_embeddings = embed

word_to_wordpieces = []
word_pieces_lengths = []
for word, index in vocab:
if index == vocab.padding_idx: # pad是个特殊的符号
word = '[PAD]'
elif index == vocab.unknown_idx:
word = '[UNK]'
word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces)
word_to_wordpieces.append(word_pieces)
word_pieces_lengths.append(len(word_pieces))
print("Found(Or seg into word pieces) {} words out of {}.".format(found_count, len(vocab)))
self._cls_index = self.tokenzier.vocab['[CLS]']
self._sep_index = self.tokenzier.vocab['[SEP]']
self._pad_index = vocab.padding_idx
self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece
self.word_to_wordpieces = np.array(word_to_wordpieces)
self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False)
print("Successfully generate word pieces.")

def forward(self, words):
"""

:param words: torch.LongTensor, batch_size x max_len
:return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size
"""
batch_size, max_word_len = words.size()
seq_len = words.ne(self._pad_index).sum(dim=-1)
batch_word_pieces_length = self.word_pieces_lengths[words] # batch_size x max_len
word_pieces_lengths = batch_word_pieces_length.sum(dim=-1)
max_word_piece_length = word_pieces_lengths.max().item()
# +2是由于需要加入[CLS]与[SEP]
word_pieces = words.new_full((batch_size, max_word_piece_length+2), fill_value=self._wordpiece_pad_index)
word_pieces[:, 0].fill_(self._cls_index)
batch_indexes = torch.arange(batch_size).to(words)
word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index
attn_masks = torch.zeros_like(word_pieces)
# 1. 获取words的word_pieces的id,以及对应的span范围
word_indexes = words.tolist()
for i in range(batch_size):
word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i]]))
word_pieces[i, 1:len(word_pieces_i)+1] = torch.LongTensor(word_pieces_i)
attn_masks[i, :len(word_pieces_i)+2].fill_(1)
# TODO 截掉长度超过的部分。
# 2. 获取hidden的结果,根据word_pieces进行对应的pool计算
# all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...]
bert_outputs, _ = self.encoder(word_pieces, token_type_ids=None, attention_mask=attn_masks,
output_all_encoded_layers=True)
# output_layers = [self.layers] # len(self.layers) x batch_size x max_word_piece_length x hidden_size

if self.include_cls_sep:
outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2,
bert_outputs[-1].size(-1))
s_shift = 1
else:
outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len,
bert_outputs[-1].size(-1))
s_shift = 0
batch_word_pieces_cum_length = batch_word_pieces_length.new_zeros(batch_size, max_word_len + 1)
batch_word_pieces_cum_length[:, 1:] = batch_word_pieces_length.cumsum(dim=-1) # batch_size x max_len
for l_index, l in enumerate(self.layers):
output_layer = bert_outputs[l]
# 从word_piece collapse到word的表示
truncate_output_layer = output_layer[:, 1:-1] # 删除[CLS]与[SEP] batch_size x len x hidden_size
outputs_seq_len = seq_len + s_shift
if self.pool_method == 'first':
for i in range(batch_size):
i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]] # 每个word的start位置
outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length] # num_layer x batch_size x len x hidden_size
elif self.pool_method == 'last':
for i in range(batch_size):
i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i]+1] - 1 # 每个word的end
outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length]
elif self.pool_method == 'max':
for i in range(batch_size):
for j in range(seq_len[i]):
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
outputs[l_index, i, j+s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2)
else:
for i in range(batch_size):
for j in range(seq_len[i]):
start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2)
if self.include_cls_sep:
outputs[l_index, :, 0] = output_layer[:, 0]
outputs[l_index, batch_indexes, seq_len+s_shift] = output_layer[batch_indexes, seq_len+s_shift]
# 3. 最终的embedding结果
return outputs


+ 280
- 0
fastNLP/embeddings/char_embedding.py View File

@@ -0,0 +1,280 @@

import torch
import torch.nn as nn
import torch.nn.functional as F
from typing import List

from ..modules.encoder.lstm import LSTM
from ..core.vocabulary import Vocabulary
from .embedding import TokenEmbedding
from .utils import _construct_char_vocab_from_vocab


class CNNCharEmbedding(TokenEmbedding):
"""
别名::class:`fastNLP.embeddings.CNNCharEmbedding` :class:`fastNLP.embeddings.char_embedding.CNNCharEmbedding`

使用CNN生成character embedding。CNN的结果为, embed(x) -> Dropout(x) -> CNN(x) -> activation(x) -> pool -> fc -> Dropout.
不同的kernel大小的fitler结果是concat起来的。

Example::

>>> cnn_char_embed = CNNCharEmbedding(vocab)


:param vocab: 词表
:param embed_size: 该word embedding的大小,默认值为50.
:param char_emb_size: character的embed的大小。character是从vocab中生成的。默认值为50.
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
:param float dropout: 以多大的概率drop
:param filter_nums: filter的数量. 长度需要和kernels一致。默认值为[40, 30, 20].
:param kernel_sizes: kernel的大小. 默认值为[5, 3, 1].
:param pool_method: character的表示在合成一个表示时所使用的pool方法,支持'avg', 'max'.
:param activation: CNN之后使用的激活方法,支持'relu', 'sigmoid', 'tanh' 或者自定义函数.
:param min_char_freq: character的最少出现次数。默认值为2.
"""
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0,
dropout:float=0.5, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1),
pool_method: str='max', activation='relu', min_char_freq: int=2):
super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)

for kernel in kernel_sizes:
assert kernel % 2 == 1, "Only odd kernel is allowed."

assert pool_method in ('max', 'avg')
self.dropout = nn.Dropout(dropout)
self.pool_method = pool_method
# activation function
if isinstance(activation, str):
if activation.lower() == 'relu':
self.activation = F.relu
elif activation.lower() == 'sigmoid':
self.activation = F.sigmoid
elif activation.lower() == 'tanh':
self.activation = F.tanh
elif activation is None:
self.activation = lambda x: x
elif callable(activation):
self.activation = activation
else:
raise Exception(
"Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")

print("Start constructing character vocabulary.")
# 建立char的词表
self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
self.char_pad_index = self.char_vocab.padding_idx
print(f"In total, there are {len(self.char_vocab)} distinct characters.")
# 对vocab进行index
max_word_len = max(map(lambda x: len(x[0]), vocab))
self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), max_word_len),
fill_value=self.char_pad_index, dtype=torch.long),
requires_grad=False)
self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False)
for word, index in vocab:
# if index!=vocab.padding_idx: # 如果是pad的话,直接就为pad_value了。修改为不区分pad, 这样所有的<pad>也是同一个embed
self.words_to_chars_embedding[index, :len(word)] = \
torch.LongTensor([self.char_vocab.to_index(c) for c in word])
self.word_lengths[index] = len(word)
self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size)

self.convs = nn.ModuleList([nn.Conv1d(
char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2)
for i in range(len(kernel_sizes))])
self._embed_size = embed_size
self.fc = nn.Linear(sum(filter_nums), embed_size)
self.init_param()

def forward(self, words):
"""
输入words的index后,生成对应的words的表示。

:param words: [batch_size, max_len]
:return: [batch_size, max_len, embed_size]
"""
words = self.drop_word(words)
batch_size, max_len = words.size()
chars = self.words_to_chars_embedding[words] # batch_size x max_len x max_word_len
word_lengths = self.word_lengths[words] # batch_size x max_len
max_word_len = word_lengths.max()
chars = chars[:, :, :max_word_len]
# 为1的地方为mask
chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size
chars = self.dropout(chars)
reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1)
reshaped_chars = reshaped_chars.transpose(1, 2) # B' x E x M
conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1)
for conv in self.convs]
conv_chars = torch.cat(conv_chars, dim=-1).contiguous() # B x max_len x max_word_len x sum(filters)
conv_chars = self.activation(conv_chars)
if self.pool_method == 'max':
conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf'))
chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters)
else:
conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), 0)
chars = torch.sum(conv_chars, dim=-2)/chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
chars = self.fc(chars)
return self.dropout(chars)

@property
def requires_grad(self):
"""
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
:return:
"""
params = []
for name, param in self.named_parameters():
if 'words_to_chars_embedding' not in name and 'word_lengths' not in name:
params.append(param.requires_grad)
requires_grads = set(params)
if len(requires_grads) == 1:
return requires_grads.pop()
else:
return None

@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能加入到requires_grad中
continue
param.requires_grad = value

def init_param(self):
for name, param in self.named_parameters():
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能reset
continue
if param.data.dim()>1:
nn.init.xavier_uniform_(param, 1)
else:
nn.init.uniform_(param, -1, 1)


class LSTMCharEmbedding(TokenEmbedding):
"""
别名::class:`fastNLP.embeddings.LSTMCharEmbedding` :class:`fastNLP.embeddings.char_embedding.LSTMCharEmbedding`

使用LSTM的方式对character进行encode. embed(x) -> Dropout(x) -> LSTM(x) -> activation(x) -> pool

Example::

>>> lstm_char_embed = LSTMCharEmbedding(vocab)

:param vocab: 词表
:param embed_size: embedding的大小。默认值为50.
:param char_emb_size: character的embedding的大小。默认值为50.
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
:param dropout: 以多大概率drop
:param hidden_size: LSTM的中间hidden的大小,如果为bidirectional的,hidden会除二,默认为50.
:param pool_method: 支持'max', 'avg'
:param activation: 激活函数,支持'relu', 'sigmoid', 'tanh', 或者自定义函数.
:param min_char_freq: character的最小出现次数。默认值为2.
:param bidirectional: 是否使用双向的LSTM进行encode。默认值为True。
"""
def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0,
dropout:float=0.5, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2,
bidirectional=True):
super(LSTMCharEmbedding, self).__init__(vocab)

assert hidden_size % 2 == 0, "Only even kernel is allowed."

assert pool_method in ('max', 'avg')
self.pool_method = pool_method
self.dropout = nn.Dropout(dropout)
# activation function
if isinstance(activation, str):
if activation.lower() == 'relu':
self.activation = F.relu
elif activation.lower() == 'sigmoid':
self.activation = F.sigmoid
elif activation.lower() == 'tanh':
self.activation = F.tanh
elif activation is None:
self.activation = lambda x: x
elif callable(activation):
self.activation = activation
else:
raise Exception(
"Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")

print("Start constructing character vocabulary.")
# 建立char的词表
self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
self.char_pad_index = self.char_vocab.padding_idx
print(f"In total, there are {len(self.char_vocab)} distinct characters.")
# 对vocab进行index
self.max_word_len = max(map(lambda x: len(x[0]), vocab))
self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), self.max_word_len),
fill_value=self.char_pad_index, dtype=torch.long),
requires_grad=False)
self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False)
for word, index in vocab:
# if index!=vocab.padding_idx: # 如果是pad的话,直接就为pad_value了. 修改为不区分pad与否
self.words_to_chars_embedding[index, :len(word)] = \
torch.LongTensor([self.char_vocab.to_index(c) for c in word])
self.word_lengths[index] = len(word)
self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size)

self.fc = nn.Linear(hidden_size, embed_size)
hidden_size = hidden_size // 2 if bidirectional else hidden_size

self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True)
self._embed_size = embed_size
self.bidirectional = bidirectional

def forward(self, words):
"""
输入words的index后,生成对应的words的表示。

:param words: [batch_size, max_len]
:return: [batch_size, max_len, embed_size]
"""
words = self.drop_word(words)
batch_size, max_len = words.size()
chars = self.words_to_chars_embedding[words] # batch_size x max_len x max_word_len
word_lengths = self.word_lengths[words] # batch_size x max_len
max_word_len = word_lengths.max()
chars = chars[:, :, :max_word_len]
# 为mask的地方为1
chars_masks = chars.eq(self.char_pad_index) # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
chars = self.char_embedding(chars) # batch_size x max_len x max_word_len x embed_size
chars = self.dropout(chars)
reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1)
char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len)
lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1)
# B x M x M x H

lstm_chars = self.activation(lstm_chars)
if self.pool_method == 'max':
lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf'))
chars, _ = torch.max(lstm_chars, dim=-2) # batch_size x max_len x H
else:
lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), 0)
chars = torch.sum(lstm_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float()

chars = self.fc(chars)

return self.dropout(chars)

@property
def requires_grad(self):
"""
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
:return:
"""
params = []
for name, param in self.named_parameters():
if 'words_to_chars_embedding' not in name and 'word_lengths' not in name:
params.append(param)
requires_grads = set(params)
if len(requires_grads) == 1:
return requires_grads.pop()
else:
return None

@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
if 'words_to_chars_embedding' in name or 'word_lengths' in name: # 这个不能加入到requires_grad中
continue
param.requires_grad = value

+ 100
- 0
fastNLP/embeddings/contextual_embedding.py View File

@@ -0,0 +1,100 @@

from abc import abstractmethod
import torch

from ..core.vocabulary import Vocabulary
from ..core.dataset import DataSet
from ..core.batch import DataSetIter
from ..core.sampler import SequentialSampler
from ..core.utils import _move_model_to_device, _get_model_device
from .embedding import TokenEmbedding


class ContextualEmbedding(TokenEmbedding):
def __init__(self, vocab: Vocabulary, word_dropout:float=0.0, dropout:float=0.0):
super(ContextualEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)

def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool=True):
"""
由于动态embedding生成比较耗时,所以可以把每句话embedding缓存下来,这样就不需要每次都运行生成过程。

:param datasets: DataSet对象
:param batch_size: int, 生成cache的sentence表示时使用的batch的大小
:param device: 参考 :class::fastNLP.Trainer 的device
:param delete_weights: 似乎在生成了cache之后删除权重,在不需要finetune动态模型的情况下,删除权重会大量减少内存占用。
:return:
"""
for index, dataset in enumerate(datasets):
try:
assert isinstance(dataset, DataSet), "Only fastNLP.DataSet object is allowed."
assert 'words' in dataset.get_input_name(), "`words` field has to be set as input."
except Exception as e:
print(f"Exception happens at {index} dataset.")
raise e

sent_embeds = {}
_move_model_to_device(self, device=device)
device = _get_model_device(self)
pad_index = self._word_vocab.padding_idx
print("Start to calculate sentence representations.")
with torch.no_grad():
for index, dataset in enumerate(datasets):
try:
batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler())
for batch_x, batch_y in batch:
words = batch_x['words'].to(device)
words_list = words.tolist()
seq_len = words.ne(pad_index).sum(dim=-1)
max_len = words.size(1)
# 因为有些情况可能包含CLS, SEP, 从后面往前计算比较安全。
seq_len_from_behind = (max_len - seq_len).tolist()
word_embeds = self(words).detach().cpu().numpy()
for b in range(words.size(0)):
length = seq_len_from_behind[b]
if length==0:
sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b]
else:
sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length]
except Exception as e:
print(f"Exception happens at {index} dataset.")
raise e
print("Finish calculating sentence representations.")
self.sent_embeds = sent_embeds
if delete_weights:
self._delete_model_weights()

def _get_sent_reprs(self, words):
"""
获取sentence的表示,如果有缓存,则返回缓存的值; 没有缓存则返回None

:param words: torch.LongTensor
:return:
"""
if hasattr(self, 'sent_embeds'):
words_list = words.tolist()
seq_len = words.ne(self._word_pad_index).sum(dim=-1)
_embeds = []
for b in range(len(words)):
words_i = tuple(words_list[b][:seq_len[b]])
embed = self.sent_embeds[words_i]
_embeds.append(embed)
max_sent_len = max(map(len, _embeds))
embeds = words.new_zeros(len(_embeds), max_sent_len, self.embed_size, dtype=torch.float,
device=words.device)
for i, embed in enumerate(_embeds):
embeds[i, :len(embed)] = torch.FloatTensor(embed).to(words.device)
return embeds
return None

@abstractmethod
def _delete_model_weights(self):
"""删除计算表示的模型以节省资源"""
raise NotImplementedError

def remove_sentence_cache(self):
"""
删除缓存的句子表示. 删除之后如果模型权重没有被删除,将开始使用动态计算权重。

:return:
"""
del self.sent_embeds

+ 326
- 0
fastNLP/embeddings/elmo_embedding.py View File

@@ -0,0 +1,326 @@

import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import json
import codecs

from ..core.vocabulary import Vocabulary
from ..io.file_utils import cached_path, _get_base_url, PRETRAINED_ELMO_MODEL_DIR
from ..modules.encoder._elmo import ElmobiLm, ConvTokenEmbedder
from .contextual_embedding import ContextualEmbedding


class ElmoEmbedding(ContextualEmbedding):
"""
别名::class:`fastNLP.modules.ElmoEmbedding` :class:`fastNLP.modules.encoder.embedding.ElmoEmbedding`

使用ELMo的embedding。初始化之后,只需要传入words就可以得到对应的embedding。
我们提供的ELMo预训练模型来自 https://github.com/HIT-SCIR/ELMoForManyLangs

Example::

>>> embedding = ElmoEmbedding(vocab, model_dir_or_name='en', layers='2', requires_grad=True)

:param vocab: 词表
:param model_dir_or_name: 可以有两种方式调用预训练好的ELMo embedding:第一种是传入ELMo权重的文件名,第二种是传入ELMo版本的名称,
目前支持的ELMo包括{`en` : 英文版本的ELMo, `cn` : 中文版本的ELMo,}。第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载
:param layers: str, 指定返回的层数, 以,隔开不同的层。如果要返回第二层的结果'2', 返回后两层的结果'1,2'。不同的层的结果
按照这个顺序concat起来。默认为'2'。'mix'会使用可学习的权重结合不同层的表示(权重是否可训练与requires_grad保持一致,
初始化权重对三层结果进行mean-pooling, 可以通过ElmoEmbedding.set_mix_weights_requires_grad()方法只将mix weights设置为可学习。)
:param requires_grad: bool, 该层是否需要gradient, 默认为False.
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。
:param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话,将在初始化的时候为每个word生成对应的embedding,
并删除character encoder,之后将直接使用cache的embedding。默认为False。
"""

def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', layers: str = '2', requires_grad: bool = False,
word_dropout=0.0, dropout=0.0, cache_word_reprs: bool = False):
super(ElmoEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)

# 根据model_dir_or_name检查是否存在并下载
if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR:
PRETRAIN_URL = _get_base_url('elmo')
model_name = PRETRAINED_ELMO_MODEL_DIR[model_dir_or_name]
model_url = PRETRAIN_URL + model_name
model_dir = cached_path(model_url)
# 检查是否存在
elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))):
model_dir = model_dir_or_name
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")
self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs)

if layers == 'mix':
self.layer_weights = nn.Parameter(torch.zeros(self.model.config['lstm']['n_layers'] + 1),
requires_grad=requires_grad)
self.gamma = nn.Parameter(torch.ones(1), requires_grad=requires_grad)
self._get_outputs = self._get_mixed_outputs
self._embed_size = self.model.config['lstm']['projection_dim'] * 2
else:
layers = list(map(int, layers.split(',')))
assert len(layers) > 0, "Must choose one output"
for layer in layers:
assert 0 <= layer <= 2, "Layer index should be in range [0, 2]."
self.layers = layers
self._get_outputs = self._get_layer_outputs
self._embed_size = len(self.layers) * self.model.config['lstm']['projection_dim'] * 2

self.requires_grad = requires_grad

def _get_mixed_outputs(self, outputs):
# outputs: num_layers x batch_size x max_len x hidden_size
# return: batch_size x max_len x hidden_size
weights = F.softmax(self.layer_weights + 1 / len(outputs), dim=0).to(outputs)
outputs = torch.einsum('l,lbij->bij', weights, outputs)
return self.gamma.to(outputs) * outputs

def set_mix_weights_requires_grad(self, flag=True):
"""
当初始化ElmoEmbedding时layers被设置为mix时,可以通过调用该方法设置mix weights是否可训练。如果layers不是mix,调用
该方法没有用。
:param bool flag: 混合不同层表示的结果是否可以训练。
:return:
"""
if hasattr(self, 'layer_weights'):
self.layer_weights.requires_grad = flag
self.gamma.requires_grad = flag

def _get_layer_outputs(self, outputs):
if len(self.layers) == 1:
outputs = outputs[self.layers[0]]
else:
outputs = torch.cat(tuple([*outputs[self.layers]]), dim=-1)

return outputs

def forward(self, words: torch.LongTensor):
"""
计算words的elmo embedding表示。根据elmo文章中介绍的ELMO实际上是有2L+1层结果,但是为了让结果比较容易拆分,token的
被重复了一次,使得实际上layer=0的结果是[token_embedding;token_embedding], 而layer=1的结果是[forward_hiddens;
backward_hiddens].

:param words: batch_size x max_len
:return: torch.FloatTensor. batch_size x max_len x (512*len(self.layers))
"""
words = self.drop_word(words)
outputs = self._get_sent_reprs(words)
if outputs is not None:
return self.dropout(outputs)
outputs = self.model(words)
outputs = self._get_outputs(outputs)
return self.dropout(outputs)

def _delete_model_weights(self):
for name in ['layers', 'model', 'layer_weights', 'gamma']:
if hasattr(self, name):
delattr(self, name)

@property
def requires_grad(self):
"""
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许

:return:
"""
requires_grads = set([param.requires_grad for name, param in self.named_parameters()
if 'words_to_chars_embedding' not in name and 'words_to_words' not in name])
if len(requires_grads) == 1:
return requires_grads.pop()
else:
return None

@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
if 'words_to_chars_embedding' in name or 'words_to_words' in name: # 这个不能加入到requires_grad中
continue
param.requires_grad = value


class _ElmoModel(nn.Module):
"""
该Module是ElmoEmbedding中进行所有的heavy lifting的地方。做的工作,包括
(1) 根据配置,加载模型;
(2) 根据vocab,对模型中的embedding进行调整. 并将其正确初始化
(3) 保存一个words与chars的对应转换,获取时自动进行相应的转换
(4) 设计一个保存token的embedding,允许缓存word的表示。

"""

def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False):
super(_ElmoModel, self).__init__()
self.model_dir = model_dir
dir = os.walk(self.model_dir)
config_file = None
weight_file = None
config_count = 0
weight_count = 0
for path, dir_list, file_list in dir:
for file_name in file_list:
if file_name.__contains__(".json"):
config_file = file_name
config_count += 1
elif file_name.__contains__(".pkl"):
weight_file = file_name
weight_count += 1
if config_count > 1 or weight_count > 1:
raise Exception(f"Multiple config files(*.json) or weight files(*.hdf5) detected in {model_dir}.")
elif config_count == 0 or weight_count == 0:
raise Exception(f"No config file or weight file found in {model_dir}")

config = json.load(open(os.path.join(model_dir, config_file), 'r'))
self.weight_file = os.path.join(model_dir, weight_file)
self.config = config

OOV_TAG = '<oov>'
PAD_TAG = '<pad>'
BOS_TAG = '<bos>'
EOS_TAG = '<eos>'
BOW_TAG = '<bow>'
EOW_TAG = '<eow>'

# For the model trained with character-based word encoder.
char_lexicon = {}
with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi:
for line in fpi:
tokens = line.strip().split('\t')
if len(tokens) == 1:
tokens.insert(0, '\u3000')
token, i = tokens
char_lexicon[token] = int(i)

# 做一些sanity check
for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]:
assert special_word in char_lexicon, f"{special_word} not found in char.dic."

# 从vocab中构建char_vocab
char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG)
# 需要保证<bow>与<eow>在里面
char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG])

for word, index in vocab:
char_vocab.add_word_lst(list(word))

self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab) + 1, vocab.padding_idx
# 根据char_lexicon调整, 多设置一位,是预留给word padding的(该位置的char表示为全0表示)
char_emb_layer = nn.Embedding(len(char_vocab) + 1, int(config['char_cnn']['embedding']['dim']),
padding_idx=len(char_vocab))

# 读入预训练权重 这里的elmo_model 包含char_cnn和 lstm 的 state_dict
elmo_model = torch.load(os.path.join(self.model_dir, weight_file), map_location='cpu')

char_embed_weights = elmo_model["char_cnn"]['char_emb_layer.weight']

found_char_count = 0
for char, index in char_vocab: # 调整character embedding
if char in char_lexicon:
index_in_pre = char_lexicon.get(char)
found_char_count += 1
else:
index_in_pre = char_lexicon[OOV_TAG]
char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre]

print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.")
# 生成words到chars的映射
max_chars = config['char_cnn']['max_characters_per_token']

self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars),
fill_value=len(char_vocab),
dtype=torch.long),
requires_grad=False)
for word, index in list(iter(vocab)) + [(BOS_TAG, len(vocab)), (EOS_TAG, len(vocab) + 1)]:
if len(word) + 2 > max_chars:
word = word[:max_chars - 2]
if index == self._pad_index:
continue
elif word == BOS_TAG or word == EOS_TAG:
char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(word)] + [
char_vocab.to_index(EOW_TAG)]
char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids))
else:
char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(c) for c in word] + [
char_vocab.to_index(EOW_TAG)]
char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids))
self.words_to_chars_embedding[index] = torch.LongTensor(char_ids)

self.char_vocab = char_vocab

self.token_embedder = ConvTokenEmbedder(
config, self.weight_file, None, char_emb_layer)
elmo_model["char_cnn"]['char_emb_layer.weight'] = char_emb_layer.weight
self.token_embedder.load_state_dict(elmo_model["char_cnn"])

self.output_dim = config['lstm']['projection_dim']

# lstm encoder
self.encoder = ElmobiLm(config)
self.encoder.load_state_dict(elmo_model["lstm"])

if cache_word_reprs:
if config['char_cnn']['embedding']['dim'] > 0: # 只有在使用了chars的情况下有用
print("Start to generate cache word representations.")
batch_size = 320
# bos eos
word_size = self.words_to_chars_embedding.size(0)
num_batches = word_size // batch_size + \
int(word_size % batch_size != 0)

self.cached_word_embedding = nn.Embedding(word_size,
config['lstm']['projection_dim'])
with torch.no_grad():
for i in range(num_batches):
words = torch.arange(i * batch_size,
min((i + 1) * batch_size, word_size)).long()
chars = self.words_to_chars_embedding[words].unsqueeze(1) # batch_size x 1 x max_chars
word_reprs = self.token_embedder(words.unsqueeze(1),
chars).detach() # batch_size x 1 x config['encoder']['projection_dim']
self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1)

print("Finish generating cached word representations. Going to delete the character encoder.")
del self.token_embedder, self.words_to_chars_embedding
else:
print("There is no need to cache word representations, since no character information is used.")

def forward(self, words):
"""

:param words: batch_size x max_len
:return: num_layers x batch_size x max_len x hidden_size
"""
# 扩展<bos>, <eos>
batch_size, max_len = words.size()
expanded_words = words.new_zeros(batch_size, max_len + 2) # 因为pad一定为0,
seq_len = words.ne(self._pad_index).sum(dim=-1)
expanded_words[:, 1:-1] = words
expanded_words[:, 0].fill_(self.bos_index)
expanded_words[torch.arange(batch_size).to(words), seq_len + 1] = self.eos_index
seq_len = seq_len + 2
zero_tensor = expanded_words.new_zeros(expanded_words.shape)
mask = (expanded_words == zero_tensor).unsqueeze(-1)
if hasattr(self, 'cached_word_embedding'):
token_embedding = self.cached_word_embedding(expanded_words)
else:
if hasattr(self, 'words_to_chars_embedding'):
chars = self.words_to_chars_embedding[expanded_words]
else:
chars = None
token_embedding = self.token_embedder(expanded_words, chars) # batch_size x max_len x embed_dim

encoder_output = self.encoder(token_embedding, seq_len)
if encoder_output.size(2) < max_len + 2:
num_layers, _, output_len, hidden_size = encoder_output.size()
dummy_tensor = encoder_output.new_zeros(num_layers, batch_size,
max_len + 2 - output_len, hidden_size)
encoder_output = torch.cat((encoder_output, dummy_tensor), 2)
sz = encoder_output.size() # 2, batch_size, max_len, hidden_size
token_embedding = token_embedding.masked_fill(mask, 0)
token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3])
encoder_output = torch.cat((token_embedding, encoder_output), dim=0)

# 删除<eos>, <bos>. 这里没有精确地删除,但应该也不会影响最后的结果了。
encoder_output = encoder_output[:, :, 1:-1]
return encoder_output

+ 180
- 0
fastNLP/embeddings/embedding.py View File

@@ -0,0 +1,180 @@

import torch.nn as nn
from abc import abstractmethod
import torch

from .utils import get_embeddings


class Embedding(nn.Module):
"""
别名::class:`fastNLP.embeddings.Embedding` :class:`fastNLP.embeddings.embedding.Embedding`

Embedding组件. 可以通过self.num_embeddings获取词表大小; self.embedding_dim获取embedding的维度"""
def __init__(self, init_embed, word_dropout=0, dropout=0.0, unk_index=None):
"""

:param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int),
第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding;
:param float word_dropout: 按照一定概率随机将word设置为unk_index,这样可以使得unk这个token得到足够的训练, 且会对网络有
一定的regularize的作用。
:param float dropout: 对Embedding的输出的dropout。
:param int unk_index: drop word时替换为的index。fastNLP的Vocabulary的unk_index默认为1。
"""
super(Embedding, self).__init__()

self.embed = get_embeddings(init_embed)
self.dropout = nn.Dropout(dropout)
if not isinstance(self.embed, TokenEmbedding):
self._embed_size = self.embed.weight.size(1)
if word_dropout>0 and not isinstance(unk_index, int):
raise ValueError("When drop word is set, you need to pass in the unk_index.")
else:
self._embed_size = self.embed.embed_size
unk_index = self.embed.get_word_vocab().unknown_idx
self.unk_index = unk_index
self.word_dropout = word_dropout

def forward(self, x):
"""
:param torch.LongTensor x: [batch, seq_len]
:return: torch.Tensor : [batch, seq_len, embed_dim]
"""
if self.word_dropout>0 and self.training:
mask = torch.ones_like(x).float() * self.word_dropout
mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1
x = x.masked_fill(mask, self.unk_index)
x = self.embed(x)
return self.dropout(x)

@property
def num_embedding(self)->int:
if isinstance(self.embed, nn.Embedding):
return self.embed.weight.size(0)
else:
return self.embed.num_embedding

def __len__(self):
return len(self.embed)

@property
def embed_size(self) -> int:
return self._embed_size

@property
def embedding_dim(self) -> int:
return self._embed_size

@property
def requires_grad(self):
"""
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
:return:
"""
if not isinstance(self.embed, TokenEmbedding):
return self.embed.weight.requires_grad
else:
return self.embed.requires_grad

@requires_grad.setter
def requires_grad(self, value):
if not isinstance(self.embed, TokenEmbedding):
self.embed.weight.requires_grad = value
else:
self.embed.requires_grad = value

@property
def size(self):
if isinstance(self.embed, TokenEmbedding):
return self.embed.size
else:
return self.embed.weight.size()


class TokenEmbedding(nn.Module):
def __init__(self, vocab, word_dropout=0.0, dropout=0.0):
super(TokenEmbedding, self).__init__()
assert vocab.padding is not None, "Vocabulary must have a padding entry."
self._word_vocab = vocab
self._word_pad_index = vocab.padding_idx
if word_dropout>0:
assert vocab.unknown is not None, "Vocabulary must have unknown entry when you want to drop a word."
self.word_dropout = word_dropout
self._word_unk_index = vocab.unknown_idx
self.dropout_layer = nn.Dropout(dropout)

def drop_word(self, words):
"""
按照设定随机将words设置为unknown_index。

:param torch.LongTensor words: batch_size x max_len
:return:
"""
if self.word_dropout > 0 and self.training:
mask = torch.ones_like(words).float() * self.word_dropout
mask = torch.bernoulli(mask).byte() # dropout_word越大,越多位置为1
words = words.masked_fill(mask, self._word_unk_index)
return words

def dropout(self, words):
"""
对embedding后的word表示进行drop。

:param torch.FloatTensor words: batch_size x max_len x embed_size
:return:
"""
return self.dropout_layer(words)

@property
def requires_grad(self):
"""
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
:return:
"""
requires_grads = set([param.requires_grad for param in self.parameters()])
if len(requires_grads) == 1:
return requires_grads.pop()
else:
return None

@requires_grad.setter
def requires_grad(self, value):
for param in self.parameters():
param.requires_grad = value

def __len__(self):
return len(self._word_vocab)

@property
def embed_size(self) -> int:
return self._embed_size

@property
def embedding_dim(self) -> int:
return self._embed_size

@property
def num_embedding(self) -> int:
"""
这个值可能会大于实际的embedding矩阵的大小。
:return:
"""
return len(self._word_vocab)

def get_word_vocab(self):
"""
返回embedding的词典。

:return: Vocabulary
"""
return self._word_vocab

@property
def size(self):
return torch.Size(self.num_embedding, self._embed_size)

@abstractmethod
def forward(self, *input):
raise NotImplementedError

+ 92
- 0
fastNLP/embeddings/stack_embedding.py View File

@@ -0,0 +1,92 @@
from typing import List

import torch
from torch import nn as nn

from .embedding import TokenEmbedding


class StackEmbedding(TokenEmbedding):
"""
别名::class:`fastNLP.embeddings.StackEmbedding` :class:`fastNLP.embeddings.stack_embedding.StackEmbedding`

支持将多个embedding集合成一个embedding。

Example::

>>> embed_1 = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
>>> embed_2 = StaticEmbedding(vocab, model_dir_or_name='en-word2vec-300', requires_grad=True)


:param embeds: 一个由若干个TokenEmbedding组成的list,要求每一个TokenEmbedding的词表都保持一致
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。不同embedidng会在相同的位置
被设置为unknown。如果这里设置了dropout,则组成的embedding就不要再设置dropout了。
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。

"""
def __init__(self, embeds: List[TokenEmbedding], word_dropout=0, dropout=0):
vocabs = []
for embed in embeds:
if hasattr(embed, 'get_word_vocab'):
vocabs.append(embed.get_word_vocab())
_vocab = vocabs[0]
for vocab in vocabs[1:]:
assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary."

super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout)
assert isinstance(embeds, list)
for embed in embeds:
assert isinstance(embed, TokenEmbedding), "Only TokenEmbedding type is supported."
self.embeds = nn.ModuleList(embeds)
self._embed_size = sum([embed.embed_size for embed in self.embeds])

def append(self, embed: TokenEmbedding):
"""
添加一个embedding到结尾。
:param embed:
:return:
"""
assert isinstance(embed, TokenEmbedding)
self.embeds.append(embed)

def pop(self):
"""
弹出最后一个embed
:return:
"""
return self.embeds.pop()

@property
def embed_size(self):
return self._embed_size

@property
def requires_grad(self):
"""
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
:return:
"""
requires_grads = set([embed.requires_grad for embed in self.embeds()])
if len(requires_grads) == 1:
return requires_grads.pop()
else:
return None

@requires_grad.setter
def requires_grad(self, value):
for embed in self.embeds():
embed.requires_grad = value

def forward(self, words):
"""
得到多个embedding的结果,并把结果按照顺序concat起来。

:param words: batch_size x max_len
:return: 返回的shape和当前这个stack embedding中embedding的组成有关
"""
outputs = []
words = self.drop_word(words)
for embed in self.embeds:
outputs.append(embed(words))
outputs = self.dropout(torch.cat(outputs, dim=-1))
return outputs

+ 217
- 0
fastNLP/embeddings/static_embedding.py View File

@@ -0,0 +1,217 @@

import os

import torch
import torch.nn as nn
import numpy as np
import warnings

from ..core.vocabulary import Vocabulary
from ..io.file_utils import PRETRAIN_STATIC_FILES, _get_base_url, cached_path
from .embedding import TokenEmbedding


class StaticEmbedding(TokenEmbedding):
"""
别名::class:`fastNLP.embeddings.StaticEmbedding` :class:`fastNLP.embeddings.static_embedding.StaticEmbedding`

StaticEmbedding组件. 给定embedding的名称,根据vocab从embedding中抽取相应的数据。该Embedding可以就按照正常的embedding使用了

Example::

>>> embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50')


:param vocab: Vocabulary. 若该项为None则会读取所有的embedding。
:param model_dir_or_name: 可以有两种方式调用预训练好的static embedding:第一种是传入embedding的文件名,第二种是传入embedding
的名称。目前支持的embedding包括{`en` 或者 `en-glove-840b-300` : glove.840B.300d, `en-glove-6b-50` : glove.6B.50d,
`en-word2vec-300` : GoogleNews-vectors-negative300}。第二种情况将自动查看缓存中是否存在该模型,没有的话将自动下载。
:param bool requires_grad: 是否需要gradient. 默认为True
:param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对象。
:param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语,或者就是需要单独
为大写的词语开辟一个vector表示,则将lower设置为False。
:param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
:param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。
:param bool normailize: 是否对vector进行normalize,使得每个vector的norm为1。
"""
def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=True, init_method=None,
lower=False, dropout=0, word_dropout=0, normalize=False):
super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)

# 得到cache_path
if model_dir_or_name.lower() in PRETRAIN_STATIC_FILES:
PRETRAIN_URL = _get_base_url('static')
model_name = PRETRAIN_STATIC_FILES[model_dir_or_name]
model_url = PRETRAIN_URL + model_name
model_path = cached_path(model_url)
# 检查是否存在
elif os.path.isfile(os.path.expanduser(os.path.abspath(model_dir_or_name))):
model_path = model_dir_or_name
else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")

# 读取embedding
if lower:
lowered_vocab = Vocabulary(padding=vocab.padding, unknown=vocab.unknown)
for word, index in vocab:
if not vocab._is_word_no_create_entry(word):
lowered_vocab.add_word(word.lower()) # 先加入需要创建entry的
for word in vocab._no_create_word.keys(): # 不需要创建entry的
if word in vocab:
lowered_word = word.lower()
if lowered_word not in lowered_vocab.word_count:
lowered_vocab.add_word(lowered_word)
lowered_vocab._no_create_word[lowered_word] += 1
print(f"All word in vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} unique lowered "
f"words.")
embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method,
normalize=normalize)
# 需要适配一下
if not hasattr(self, 'words_to_words'):
self.words_to_words = torch.arange(len(lowered_vocab, )).long()
if lowered_vocab.unknown:
unknown_idx = lowered_vocab.unknown_idx
else:
unknown_idx = embedding.size(0) - 1 # 否则是最后一个为unknow
words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(),
requires_grad=False)
for word, index in vocab:
if word not in lowered_vocab:
word = word.lower()
if lowered_vocab._is_word_no_create_entry(word): # 如果不需要创建entry,已经默认unknown了
continue
words_to_words[index] = self.words_to_words[lowered_vocab.to_index(word)]
self.words_to_words = words_to_words
else:
embedding = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method,
normalize=normalize)
self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1],
padding_idx=vocab.padding_idx,
max_norm=None, norm_type=2, scale_grad_by_freq=False,
sparse=False, _weight=embedding)
self._embed_size = self.embedding.weight.size(1)
self.requires_grad = requires_grad

@property
def requires_grad(self):
"""
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
:return:
"""
requires_grads = set([param.requires_grad for name, param in self.named_parameters()
if 'words_to_words' not in name])
if len(requires_grads) == 1:
return requires_grads.pop()
else:
return None

@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
if 'words_to_words' in name:
continue
param.requires_grad = value

def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>',
normalize=True, error='ignore', init_method=None):
"""
从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是
word2vec(第一行只有两个元素)还是glove格式的数据。

:param str embed_filepath: 预训练的embedding的路径。
:param vocab: 词表 :class:`~fastNLP.Vocabulary` 类型,读取出现在vocab中的词的embedding。
没有出现在vocab中的词的embedding将通过找到的词的embedding的正态分布采样出来,以使得整个Embedding是同分布的。
:param dtype: 读出的embedding的类型
:param str padding: 词表中padding的token
:param str unknown: 词表中unknown的token
:param bool normalize: 是否将每个vector归一化到norm为1
:param str error: `ignore` , `strict` ; 如果 `ignore` ,错误将自动跳过; 如果 `strict` , 错误将抛出。
这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。
:param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。默认使用torch.nn.init.zeros_
:return torch.tensor: shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。
"""
assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported."
if not os.path.exists(embed_filepath):
raise FileNotFoundError("`{}` does not exist.".format(embed_filepath))
with open(embed_filepath, 'r', encoding='utf-8') as f:
line = f.readline().strip()
parts = line.split()
start_idx = 0
if len(parts) == 2:
dim = int(parts[1])
start_idx += 1
else:
dim = len(parts) - 1
f.seek(0)
matrix = {}
found_count = 0
for idx, line in enumerate(f, start_idx):
try:
parts = line.strip().split()
word = ''.join(parts[:-dim])
nums = parts[-dim:]
# 对齐unk与pad
if word == padding and vocab.padding is not None:
word = vocab.padding
elif word == unknown and vocab.unknown is not None:
word = vocab.unknown
if word in vocab:
index = vocab.to_index(word)
matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim))
found_count += 1
except Exception as e:
if error == 'ignore':
warnings.warn("Error occurred at the {} line.".format(idx))
else:
print("Error occurred at the {} line.".format(idx))
raise e
print("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab)))
for word, index in vocab:
if index not in matrix and not vocab._is_word_no_create_entry(word):
if vocab.unknown_idx in matrix: # 如果有unkonwn,用unknown初始化
matrix[index] = matrix[vocab.unknown_idx]
else:
matrix[index] = None

vectors = torch.zeros(len(matrix), dim)
if init_method:
init_method(vectors)
else:
nn.init.uniform_(vectors, -np.sqrt(3/dim), np.sqrt(3/dim))

if vocab._no_create_word_length>0:
if vocab.unknown is None: # 创建一个专门的unknown
unknown_idx = len(matrix)
vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous()
else:
unknown_idx = vocab.unknown_idx
words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(),
requires_grad=False)
for order, (index, vec) in enumerate(matrix.items()):
if vec is not None:
vectors[order] = vec
words_to_words[index] = order
self.words_to_words = words_to_words
else:
for index, vec in matrix.items():
if vec is not None:
vectors[index] = vec

if normalize:
vectors /= (torch.norm(vectors, dim=1, keepdim=True) + 1e-12)

return vectors

def forward(self, words):
"""
传入words的index

:param words: torch.LongTensor, [batch_size, max_len]
:return: torch.FloatTensor, [batch_size, max_len, embed_size]
"""
if hasattr(self, 'words_to_words'):
words = self.words_to_words[words]
words = self.drop_word(words)
words = self.embedding(words)
words = self.dropout(words)
return words

+ 47
- 0
fastNLP/embeddings/utils.py View File

@@ -0,0 +1,47 @@
import numpy as np
import torch
from torch import nn as nn

from ..core.vocabulary import Vocabulary


def _construct_char_vocab_from_vocab(vocab:Vocabulary, min_freq:int=1):
"""
给定一个word的vocabulary生成character的vocabulary.

:param vocab: 从vocab
:param min_freq:
:return:
"""
char_vocab = Vocabulary(min_freq=min_freq)
for word, index in vocab:
if not vocab._is_word_no_create_entry(word):
char_vocab.add_word_lst(list(word))
return char_vocab


def get_embeddings(init_embed):
"""
根据输入的init_embed生成nn.Embedding对象。

:param init_embed: 可以是 tuple:(num_embedings, embedding_dim), 即embedding的大小和每个词的维度;也可以传入
nn.Embedding 对象, 此时就以传入的对象作为embedding; 传入np.ndarray也行,将使用传入的ndarray作为作为Embedding初始
化; 传入orch.Tensor, 将使用传入的值作为Embedding初始化。
:return nn.Embedding embeddings:
"""
if isinstance(init_embed, tuple):
res = nn.Embedding(
num_embeddings=init_embed[0], embedding_dim=init_embed[1])
nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)),
b=np.sqrt(3/res.weight.data.size(1)))
elif isinstance(init_embed, nn.Module):
res = init_embed
elif isinstance(init_embed, torch.Tensor):
res = nn.Embedding.from_pretrained(init_embed, freeze=False)
elif isinstance(init_embed, np.ndarray):
init_embed = torch.tensor(init_embed, dtype=torch.float32)
res = nn.Embedding.from_pretrained(init_embed, freeze=False)
else:
raise TypeError(
'invalid init_embed type: {}'.format((type(init_embed))))
return res

+ 1
- 1
fastNLP/io/data_loader/matching.py View File

@@ -6,7 +6,7 @@ from ...core.const import Const
from ...core.vocabulary import Vocabulary from ...core.vocabulary import Vocabulary
from ..base_loader import DataBundle, DataSetLoader from ..base_loader import DataBundle, DataSetLoader
from ..file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR from ..file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR
from ...modules.encoder._bert import BertTokenizer
from ...modules.encoder.bert import BertTokenizer




class MatchingLoader(DataSetLoader): class MatchingLoader(DataSetLoader):


+ 1
- 1
fastNLP/models/bert.py View File

@@ -8,7 +8,7 @@ from torch import nn
from .base_model import BaseModel from .base_model import BaseModel
from ..core.const import Const from ..core.const import Const
from ..modules.encoder import BertModel from ..modules.encoder import BertModel
from ..modules.encoder._bert import BertConfig
from ..modules.encoder.bert import BertConfig




class BertForSequenceClassification(BaseModel): class BertForSequenceClassification(BaseModel):


+ 1
- 1
fastNLP/models/biaffine_parser.py View File

@@ -20,7 +20,7 @@ from ..modules.dropout import TimestepDropout
from ..modules.encoder.transformer import TransformerEncoder from ..modules.encoder.transformer import TransformerEncoder
from ..modules.encoder.variational_rnn import VarLSTM from ..modules.encoder.variational_rnn import VarLSTM
from ..modules.utils import initial_parameter from ..modules.utils import initial_parameter
from ..modules.utils import get_embeddings
from ..embeddings.utils import get_embeddings
from .base_model import BaseModel from .base_model import BaseModel
from ..core.utils import seq_len_to_mask from ..core.utils import seq_len_to_mask




+ 3
- 2
fastNLP/models/cnn_text_classification.py View File

@@ -6,8 +6,9 @@ import torch
import torch.nn as nn import torch.nn as nn


from ..core.const import Const as C from ..core.const import Const as C
from ..core.utils import seq_len_to_mask
from ..modules import encoder from ..modules import encoder
from fastNLP import seq_len_to_mask
from ..embeddings import embedding




class CNNText(torch.nn.Module): class CNNText(torch.nn.Module):
@@ -33,7 +34,7 @@ class CNNText(torch.nn.Module):
super(CNNText, self).__init__() super(CNNText, self).__init__()
# no support for pre-trained embedding currently # no support for pre-trained embedding currently
self.embed = encoder.Embedding(init_embed)
self.embed = embedding.Embedding(init_embed)
self.conv_pool = encoder.ConvMaxpool( self.conv_pool = encoder.ConvMaxpool(
in_channels=self.embed.embedding_dim, in_channels=self.embed.embedding_dim,
out_channels=kernel_nums, out_channels=kernel_nums,


+ 5
- 4
fastNLP/models/sequence_labeling.py View File

@@ -10,6 +10,7 @@ import torch
import torch.nn as nn import torch.nn as nn


from .base_model import BaseModel from .base_model import BaseModel
from ..embeddings import embedding
from ..modules import decoder, encoder from ..modules import decoder, encoder
from ..modules.decoder.crf import allowed_transitions from ..modules.decoder.crf import allowed_transitions
from ..core.utils import seq_len_to_mask from ..core.utils import seq_len_to_mask
@@ -32,10 +33,10 @@ class SeqLabeling(BaseModel):
def __init__(self, init_embed, hidden_size, num_classes): def __init__(self, init_embed, hidden_size, num_classes):
super(SeqLabeling, self).__init__() super(SeqLabeling, self).__init__()
self.Embedding = encoder.embedding.Embedding(init_embed)
self.Rnn = encoder.lstm.LSTM(self.Embedding.embedding_dim, hidden_size)
self.Embedding = embedding.Embedding(init_embed)
self.Rnn = encoder.LSTM(self.Embedding.embedding_dim, hidden_size)
self.Linear = nn.Linear(hidden_size, num_classes) self.Linear = nn.Linear(hidden_size, num_classes)
self.Crf = decoder.crf.ConditionalRandomField(num_classes)
self.Crf = decoder.ConditionalRandomField(num_classes)
self.mask = None self.mask = None
def forward(self, words, seq_len, target): def forward(self, words, seq_len, target):
@@ -129,7 +130,7 @@ class AdvSeqLabel(nn.Module):
super().__init__() super().__init__()
self.Embedding = encoder.embedding.Embedding(init_embed)
self.Embedding = embedding.Embedding(init_embed)
self.norm1 = torch.nn.LayerNorm(self.Embedding.embedding_dim) self.norm1 = torch.nn.LayerNorm(self.Embedding.embedding_dim)
self.Rnn = encoder.LSTM(input_size=self.Embedding.embedding_dim, hidden_size=hidden_size, num_layers=2, self.Rnn = encoder.LSTM(input_size=self.Embedding.embedding_dim, hidden_size=hidden_size, num_layers=2,
dropout=dropout, dropout=dropout,


+ 4
- 5
fastNLP/models/snli.py View File

@@ -8,11 +8,10 @@ import torch.nn.functional as F


from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss


from fastNLP.models import BaseModel
from fastNLP.modules.encoder.embedding import TokenEmbedding
from fastNLP.modules.encoder.lstm import LSTM
from fastNLP.core.const import Const
from fastNLP.core.utils import seq_len_to_mask
from .base_model import BaseModel
from ..embeddings.embedding import TokenEmbedding
from ..core.const import Const
from ..core.utils import seq_len_to_mask




class ESIM(BaseModel): class ESIM(BaseModel):


+ 1
- 1
fastNLP/models/star_transformer.py View File

@@ -13,7 +13,7 @@ from torch import nn


from ..modules.encoder.star_transformer import StarTransformer from ..modules.encoder.star_transformer import StarTransformer
from ..core.utils import seq_len_to_mask from ..core.utils import seq_len_to_mask
from ..modules.utils import get_embeddings
from ..embeddings.utils import get_embeddings
from ..core.const import Const from ..core.const import Const






+ 0
- 2
fastNLP/modules/__init__.py View File

@@ -24,7 +24,6 @@ __all__ = [
"ConvolutionCharEncoder", "ConvolutionCharEncoder",
"LSTMCharEncoder", "LSTMCharEncoder",
"ConvMaxpool", "ConvMaxpool",
"Embedding",
"LSTM", "LSTM",
"StarTransformer", "StarTransformer",
"TransformerEncoder", "TransformerEncoder",
@@ -48,4 +47,3 @@ from . import encoder
from .decoder import * from .decoder import *
from .dropout import TimestepDropout from .dropout import TimestepDropout
from .encoder import * from .encoder import *
from .utils import get_embeddings

+ 3
- 13
fastNLP/modules/encoder/__init__.py View File

@@ -1,19 +1,11 @@
__all__ = [ __all__ = [
# "BertModel",
"BertModel",
"ConvolutionCharEncoder", "ConvolutionCharEncoder",
"LSTMCharEncoder", "LSTMCharEncoder",
"ConvMaxpool", "ConvMaxpool",
"Embedding",
"StaticEmbedding",
"ElmoEmbedding",
"BertEmbedding",
"StackEmbedding",
"LSTMCharEmbedding",
"CNNCharEmbedding",
"LSTM", "LSTM",
"StarTransformer", "StarTransformer",
@@ -31,12 +23,10 @@ __all__ = [


"MultiHeadAttention", "MultiHeadAttention",
] ]
from ._bert import BertModel
from .bert import BertWordPieceEncoder
from .bert import BertModel
from .char_encoder import ConvolutionCharEncoder, LSTMCharEncoder from .char_encoder import ConvolutionCharEncoder, LSTMCharEncoder
from .conv_maxpool import ConvMaxpool from .conv_maxpool import ConvMaxpool
from .embedding import Embedding, StaticEmbedding, ElmoEmbedding, BertEmbedding, \
StackEmbedding, LSTMCharEmbedding, CNNCharEmbedding
from .lstm import LSTM from .lstm import LSTM
from .star_transformer import StarTransformer from .star_transformer import StarTransformer
from .transformer import TransformerEncoder from .transformer import TransformerEncoder


+ 0
- 1069
fastNLP/modules/encoder/_bert.py
File diff suppressed because it is too large
View File


+ 1
- 191
fastNLP/modules/encoder/_elmo.py View File

@@ -4,18 +4,13 @@


from typing import Optional, Tuple, List, Callable from typing import Optional, Tuple, List, Callable


import os

import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence
from ...core.vocabulary import Vocabulary
import json
import pickle


from ..utils import get_dropout_mask from ..utils import get_dropout_mask
import codecs



class LstmCellWithProjection(torch.nn.Module): class LstmCellWithProjection(torch.nn.Module):
""" """
@@ -541,188 +536,3 @@ class Highway(torch.nn.Module):
gate = torch.sigmoid(gate) gate = torch.sigmoid(gate)
current_input = gate * linear_part + (1 - gate) * nonlinear_part current_input = gate * linear_part + (1 - gate) * nonlinear_part
return current_input return current_input


class _ElmoModel(nn.Module):
"""
该Module是ElmoEmbedding中进行所有的heavy lifting的地方。做的工作,包括
(1) 根据配置,加载模型;
(2) 根据vocab,对模型中的embedding进行调整. 并将其正确初始化
(3) 保存一个words与chars的对应转换,获取时自动进行相应的转换
(4) 设计一个保存token的embedding,允许缓存word的表示。

"""

def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False):
super(_ElmoModel, self).__init__()
self.model_dir = model_dir
dir = os.walk(self.model_dir)
config_file = None
weight_file = None
config_count = 0
weight_count = 0
for path, dir_list, file_list in dir:
for file_name in file_list:
if file_name.__contains__(".json"):
config_file = file_name
config_count += 1
elif file_name.__contains__(".pkl"):
weight_file = file_name
weight_count += 1
if config_count > 1 or weight_count > 1:
raise Exception(f"Multiple config files(*.json) or weight files(*.hdf5) detected in {model_dir}.")
elif config_count == 0 or weight_count == 0:
raise Exception(f"No config file or weight file found in {model_dir}")

config = json.load(open(os.path.join(model_dir, config_file), 'r'))
self.weight_file = os.path.join(model_dir, weight_file)
self.config = config

OOV_TAG = '<oov>'
PAD_TAG = '<pad>'
BOS_TAG = '<bos>'
EOS_TAG = '<eos>'
BOW_TAG = '<bow>'
EOW_TAG = '<eow>'

# For the model trained with character-based word encoder.
char_lexicon = {}
with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi:
for line in fpi:
tokens = line.strip().split('\t')
if len(tokens) == 1:
tokens.insert(0, '\u3000')
token, i = tokens
char_lexicon[token] = int(i)

# 做一些sanity check
for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]:
assert special_word in char_lexicon, f"{special_word} not found in char.dic."

# 从vocab中构建char_vocab
char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG)
# 需要保证<bow>与<eow>在里面
char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG])

for word, index in vocab:
char_vocab.add_word_lst(list(word))

self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab) + 1, vocab.padding_idx
# 根据char_lexicon调整, 多设置一位,是预留给word padding的(该位置的char表示为全0表示)
char_emb_layer = nn.Embedding(len(char_vocab) + 1, int(config['char_cnn']['embedding']['dim']),
padding_idx=len(char_vocab))

# 读入预训练权重 这里的elmo_model 包含char_cnn和 lstm 的 state_dict
elmo_model = torch.load(os.path.join(self.model_dir, weight_file), map_location='cpu')

char_embed_weights = elmo_model["char_cnn"]['char_emb_layer.weight']

found_char_count = 0
for char, index in char_vocab: # 调整character embedding
if char in char_lexicon:
index_in_pre = char_lexicon.get(char)
found_char_count += 1
else:
index_in_pre = char_lexicon[OOV_TAG]
char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre]

print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.")
# 生成words到chars的映射
max_chars = config['char_cnn']['max_characters_per_token']

self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars),
fill_value=len(char_vocab),
dtype=torch.long),
requires_grad=False)
for word, index in list(iter(vocab)) + [(BOS_TAG, len(vocab)), (EOS_TAG, len(vocab) + 1)]:
if len(word) + 2 > max_chars:
word = word[:max_chars - 2]
if index == self._pad_index:
continue
elif word == BOS_TAG or word == EOS_TAG:
char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(word)] + [
char_vocab.to_index(EOW_TAG)]
char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids))
else:
char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(c) for c in word] + [
char_vocab.to_index(EOW_TAG)]
char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids))
self.words_to_chars_embedding[index] = torch.LongTensor(char_ids)

self.char_vocab = char_vocab

self.token_embedder = ConvTokenEmbedder(
config, self.weight_file, None, char_emb_layer)
elmo_model["char_cnn"]['char_emb_layer.weight'] = char_emb_layer.weight
self.token_embedder.load_state_dict(elmo_model["char_cnn"])

self.output_dim = config['lstm']['projection_dim']

# lstm encoder
self.encoder = ElmobiLm(config)
self.encoder.load_state_dict(elmo_model["lstm"])

if cache_word_reprs:
if config['char_cnn']['embedding']['dim'] > 0: # 只有在使用了chars的情况下有用
print("Start to generate cache word representations.")
batch_size = 320
# bos eos
word_size = self.words_to_chars_embedding.size(0)
num_batches = word_size // batch_size + \
int(word_size % batch_size != 0)

self.cached_word_embedding = nn.Embedding(word_size,
config['lstm']['projection_dim'])
with torch.no_grad():
for i in range(num_batches):
words = torch.arange(i * batch_size,
min((i + 1) * batch_size, word_size)).long()
chars = self.words_to_chars_embedding[words].unsqueeze(1) # batch_size x 1 x max_chars
word_reprs = self.token_embedder(words.unsqueeze(1),
chars).detach() # batch_size x 1 x config['encoder']['projection_dim']
self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1)

print("Finish generating cached word representations. Going to delete the character encoder.")
del self.token_embedder, self.words_to_chars_embedding
else:
print("There is no need to cache word representations, since no character information is used.")

def forward(self, words):
"""

:param words: batch_size x max_len
:return: num_layers x batch_size x max_len x hidden_size
"""
# 扩展<bos>, <eos>
batch_size, max_len = words.size()
expanded_words = words.new_zeros(batch_size, max_len + 2) # 因为pad一定为0,
seq_len = words.ne(self._pad_index).sum(dim=-1)
expanded_words[:, 1:-1] = words
expanded_words[:, 0].fill_(self.bos_index)
expanded_words[torch.arange(batch_size).to(words), seq_len + 1] = self.eos_index
seq_len = seq_len + 2
zero_tensor = expanded_words.new_zeros(expanded_words.shape)
mask = (expanded_words == zero_tensor).unsqueeze(-1)
if hasattr(self, 'cached_word_embedding'):
token_embedding = self.cached_word_embedding(expanded_words)
else:
if hasattr(self, 'words_to_chars_embedding'):
chars = self.words_to_chars_embedding[expanded_words]
else:
chars = None
token_embedding = self.token_embedder(expanded_words, chars) # batch_size x max_len x embed_dim

encoder_output = self.encoder(token_embedding, seq_len)
if encoder_output.size(2) < max_len + 2:
num_layers, _, output_len, hidden_size = encoder_output.size()
dummy_tensor = encoder_output.new_zeros(num_layers, batch_size,
max_len + 2 - output_len, hidden_size)
encoder_output = torch.cat((encoder_output, dummy_tensor), 2)
sz = encoder_output.size() # 2, batch_size, max_len, hidden_size
token_embedding = token_embedding.masked_fill(mask, 0)
token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3])
encoder_output = torch.cat((token_embedding, encoder_output), dim=0)

# 删除<eos>, <bos>. 这里没有精确地删除,但应该也不会影响最后的结果了。
encoder_output = encoder_output[:, :, 1:-1]
return encoder_output

+ 887
- 47
fastNLP/modules/encoder/bert.py View File

@@ -1,79 +1,919 @@




"""
这个页面的代码很大程度上参考(复制粘贴)了https://github.com/huggingface/pytorch-pretrained-BERT的代码, 如果你发现该代码对你
有用,也请引用一下他们。
"""


import collections

import unicodedata
import copy
import json
import math
import os import os
from torch import nn
import torch import torch
from ...io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR
from ._bert import _WordPieceBertModel, BertModel
from torch import nn
import glob
import sys


CONFIG_FILE = 'bert_config.json'


class BertWordPieceEncoder(nn.Module):

class BertConfig(object):
"""Configuration class to store the configuration of a `BertModel`.
""" """
读取bert模型,读取之后调用index_dataset方法在dataset中生成word_pieces这一列。
def __init__(self,
vocab_size_or_config_json_file,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12):
"""Constructs BertConfig.

Args:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder.
num_attention_heads: Number of attention heads for each attention layer in
the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
hidden_act: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
hidden_dropout_prob: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob: The dropout ratio for the attention
probabilities.
max_position_embeddings: The maximum sequence length that this model might
ever be used with. Typically set this to something large just in case
(e.g., 512 or 1024 or 2048).
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`BertModel`.
initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices.
layer_norm_eps: The epsilon used by LayerNorm.
"""
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)):
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
json_config = json.loads(reader.read())
for key, value in json_config.items():
self.__dict__[key] = value
elif isinstance(vocab_size_or_config_json_file, int):
self.vocab_size = vocab_size_or_config_json_file
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.hidden_act = hidden_act
self.intermediate_size = intermediate_size
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range
self.layer_norm_eps = layer_norm_eps
else:
raise ValueError("First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)")

@classmethod
def from_dict(cls, json_object):
"""Constructs a `BertConfig` from a Python dictionary of parameters."""
config = BertConfig(vocab_size_or_config_json_file=-1)
for key, value in json_object.items():
config.__dict__[key] = value
return config

@classmethod
def from_json_file(cls, json_file):
"""Constructs a `BertConfig` from a json file of parameters."""
with open(json_file, "r", encoding='utf-8') as reader:
text = reader.read()
return cls.from_dict(json.loads(text))

def __repr__(self):
return str(self.to_json_string())

def to_dict(self):
"""Serializes this instance to a Python dictionary."""
output = copy.deepcopy(self.__dict__)
return output

def to_json_string(self):
"""Serializes this instance to a JSON string."""
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"

def to_json_file(self, json_file_path):
""" Save this instance to a json file."""
with open(json_file_path, "w", encoding='utf-8') as writer:
writer.write(self.to_json_string())



:param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为``en-base-uncased``
:param str layers:最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层
:param bool requires_grad: 是否需要gradient。
def gelu(x):
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


def swish(x):
return x * torch.sigmoid(x)


ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}


class BertLayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-12):
"""Construct a layernorm module in the TF style (epsilon inside the square root).
"""
super(BertLayerNorm, self).__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.bias = nn.Parameter(torch.zeros(hidden_size))
self.variance_epsilon = eps

def forward(self, x):
u = x.mean(-1, keepdim=True)
s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias


class BertEmbeddings(nn.Module):
"""Construct the embeddings from word, position and token_type embeddings.
""" """
def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1',
requires_grad: bool=False):
super().__init__()
PRETRAIN_URL = _get_base_url('bert')

if model_dir_or_name in PRETRAINED_BERT_MODEL_DIR:
model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
model_url = PRETRAIN_URL + model_name
model_dir = cached_path(model_url)
# 检查是否存在
elif os.path.isdir(model_dir_or_name):
model_dir = model_dir_or_name
def __init__(self, config):
super(BertEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)

def forward(self, input_ids, token_type_ids=None):
seq_length = input_ids.size(1)
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)

words_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)

embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings


class BertSelfAttention(nn.Module):
def __init__(self, config):
super(BertSelfAttention, self).__init__()
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (config.hidden_size, config.num_attention_heads))
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size

self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)

self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)

def forward(self, hidden_states, attention_mask):
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)

query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(mixed_key_layer)
value_layer = self.transpose_for_scores(mixed_value_layer)

# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask

# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)

# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)

context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
return context_layer


class BertSelfOutput(nn.Module):
def __init__(self, config):
super(BertSelfOutput, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)

def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states


class BertAttention(nn.Module):
def __init__(self, config):
super(BertAttention, self).__init__()
self.self = BertSelfAttention(config)
self.output = BertSelfOutput(config)

def forward(self, input_tensor, attention_mask):
self_output = self.self(input_tensor, attention_mask)
attention_output = self.output(self_output, input_tensor)
return attention_output


class BertIntermediate(nn.Module):
def __init__(self, config):
super(BertIntermediate, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
else: else:
raise ValueError(f"Cannot recognize {model_dir_or_name}.")
self.intermediate_act_fn = config.hidden_act

def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states


class BertOutput(nn.Module):
def __init__(self, config):
super(BertOutput, self).__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)

def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states


class BertLayer(nn.Module):
def __init__(self, config):
super(BertLayer, self).__init__()
self.attention = BertAttention(config)
self.intermediate = BertIntermediate(config)
self.output = BertOutput(config)

def forward(self, hidden_states, attention_mask):
attention_output = self.attention(hidden_states, attention_mask)
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output


class BertEncoder(nn.Module):
def __init__(self, config):
super(BertEncoder, self).__init__()
layer = BertLayer(config)
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])

def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
all_encoder_layers = []
for layer_module in self.layer:
hidden_states = layer_module(hidden_states, attention_mask)
if output_all_encoded_layers:
all_encoder_layers.append(hidden_states)
if not output_all_encoded_layers:
all_encoder_layers.append(hidden_states)
return all_encoder_layers


class BertPooler(nn.Module):
def __init__(self, config):
super(BertPooler, self).__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()

def forward(self, hidden_states):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output


class BertModel(nn.Module):
"""BERT(Bidirectional Embedding Representations from Transformers).

如果你想使用预训练好的权重矩阵,请在以下网址下载.
sources::

'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-pytorch_model.bin",
'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin"


用预训练权重矩阵来建立BERT模型::

model = BertModel.from_pretrained("path/to/weights/directory")

用随机初始化权重矩阵来建立BERT模型::


self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers)
self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size
self.requires_grad = requires_grad
model = BertModel()


@property
def requires_grad(self):
:param int vocab_size: 词表大小,默认值为30522,为BERT English uncase版本的词表大小
:param int hidden_size: 隐层大小,默认值为768,为BERT base的版本
:param int num_hidden_layers: 隐藏层数,默认值为12,为BERT base的版本
:param int num_attention_heads: 多头注意力头数,默认值为12,为BERT base的版本
:param int intermediate_size: FFN隐藏层大小,默认值是3072,为BERT base的版本
:param str hidden_act: FFN隐藏层激活函数,默认值为``gelu``
:param float hidden_dropout_prob: FFN隐藏层dropout,默认值为0.1
:param float attention_probs_dropout_prob: Attention层的dropout,默认值为0.1
:param int max_position_embeddings: 最大的序列长度,默认值为512,
:param int type_vocab_size: 最大segment数量,默认值为2
:param int initializer_range: 初始化权重范围,默认值为0.02
"""

def __init__(self, config, *inputs, **kwargs):
super(BertModel, self).__init__()
if not isinstance(config, BertConfig):
raise ValueError(
"Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
"To create a model from a Google pretrained model use "
"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
self.__class__.__name__, self.__class__.__name__
))
super(BertModel, self).__init__()
self.config = config
self.hidden_size = self.config.hidden_size
self.embeddings = BertEmbeddings(config)
self.encoder = BertEncoder(config)
self.pooler = BertPooler(config)
self.apply(self.init_bert_weights)

def init_bert_weights(self, module):
""" Initialize the weights.
""" """
Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
elif isinstance(module, BertLayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()

def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)

# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

embedding_output = self.embeddings(input_ids, token_type_ids)
encoded_layers = self.encoder(embedding_output,
extended_attention_mask,
output_all_encoded_layers=output_all_encoded_layers)
sequence_output = encoded_layers[-1]
pooled_output = self.pooler(sequence_output)
if not output_all_encoded_layers:
encoded_layers = encoded_layers[-1]
return encoded_layers, pooled_output

@classmethod
def from_pretrained(cls, pretrained_model_dir, *inputs, **kwargs):
state_dict = kwargs.get('state_dict', None)
kwargs.pop('state_dict', None)
cache_dir = kwargs.get('cache_dir', None)
kwargs.pop('cache_dir', None)
from_tf = kwargs.get('from_tf', False)
kwargs.pop('from_tf', None)
# Load config
config_file = os.path.join(pretrained_model_dir, CONFIG_FILE)
config = BertConfig.from_json_file(config_file)
# logger.info("Model config {}".format(config))
# Instantiate model.
model = cls(config, *inputs, **kwargs)
if state_dict is None:
files = glob.glob(os.path.join(pretrained_model_dir, '*.bin'))
if len(files)==0:
raise FileNotFoundError(f"There is no *.bin file in {pretrained_model_dir}")
elif len(files)>1:
raise FileExistsError(f"There are multiple *.bin files in {pretrained_model_dir}")
weights_path = files[0]
state_dict = torch.load(weights_path, map_location='cpu')

old_keys = []
new_keys = []
for key in state_dict.keys():
new_key = None
if 'gamma' in key:
new_key = key.replace('gamma', 'weight')
if 'beta' in key:
new_key = key.replace('beta', 'bias')
if new_key:
old_keys.append(key)
new_keys.append(new_key)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key] = state_dict.pop(old_key)

missing_keys = []
unexpected_keys = []
error_msgs = []
# copy state_dict so _load_from_state_dict can modify it
metadata = getattr(state_dict, '_metadata', None)
state_dict = state_dict.copy()
if metadata is not None:
state_dict._metadata = metadata

def load(module, prefix=''):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
module._load_from_state_dict(
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
for name, child in module._modules.items():
if child is not None:
load(child, prefix + name + '.')

load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
if len(missing_keys) > 0:
print("Weights of {} not initialized from pretrained model: {}".format(
model.__class__.__name__, missing_keys))
if len(unexpected_keys) > 0:
print("Weights from pretrained model not used in {}: {}".format(
model.__class__.__name__, unexpected_keys))
return model


def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a piece of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens


class WordpieceTokenizer(object):
"""Runs WordPiece tokenization."""

def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
self.vocab = vocab
self.unk_token = unk_token
self.max_input_chars_per_word = max_input_chars_per_word

def tokenize(self, text):
"""Tokenizes a piece of text into its word pieces.

This uses a greedy longest-match-first algorithm to perform tokenization
using the given vocabulary.

For example:
input = "unaffable"
output = ["un", "##aff", "##able"]

Args:
text: A single token or whitespace separated tokens. This should have
already been passed through `BasicTokenizer`.

Returns:
A list of wordpiece tokens.
"""

output_tokens = []
for token in whitespace_tokenize(text):
chars = list(token)
if len(chars) > self.max_input_chars_per_word:
output_tokens.append(self.unk_token)
continue

is_bad = False
start = 0
sub_tokens = []
while start < len(chars):
end = len(chars)
cur_substr = None
while start < end:
substr = "".join(chars[start:end])
if start > 0:
substr = "##" + substr
if substr in self.vocab:
cur_substr = substr
break
end -= 1
if cur_substr is None:
is_bad = True
break
sub_tokens.append(cur_substr)
start = end

if is_bad:
output_tokens.append(self.unk_token)
else:
output_tokens.extend(sub_tokens)
return output_tokens


def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict()
index = 0
with open(vocab_file, "r", encoding="utf-8") as reader:
while True:
token = reader.readline()
if not token:
break
token = token.strip()
vocab[token] = index
index += 1
return vocab

class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

def __init__(self,
do_lower_case=True,
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
"""Constructs a BasicTokenizer.

Args:
do_lower_case: Whether to lower case the input.
"""
self.do_lower_case = do_lower_case
self.never_split = never_split

def tokenize(self, text):
"""Tokenizes a piece of text."""
text = self._clean_text(text)
# This was added on November 1st, 2018 for the multilingual and Chinese
# models. This is also applied to the English models now, but it doesn't
# matter since the English models were not trained on any Chinese data
# and generally don't have any Chinese data in them (there are Chinese
# characters in the vocabulary because Wikipedia does have some Chinese
# words in the English Wikipedia.).
text = self._tokenize_chinese_chars(text)
orig_tokens = whitespace_tokenize(text)
split_tokens = []
for token in orig_tokens:
if self.do_lower_case and token not in self.never_split:
token = token.lower()
token = self._run_strip_accents(token)
split_tokens.extend(self._run_split_on_punc(token))

output_tokens = whitespace_tokenize(" ".join(split_tokens))
return output_tokens

def _run_strip_accents(self, text):
"""Strips accents from a piece of text."""
text = unicodedata.normalize("NFD", text)
output = []
for char in text:
cat = unicodedata.category(char)
if cat == "Mn":
continue
output.append(char)
return "".join(output)

def _run_split_on_punc(self, text):
"""Splits punctuation on a piece of text."""
if text in self.never_split:
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1

return ["".join(x) for x in output]

def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)

def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True

return False

def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)


def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False


def _is_control(char):
"""Checks whether `chars` is a control character."""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False


def _is_punctuation(char):
"""Checks whether `chars` is a punctuation character."""
cp = ord(char)
# We treat all non-letter/number ASCII as punctuation.
# Characters such as "^", "$", and "`" are not in the Unicode
# Punctuation class but we treat them as punctuation anyways, for
# consistency.
if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False


class BertTokenizer(object):
"""Runs end-to-end tokenization: punctuation splitting + wordpiece"""

def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
"""Constructs a BertTokenizer.

Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file
do_lower_case: Whether to lower case the input
Only has an effect when do_wordpiece_only=False
do_basic_tokenize: Whether to do basic tokenization before wordpiece.
max_len: An artificial maximum length to truncate tokenized sequences to;
Effective maximum length is always the minimum of this
value (if specified) and the underlying BERT model's
sequence length.
never_split: List of tokens which will never be split during tokenization.
Only has an effect when do_wordpiece_only=False
"""
if not os.path.isfile(vocab_file):
raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in self.vocab.items()])
self.do_basic_tokenize = do_basic_tokenize
if do_basic_tokenize:
self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
never_split=never_split)
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
self.max_len = max_len if max_len is not None else int(1e12)

def _reinit_on_new_vocab(self, vocab):
"""
在load bert之后,可能会对vocab进行重新排列。重新排列之后调用这个函数重新初始化与vocab相关的性质

:param vocab:
:return: :return:
""" """
requires_grads = set([param.requires_grad for name, param in self.named_parameters()])
if len(requires_grads)==1:
return requires_grads.pop()
self.vocab = vocab
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

def tokenize(self, text):
split_tokens = []
if self.do_basic_tokenize:
for token in self.basic_tokenizer.tokenize(text):
for sub_token in self.wordpiece_tokenizer.tokenize(token):
split_tokens.append(sub_token)
else:
split_tokens = self.wordpiece_tokenizer.tokenize(text)
return split_tokens

def convert_tokens_to_ids(self, tokens):
"""Converts a sequence of tokens into ids using the vocab."""
ids = []
for token in tokens:
ids.append(self.vocab[token])
if len(ids) > self.max_len:
print(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this BERT model ({} > {}). Running this"
" sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
)
return ids

def convert_ids_to_tokens(self, ids):
"""Converts a sequence of ids in wordpiece tokens using the vocab."""
tokens = []
for i in ids:
tokens.append(self.ids_to_tokens[i])
return tokens

def save_vocabulary(self, vocab_path):
"""Save the tokenizer vocabulary to a directory or file."""
index = 0
if os.path.isdir(vocab_path):
vocab_file = os.path.join(vocab_path, VOCAB_NAME)
else: else:
return None
vocab_file = vocab_path
with open(vocab_file, "w", encoding="utf-8") as writer:
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
if index != token_index:
print("Saving vocabulary to {}: vocabulary indices are not consecutive."
" Please check that the vocabulary is not corrupted!".format(vocab_file))
index = token_index
writer.write(token + u'\n')
index += 1
return vocab_file

@classmethod
def from_pretrained(cls, model_dir, *inputs, **kwargs):
"""
给定path,直接读取vocab.

"""
pretrained_model_name_or_path = os.path.join(model_dir, VOCAB_NAME)
print("loading vocabulary file {}".format(pretrained_model_name_or_path))
max_len = 512
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
# Instantiate tokenizer.
tokenizer = cls(pretrained_model_name_or_path, *inputs, **kwargs)
return tokenizer


@requires_grad.setter
def requires_grad(self, value):
for name, param in self.named_parameters():
param.requires_grad = value
VOCAB_NAME = 'vocab.txt'


@property
def embed_size(self):
return self._embed_size


def index_datasets(self, *datasets, field_name):
class _WordPieceBertModel(nn.Module):
"""
这个模块用于直接计算word_piece的结果.

"""
def __init__(self, model_dir:str, layers:str='-1'):
super().__init__()

self.tokenzier = BertTokenizer.from_pretrained(model_dir)
self.encoder = BertModel.from_pretrained(model_dir)
# 检查encoder_layer_number是否合理
encoder_layer_number = len(self.encoder.encoder.layer)
self.layers = list(map(int, layers.split(',')))
for layer in self.layers:
if layer<0:
assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \
f"a bert model with {encoder_layer_number} layers."
else:
assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
f"a bert model with {encoder_layer_number} layers."

self._cls_index = self.tokenzier.vocab['[CLS]']
self._sep_index = self.tokenzier.vocab['[SEP]']
self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece

def index_dataset(self, *datasets, field_name):
""" """
使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是
[CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。 [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。


:param datasets: DataSet对象 :param datasets: DataSet对象
:param field_name: 基于哪一列的内容生成word_pieces列。这一列中每个数据应该是List[str]的形式。
:param field_name: 基于哪一列index
:return: :return:
""" """
self.model.index_dataset(*datasets, field_name=field_name)
def convert_words_to_word_pieces(words):
word_pieces = []
for word in words:
tokens = self.tokenzier.wordpiece_tokenizer.tokenize(word)
word_piece_ids = self.tokenzier.convert_tokens_to_ids(tokens)
word_pieces.extend(word_piece_ids)
if word_pieces[0]!=self._cls_index:
word_pieces.insert(0, self._cls_index)
if word_pieces[-1]!=self._sep_index:
word_pieces.insert(-1, self._sep_index)
return word_pieces

for index, dataset in enumerate(datasets):
try:
dataset.apply_field(convert_words_to_word_pieces, field_name=field_name, new_field_name='word_pieces',
is_input=True)
dataset.set_pad_val('word_pieces', self._wordpiece_pad_index)
except Exception as e:
print(f"Exception happens when processing the {index} dataset.")
raise e


def forward(self, word_pieces, token_type_ids=None): def forward(self, word_pieces, token_type_ids=None):
""" """
计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。


:param words: batch_size x max_len
:param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话
:return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
:param word_pieces: torch.LongTensor, batch_size x max_len
:param token_type_ids: torch.LongTensor, batch_size x max_len
:return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size
""" """
outputs = self.model(word_pieces, token_type_ids)
outputs = torch.cat([*outputs], dim=-1)
batch_size, max_len = word_pieces.size()


attn_masks = word_pieces.ne(self._wordpiece_pad_index)
bert_outputs, _ = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks,
output_all_encoded_layers=True)
# output_layers = [self.layers] # len(self.layers) x batch_size x max_word_piece_length x hidden_size
outputs = bert_outputs[0].new_zeros((len(self.layers), batch_size, max_len, bert_outputs[0].size(-1)))
for l_index, l in enumerate(self.layers):
outputs[l_index] = bert_outputs[l]
return outputs return outputs


+ 0
- 1083
fastNLP/modules/encoder/embedding.py
File diff suppressed because it is too large
View File


+ 0
- 28
fastNLP/modules/utils.py View File

@@ -1,6 +1,5 @@
from functools import reduce from functools import reduce


import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.init as init import torch.nn.init as init
@@ -70,33 +69,6 @@ def initial_parameter(net, initial_method=None):
net.apply(weights_init) net.apply(weights_init)




def get_embeddings(init_embed):
"""
根据输入的init_embed生成nn.Embedding对象。

:param init_embed: 可以是 tuple:(num_embedings, embedding_dim), 即embedding的大小和每个词的维度;也可以传入
nn.Embedding 对象, 此时就以传入的对象作为embedding; 传入np.ndarray也行,将使用传入的ndarray作为作为Embedding初始
化; 传入orch.Tensor, 将使用传入的值作为Embedding初始化。
:return nn.Embedding embeddings:
"""
if isinstance(init_embed, tuple):
res = nn.Embedding(
num_embeddings=init_embed[0], embedding_dim=init_embed[1])
nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)),
b=np.sqrt(3/res.weight.data.size(1)))
elif isinstance(init_embed, nn.Module):
res = init_embed
elif isinstance(init_embed, torch.Tensor):
res = nn.Embedding.from_pretrained(init_embed, freeze=False)
elif isinstance(init_embed, np.ndarray):
init_embed = torch.tensor(init_embed, dtype=torch.float32)
res = nn.Embedding.from_pretrained(init_embed, freeze=False)
else:
raise TypeError(
'invalid init_embed type: {}'.format((type(init_embed))))
return res


def summary(model: nn.Module): def summary(model: nn.Module):
""" """
得到模型的总参数量 得到模型的总参数量


+ 2
- 0
reproduction/LSTM+self_attention_sentiment_analysis/README.md View File

@@ -1,5 +1,7 @@
# Prototype # Prototype


这是一个很旧版本的reproduction,待修改

## Word2Idx.py ## Word2Idx.py
A mapping model between words and indexes A mapping model between words and indexes




+ 9
- 5
reproduction/LSTM+self_attention_sentiment_analysis/main.py View File

@@ -1,6 +1,9 @@
# 这是一个很旧版本的代码

"""
import torch.nn.functional as F import torch.nn.functional as F


from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.core.trainer import Trainer
from fastNLP.core.utils import ClassPreprocess as Preprocess from fastNLP.core.utils import ClassPreprocess as Preprocess
from fastNLP.io.config_io import ConfigLoader from fastNLP.io.config_io import ConfigLoader
from fastNLP.io.config_io import ConfigSection from fastNLP.io.config_io import ConfigSection
@@ -8,7 +11,7 @@ from fastNLP.io.dataset_loader import DummyClassificationReader as Dataset_loade
from fastNLP.models.base_model import BaseModel from fastNLP.models.base_model import BaseModel
from fastNLP.modules.aggregator.self_attention import SelfAttention from fastNLP.modules.aggregator.self_attention import SelfAttention
from fastNLP.modules.decoder.mlp import MLP from fastNLP.modules.decoder.mlp import MLP
from fastNLP.modules.encoder.embedding import Embedding as Embedding
from fastNLP.embeddings.embedding import Embedding as Embedding
from fastNLP.modules.encoder.lstm import LSTM from fastNLP.modules.encoder.lstm import LSTM


train_data_path = 'small_train_data.txt' train_data_path = 'small_train_data.txt'
@@ -61,12 +64,13 @@ class SELF_ATTENTION_YELP_CLASSIFICATION(BaseModel):


train_args = ConfigSection() train_args = ConfigSection()
ConfigLoader("good path").load_config('config.cfg',{"train": train_args}) ConfigLoader("good path").load_config('config.cfg',{"train": train_args})
train_args['vocab'] = len(word2index)
# train_args['vocab'] = len(word2index)




trainer = ClassificationTrainer(**train_args.data)
trainer = Trainer(**train_args.data)


# for k in train_args.__dict__.keys(): # for k in train_args.__dict__.keys():
# print(k, train_args[k]) # print(k, train_args[k])
model = SELF_ATTENTION_YELP_CLASSIFICATION(train_args) model = SELF_ATTENTION_YELP_CLASSIFICATION(train_args)
trainer.train(model,train_data , dev_data)
trainer.train()
"""

+ 2
- 2
reproduction/Star_transformer/train.py View File

@@ -1,7 +1,7 @@
from util import get_argparser, set_gpu, set_rng_seeds, add_model_args
from reproduction.Star_transformer.util import get_argparser, set_gpu, set_rng_seeds, add_model_args
seed = set_rng_seeds(15360) seed = set_rng_seeds(15360)
print('RNG SEED {}'.format(seed)) print('RNG SEED {}'.format(seed))
from datasets import load_seqtag, load_sst, load_snli, EmbedLoader, MAX_LEN
from reproduction.Star_transformer.datasets import load_seqtag, load_sst, load_snli, EmbedLoader, MAX_LEN
import torch.nn as nn import torch.nn as nn
import torch import torch
import numpy as np import numpy as np


+ 1
- 1
reproduction/Summarization/BertSum/model.py View File

@@ -2,7 +2,7 @@ import torch
from torch import nn from torch import nn
from torch.nn import init from torch.nn import init


from fastNLP.modules.encoder._bert import BertModel
from fastNLP.modules.encoder.bert import BertModel




class Classifier(nn.Module): class Classifier(nn.Module):


reproduction/joint_cws_parse/readme.md → reproduction/joint_cws_parse/README.md View File


+ 1
- 1
reproduction/joint_cws_parse/models/CharParser.py View File

@@ -12,7 +12,7 @@ from torch.nn import functional as F
from fastNLP.modules.dropout import TimestepDropout from fastNLP.modules.dropout import TimestepDropout
from fastNLP.modules.encoder.variational_rnn import VarLSTM from fastNLP.modules.encoder.variational_rnn import VarLSTM
from fastNLP import seq_len_to_mask from fastNLP import seq_len_to_mask
from fastNLP.modules import Embedding
from fastNLP.embeddings import Embedding




def drop_input_independent(word_embeddings, dropout_emb): def drop_input_independent(word_embeddings, dropout_emb):


+ 4
- 4
reproduction/joint_cws_parse/train.py View File

@@ -2,15 +2,15 @@ import sys
sys.path.append('../..') sys.path.append('../..')


from reproduction.joint_cws_parse.data.data_loader import CTBxJointLoader from reproduction.joint_cws_parse.data.data_loader import CTBxJointLoader
from fastNLP.modules.encoder.embedding import StaticEmbedding
from fastNLP.embeddings.static_embedding import StaticEmbedding
from torch import nn from torch import nn
from functools import partial from functools import partial
from reproduction.joint_cws_parse.models.CharParser import CharParser from reproduction.joint_cws_parse.models.CharParser import CharParser
from reproduction.joint_cws_parse.models.metrics import SegAppCharParseF1Metric, CWSMetric from reproduction.joint_cws_parse.models.metrics import SegAppCharParseF1Metric, CWSMetric
from fastNLP import cache_results, BucketSampler, Trainer
from fastNLP import BucketSampler, Trainer
from torch import optim from torch import optim
from reproduction.joint_cws_parse.models.callbacks import DevCallback, OptimizerCallback
from torch.optim.lr_scheduler import LambdaLR, StepLR
from reproduction.joint_cws_parse.models.callbacks import DevCallback
from torch.optim.lr_scheduler import StepLR
from fastNLP import Tester from fastNLP import Tester
from fastNLP import GradientClipCallback, LRScheduler from fastNLP import GradientClipCallback, LRScheduler
import os import os


+ 4
- 0
reproduction/matching/data/MatchingDataLoader.py View File

@@ -1,3 +1,7 @@
"""
这个文件的内容已合并到fastNLP.io.data_loader里,这个文件的内容不再更新
"""



import os import os




+ 1
- 2
reproduction/matching/matching_bert.py View File

@@ -3,9 +3,8 @@ import numpy as np
import torch import torch


from fastNLP.core import Trainer, Tester, AccuracyMetric, Const, Adam from fastNLP.core import Trainer, Tester, AccuracyMetric, Const, Adam
from fastNLP.io.data_loader import SNLILoader, RTELoader, MNLILoader, QNLILoader, QuoraLoader


from reproduction.matching.data.MatchingDataLoader import SNLILoader, RTELoader, \
MNLILoader, QNLILoader, QuoraLoader
from reproduction.matching.model.bert import BertForNLI from reproduction.matching.model.bert import BertForNLI






+ 2
- 3
reproduction/matching/matching_cntn.py View File

@@ -1,11 +1,10 @@
import argparse import argparse
import torch import torch
import os


from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const
from fastNLP.modules.encoder.embedding import StaticEmbedding
from fastNLP.embeddings import StaticEmbedding
from fastNLP.io.data_loader import QNLILoader, RTELoader, SNLILoader, MNLILoader


from reproduction.matching.data.MatchingDataLoader import QNLILoader, RTELoader, SNLILoader, MNLILoader
from reproduction.matching.model.cntn import CNTNModel from reproduction.matching.model.cntn import CNTNModel


# define hyper-parameters # define hyper-parameters


+ 5
- 6
reproduction/matching/matching_esim.py View File

@@ -7,11 +7,10 @@ from torch.optim.lr_scheduler import StepLR


from fastNLP.core import Trainer, Tester, AccuracyMetric, Const from fastNLP.core import Trainer, Tester, AccuracyMetric, Const
from fastNLP.core.callback import GradientClipCallback, LRScheduler from fastNLP.core.callback import GradientClipCallback, LRScheduler
from fastNLP.modules.encoder.embedding import ElmoEmbedding, StaticEmbedding

from reproduction.matching.data.MatchingDataLoader import SNLILoader, RTELoader, \
MNLILoader, QNLILoader, QuoraLoader
from reproduction.matching.model.esim import ESIMModel
from fastNLP.embeddings.static_embedding import StaticEmbedding
from fastNLP.embeddings.elmo_embedding import ElmoEmbedding
from fastNLP.io.data_loader import SNLILoader, RTELoader, MNLILoader, QNLILoader, QuoraLoader
from fastNLP.models.snli import ESIM




# define hyper-parameters # define hyper-parameters
@@ -81,7 +80,7 @@ else:
raise RuntimeError(f'NOT support {arg.embedding} embedding yet!') raise RuntimeError(f'NOT support {arg.embedding} embedding yet!')


# define model # define model
model = ESIMModel(embedding, num_labels=len(data_info.vocabs[Const.TARGET]))
model = ESIM(embedding, num_labels=len(data_info.vocabs[Const.TARGET]))


# define optimizer and callback # define optimizer and callback
optimizer = Adamax(lr=arg.lr, params=model.parameters()) optimizer = Adamax(lr=arg.lr, params=model.parameters())


+ 5
- 11
reproduction/matching/matching_mwan.py View File

@@ -1,23 +1,17 @@
import sys

import os
import random import random


import numpy as np import numpy as np
import torch import torch
from torch.optim import Adadelta, SGD
from torch.optim import Adadelta
from torch.optim.lr_scheduler import StepLR from torch.optim.lr_scheduler import StepLR


from tqdm import tqdm

from fastNLP import CrossEntropyLoss from fastNLP import CrossEntropyLoss
from fastNLP import cache_results from fastNLP import cache_results
from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const
from fastNLP.core.predictor import Predictor
from fastNLP.core.callback import GradientClipCallback, LRScheduler, FitlogCallback
from fastNLP.modules.encoder.embedding import ElmoEmbedding, StaticEmbedding
from fastNLP.core import Trainer, Tester, AccuracyMetric, Const
from fastNLP.core.callback import LRScheduler, FitlogCallback
from fastNLP.embeddings import StaticEmbedding


from fastNLP.io.data_loader import MNLILoader, QNLILoader, QuoraLoader, SNLILoader, RTELoader
from fastNLP.io.data_loader import MNLILoader, QNLILoader, SNLILoader, RTELoader
from reproduction.matching.model.mwan import MwanModel from reproduction.matching.model.mwan import MwanModel


import fitlog import fitlog


+ 1
- 1
reproduction/matching/model/bert.py View File

@@ -4,7 +4,7 @@ import torch.nn as nn


from fastNLP.core.const import Const from fastNLP.core.const import Const
from fastNLP.models import BaseModel from fastNLP.models import BaseModel
from fastNLP.modules.encoder.bert import BertModel
from fastNLP.embeddings.bert import BertModel




class BertForNLI(BaseModel): class BertForNLI(BaseModel):


+ 1
- 1
reproduction/matching/model/cntn.py View File

@@ -6,7 +6,7 @@ import numpy as np
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss


from fastNLP.models import BaseModel from fastNLP.models import BaseModel
from fastNLP.modules.encoder.embedding import TokenEmbedding
from fastNLP.embeddings.embedding import TokenEmbedding
from fastNLP.core.const import Const from fastNLP.core.const import Const






+ 1
- 2
reproduction/matching/model/esim.py View File

@@ -5,8 +5,7 @@ import torch.nn.functional as F
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss


from fastNLP.models import BaseModel from fastNLP.models import BaseModel
from fastNLP.modules.encoder.embedding import TokenEmbedding
from fastNLP.modules.encoder.lstm import LSTM
from fastNLP.embeddings.embedding import TokenEmbedding
from fastNLP.core.const import Const from fastNLP.core.const import Const
from fastNLP.core.utils import seq_len_to_mask from fastNLP.core.utils import seq_len_to_mask




+ 1
- 1
reproduction/seqence_labelling/cws/model/model.py View File

@@ -1,6 +1,6 @@
from torch import nn from torch import nn
import torch import torch
from fastNLP.modules import Embedding
from fastNLP.embeddings import Embedding
import numpy as np import numpy as np
from reproduction.seqence_labelling.cws.model.module import FeatureFunMax, SemiCRFShiftRelay from reproduction.seqence_labelling.cws.model.module import FeatureFunMax, SemiCRFShiftRelay
from fastNLP.modules import LSTM from fastNLP.modules import LSTM


+ 2
- 4
reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py View File

@@ -1,7 +1,7 @@
import sys import sys
sys.path.append('../../..') sys.path.append('../../..')


from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding, ElmoEmbedding, StackEmbedding
from fastNLP.embeddings.embedding import CNNCharEmbedding, StaticEmbedding
from fastNLP.core.vocabulary import VocabularyOption from fastNLP.core.vocabulary import VocabularyOption


from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF
@@ -9,13 +9,11 @@ from fastNLP import Trainer
from fastNLP import SpanFPreRecMetric from fastNLP import SpanFPreRecMetric
from fastNLP import BucketSampler from fastNLP import BucketSampler
from fastNLP import Const from fastNLP import Const
from torch.optim import SGD, Adam
from torch.optim import SGD
from fastNLP import GradientClipCallback from fastNLP import GradientClipCallback
from fastNLP.core.callback import FitlogCallback, LRScheduler from fastNLP.core.callback import FitlogCallback, LRScheduler
from torch.optim.lr_scheduler import LambdaLR from torch.optim.lr_scheduler import LambdaLR
from fastNLP.core.optimizer import AdamW
# from reproduction.seqence_labelling.ner.model.swats import SWATS # from reproduction.seqence_labelling.ner.model.swats import SWATS
from reproduction.seqence_labelling.chinese_ner.callbacks import SaveModelCallback
from fastNLP import cache_results from fastNLP import cache_results


import fitlog import fitlog


+ 5
- 8
reproduction/seqence_labelling/ner/train_idcnn.py View File

@@ -1,21 +1,18 @@
from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader
from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader
from fastNLP.core.callback import FitlogCallback, LRScheduler
from fastNLP.core.callback import LRScheduler
from fastNLP import GradientClipCallback from fastNLP import GradientClipCallback
from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import LambdaLR
from torch.optim import Adam
from fastNLP import Const from fastNLP import Const
from fastNLP import RandomSampler, BucketSampler
from fastNLP import BucketSampler
from fastNLP import SpanFPreRecMetric from fastNLP import SpanFPreRecMetric
from fastNLP import Trainer, Tester from fastNLP import Trainer, Tester
from fastNLP.core.metrics import MetricBase from fastNLP.core.metrics import MetricBase
from reproduction.seqence_labelling.ner.model.dilated_cnn import IDCNN from reproduction.seqence_labelling.ner.model.dilated_cnn import IDCNN
from fastNLP.core.utils import Option from fastNLP.core.utils import Option
from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding
from fastNLP.embeddings.embedding import StaticEmbedding
from fastNLP.core.utils import cache_results from fastNLP.core.utils import cache_results
from fastNLP.core.vocabulary import VocabularyOption from fastNLP.core.vocabulary import VocabularyOption
import fitlog
import sys
import torch.cuda import torch.cuda
import os import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'


+ 2
- 3
reproduction/seqence_labelling/ner/train_ontonote.py View File

@@ -2,18 +2,17 @@ import sys


sys.path.append('../../..') sys.path.append('../../..')


from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding
from fastNLP.embeddings.embedding import CNNCharEmbedding


from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF
from fastNLP import Trainer from fastNLP import Trainer
from fastNLP import SpanFPreRecMetric from fastNLP import SpanFPreRecMetric
from fastNLP import BucketSampler from fastNLP import BucketSampler
from fastNLP import Const from fastNLP import Const
from torch.optim import SGD, Adam
from torch.optim import SGD
from torch.optim.lr_scheduler import LambdaLR from torch.optim.lr_scheduler import LambdaLR
from fastNLP import GradientClipCallback from fastNLP import GradientClipCallback
from fastNLP.core.callback import FitlogCallback, LRScheduler from fastNLP.core.callback import FitlogCallback, LRScheduler
from reproduction.seqence_labelling.ner.model.swats import SWATS


import fitlog import fitlog
fitlog.debug() fitlog.debug()


+ 1
- 1
reproduction/text_classification/model/HAN.py View File

@@ -1,7 +1,7 @@
import torch import torch
import torch.nn as nn import torch.nn as nn
from torch.autograd import Variable from torch.autograd import Variable
from fastNLP.modules.utils import get_embeddings
from fastNLP.embeddings.utils import get_embeddings
from fastNLP.core import Const as C from fastNLP.core import Const as C






+ 1
- 1
reproduction/text_classification/model/dpcnn.py View File

@@ -1,6 +1,6 @@
import torch import torch
import torch.nn as nn import torch.nn as nn
from fastNLP.modules.utils import get_embeddings
from fastNLP.embeddings.utils import get_embeddings
from fastNLP.core import Const as C from fastNLP.core import Const as C






+ 3
- 5
reproduction/text_classification/train_HAN.py View File

@@ -9,11 +9,9 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"


from fastNLP.core.const import Const as C from fastNLP.core.const import Const as C
from fastNLP.core import LRScheduler from fastNLP.core import LRScheduler
import torch.nn as nn
from fastNLP.io.dataset_loader import SSTLoader
from reproduction.text_classification.data.yelpLoader import yelpLoader
from fastNLP.io.data_loader import YelpLoader
from reproduction.text_classification.model.HAN import HANCLS from reproduction.text_classification.model.HAN import HANCLS
from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding
from fastNLP.embeddings import StaticEmbedding
from fastNLP import CrossEntropyLoss, AccuracyMetric from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.core.trainer import Trainer from fastNLP.core.trainer import Trainer
from torch.optim import SGD from torch.optim import SGD
@@ -44,7 +42,7 @@ ops = Config()


##1.task相关信息:利用dataloader载入dataInfo ##1.task相关信息:利用dataloader载入dataInfo


datainfo = yelpLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train'])
datainfo = YelpLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train'])
print(len(datainfo.datasets['train'])) print(len(datainfo.datasets['train']))
print(len(datainfo.datasets['test'])) print(len(datainfo.datasets['test']))




+ 2
- 9
reproduction/text_classification/train_awdlstm.py View File

@@ -5,20 +5,13 @@ import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'



import torch.nn as nn

from data.IMDBLoader import IMDBLoader from data.IMDBLoader import IMDBLoader
from fastNLP.modules.encoder.embedding import StaticEmbedding
from fastNLP.embeddings import StaticEmbedding
from model.awd_lstm import AWDLSTMSentiment from model.awd_lstm import AWDLSTMSentiment


from fastNLP.core.const import Const as C
from fastNLP import CrossEntropyLoss, AccuracyMetric from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP import Trainer, Tester
from fastNLP import Trainer
from torch.optim import Adam from torch.optim import Adam
from fastNLP.io.model_io import ModelLoader, ModelSaver

import argparse




class Config(): class Config():


+ 4
- 10
reproduction/text_classification/train_char_cnn.py View File

@@ -7,23 +7,17 @@ import sys
sys.path.append('../..') sys.path.append('../..')
from fastNLP.core.const import Const as C from fastNLP.core.const import Const as C
import torch.nn as nn import torch.nn as nn
from data.yelpLoader import yelpLoader
from fastNLP.io.data_loader import YelpLoader
#from data.sstLoader import sst2Loader #from data.sstLoader import sst2Loader
from fastNLP.io.data_loader.sst import SST2Loader
from data.IMDBLoader import IMDBLoader
from model.char_cnn import CharacterLevelCNN from model.char_cnn import CharacterLevelCNN
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.models.cnn_text_classification import CNNText
from fastNLP.modules.encoder.embedding import CNNCharEmbedding,StaticEmbedding,StackEmbedding,LSTMCharEmbedding
from fastNLP import CrossEntropyLoss, AccuracyMetric from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.core.trainer import Trainer from fastNLP.core.trainer import Trainer
from torch.optim import SGD from torch.optim import SGD
from torch.autograd import Variable from torch.autograd import Variable
import torch import torch
from fastNLP import BucketSampler
from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR
from torch.optim.lr_scheduler import LambdaLR
from fastNLP.core import LRScheduler from fastNLP.core import LRScheduler
from utils.util_init import set_rng_seeds


##hyper ##hyper
#todo 这里加入fastnlp的记录 #todo 这里加入fastnlp的记录
@@ -117,7 +111,7 @@ ops=Config
##1.task相关信息:利用dataloader载入dataInfo ##1.task相关信息:利用dataloader载入dataInfo
#dataloader=SST2Loader() #dataloader=SST2Loader()
#dataloader=IMDBLoader() #dataloader=IMDBLoader()
dataloader=yelpLoader(fine_grained=True)
dataloader=YelpLoader(fine_grained=True)
datainfo=dataloader.process(ops.datapath,char_level_op=True,split_dev_op=False) datainfo=dataloader.process(ops.datapath,char_level_op=True,split_dev_op=False)
char_vocab=ops.char_cnn_config["alphabet"]["en"]["lower"]["alphabet"] char_vocab=ops.char_cnn_config["alphabet"]["en"]["lower"]["alphabet"]
ops.number_of_characters=len(char_vocab) ops.number_of_characters=len(char_vocab)


+ 5
- 6
reproduction/text_classification/train_dpcnn.py View File

@@ -3,15 +3,14 @@
import torch.cuda import torch.cuda
from fastNLP.core.utils import cache_results from fastNLP.core.utils import cache_results
from torch.optim import SGD from torch.optim import SGD
from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR
from torch.optim.lr_scheduler import CosineAnnealingLR
from fastNLP.core.trainer import Trainer from fastNLP.core.trainer import Trainer
from fastNLP import CrossEntropyLoss, AccuracyMetric from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding
from fastNLP.embeddings import StaticEmbedding
from reproduction.text_classification.model.dpcnn import DPCNN from reproduction.text_classification.model.dpcnn import DPCNN
from data.yelpLoader import yelpLoader
from fastNLP.io.data_loader import YelpLoader
from fastNLP.core.sampler import BucketSampler from fastNLP.core.sampler import BucketSampler
import torch.nn as nn
from fastNLP.core import LRScheduler, Callback
from fastNLP.core import LRScheduler
from fastNLP.core.const import Const as C from fastNLP.core.const import Const as C
from fastNLP.core.vocabulary import VocabularyOption from fastNLP.core.vocabulary import VocabularyOption
from utils.util_init import set_rng_seeds from utils.util_init import set_rng_seeds
@@ -59,7 +58,7 @@ print('RNG SEED: {}'.format(ops.seed))


@cache_results(ops.model_dir_or_name+'-data-cache') @cache_results(ops.model_dir_or_name+'-data-cache')
def load_data(): def load_data():
datainfo = yelpLoader(fine_grained=True, lower=True).process(
datainfo = YelpLoader(fine_grained=True, lower=True).process(
paths=ops.datapath, train_ds=['train'], src_vocab_op=ops.src_vocab_op) paths=ops.datapath, train_ds=['train'], src_vocab_op=ops.src_vocab_op)
for ds in datainfo.datasets.values(): for ds in datainfo.datasets.values():
ds.apply_field(len, C.INPUT, C.INPUT_LEN) ds.apply_field(len, C.INPUT, C.INPUT_LEN)


+ 3
- 10
reproduction/text_classification/train_lstm.py View File

@@ -3,20 +3,13 @@ import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'



import torch.nn as nn

from data.IMDBLoader import IMDBLoader
from fastNLP.modules.encoder.embedding import StaticEmbedding
from fastNLP.io.data_loader import IMDBLoader
from fastNLP.embeddings import StaticEmbedding
from model.lstm import BiLSTMSentiment from model.lstm import BiLSTMSentiment


from fastNLP.core.const import Const as C
from fastNLP import CrossEntropyLoss, AccuracyMetric from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP import Trainer, Tester
from fastNLP import Trainer
from torch.optim import Adam from torch.optim import Adam
from fastNLP.io.model_io import ModelLoader, ModelSaver

import argparse




class Config(): class Config():


+ 3
- 10
reproduction/text_classification/train_lstm_att.py View File

@@ -3,20 +3,13 @@ import os
os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/' os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches' os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'



import torch.nn as nn

from data.IMDBLoader import IMDBLoader
from fastNLP.modules.encoder.embedding import StaticEmbedding
from fastNLP.io.data_loader import IMDBLoader
from fastNLP.embeddings import StaticEmbedding
from model.lstm_self_attention import BiLSTM_SELF_ATTENTION from model.lstm_self_attention import BiLSTM_SELF_ATTENTION


from fastNLP.core.const import Const as C
from fastNLP import CrossEntropyLoss, AccuracyMetric from fastNLP import CrossEntropyLoss, AccuracyMetric
from fastNLP import Trainer, Tester
from fastNLP import Trainer
from torch.optim import Adam from torch.optim import Adam
from fastNLP.io.model_io import ModelLoader, ModelSaver

import argparse




class Config(): class Config():


+ 26
- 0
test/embeddings/test_char_embedding.py View File

@@ -0,0 +1,26 @@
import unittest

import torch

from fastNLP import Vocabulary, DataSet, Instance
from fastNLP.embeddings.char_embedding import LSTMCharEmbedding, CNNCharEmbedding


class TestCharEmbed(unittest.TestCase):
def test_case_1(self):
ds = DataSet([Instance(words=['hello', 'world']), Instance(words=['Jack'])])
vocab = Vocabulary().from_dataset(ds, field_name='words')
self.assertEqual(len(vocab), 5)
embed = LSTMCharEmbedding(vocab, embed_size=60)
x = torch.LongTensor([[2, 1, 0], [4, 3, 4]])
y = embed(x)
self.assertEqual(tuple(y.size()), (2, 3, 60))

def test_case_2(self):
ds = DataSet([Instance(words=['hello', 'world']), Instance(words=['Jack'])])
vocab = Vocabulary().from_dataset(ds, field_name='words')
self.assertEqual(len(vocab), 5)
embed = CNNCharEmbedding(vocab, embed_size=60)
x = torch.LongTensor([[2, 1, 0], [4, 3, 4]])
y = embed(x)
self.assertEqual(tuple(y.size()), (2, 3, 60))

+ 1
- 1
test/modules/encoder/test_bert.py View File

@@ -8,7 +8,7 @@ from fastNLP.models.bert import BertModel


class TestBert(unittest.TestCase): class TestBert(unittest.TestCase):
def test_bert_1(self): def test_bert_1(self):
from fastNLP.modules.encoder._bert import BertConfig
from fastNLP.modules.encoder.bert import BertConfig
config = BertConfig(32000) config = BertConfig(32000)
model = BertModel(config) model = BertModel(config)




Loading…
Cancel
Save