diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py index 1c7d33c5..0b010c02 100644 --- a/fastNLP/core/vocabulary.py +++ b/fastNLP/core/vocabulary.py @@ -540,7 +540,7 @@ class Vocabulary(object): vocab = Vocabulary() for line in f: - line = line.strip() + line = line.strip('\n') if line: name, value = line.split() if name in ('max_size', 'min_freq'): @@ -557,7 +557,7 @@ class Vocabulary(object): no_create_entry_counter = {} word2idx = {} for line in f: - line = line.strip() + line = line.strip('\n') if line: parts = line.split('\t') word,count,idx,no_create_entry = parts[0], int(parts[1]), int(parts[2]), int(parts[3]) diff --git a/fastNLP/embeddings/__init__.py b/fastNLP/embeddings/__init__.py index 1b3b6e83..dae75995 100644 --- a/fastNLP/embeddings/__init__.py +++ b/fastNLP/embeddings/__init__.py @@ -16,6 +16,9 @@ __all__ = [ "RobertaEmbedding", "RobertaWordPieceEncoder", + "TransformersEmbedding", + "TransformersWordPieceEncoder", + "GPT2Embedding", "GPT2WordPieceEncoder", @@ -32,6 +35,7 @@ from .static_embedding import StaticEmbedding from .elmo_embedding import ElmoEmbedding from .bert_embedding import BertEmbedding, BertWordPieceEncoder from .roberta_embedding import RobertaEmbedding, RobertaWordPieceEncoder +from .transformers_embedding import TransformersEmbedding, TransformersWordPieceEncoder from .gpt2_embedding import GPT2WordPieceEncoder, GPT2Embedding from .char_embedding import CNNCharEmbedding, LSTMCharEmbedding from .stack_embedding import StackEmbedding diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index e3b91934..ec2ba26b 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -294,8 +294,7 @@ class BertWordPieceEncoder(nn.Module): sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len sep_mask_cumsum = sep_mask.long().flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) token_type_ids = sep_mask_cumsum.fmod(2) - if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 - token_type_ids = token_type_ids.eq(0).long() + token_type_ids = token_type_ids[:, :1].__xor__(token_type_ids) # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 word_pieces = self.drop_word(word_pieces) outputs = self.model(word_pieces, token_type_ids) @@ -465,8 +464,7 @@ class _BertWordModel(nn.Module): sep_mask = word_pieces.eq(self._sep_index).long() # batch_size x max_len sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1]) token_type_ids = sep_mask_cumsum.fmod(2) - if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 - token_type_ids = token_type_ids.eq(0).long() + token_type_ids = token_type_ids[:, :1].__xor__(token_type_ids) # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0 else: token_type_ids = torch.zeros_like(word_pieces) # 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 diff --git a/fastNLP/embeddings/roberta_embedding.py b/fastNLP/embeddings/roberta_embedding.py index d81abaf5..90ea1085 100644 --- a/fastNLP/embeddings/roberta_embedding.py +++ b/fastNLP/embeddings/roberta_embedding.py @@ -196,7 +196,7 @@ class _RobertaWordModel(nn.Module): include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2): super().__init__() - self.tokenzier = RobertaTokenizer.from_pretrained(model_dir_or_name) + self.tokenizer = RobertaTokenizer.from_pretrained(model_dir_or_name) self.encoder = RobertaModel.from_pretrained(model_dir_or_name) # 由于RobertaEmbedding中设置了padding_idx为1, 且使用了非常神奇的position计算方式,所以-2 self._max_position_embeddings = self.encoder.config.max_position_embeddings - 2 @@ -233,14 +233,14 @@ class _RobertaWordModel(nn.Module): word = '' elif vocab.word_count[word]'] - self._sep_index = self.tokenzier.encoder[''] + self._cls_index = self.tokenizer.encoder[''] + self._sep_index = self.tokenizer.encoder[''] self._word_pad_index = vocab.padding_idx - self._wordpiece_pad_index = self.tokenzier.encoder[''] # 需要用于生成word_piece + self._wordpiece_pad_index = self.tokenizer.encoder[''] # 需要用于生成word_piece self.word_to_wordpieces = np.array(word_to_wordpieces) self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths)) logger.debug("Successfully generate word pieces.") @@ -352,20 +352,19 @@ class _RobertaWordModel(nn.Module): return outputs def save(self, folder): - self.tokenzier.save_pretrained(folder) + self.tokenizer.save_pretrained(folder) self.encoder.save_pretrained(folder) class RobertaWordPieceEncoder(nn.Module): r""" - 读取bert模型,读取之后调用index_dataset方法在dataset中生成word_pieces这一列。 + 读取roberta模型,读取之后调用index_dataset方法在dataset中生成word_pieces这一列。 RobertaWordPieceEncoder可以支持自动下载权重,当前支持的模型: en: roberta-base en-large: roberta-large """ - def __init__(self, model_dir_or_name: str = 'en', layers: str = '-1', pooled_cls: bool = False, word_dropout=0, dropout=0, requires_grad: bool = True, **kwargs): r""" @@ -417,11 +416,10 @@ class RobertaWordPieceEncoder(nn.Module): def forward(self, word_pieces, token_type_ids=None): r""" - 计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 + 计算words的bert embedding表示。传入的words中应该自行包含>的tag。 :param words: batch_size x max_len - :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话. 如果不传入,则自动生成(大部分情况,都不需要输入), - 第一个[SEP]及之前为0, 第二个[SEP]及到第一个[SEP]之间为1; 第三个[SEP]及到第二个[SEP]之间为0,依次往后推。 + :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话. 如果不传入,则自动生成(大部分情况,都不需要输入)。 :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) """ word_pieces = self.drop_word(word_pieces) @@ -484,7 +482,7 @@ class _WordPieceRobertaModel(nn.Module): def __init__(self, model_dir_or_name: str, layers: str = '-1', pooled_cls: bool=False): super().__init__() - self.tokenzier = RobertaTokenizer.from_pretrained(model_dir_or_name) + self.tokenizer = RobertaTokenizer.from_pretrained(model_dir_or_name) self.encoder = RobertaModel.from_pretrained(model_dir_or_name) # 检查encoder_layer_number是否合理 encoder_layer_number = len(self.encoder.encoder.layer) @@ -504,25 +502,25 @@ class _WordPieceRobertaModel(nn.Module): assert layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ f"a RoBERTa model with {encoder_layer_number} layers." - self._cls_index = self.tokenzier.encoder[''] - self._sep_index = self.tokenzier.encoder[''] - self._wordpiece_pad_index = self.tokenzier.encoder[''] # 需要用于生成word_piece - self._wordpiece_unknown_index = self.tokenzier.encoder[''] + self._cls_index = self.tokenizer.encoder[''] + self._sep_index = self.tokenizer.encoder[''] + self._wordpiece_pad_index = self.tokenizer.encoder[''] # 需要用于生成word_piece + self._wordpiece_unknown_index = self.tokenizer.encoder[''] self.pooled_cls = pooled_cls def index_datasets(self, *datasets, field_name, add_cls_sep=True, add_prefix_space=True): r""" - 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是 - [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。 + 使用roberta的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是 + 会在首尾额外加入, 且将word_pieces这一列的pad value设置为了bert的pad value。 :param datasets: DataSet对象 - :param field_name: 基于哪一列index + :param field_name: 基于哪一列index, 这一列一般是raw_string :param bool add_cls_sep: 是否在句首句尾添加cls和sep的index :param bool add_prefix_space: 是否在句子开头添加空格,预训练时RoBERTa该值为True :return: """ - encode_func = partial(self.tokenzier.encode, add_special_tokens=add_cls_sep, add_prefix_space=add_prefix_space) + encode_func = partial(self.tokenizer.encode, add_special_tokens=add_cls_sep, add_prefix_space=add_prefix_space) for index, dataset in enumerate(datasets): try: @@ -555,5 +553,5 @@ class _WordPieceRobertaModel(nn.Module): return outputs def save(self, folder): - self.tokenzier.save_pretrained(folder) + self.tokenizer.save_pretrained(folder) self.encoder.save_pretrained(folder) \ No newline at end of file diff --git a/fastNLP/embeddings/transformers_embedding.py b/fastNLP/embeddings/transformers_embedding.py new file mode 100644 index 00000000..4d84fefd --- /dev/null +++ b/fastNLP/embeddings/transformers_embedding.py @@ -0,0 +1,495 @@ +r""" +将transformers包中的模型封装成fastNLP中的embedding对象 + +""" +import os +from itertools import chain +from functools import partial + +from torch import nn +import numpy as np +import torch + +from .contextual_embedding import ContextualEmbedding +from ..core import logger +from ..core.vocabulary import Vocabulary + + +class TransformersEmbedding(ContextualEmbedding): + r""" + 使用transformers中的模型对words进行编码的Embedding。建议将输入的words长度限制在430以内,而不要使用512(根据预训练模型参数,可能有变化)。这是由于 + 预训练的bert模型长度限制为512个token,而因为输入的word是未进行word piece分割的(word piece的分割由TransformersEmbedding在输入word + 时切分),在分割之后长度可能会超过最大长度限制。 + + Example:: + + >>> import torch + >>> from fastNLP import Vocabulary + >>> from fastNLP.embeddings import TransformersEmbedding + >>> from transformers import ElectraModel, ElectraTokenizer + >>> vocab = Vocabulary().add_word_lst("The whether is good .".split()) + >>> model = ElectraModel.from_pretrained("google/electra-small-generator") + >>> tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-generator") + >>> embed = TransformersEmbedding(vocab, model_dir_or_name='en', requires_grad=False, layers='4,-2,-1') + >>> words = torch.LongTensor([[vocab.to_index(word) for word in "The whether is good .".split()]]) + >>> outputs = embed(words) + >>> outputs.size() + >>> # torch.Size([1, 5, 2304]) + + """ + def __init__(self, vocab, model, tokenizer, layers='-1', + pool_method: str = 'first', word_dropout=0, dropout=0, requires_grad=True, + include_cls_sep: bool = False, auto_truncate=True, **kwargs): + r""" + + :param ~fastNLP.Vocabulary vocab: 词表 + :model model: transformers包中的PreTrainedModel对象 + :param tokenizer: transformers包中的PreTrainedTokenizer对象 + :param str,list layers: 输出embedding表示来自于哪些层,不同层的结果按照layers中的顺序在最后一维concat起来。以','隔开层数,层的序号是 + 从0开始,可以以负数去索引倒数几层。layer=0为embedding层(包括wordpiece embedding, position embedding) + :param str pool_method: 因为在bert中,每个word会被表示为多个word pieces, 当获取一个word的表示的时候,怎样从它的word pieces + 中计算得到它对应的表示。支持 ``last`` , ``first`` , ``avg`` , ``max``。 + :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 + :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 + :param bool include_cls_sep: bool,在bert计算句子的表示的时候,需要在前面加上[CLS]和[SEP], 是否在结果中保留这两个内容。 这样 + 会使得word embedding的结果比输入的结果长两个token。如果该值为True,则在使用 :class::StackEmbedding 可能会与其它类型的 + embedding长度不匹配。 + :param bool pooled_cls: 返回的是否使用预训练中的BertPool映射一下,仅在include_cls_sep时有效。如果下游任务只取做预测, + 一般该值为True。 + :param bool requires_grad: 是否需要gradient以更新Bert的权重。 + :param bool auto_truncate: 当句子words拆分为word pieces长度超过bert最大允许长度(一般为512), 自动截掉拆分后的超过510个 + word pieces后的内容,并将第512个word piece置为。超过长度的部分的encode结果直接全部置零。一般仅有只使用 + 来进行分类的任务将auto_truncate置为True。 + :param kwargs: + int min_freq: 小于该次数的词会被unk代替, 默认为1 + """ + super().__init__(vocab, word_dropout=word_dropout, dropout=dropout) + + if word_dropout > 0: + assert vocab.unknown is not None, "When word_drop > 0, Vocabulary must contain the unknown token." + + self._word_sep_index = -100 + if tokenizer.sep_token in vocab: + self._word_sep_index = vocab[tokenizer.sep_token] + + self._word_cls_index = -100 + if tokenizer.cls_token in vocab: + self._word_cls_index = vocab[tokenizer.cls_token] + + min_freq = kwargs.get('min_freq', 1) + self._min_freq = min_freq + + self.model = _TransformersWordModel(tokenizer=tokenizer, model=model, vocab=vocab, layers=layers, + pool_method=pool_method, include_cls_sep=include_cls_sep, + auto_truncate=auto_truncate, min_freq=min_freq) + + self.requires_grad = requires_grad + self._embed_size = len(self.model.layers) * model.config.hidden_size + + def forward(self, words): + r""" + 计算words的roberta embedding表示。计算之前会在每句话的开始增加在结束增加, 并根据include_cls_sep判断要不要 + 删除这两个token的表示。 + + :param torch.LongTensor words: [batch_size, max_len] + :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) + """ + words = self.drop_word(words) + outputs = self._get_sent_reprs(words) + if outputs is not None: + return self.dropout(outputs) + outputs = self.model(words) + outputs = torch.cat([*outputs], dim=-1) + + return self.dropout(outputs) + + def drop_word(self, words): + r""" + 按照设定随机将words设置为unknown_index。 + + :param torch.LongTensor words: batch_size x max_len + :return: + """ + if self.word_dropout > 0 and self.training: + with torch.no_grad(): + mask = torch.full_like(words, fill_value=self.word_dropout, dtype=torch.float, device=words.device) + mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 + pad_mask = words.ne(self._word_pad_index) + mask = pad_mask.__and__(mask) # pad的位置不为unk + if self._word_sep_index!=-100: + not_sep_mask = words.ne(self._word_sep_index) + mask = mask.__and__(not_sep_mask) + if self._word_cls_index!=-100: + not_cls_mask = words.ne(self._word_cls_index) + mask = mask.__and__(not_cls_mask) + words = words.masked_fill(mask, self._word_unk_index) + return words + + def save(self, folder): + """ + 保存tokenizer和model到folder文件夹。model保存在`folder/{model_name}`, tokenizer在`folder/{tokenizer_name}`下 + :param str folder: 保存地址 + :return: + """ + os.makedirs(folder, exist_ok=True) + self.model.save(folder) + + +class TransformersWordPieceEncoder(nn.Module): + r""" + 读取roberta模型,读取之后调用index_dataset方法在dataset中生成word_pieces这一列。 + + RobertaWordPieceEncoder可以支持自动下载权重,当前支持的模型: + en: roberta-base + en-large: roberta-large + + """ + def __init__(self, model, tokenizer, layers: str = '-1', + word_dropout=0, dropout=0, requires_grad: bool = True, **kwargs): + r""" + + :param model: transformers的model + :param tokenizer: transformer的tokenizer + :param str layers: 最终结果中的表示。以','隔开层数,可以以负数去索引倒数几层。layer=0为embedding层(包括wordpiece embedding, + position embedding) + :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。 + :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。 + :param bool requires_grad: 是否需要gradient。 + """ + super().__init__() + + self.model = _WordPieceTransformersModel(model=model, tokenizer=tokenizer, layers=layers) + self._sep_index = self.model._sep_index + self._cls_index = self.model._cls_index + self._wordpiece_pad_index = self.model._wordpiece_pad_index + self._wordpiece_unk_index = self.model._wordpiece_unknown_index + self._embed_size = len(self.model.layers) * self.model.config.hidden_size + self.requires_grad = requires_grad + self.word_dropout = word_dropout + self.dropout_layer = nn.Dropout(dropout) + + @property + def embed_size(self): + return self._embed_size + + @property + def embedding_dim(self): + return self._embed_size + + @property + def num_embedding(self): + return self.model.encoder.config.vocab_size + + def index_datasets(self, *datasets, field_name, **kwargs): + r""" + 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input,且将word_pieces这一列的pad value设置为了 + bert的pad value。 + + :param ~fastNLP.DataSet datasets: DataSet对象 + :param str field_name: 基于哪一列的内容生成word_pieces列。这一列中每个数据应该是raw_string的形式。 + :param kwargs: 传递给tokenizer的参数 + :return: + """ + self.model.index_datasets(*datasets, field_name=field_name, **kwargs) + + def forward(self, word_pieces, token_type_ids=None): + r""" + 计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。 + + :param words: batch_size x max_len + :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话. 如果不传入,则自动生成(大部分情况,都不需要输入), + 第一个[SEP]及之前为0, 第二个[SEP]及到第一个[SEP]之间为1; 第三个[SEP]及到第二个[SEP]之间为0,依次往后推。 + :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers)) + """ + word_pieces = self.drop_word(word_pieces) + outputs = self.model(word_pieces) + outputs = torch.cat([*outputs], dim=-1) + + return self.dropout_layer(outputs) + + def drop_word(self, words): + r""" + 按照设定随机将words设置为unknown_index。 + + :param torch.LongTensor words: batch_size x max_len + :return: + """ + if self.word_dropout > 0 and self.training: + with torch.no_grad(): + not_sep_mask = words.ne(self._sep_index) + not_cls_mask = words.ne(self._cls_index) + replaceable_mask = not_sep_mask.__and__(not_cls_mask) + mask = torch.full_like(words, fill_value=self.word_dropout, dtype=torch.float, device=words.device) + mask = torch.bernoulli(mask).eq(1) # dropout_word越大,越多位置为1 + pad_mask = words.ne(self._wordpiece_pad_index) + mask = pad_mask.__and__(mask).__and__(replaceable_mask) # pad的位置不为unk + words = words.masked_fill(mask, self._wordpiece_unk_index) + return words + + def save(self, folder): + os.makedirs(folder, exist_ok=True) + self.model.save(os.path.join(folder, folder)) + logger.debug(f"TransformersWordPieceEncoder has been saved in {folder}") + + +class _TransformersWordModel(nn.Module): + def __init__(self, tokenizer, model, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first', + include_cls_sep: bool = False, auto_truncate: bool = False, min_freq=2): + super().__init__() + + self.tokenizer = tokenizer + self.encoder = model + self.config = model.config + self.only_last_layer = True + if not (isinstance(layers, str) and (layers=='-1' or int(layers)==self.encoder.config.num_hidden_layers)): + assert self.encoder.config.output_hidden_states == True, \ + f"You have to output all hidden states if you want to" \ + f" access the middle output of `{model.__class__.__name__}` " + self.only_last_layer = False + + self._max_position_embeddings = self.encoder.config.max_position_embeddings - 2 + # 检查encoder_layer_number是否合理 + encoder_layer_number = len(self.encoder.encoder.layer) + self.encoder_layer_number = encoder_layer_number + if isinstance(layers, list): + self.layers = [int(l) for l in layers] + elif isinstance(layers, str): + self.layers = list(map(int, layers.split(','))) + else: + raise TypeError("`layers` only supports str or list[int]") + + for layer in self.layers: + if layer < 0: + assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ + f"a {model.__class__.__name__} model with {encoder_layer_number} layers." + else: + assert layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ + f"a {model.__class__.__name__} model with {encoder_layer_number} layers." + + assert pool_method in ('avg', 'max', 'first', 'last') + self.pool_method = pool_method + self.include_cls_sep = include_cls_sep + self.auto_truncate = auto_truncate + + word_to_wordpieces = [] + word_pieces_lengths = [] + for word, index in vocab: + if index == vocab.padding_idx: # pad是个特殊的符号 + word = tokenizer.pad_token + elif index == vocab.unknown_idx: + word = tokenizer.unk_token + elif vocab.word_count[word] self._max_position_embeddings: + if self.auto_truncate: + word_pieces_lengths = word_pieces_lengths.masked_fill( + word_pieces_lengths + 2 > self._max_position_embeddings, self._max_position_embeddings - 2) + else: + raise RuntimeError( + "After split words into word pieces, the lengths of word pieces are longer than the " + f"maximum allowed sequence length:{self._max_position_embeddings} of bert. You can set " + f"`auto_truncate=True` for BertEmbedding to automatically truncate overlong input.") + + # +2是由于需要加入 + word_pieces = words.new_full((batch_size, min(max_word_piece_length + 2, self._max_position_embeddings)), + fill_value=self._wordpiece_pad_index) + attn_masks = torch.zeros_like(word_pieces) + # 1. 获取words的word_pieces的id,以及对应的span范围 + word_indexes = words.cpu().numpy() + for i in range(batch_size): + word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i, :seq_len[i]]])) + if self.auto_truncate and len(word_pieces_i) > self._max_position_embeddings - 2: + word_pieces_i = word_pieces_i[:self._max_position_embeddings - 2] + word_pieces[i, 1:word_pieces_lengths[i] + 1] = torch.LongTensor(word_pieces_i) + attn_masks[i, :word_pieces_lengths[i] + 2].fill_(1) + word_pieces[:, 0].fill_(self._cls_index) + batch_indexes = torch.arange(batch_size).to(words) + word_pieces[batch_indexes, word_pieces_lengths + 1] = self._sep_index + token_type_ids = torch.zeros_like(word_pieces) + # 2. 获取hidden的结果,根据word_pieces进行对应的pool计算 + # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...] + all_outputs = self.encoder(input_ids=word_pieces, token_type_ids=token_type_ids, + attention_mask=attn_masks) + if not self.only_last_layer: + for _ in all_outputs: + if isinstance(_, (tuple, list)) and len(_)==self.encoder_layer_number: + bert_outputs = _ + break + else: + bert_outputs = all_outputs[:1] + # output_layers = [self.layers] # len(self.layers) x batch_size x real_word_piece_length x hidden_size + + if self.include_cls_sep: + s_shift = 1 + outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2, + bert_outputs[-1].size(-1)) + + else: + s_shift = 0 + outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len, + bert_outputs[-1].size(-1)) + batch_word_pieces_cum_length = batch_word_pieces_length.new_zeros(batch_size, max_word_len + 1) + batch_word_pieces_cum_length[:, 1:] = batch_word_pieces_length.cumsum(dim=-1) # batch_size x max_len + + if self.pool_method == 'first': + batch_word_pieces_cum_length = batch_word_pieces_cum_length[:, :seq_len.max()] + batch_word_pieces_cum_length.masked_fill_(batch_word_pieces_cum_length.ge(max_word_piece_length), 0) + _batch_indexes = batch_indexes[:, None].expand((batch_size, batch_word_pieces_cum_length.size(1))) + elif self.pool_method == 'last': + batch_word_pieces_cum_length = batch_word_pieces_cum_length[:, 1:seq_len.max() + 1] - 1 + batch_word_pieces_cum_length.masked_fill_(batch_word_pieces_cum_length.ge(max_word_piece_length), 0) + _batch_indexes = batch_indexes[:, None].expand((batch_size, batch_word_pieces_cum_length.size(1))) + + for l_index, l in enumerate(self.layers): + output_layer = bert_outputs[l] + real_word_piece_length = output_layer.size(1) - 2 + if max_word_piece_length > real_word_piece_length: # 如果实际上是截取出来的 + paddings = output_layer.new_zeros(batch_size, + max_word_piece_length - real_word_piece_length, + output_layer.size(2)) + output_layer = torch.cat((output_layer, paddings), dim=1).contiguous() + # 从word_piece collapse到word的表示 + truncate_output_layer = output_layer[:, 1:-1] # 删除 batch_size x len x hidden_size + if self.pool_method == 'first': + tmp = truncate_output_layer[_batch_indexes, batch_word_pieces_cum_length] + tmp = tmp.masked_fill(word_mask[:, :batch_word_pieces_cum_length.size(1), None].eq(False), 0) + outputs[l_index, :, s_shift:batch_word_pieces_cum_length.size(1) + s_shift] = tmp + + elif self.pool_method == 'last': + tmp = truncate_output_layer[_batch_indexes, batch_word_pieces_cum_length] + tmp = tmp.masked_fill(word_mask[:, :batch_word_pieces_cum_length.size(1), None].eq(False), 0) + outputs[l_index, :, s_shift:batch_word_pieces_cum_length.size(1) + s_shift] = tmp + elif self.pool_method == 'max': + for i in range(batch_size): + for j in range(seq_len[i]): + start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1] + outputs[l_index, i, j + s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2) + else: + for i in range(batch_size): + for j in range(seq_len[i]): + start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1] + outputs[l_index, i, j + s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2) + if self.include_cls_sep: + outputs[l_index, :, 0] = output_layer[:, 0] + outputs[l_index, batch_indexes, seq_len + s_shift] = output_layer[batch_indexes, word_pieces_lengths + s_shift] + + # 3. 最终的embedding结果 + return outputs + + def save(self, folder): + self.tokenzier.save_pretrained(folder) + self.encoder.save_pretrained(folder) + + +class _WordPieceTransformersModel(nn.Module): + def __init__(self, model, tokenizer, layers: str = '-1'): + super().__init__() + + self.tokenizer = tokenizer + self.encoder = model + self.config = self.encoder.config + # 检查encoder_layer_number是否合理 + encoder_layer_number = len(self.encoder.encoder.layer) + self.only_last_layer = True + if not (isinstance(layers, str) and (layers=='-1' or int(layers)==self.encoder.config.num_hidden_layers)): + assert self.encoder.config.output_hidden_states == True, \ + f"You have to output all hidden states if you want to" \ + f" access the middle output of `{model.__class__.__name__}` " + self.only_last_layer = False + + if isinstance(layers, list): + self.layers = [int(l) for l in layers] + elif isinstance(layers, str): + self.layers = list(map(int, layers.split(','))) + else: + raise TypeError("`layers` only supports str or list[int]") + + for layer in self.layers: + if layer < 0: + assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ + f"a RoBERTa model with {encoder_layer_number} layers." + else: + assert layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ + f"a RoBERTa model with {encoder_layer_number} layers." + + self._cls_index = self.tokenizer.cls_token_id + self._sep_index = self.tokenizer.sep_token_id + self._wordpiece_pad_index = self.tokenizer.pad_token_id # 需要用于生成word_piece + self._wordpiece_unknown_index = self.tokenizer.unk_token_id + + def index_datasets(self, *datasets, field_name, **kwargs): + r""" + 使用bert的tokenizer新生成word_pieces列加入到datasets中,并将他们设置为input。如果首尾不是 + [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。 + + :param datasets: DataSet对象 + :param field_name: 基于哪一列index + :param kwargs: 传递给tokenizer的参数 + :return: + """ + kwargs['add_special_tokens'] = kwargs.get('add_special_tokens', True) + + encode_func = partial(self.tokenizer.encode, **kwargs) + + for index, dataset in enumerate(datasets): + try: + dataset.apply_field(encode_func, field_name=field_name, new_field_name='word_pieces', + is_input=True) + dataset.set_pad_val('word_pieces', self._wordpiece_pad_index) + except Exception as e: + logger.error(f"Exception happens when processing the {index} dataset.") + raise e + + def forward(self, word_pieces): + r""" + + :param word_pieces: torch.LongTensor, batch_size x max_len + :return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size + """ + batch_size, max_len = word_pieces.size() + + attn_masks = word_pieces.ne(self._wordpiece_pad_index) + all_outputs = self.encoder(word_pieces, token_type_ids=torch.zeros_like(word_pieces), + attention_mask=attn_masks) + if not self.only_last_layer: + for _ in all_outputs: + if isinstance(_, (tuple, list)) and len(_)==self.encoder_layer_number: + roberta_outputs = _ + break + else: + roberta_outputs = all_outputs[:1] + # output_layers = [self.layers] # len(self.layers) x batch_size x max_word_piece_length x hidden_size + outputs = roberta_outputs[0].new_zeros((len(self.layers), batch_size, max_len, roberta_outputs[0].size(-1))) + for l_index, l in enumerate(self.layers): + roberta_output = roberta_outputs[l] + outputs[l_index] = roberta_output + return outputs + + def save(self, folder): + self.tokenizer.save_pretrained(folder) + self.encoder.save_pretrained(folder) diff --git a/fastNLP/modules/decoder/mlp.py b/fastNLP/modules/decoder/mlp.py index 10348324..e4df542d 100644 --- a/fastNLP/modules/decoder/mlp.py +++ b/fastNLP/modules/decoder/mlp.py @@ -71,8 +71,8 @@ class MLP(nn.Module): f"the length of activation function list except {len(size_layer) - 2} but got {len(activation)}!") self.hidden_active = [] for func in activation: - if callable(activation): - self.hidden_active.append(activation) + if callable(func): + self.hidden_active.append(func) elif func.lower() in actives: self.hidden_active.append(actives[func]) else: diff --git a/test/embeddings/test_roberta_embedding.py b/test/embeddings/test_roberta_embedding.py index 4cfc1ca3..7eba2644 100644 --- a/test/embeddings/test_roberta_embedding.py +++ b/test/embeddings/test_roberta_embedding.py @@ -214,16 +214,8 @@ class TestRobertaEmbedding(unittest.TestCase): result = embed(words) self.assertEqual(result.size(), (1, 4, 16)) - embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1, - only_use_pretrain_bpe=True) - embed.eval() - words = torch.LongTensor([[2, 3, 4, 1]]) - result = embed(words) - self.assertEqual(result.size(), (1, 4, 16)) - # 自动截断而不报错 - embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1, - only_use_pretrain_bpe=True, auto_truncate=True) + embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1, auto_truncate=True) words = torch.LongTensor([[2, 3, 4, 1]*10, [2, 3]+[0]*38]) result = embed(words) @@ -234,7 +226,7 @@ class TestRobertaEmbedding(unittest.TestCase): Embedding = RobertaEmbedding weight_path = 'test/data_for_tests/embedding/small_roberta' vocab = Vocabulary().add_word_lst("this is a texta and".split()) - embed1 = Embedding(vocab, model_dir_or_name=weight_path,layers=list(range(3)), + embed1 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)), only_use_pretrain_bpe=True, truncate_embed=True, min_freq=1) # embed_bpe_vocab_size = len(vocab)-1 + 2 # 排除NotInBERT, 额外加##a, [CLS] # self.assertEqual(embed_bpe_vocab_size, len(embed1.model.tokenzier.vocab)) diff --git a/test/embeddings/test_transformer_embedding.py b/test/embeddings/test_transformer_embedding.py new file mode 100644 index 00000000..2a3617d9 --- /dev/null +++ b/test/embeddings/test_transformer_embedding.py @@ -0,0 +1,38 @@ +import unittest + +import torch +import os + +from fastNLP import DataSet, Vocabulary +from fastNLP.embeddings.transformers_embedding import TransformersEmbedding, TransformersWordPieceEncoder + + +class TransformersEmbeddingTest(unittest.TestCase): + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_transformers_embedding_1(self): + from transformers import ElectraModel, ElectraTokenizer + weight_path = "google/electra-small-generator" + vocab = Vocabulary().add_word_lst("this is a test . [SEP] NotInRoberta".split()) + model = ElectraModel.from_pretrained(weight_path) + tokenizer = ElectraTokenizer.from_pretrained(weight_path) + + embed = TransformersEmbedding(vocab, model, tokenizer, word_dropout=0.1) + + words = torch.LongTensor([[2, 3, 4, 1]]) + result = embed(words) + self.assertEqual(result.size(), (1, 4, model.config.hidden_size)) + + +class TransformersWordPieceEncoderTest(unittest.TestCase): + @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") + def test_transformers_embedding_1(self): + from transformers import ElectraModel, ElectraTokenizer + weight_path = "google/electra-small-generator" + model = ElectraModel.from_pretrained(weight_path) + tokenizer = ElectraTokenizer.from_pretrained(weight_path) + encoder = TransformersWordPieceEncoder(model, tokenizer) + ds = DataSet({'words': ["this is a test . [SEP]".split()]}) + encoder.index_datasets(ds, field_name='words') + self.assertTrue(ds.has_field('word_pieces')) + result = encoder(torch.LongTensor([[1,2,3,4]])) + self.assertEqual(result.size(), (1, 4, model.config.hidden_size))