Browse Source

update to avoid warning

tags/v1.0.0alpha
yh_cc 4 years ago
parent
commit
eaff7cafc6
4 changed files with 11 additions and 8 deletions
  1. +1
    -1
      fastNLP/embeddings/bert_embedding.py
  2. +1
    -1
      fastNLP/embeddings/gpt2_embedding.py
  3. +1
    -1
      fastNLP/embeddings/roberta_embedding.py
  4. +8
    -5
      fastNLP/embeddings/transformers_embedding.py

+ 1
- 1
fastNLP/embeddings/bert_embedding.py View File

@@ -427,7 +427,7 @@ class _BertWordModel(nn.Module):
self._sep_index = self.tokenzier.vocab['[SEP]']
self._word_pad_index = vocab.padding_idx
self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece
self.word_to_wordpieces = np.array(word_to_wordpieces)
self.word_to_wordpieces = np.array(word_to_wordpieces, dtype=object)
self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths))
logger.debug("Successfully generate word pieces.")



+ 1
- 1
fastNLP/embeddings/gpt2_embedding.py View File

@@ -423,7 +423,7 @@ class _GPT2Model(nn.Module):
self._word_pad_index = vocab.padding_idx
self._endoftext_index = self.tokenzier.encoder.get('<|endoftext|>')
self._wordpiece_pad_index = self.tokenzier.encoder.get('<|endoftext|>') # 需要用于生成word_piece
self.word_to_wordpieces = np.array(word_to_wordpieces)
self.word_to_wordpieces = np.array(word_to_wordpieces, dtype=object)
self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths))
logger.debug("Successfully generate word pieces.")



+ 1
- 1
fastNLP/embeddings/roberta_embedding.py View File

@@ -251,7 +251,7 @@ class _RobertaWordModel(nn.Module):
self._sep_index = self.tokenizer.encoder['</s>']
self._word_pad_index = vocab.padding_idx
self._wordpiece_pad_index = self.tokenizer.encoder['<pad>'] # 需要用于生成word_piece
self.word_to_wordpieces = np.array(word_to_wordpieces)
self.word_to_wordpieces = np.array(word_to_wordpieces, dtype=object)
self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths))
logger.debug("Successfully generate word pieces.")



+ 8
- 5
fastNLP/embeddings/transformers_embedding.py View File

@@ -65,6 +65,8 @@ class TransformersEmbedding(ContextualEmbedding):
来进行分类的任务将auto_truncate置为True。
:param kwargs:
int min_freq: 小于该次数的词会被unk代替, 默认为1
dict tokenizer_kwargs: 传递给tokenizer在调用tokenize()方法时所额外使用的参数,例如RoBERTaTokenizer需要传入
{'add_prefix_space':True}
"""
super().__init__(vocab, word_dropout=word_dropout, dropout=dropout)

@@ -82,9 +84,10 @@ class TransformersEmbedding(ContextualEmbedding):
min_freq = kwargs.get('min_freq', 1)
self._min_freq = min_freq

tokenizer_kwargs = kwargs.get('tokenizer_kwargs', {})
self.model = _TransformersWordModel(tokenizer=tokenizer, model=model, vocab=vocab, layers=layers,
pool_method=pool_method, include_cls_sep=include_cls_sep,
auto_truncate=auto_truncate, min_freq=min_freq)
pool_method=pool_method, include_cls_sep=include_cls_sep,
auto_truncate=auto_truncate, min_freq=min_freq, tokenizer_kwargs=tokenizer_kwargs)

self.requires_grad = requires_grad
self._embed_size = len(self.model.layers) * model.config.hidden_size
@@ -237,7 +240,7 @@ class TransformersWordPieceEncoder(nn.Module):

class _TransformersWordModel(nn.Module):
def __init__(self, tokenizer, model, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first',
include_cls_sep: bool = False, auto_truncate: bool = False, min_freq=2):
include_cls_sep: bool = False, auto_truncate: bool = False, min_freq=2, tokenizer_kwargs={}):
super().__init__()

self.tokenizer = tokenizer
@@ -283,7 +286,7 @@ class _TransformersWordModel(nn.Module):
word = tokenizer.unk_token
elif vocab.word_count[word]<min_freq:
word = tokenizer.unk_token
word_pieces = self.tokenizer.tokenize(word, add_prefix_space=True)
word_pieces = self.tokenizer.tokenize(word, **tokenizer_kwargs)
word_pieces = self.tokenizer.convert_tokens_to_ids(word_pieces)
word_to_wordpieces.append(word_pieces)
word_pieces_lengths.append(len(word_pieces))
@@ -291,7 +294,7 @@ class _TransformersWordModel(nn.Module):
self._sep_index = self.tokenizer.sep_token_id
self._word_pad_index = vocab.padding_idx
self._wordpiece_pad_index = self.tokenizer.pad_token_id # 需要用于生成word_piece
self.word_to_wordpieces = np.array(word_to_wordpieces)
self.word_to_wordpieces = np.array(word_to_wordpieces, dtype=object)
self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths))
logger.debug("Successfully generate word pieces.")



Loading…
Cancel
Save