diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py index c57d2bef..694e6b52 100644 --- a/fastNLP/embeddings/bert_embedding.py +++ b/fastNLP/embeddings/bert_embedding.py @@ -427,7 +427,7 @@ class _BertWordModel(nn.Module): self._sep_index = self.tokenzier.vocab['[SEP]'] self._word_pad_index = vocab.padding_idx self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]'] # 需要用于生成word_piece - self.word_to_wordpieces = np.array(word_to_wordpieces) + self.word_to_wordpieces = np.array(word_to_wordpieces, dtype=object) self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths)) logger.debug("Successfully generate word pieces.") diff --git a/fastNLP/embeddings/gpt2_embedding.py b/fastNLP/embeddings/gpt2_embedding.py index dc45b26b..a9ce3202 100644 --- a/fastNLP/embeddings/gpt2_embedding.py +++ b/fastNLP/embeddings/gpt2_embedding.py @@ -423,7 +423,7 @@ class _GPT2Model(nn.Module): self._word_pad_index = vocab.padding_idx self._endoftext_index = self.tokenzier.encoder.get('<|endoftext|>') self._wordpiece_pad_index = self.tokenzier.encoder.get('<|endoftext|>') # 需要用于生成word_piece - self.word_to_wordpieces = np.array(word_to_wordpieces) + self.word_to_wordpieces = np.array(word_to_wordpieces, dtype=object) self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths)) logger.debug("Successfully generate word pieces.") diff --git a/fastNLP/embeddings/roberta_embedding.py b/fastNLP/embeddings/roberta_embedding.py index ec95abe2..d982838d 100644 --- a/fastNLP/embeddings/roberta_embedding.py +++ b/fastNLP/embeddings/roberta_embedding.py @@ -251,7 +251,7 @@ class _RobertaWordModel(nn.Module): self._sep_index = self.tokenizer.encoder[''] self._word_pad_index = vocab.padding_idx self._wordpiece_pad_index = self.tokenizer.encoder[''] # 需要用于生成word_piece - self.word_to_wordpieces = np.array(word_to_wordpieces) + self.word_to_wordpieces = np.array(word_to_wordpieces, dtype=object) self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths)) logger.debug("Successfully generate word pieces.") diff --git a/fastNLP/embeddings/transformers_embedding.py b/fastNLP/embeddings/transformers_embedding.py index 46282af7..4b15ea37 100644 --- a/fastNLP/embeddings/transformers_embedding.py +++ b/fastNLP/embeddings/transformers_embedding.py @@ -65,6 +65,8 @@ class TransformersEmbedding(ContextualEmbedding): 来进行分类的任务将auto_truncate置为True。 :param kwargs: int min_freq: 小于该次数的词会被unk代替, 默认为1 + dict tokenizer_kwargs: 传递给tokenizer在调用tokenize()方法时所额外使用的参数,例如RoBERTaTokenizer需要传入 + {'add_prefix_space':True} """ super().__init__(vocab, word_dropout=word_dropout, dropout=dropout) @@ -82,9 +84,10 @@ class TransformersEmbedding(ContextualEmbedding): min_freq = kwargs.get('min_freq', 1) self._min_freq = min_freq + tokenizer_kwargs = kwargs.get('tokenizer_kwargs', {}) self.model = _TransformersWordModel(tokenizer=tokenizer, model=model, vocab=vocab, layers=layers, - pool_method=pool_method, include_cls_sep=include_cls_sep, - auto_truncate=auto_truncate, min_freq=min_freq) + pool_method=pool_method, include_cls_sep=include_cls_sep, + auto_truncate=auto_truncate, min_freq=min_freq, tokenizer_kwargs=tokenizer_kwargs) self.requires_grad = requires_grad self._embed_size = len(self.model.layers) * model.config.hidden_size @@ -237,7 +240,7 @@ class TransformersWordPieceEncoder(nn.Module): class _TransformersWordModel(nn.Module): def __init__(self, tokenizer, model, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first', - include_cls_sep: bool = False, auto_truncate: bool = False, min_freq=2): + include_cls_sep: bool = False, auto_truncate: bool = False, min_freq=2, tokenizer_kwargs={}): super().__init__() self.tokenizer = tokenizer @@ -283,7 +286,7 @@ class _TransformersWordModel(nn.Module): word = tokenizer.unk_token elif vocab.word_count[word]