Browse Source

1. update bert.py and fix a bug in bert_embedding to adapt torch 1.2.0; 2. update models/bert.py and add BertForSentenceMatching model, now a BertEmbedding param should be passed to these five models; 3. create a small bert version for testing and modify test/models/test_bert.py; 4. move small glove and word2vec files to data_for_tests/embedding/small_static_embedding dir and fix relevant test codes; 5. delete some __init__.py files in test dir.

tags/v0.4.10
Yige Xu 5 years ago
parent
commit
4440801dbf
17 changed files with 225 additions and 301 deletions
  1. +1
    -1
      fastNLP/embeddings/bert_embedding.py
  2. +118
    -255
      fastNLP/models/bert.py
  3. +2
    -2
      fastNLP/modules/encoder/bert.py
  4. +0
    -3
      test/__init__.py
  5. +13
    -0
      test/data_for_tests/embedding/small_bert/config.json
  6. BIN
      test/data_for_tests/embedding/small_bert/small_pytorch_model.bin
  7. +20
    -0
      test/data_for_tests/embedding/small_bert/vocab.txt
  8. +0
    -0
      test/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt
  9. +0
    -0
      test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt
  10. +0
    -0
      test/embeddings/__init__.py
  11. +10
    -1
      test/embeddings/test_bert_embedding.py
  12. +4
    -2
      test/embeddings/test_static_embedding.py
  13. +4
    -4
      test/io/test_embed_loader.py
  14. +0
    -0
      test/models/__init__.py
  15. +53
    -33
      test/models/test_bert.py
  16. +0
    -0
      test/modules/__init__.py
  17. +0
    -0
      test/modules/decoder/__init__.py

+ 1
- 1
fastNLP/embeddings/bert_embedding.py View File

@@ -393,7 +393,7 @@ class _WordBertModel(nn.Module):
batch_indexes = torch.arange(batch_size).to(words)
word_pieces[batch_indexes, word_pieces_lengths + 1] = self._sep_index
if self._has_sep_in_vocab: # 但[SEP]在vocab中出现应该才会需要token_ids
sep_mask = word_pieces.eq(self._sep_index) # batch_size x max_len
sep_mask = word_pieces.eq(self._sep_index).long() # batch_size x max_len
sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1])
token_type_ids = sep_mask_cumsum.fmod(2)
if token_type_ids[0, 0].item(): # 如果开头是奇数,则需要flip一下结果,因为需要保证开头为0


+ 118
- 255
fastNLP/models/bert.py View File

@@ -5,253 +5,145 @@ bert.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed

__all__ = []

import os
import warnings

import torch
from torch import nn

from .base_model import BaseModel
from ..core.const import Const
from ..core.utils import seq_len_to_mask
from ..core._logger import logger
from ..modules.encoder import BertModel
from ..modules.encoder.bert import BertConfig, CONFIG_FILE
from ..embeddings.bert_embedding import BertEmbedding


class BertForSequenceClassification(BaseModel):
"""BERT model for classification.
This module is composed of the BERT model with a linear layer on top of
the pooled output.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_labels`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_labels].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 2
model = BertForSequenceClassification(num_labels, config)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, num_labels, config=None, bert_dir=None):
def __init__(self, init_embed: BertEmbedding, num_labels: int=2):
super(BertForSequenceClassification, self).__init__()

self.num_labels = num_labels
if bert_dir is not None:
self.bert = BertModel.from_pretrained(bert_dir)
config = BertConfig(os.path.join(bert_dir, CONFIG_FILE))
else:
if config is None:
config = BertConfig(30522)
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, num_labels)

@classmethod
def from_pretrained(cls, num_labels, pretrained_model_dir):
config = BertConfig(pretrained_model_dir)
model = cls(num_labels=num_labels, config=config, bert_dir=pretrained_model_dir)
return model

def forward(self, words, seq_len=None, target=None):
if seq_len is None:
seq_len = torch.ones_like(words, dtype=words.dtype, device=words.device)
if len(seq_len.size()) + 1 == len(words.size()):
seq_len = seq_len_to_mask(seq_len, max_len=words.size(-1))
_, pooled_output = self.bert(words, attention_mask=seq_len, output_all_encoded_layers=False)
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
self.bert = init_embed
self.dropout = nn.Dropout(0.1)
self.classifier = nn.Linear(self.bert.embedding_dim, num_labels)

if not self.bert.model.include_cls_sep:
warn_msg = "Bert for sequence classification excepts BertEmbedding `include_cls_sep` True, but got False."
logger.warn(warn_msg)
warnings.warn(warn_msg)

def forward(self, words):
hidden = self.dropout(self.bert(words))
cls_hidden = hidden[:, 0]
logits = self.classifier(cls_hidden)

return {Const.OUTPUT: logits}

def predict(self, words):
logits = self.forward(words)[Const.OUTPUT]
return {Const.OUTPUT: torch.argmax(logits, dim=-1)}


class BertForSentenceMatching(BaseModel):

"""BERT model for matching.
"""
def __init__(self, init_embed: BertEmbedding, num_labels: int=2):
super(BertForSentenceMatching, self).__init__()
self.num_labels = num_labels
self.bert = init_embed
self.dropout = nn.Dropout(0.1)
self.classifier = nn.Linear(self.bert.embedding_dim, num_labels)

if not self.bert.model.include_cls_sep:
error_msg = "Bert for sentence matching excepts BertEmbedding `include_cls_sep` True, but got False."
logger.error(error_msg)
raise RuntimeError(error_msg)

if target is not None:
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(logits, target)
return {Const.OUTPUT: logits, Const.LOSS: loss}
else:
return {Const.OUTPUT: logits}
def forward(self, words):
hidden = self.dropout(self.bert(words))
cls_hidden = hidden[:, 0]
logits = self.classifier(cls_hidden)

def predict(self, words, seq_len=None):
logits = self.forward(words, seq_len=seq_len)[Const.OUTPUT]
return {Const.OUTPUT: logits}

def predict(self, words):
logits = self.forward(words)[Const.OUTPUT]
return {Const.OUTPUT: torch.argmax(logits, dim=-1)}


class BertForMultipleChoice(BaseModel):
"""BERT model for multiple choice tasks.
This module is composed of the BERT model with a linear layer on top of
the pooled output.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_choices`: the number of classes for the classifier. Default = 2.
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_choices].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_choices = 2
model = BertForMultipleChoice(num_choices, config, bert_dir)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, num_choices, config=None, bert_dir=None):
def __init__(self, init_embed: BertEmbedding, num_choices=2):
super(BertForMultipleChoice, self).__init__()

self.num_choices = num_choices
if bert_dir is not None:
self.bert = BertModel.from_pretrained(bert_dir)
else:
if config is None:
config = BertConfig(30522)
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)

@classmethod
def from_pretrained(cls, num_choices, pretrained_model_dir):
config = BertConfig(pretrained_model_dir)
model = cls(num_choices=num_choices, config=config, bert_dir=pretrained_model_dir)
return model

def forward(self, words, seq_len1=None, seq_len2=None, target=None):
input_ids, token_type_ids, attention_mask = words, seq_len1, seq_len2
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
_, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
self.bert = init_embed
self.dropout = nn.Dropout(0.1)
self.classifier = nn.Linear(self.bert.embedding_dim, 1)
self.include_cls_sep = init_embed.model.include_cls_sep

if not self.bert.model.include_cls_sep:
error_msg = "Bert for multiple choice excepts BertEmbedding `include_cls_sep` True, but got False."
logger.error(error_msg)
raise RuntimeError(error_msg)

def forward(self, words):
"""
:param torch.Tensor words: [batch_size, num_choices, seq_len]
:return: [batch_size, num_labels]
"""
batch_size, num_choices, seq_len = words.size()

input_ids = words.view(batch_size * num_choices, seq_len)
hidden = self.bert(input_ids)
pooled_output = hidden[:, 0]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, self.num_choices)

if target is not None:
loss_fct = nn.CrossEntropyLoss()
loss = loss_fct(reshaped_logits, target)
return {Const.OUTPUT: reshaped_logits, Const.LOSS: loss}
else:
return {Const.OUTPUT: reshaped_logits}
return {Const.OUTPUT: reshaped_logits}

def predict(self, words, seq_len1=None, seq_len2=None,):
logits = self.forward(words, seq_len1=seq_len1, seq_len2=seq_len2)[Const.OUTPUT]
def predict(self, words):
logits = self.forward(words)[Const.OUTPUT]
return {Const.OUTPUT: torch.argmax(logits, dim=-1)}


class BertForTokenClassification(BaseModel):
"""BERT model for token-level classification.
This module is composed of the BERT model with a linear layer on top of
the full hidden state of the last layer.
Params:
`config`: a BertConfig class instance with the configuration to build a new model.
`num_labels`: the number of classes for the classifier. Default = 2.
`bert_dir`: a dir which contains the bert parameters within file `pytorch_model.bin`
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`extract_features.py`, `run_classifier.py` and `run_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
a `sentence B` token (see BERT paper for more details).
`attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
input sequence length in the current batch. It's the mask that we typically use for attention when
a batch has varying length sentences.
`labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [0, ..., num_labels].
Outputs:
if `labels` is not `None`:
Outputs the CrossEntropy classification loss of the output with the labels.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
Example usage:
```python
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 2
bert_dir = 'your-bert-file-dir'
model = BertForTokenClassification(num_labels, config, bert_dir)
logits = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, num_labels, config=None, bert_dir=None):
def __init__(self, init_embed: BertEmbedding, num_labels):
super(BertForTokenClassification, self).__init__()

self.num_labels = num_labels
if bert_dir is not None:
self.bert = BertModel.from_pretrained(bert_dir)
else:
if config is None:
config = BertConfig(30522)
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, num_labels)

@classmethod
def from_pretrained(cls, num_labels, pretrained_model_dir):
config = BertConfig(pretrained_model_dir)
model = cls(num_labels=num_labels, config=config, bert_dir=pretrained_model_dir)
return model

def forward(self, words, seq_len1=None, seq_len2=None, target=None):
sequence_output, _ = self.bert(words, seq_len1, seq_len2, output_all_encoded_layers=False)
self.bert = init_embed
self.dropout = nn.Dropout(0.1)
self.classifier = nn.Linear(self.bert.embedding_dim, num_labels)
self.include_cls_sep = init_embed.model.include_cls_sep

if self.include_cls_sep:
warn_msg = "Bert for token classification excepts BertEmbedding `include_cls_sep` False, but got True."
warnings.warn(warn_msg)
logger.warn(warn_msg)

def forward(self, words):
"""
:param torch.Tensor words: [batch_size, seq_len]
:return: [batch_size, seq_len, num_labels]
"""
sequence_output = self.bert(words)
if self.include_cls_sep:
sequence_output = sequence_output[:, 1: -1] # [batch_size, seq_len, embed_dim]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)

if target is not None:
loss_fct = nn.CrossEntropyLoss()
# Only keep active parts of the loss
if seq_len2 is not None:
active_loss = seq_len2.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)[active_loss]
active_labels = target.view(-1)[active_loss]
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), target.view(-1))
return {Const.OUTPUT: logits, Const.LOSS: loss}
else:
return {Const.OUTPUT: logits}

def predict(self, words, seq_len1=None, seq_len2=None):
logits = self.forward(words, seq_len1, seq_len2)[Const.OUTPUT]
return {Const.OUTPUT: logits}

def predict(self, words):
logits = self.forward(words)[Const.OUTPUT]
return {Const.OUTPUT: torch.argmax(logits, dim=-1)}


@@ -298,53 +190,24 @@ class BertForQuestionAnswering(BaseModel):
start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
```
"""
def __init__(self, config=None, bert_dir=None):
def __init__(self, init_embed: BertEmbedding, num_labels=2):
super(BertForQuestionAnswering, self).__init__()
if bert_dir is not None:
self.bert = BertModel.from_pretrained(bert_dir)
else:
if config is None:
config = BertConfig(30522)
self.bert = BertModel(config)
# TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
# self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.qa_outputs = nn.Linear(config.hidden_size, 2)

@classmethod
def from_pretrained(cls, pretrained_model_dir):
config = BertConfig(pretrained_model_dir)
model = cls(config=config, bert_dir=pretrained_model_dir)
return model

def forward(self, words, seq_len1=None, seq_len2=None, target1=None, target2=None):
sequence_output, _ = self.bert(words, seq_len1, seq_len2, output_all_encoded_layers=False)
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)

if target1 is not None and target2 is not None:
# If we are on multi-GPU, split add a dimension
if len(target1.size()) > 1:
target1 = target1.squeeze(-1)
if len(target2.size()) > 1:
target2 = target2.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
target1.clamp_(0, ignored_index)
target2.clamp_(0, ignored_index)

loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, target1)
end_loss = loss_fct(end_logits, target2)
total_loss = (start_loss + end_loss) / 2
return {Const.OUTPUTS(0): start_logits, Const.OUTPUTS(1): end_logits, Const.LOSS: total_loss}
else:
return {Const.OUTPUTS(0): start_logits, Const.OUTPUTS(1): end_logits}

def predict(self, words, seq_len1=None, seq_len2=None):
logits = self.forward(words, seq_len1, seq_len2)
start_logits = logits[Const.OUTPUTS(0)]
end_logits = logits[Const.OUTPUTS(1)]
return {Const.OUTPUTS(0): torch.argmax(start_logits, dim=-1),
Const.OUTPUTS(1): torch.argmax(end_logits, dim=-1)}

self.bert = init_embed
self.num_labels = num_labels
self.qa_outputs = nn.Linear(self.bert.embedding_dim, self.num_labels)

if not self.bert.model.include_cls_sep:
error_msg = "Bert for multiple choice excepts BertEmbedding `include_cls_sep` True, but got False."
logger.error(error_msg)
raise RuntimeError(error_msg)

def forward(self, words):
sequence_output = self.bert(words)
logits = self.qa_outputs(sequence_output) # [batch_size, seq_len, num_labels]

return {Const.OUTPUTS(i): logits[:, :, i] for i in range(self.num_labels)}

def predict(self, words):
logits = self.forward(words)
return {Const.OUTPUTS(i): torch.argmax(logits[Const.OUTPUTS(i)], dim=-1) for i in range(self.num_labels)}

+ 2
- 2
fastNLP/modules/encoder/bert.py View File

@@ -435,14 +435,14 @@ class BertModel(nn.Module):
return encoded_layers, pooled_output

@classmethod
def from_pretrained(cls, pretrained_model_dir_or_name, *inputs, **kwargs):
def from_pretrained(cls, model_dir_or_name, *inputs, **kwargs):
state_dict = kwargs.get('state_dict', None)
kwargs.pop('state_dict', None)
kwargs.pop('cache_dir', None)
kwargs.pop('from_tf', None)

# get model dir from name or dir
pretrained_model_dir = _get_bert_dir(pretrained_model_dir_or_name)
pretrained_model_dir = _get_bert_dir(model_dir_or_name)

# Load config
config_file = _get_file_name_base_on_postfix(pretrained_model_dir, '.json')


+ 0
- 3
test/__init__.py View File

@@ -1,3 +0,0 @@
import fastNLP

__all__ = ["fastNLP"]

+ 13
- 0
test/data_for_tests/embedding/small_bert/config.json View File

@@ -0,0 +1,13 @@
{
"attention_probs_dropout_prob": 0.1,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 16,
"initializer_range": 0.02,
"intermediate_size": 64,
"max_position_embeddings": 32,
"num_attention_heads": 4,
"num_hidden_layers": 2,
"type_vocab_size": 2,
"vocab_size": 20
}

BIN
test/data_for_tests/embedding/small_bert/small_pytorch_model.bin View File


+ 20
- 0
test/data_for_tests/embedding/small_bert/vocab.txt View File

@@ -0,0 +1,20 @@
[PAD]
[UNK]
[CLS]
[SEP]
this
is
a
small
bert
model
vocab
file
and
only
twenty
line
for
the
whole
text

test/data_for_tests/glove.6B.50d_test.txt → test/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt View File


test/data_for_tests/word2vec_test.txt → test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt View File


+ 0
- 0
test/embeddings/__init__.py View File


+ 10
- 1
test/embeddings/test_bert_embedding.py View File

@@ -18,4 +18,13 @@ class TestDownload(unittest.TestCase):
embed = BertEmbedding(vocab, model_dir_or_name='en', dropout=0.1, word_dropout=0.2)
for i in range(10):
words = torch.LongTensor([[2, 3, 4, 0]])
print(embed(words).size())
print(embed(words).size())


class TestBertEmbedding(unittest.TestCase):
def test_bert_embedding_1(self):
vocab = Vocabulary().add_word_lst("this is a test .".split())
embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert')
words = torch.LongTensor([[2, 3, 4, 0]])
result = embed(words)
self.assertEqual(result.size(), (1, 4, 16))

+ 4
- 2
test/embeddings/test_static_embedding.py View File

@@ -10,7 +10,8 @@ class TestLoad(unittest.TestCase):
def test_norm1(self):
# 测试只对可以找到的norm
vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile'])
embed = StaticEmbedding(vocab, model_dir_or_name='test/data_for_tests/glove.6B.50d_test.txt',
embed = StaticEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_static_embedding/'
'glove.6B.50d_test.txt',
only_norm_found_vector=True)
self.assertEqual(round(torch.norm(embed(torch.LongTensor([[2]]))).item(), 4), 1)
self.assertNotEqual(torch.norm(embed(torch.LongTensor([[4]]))).item(), 1)
@@ -18,7 +19,8 @@ class TestLoad(unittest.TestCase):
def test_norm2(self):
# 测试对所有都norm
vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile'])
embed = StaticEmbedding(vocab, model_dir_or_name='test/data_for_tests/glove.6B.50d_test.txt',
embed = StaticEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_static_embedding/'
'glove.6B.50d_test.txt',
normalize=True)
self.assertEqual(round(torch.norm(embed(torch.LongTensor([[2]]))).item(), 4), 1)
self.assertEqual(round(torch.norm(embed(torch.LongTensor([[4]]))).item(), 4), 1)


+ 4
- 4
test/io/test_embed_loader.py View File

@@ -8,8 +8,8 @@ from fastNLP.io import EmbedLoader
class TestEmbedLoader(unittest.TestCase):
def test_load_with_vocab(self):
vocab = Vocabulary()
glove = "test/data_for_tests/glove.6B.50d_test.txt"
word2vec = "test/data_for_tests/word2vec_test.txt"
glove = "test/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt"
word2vec = "test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt"
vocab.add_word('the')
vocab.add_word('none')
g_m = EmbedLoader.load_with_vocab(glove, vocab)
@@ -20,8 +20,8 @@ class TestEmbedLoader(unittest.TestCase):
def test_load_without_vocab(self):
words = ['the', 'of', 'in', 'a', 'to', 'and']
glove = "test/data_for_tests/glove.6B.50d_test.txt"
word2vec = "test/data_for_tests/word2vec_test.txt"
glove = "test/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt"
word2vec = "test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt"
g_m, vocab = EmbedLoader.load_without_vocab(glove)
self.assertEqual(g_m.shape, (8, 50))
for word in words:


+ 0
- 0
test/models/__init__.py View File


+ 53
- 33
test/models/test_bert.py View File

@@ -2,74 +2,94 @@ import unittest

import torch

from fastNLP.core import Vocabulary, Const
from fastNLP.models.bert import BertForSequenceClassification, BertForQuestionAnswering, \
BertForTokenClassification, BertForMultipleChoice
BertForTokenClassification, BertForMultipleChoice, BertForSentenceMatching
from fastNLP.embeddings.bert_embedding import BertEmbedding


class TestBert(unittest.TestCase):
def test_bert_1(self):
from fastNLP.core.const import Const
from fastNLP.modules.encoder.bert import BertConfig
vocab = Vocabulary().add_word_lst("this is a test .".split())
embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert',
include_cls_sep=True)

model = BertForSequenceClassification(2, BertConfig(32000))
model = BertForSequenceClassification(embed, 2)

input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
input_ids = torch.LongTensor([[1, 2, 3], [5, 6, 0]])

pred = model(input_ids, input_mask)
pred = model(input_ids)
self.assertTrue(isinstance(pred, dict))
self.assertTrue(Const.OUTPUT in pred)
self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 2))

input_mask = torch.LongTensor([3, 2])
pred = model(input_ids, input_mask)
pred = model.predict(input_ids)
self.assertTrue(isinstance(pred, dict))
self.assertTrue(Const.OUTPUT in pred)
self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 2))
self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2,))

def test_bert_2(self):
from fastNLP.core.const import Const
from fastNLP.modules.encoder.bert import BertConfig

model = BertForMultipleChoice(2, BertConfig(32000))
vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split())
embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert',
include_cls_sep=True)

model = BertForMultipleChoice(embed, 2)

input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
input_ids = torch.LongTensor([[[2, 6, 7], [1, 6, 5]]])
print(input_ids.size())

pred = model(input_ids, token_type_ids, input_mask)
pred = model(input_ids)
self.assertTrue(isinstance(pred, dict))
self.assertTrue(Const.OUTPUT in pred)
self.assertEqual(tuple(pred[Const.OUTPUT].shape), (1, 2))

def test_bert_3(self):
from fastNLP.core.const import Const
from fastNLP.modules.encoder.bert import BertConfig

model = BertForTokenClassification(7, BertConfig(32000))
vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split())
embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert',
include_cls_sep=False)
model = BertForTokenClassification(embed, 7)

input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]])

pred = model(input_ids, token_type_ids, input_mask)
pred = model(input_ids)
self.assertTrue(isinstance(pred, dict))
self.assertTrue(Const.OUTPUT in pred)
self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 3, 7))

def test_bert_4(self):
from fastNLP.core.const import Const
from fastNLP.modules.encoder.bert import BertConfig

model = BertForQuestionAnswering(BertConfig(32000))
vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split())
embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert',
include_cls_sep=True)
model = BertForQuestionAnswering(embed)

input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]])

pred = model(input_ids, token_type_ids, input_mask)
pred = model(input_ids)
self.assertTrue(isinstance(pred, dict))
self.assertTrue(Const.OUTPUTS(0) in pred)
self.assertTrue(Const.OUTPUTS(1) in pred)
self.assertEqual(tuple(pred[Const.OUTPUTS(0)].shape), (2, 3))
self.assertEqual(tuple(pred[Const.OUTPUTS(1)].shape), (2, 3))
self.assertEqual(tuple(pred[Const.OUTPUTS(0)].shape), (2, 5))
self.assertEqual(tuple(pred[Const.OUTPUTS(1)].shape), (2, 5))

model = BertForQuestionAnswering(embed, 7)
pred = model(input_ids)
self.assertTrue(isinstance(pred, dict))
self.assertEqual(len(pred), 7)

def test_bert_5(self):

vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split())
embed = BertEmbedding(vocab, model_dir_or_name='./../data_for_tests/embedding/small_bert',
include_cls_sep=True)
model = BertForSentenceMatching(embed)

input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]])

pred = model(input_ids)
self.assertTrue(isinstance(pred, dict))
self.assertTrue(Const.OUTPUT in pred)
self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 2))


+ 0
- 0
test/modules/__init__.py View File


+ 0
- 0
test/modules/decoder/__init__.py View File


Loading…
Cancel
Save