1. update bert.py and fix a bug in bert_embedding to adapt torch 1.2.0; 2. update models/bert.py and add BertForSentenceMatching model, now a BertEmbedding param should be passed to these five models; 3. create a small bert version for testing and modify test/models/test_bert.py; 4. move small glove and word2vec files to data_for_tests/embedding/small_static_embedding dir and fix relevant test codes; 5. delete some __init__.py files in test dir.

6 years ago · 4440801dbf
--- a/fastNLP/embeddings/bert_embedding.py
+++ b/fastNLP/embeddings/bert_embedding.py
@@ -393,7 +393,7 @@ class _WordBertModel(nn.Module):
            batch_indexes = torch.arange(batch_size).to(words)
            word_pieces[batch_indexes, word_pieces_lengths + 1] = self._sep_index
            if self._has_sep_in_vocab:  # 但[SEP]在vocab中出现应该才会需要token_ids
                sep_mask = word_pieces.eq(self._sep_index)  # batch_size x max_len
                sep_mask = word_pieces.eq(self._sep_index).long()  # batch_size x max_len
                sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1])
                token_type_ids = sep_mask_cumsum.fmod(2)
                if token_type_ids[0, 0].item():  # 如果开头是奇数，则需要flip一下结果，因为需要保证开头为0
--- a/fastNLP/models/bert.py
+++ b/fastNLP/models/bert.py
@@ -5,253 +5,145 @@ bert.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed

 __all__ = []

 import os
 import warnings

 import torch
 from torch import nn

 from .base_model import BaseModel
 from ..core.const import Const
 from ..core.utils import seq_len_to_mask
 from ..core._logger import logger
 from ..modules.encoder import BertModel
 from ..modules.encoder.bert import BertConfig, CONFIG_FILE
 from ..embeddings.bert_embedding import BertEmbedding


 class BertForSequenceClassification(BaseModel):
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    Params:
        `config`: a BertConfig class instance with the configuration to build a new model.
        `num_labels`: the number of classes for the classifier. Default = 2.
    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
            with indices selected in [0, ..., num_labels].
    Outputs:
        if `labels` is not `None`:
            Outputs the CrossEntropy classification loss of the output with the labels.
        if `labels` is `None`:
            Outputs the classification logits of shape [batch_size, num_labels].
    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
    num_labels = 2
    model = BertForSequenceClassification(num_labels, config)
    logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, num_labels, config=None, bert_dir=None):
    def __init__(self, init_embed: BertEmbedding, num_labels: int=2):
        super(BertForSequenceClassification, self).__init__()

        self.num_labels = num_labels
        if bert_dir is not None:
            self.bert = BertModel.from_pretrained(bert_dir)
            config = BertConfig(os.path.join(bert_dir, CONFIG_FILE))
        else:
            if config is None:
                config = BertConfig(30522)
            self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)

    @classmethod
    def from_pretrained(cls, num_labels, pretrained_model_dir):
        config = BertConfig(pretrained_model_dir)
        model = cls(num_labels=num_labels, config=config, bert_dir=pretrained_model_dir)
        return model

    def forward(self, words, seq_len=None, target=None):
        if seq_len is None:
            seq_len = torch.ones_like(words, dtype=words.dtype, device=words.device)
        if len(seq_len.size()) + 1 == len(words.size()):
            seq_len = seq_len_to_mask(seq_len, max_len=words.size(-1))
        _, pooled_output = self.bert(words, attention_mask=seq_len, output_all_encoded_layers=False)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        self.bert = init_embed
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.embedding_dim, num_labels)

        if not self.bert.model.include_cls_sep:
            warn_msg = "Bert for sequence classification excepts BertEmbedding `include_cls_sep` True, but got False."
            logger.warn(warn_msg)
            warnings.warn(warn_msg)

    def forward(self, words):
        hidden = self.dropout(self.bert(words))
        cls_hidden = hidden[:, 0]
        logits = self.classifier(cls_hidden)

        return {Const.OUTPUT: logits}

    def predict(self, words):
        logits = self.forward(words)[Const.OUTPUT]
        return {Const.OUTPUT: torch.argmax(logits, dim=-1)}


 class BertForSentenceMatching(BaseModel):

    """BERT model for matching.
    """
    def __init__(self, init_embed: BertEmbedding, num_labels: int=2):
        super(BertForSentenceMatching, self).__init__()
        self.num_labels = num_labels
        self.bert = init_embed
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.embedding_dim, num_labels)

        if not self.bert.model.include_cls_sep:
            error_msg = "Bert for sentence matching excepts BertEmbedding `include_cls_sep` True, but got False."
            logger.error(error_msg)
            raise RuntimeError(error_msg)

        if target is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits, target)
            return {Const.OUTPUT: logits, Const.LOSS: loss}
        else:
            return {Const.OUTPUT: logits}
    def forward(self, words):
        hidden = self.dropout(self.bert(words))
        cls_hidden = hidden[:, 0]
        logits = self.classifier(cls_hidden)

    def predict(self, words, seq_len=None):
        logits = self.forward(words, seq_len=seq_len)[Const.OUTPUT]
        return {Const.OUTPUT: logits}

    def predict(self, words):
        logits = self.forward(words)[Const.OUTPUT]
        return {Const.OUTPUT: torch.argmax(logits, dim=-1)}


 class BertForMultipleChoice(BaseModel):
    """BERT model for multiple choice tasks.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    Params:
        `config`: a BertConfig class instance with the configuration to build a new model.
        `num_choices`: the number of classes for the classifier. Default = 2.
    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length]
            with the token types indices selected in [0, 1]. Type 0 corresponds to a `sentence A`
            and type 1 corresponds to a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, num_choices, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size]
            with indices selected in [0, ..., num_choices].
    Outputs:
        if `labels` is not `None`:
            Outputs the CrossEntropy classification loss of the output with the labels.
        if `labels` is `None`:
            Outputs the classification logits of shape [batch_size, num_labels].
    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
    input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
    token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
    num_choices = 2
    model = BertForMultipleChoice(num_choices, config, bert_dir)
    logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, num_choices, config=None, bert_dir=None):
    def __init__(self, init_embed: BertEmbedding, num_choices=2):
        super(BertForMultipleChoice, self).__init__()

        self.num_choices = num_choices
        if bert_dir is not None:
            self.bert = BertModel.from_pretrained(bert_dir)
        else:
            if config is None:
                config = BertConfig(30522)
            self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, 1)

    @classmethod
    def from_pretrained(cls, num_choices, pretrained_model_dir):
        config = BertConfig(pretrained_model_dir)
        model = cls(num_choices=num_choices, config=config, bert_dir=pretrained_model_dir)
        return model

    def forward(self, words, seq_len1=None, seq_len2=None, target=None):
        input_ids, token_type_ids, attention_mask = words, seq_len1, seq_len2
        flat_input_ids = input_ids.view(-1, input_ids.size(-1))
        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1))
        _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, output_all_encoded_layers=False)
        self.bert = init_embed
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.embedding_dim, 1)
        self.include_cls_sep = init_embed.model.include_cls_sep

        if not self.bert.model.include_cls_sep:
            error_msg = "Bert for multiple choice excepts BertEmbedding `include_cls_sep` True, but got False."
            logger.error(error_msg)
            raise RuntimeError(error_msg)

    def forward(self, words):
        """
        :param torch.Tensor words: [batch_size, num_choices, seq_len]
        :return: [batch_size, num_labels]
        """
        batch_size, num_choices, seq_len = words.size()

        input_ids = words.view(batch_size * num_choices, seq_len)
        hidden = self.bert(input_ids)
        pooled_output = hidden[:, 0]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        reshaped_logits = logits.view(-1, self.num_choices)

        if target is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(reshaped_logits, target)
            return {Const.OUTPUT: reshaped_logits, Const.LOSS: loss}
        else:
            return {Const.OUTPUT: reshaped_logits}
        return {Const.OUTPUT: reshaped_logits}

    def predict(self, words, seq_len1=None, seq_len2=None,):
        logits = self.forward(words, seq_len1=seq_len1, seq_len2=seq_len2)[Const.OUTPUT]
    def predict(self, words):
        logits = self.forward(words)[Const.OUTPUT]
        return {Const.OUTPUT: torch.argmax(logits, dim=-1)}


 class BertForTokenClassification(BaseModel):
    """BERT model for token-level classification.
    This module is composed of the BERT model with a linear layer on top of
    the full hidden state of the last layer.
    Params:
        `config`: a BertConfig class instance with the configuration to build a new model.
        `num_labels`: the number of classes for the classifier. Default = 2.
        `bert_dir`: a dir which contains the bert parameters within file `pytorch_model.bin`
    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
            `extract_features.py`, `run_classifier.py` and `run_squad.py`)
        `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
            types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
            a `sentence B` token (see BERT paper for more details).
        `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices
            selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max
            input sequence length in the current batch. It's the mask that we typically use for attention when
            a batch has varying length sentences.
        `labels`: labels for the classification output: torch.LongTensor of shape [batch_size, sequence_length]
            with indices selected in [0, ..., num_labels].
    Outputs:
        if `labels` is not `None`:
            Outputs the CrossEntropy classification loss of the output with the labels.
        if `labels` is `None`:
            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
    Example usage:
    ```python
    # Already been converted into WordPiece token ids
    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
    input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
    token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
    num_labels = 2
    bert_dir = 'your-bert-file-dir'
    model = BertForTokenClassification(num_labels, config, bert_dir)
    logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, num_labels, config=None, bert_dir=None):
    def __init__(self, init_embed: BertEmbedding, num_labels):
        super(BertForTokenClassification, self).__init__()

        self.num_labels = num_labels
        if bert_dir is not None:
            self.bert = BertModel.from_pretrained(bert_dir)
        else:
            if config is None:
                config = BertConfig(30522)
            self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)

    @classmethod
    def from_pretrained(cls, num_labels, pretrained_model_dir):
        config = BertConfig(pretrained_model_dir)
        model = cls(num_labels=num_labels, config=config, bert_dir=pretrained_model_dir)
        return model

    def forward(self, words, seq_len1=None, seq_len2=None, target=None):
        sequence_output, _ = self.bert(words, seq_len1, seq_len2, output_all_encoded_layers=False)
        self.bert = init_embed
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.embedding_dim, num_labels)
        self.include_cls_sep = init_embed.model.include_cls_sep

        if self.include_cls_sep:
            warn_msg = "Bert for token classification excepts BertEmbedding `include_cls_sep` False, but got True."
            warnings.warn(warn_msg)
            logger.warn(warn_msg)

    def forward(self, words):
        """
        :param torch.Tensor words: [batch_size, seq_len]
        :return: [batch_size, seq_len, num_labels]
        """
        sequence_output = self.bert(words)
        if self.include_cls_sep:
            sequence_output = sequence_output[:, 1: -1]  # [batch_size, seq_len, embed_dim]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        if target is not None:
            loss_fct = nn.CrossEntropyLoss()
            # Only keep active parts of the loss
            if seq_len2 is not None:
                active_loss = seq_len2.view(-1) == 1
                active_logits = logits.view(-1, self.num_labels)[active_loss]
                active_labels = target.view(-1)[active_loss]
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, self.num_labels), target.view(-1))
            return {Const.OUTPUT: logits, Const.LOSS: loss}
        else:
            return {Const.OUTPUT: logits}

    def predict(self, words, seq_len1=None, seq_len2=None):
        logits = self.forward(words, seq_len1, seq_len2)[Const.OUTPUT]
        return {Const.OUTPUT: logits}

    def predict(self, words):
        logits = self.forward(words)[Const.OUTPUT]
        return {Const.OUTPUT: torch.argmax(logits, dim=-1)}


@@ -298,53 +190,24 @@ class BertForQuestionAnswering(BaseModel):
    start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
    ```
    """
    def __init__(self, config=None, bert_dir=None):
    def __init__(self, init_embed: BertEmbedding, num_labels=2):
        super(BertForQuestionAnswering, self).__init__()
        if bert_dir is not None:
            self.bert = BertModel.from_pretrained(bert_dir)
        else:
            if config is None:
                config = BertConfig(30522)
            self.bert = BertModel(config)
        # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.qa_outputs = nn.Linear(config.hidden_size, 2)

    @classmethod
    def from_pretrained(cls, pretrained_model_dir):
        config = BertConfig(pretrained_model_dir)
        model = cls(config=config, bert_dir=pretrained_model_dir)
        return model

    def forward(self, words, seq_len1=None, seq_len2=None, target1=None, target2=None):
        sequence_output, _ = self.bert(words, seq_len1, seq_len2, output_all_encoded_layers=False)
        logits = self.qa_outputs(sequence_output)
        start_logits, end_logits = logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)

        if target1 is not None and target2 is not None:
            # If we are on multi-GPU, split add a dimension
            if len(target1.size()) > 1:
                target1 = target1.squeeze(-1)
            if len(target2.size()) > 1:
                target2 = target2.squeeze(-1)
            # sometimes the start/end positions are outside our model inputs, we ignore these terms
            ignored_index = start_logits.size(1)
            target1.clamp_(0, ignored_index)
            target2.clamp_(0, ignored_index)

            loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, target1)
            end_loss = loss_fct(end_logits, target2)
            total_loss = (start_loss + end_loss) / 2
            return {Const.OUTPUTS(0): start_logits, Const.OUTPUTS(1): end_logits, Const.LOSS: total_loss}
        else:
            return {Const.OUTPUTS(0): start_logits, Const.OUTPUTS(1): end_logits}

    def predict(self, words, seq_len1=None, seq_len2=None):
        logits = self.forward(words, seq_len1, seq_len2)
        start_logits = logits[Const.OUTPUTS(0)]
        end_logits = logits[Const.OUTPUTS(1)]
        return {Const.OUTPUTS(0): torch.argmax(start_logits, dim=-1),
                Const.OUTPUTS(1): torch.argmax(end_logits, dim=-1)}

        self.bert = init_embed
        self.num_labels = num_labels
        self.qa_outputs = nn.Linear(self.bert.embedding_dim, self.num_labels)

        if not self.bert.model.include_cls_sep:
            error_msg = "Bert for multiple choice excepts BertEmbedding `include_cls_sep` True, but got False."
            logger.error(error_msg)
            raise RuntimeError(error_msg)

    def forward(self, words):
        sequence_output = self.bert(words)
        logits = self.qa_outputs(sequence_output)  # [batch_size, seq_len, num_labels]

        return {Const.OUTPUTS(i): logits[:, :, i] for i in range(self.num_labels)}

    def predict(self, words):
        logits = self.forward(words)
        return {Const.OUTPUTS(i): torch.argmax(logits[Const.OUTPUTS(i)], dim=-1) for i in range(self.num_labels)}
--- a/fastNLP/modules/encoder/bert.py
+++ b/fastNLP/modules/encoder/bert.py
@@ -435,14 +435,14 @@ class BertModel(nn.Module):
        return encoded_layers, pooled_output

    @classmethod
    def from_pretrained(cls, pretrained_model_dir_or_name, *inputs, **kwargs):
    def from_pretrained(cls, model_dir_or_name, *inputs, **kwargs):
        state_dict = kwargs.get('state_dict', None)
        kwargs.pop('state_dict', None)
        kwargs.pop('cache_dir', None)
        kwargs.pop('from_tf', None)

        # get model dir from name or dir
        pretrained_model_dir = _get_bert_dir(pretrained_model_dir_or_name)
        pretrained_model_dir = _get_bert_dir(model_dir_or_name)

        # Load config
        config_file = _get_file_name_base_on_postfix(pretrained_model_dir, '.json')
--- a/test/init.py
+++ b/test/init.py
@@ -1,3 +0,0 @@
 import fastNLP

 __all__ = ["fastNLP"]
--- a/test/data_for_tests/embedding/small_bert/config.json
+++ b/test/data_for_tests/embedding/small_bert/config.json
@@ -0,0 +1,13 @@
 {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 16,
  "initializer_range": 0.02,
  "intermediate_size": 64,
  "max_position_embeddings": 32,
  "num_attention_heads": 4,
  "num_hidden_layers": 2,
  "type_vocab_size": 2,
  "vocab_size": 20
 }
--- a/test/data_for_tests/embedding/small_bert/small_pytorch_model.bin
+++ b/test/data_for_tests/embedding/small_bert/small_pytorch_model.bin
--- a/test/data_for_tests/embedding/small_bert/vocab.txt
+++ b/test/data_for_tests/embedding/small_bert/vocab.txt
@@ -0,0 +1,20 @@
 [PAD]
 [UNK]
 [CLS]
 [SEP]
 this
 is
 a
 small
 bert
 model
 vocab
 file
 and
 only
 twenty
 line
 for
 the
 whole
 text
--- a/test/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt
+++ b/test/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt
--- a/test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt
+++ b/test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt
--- a/test/embeddings/init.py
+++ b/test/embeddings/init.py
--- a/test/embeddings/test_bert_embedding.py
+++ b/test/embeddings/test_bert_embedding.py
@@ -18,4 +18,13 @@ class TestDownload(unittest.TestCase):
        embed = BertEmbedding(vocab, model_dir_or_name='en', dropout=0.1, word_dropout=0.2)
        for i in range(10):
            words = torch.LongTensor([[2, 3, 4, 0]])
            print(embed(words).size())
            print(embed(words).size())


 class TestBertEmbedding(unittest.TestCase):
    def test_bert_embedding_1(self):
        vocab = Vocabulary().add_word_lst("this is a test .".split())
        embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert')
        words = torch.LongTensor([[2, 3, 4, 0]])
        result = embed(words)
        self.assertEqual(result.size(), (1, 4, 16))
--- a/test/embeddings/test_static_embedding.py
+++ b/test/embeddings/test_static_embedding.py
@@ -10,7 +10,8 @@ class TestLoad(unittest.TestCase):
    def test_norm1(self):
        # 测试只对可以找到的norm
        vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile'])
        embed = StaticEmbedding(vocab, model_dir_or_name='test/data_for_tests/glove.6B.50d_test.txt',
        embed = StaticEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_static_embedding/'
                                                         'glove.6B.50d_test.txt',
                                only_norm_found_vector=True)
        self.assertEqual(round(torch.norm(embed(torch.LongTensor([[2]]))).item(), 4), 1)
        self.assertNotEqual(torch.norm(embed(torch.LongTensor([[4]]))).item(), 1)
@@ -18,7 +19,8 @@ class TestLoad(unittest.TestCase):
    def test_norm2(self):
        # 测试对所有都norm
        vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile'])
        embed = StaticEmbedding(vocab, model_dir_or_name='test/data_for_tests/glove.6B.50d_test.txt',
        embed = StaticEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_static_embedding/'
                                                         'glove.6B.50d_test.txt',
                                normalize=True)
        self.assertEqual(round(torch.norm(embed(torch.LongTensor([[2]]))).item(), 4), 1)
        self.assertEqual(round(torch.norm(embed(torch.LongTensor([[4]]))).item(), 4), 1)
--- a/test/io/test_embed_loader.py
+++ b/test/io/test_embed_loader.py
@@ -8,8 +8,8 @@ from fastNLP.io import EmbedLoader
 class TestEmbedLoader(unittest.TestCase):
    def test_load_with_vocab(self):
        vocab = Vocabulary()
        glove = "test/data_for_tests/glove.6B.50d_test.txt"
        word2vec = "test/data_for_tests/word2vec_test.txt"
        glove = "test/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt"
        word2vec = "test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt"
        vocab.add_word('the')
        vocab.add_word('none')
        g_m = EmbedLoader.load_with_vocab(glove, vocab)
@@ -20,8 +20,8 @@ class TestEmbedLoader(unittest.TestCase):
    
    def test_load_without_vocab(self):
        words = ['the', 'of', 'in', 'a', 'to', 'and']
        glove = "test/data_for_tests/glove.6B.50d_test.txt"
        word2vec = "test/data_for_tests/word2vec_test.txt"
        glove = "test/data_for_tests/embedding/small_static_embedding/glove.6B.50d_test.txt"
        word2vec = "test/data_for_tests/embedding/small_static_embedding/word2vec_test.txt"
        g_m, vocab = EmbedLoader.load_without_vocab(glove)
        self.assertEqual(g_m.shape, (8, 50))
        for word in words:
--- a/test/models/init.py
+++ b/test/models/init.py
--- a/test/models/test_bert.py
+++ b/test/models/test_bert.py
@@ -2,74 +2,94 @@ import unittest

 import torch

 from fastNLP.core import Vocabulary, Const
 from fastNLP.models.bert import BertForSequenceClassification, BertForQuestionAnswering, \
    BertForTokenClassification, BertForMultipleChoice
    BertForTokenClassification, BertForMultipleChoice, BertForSentenceMatching
 from fastNLP.embeddings.bert_embedding import BertEmbedding


 class TestBert(unittest.TestCase):
    def test_bert_1(self):
        from fastNLP.core.const import Const
        from fastNLP.modules.encoder.bert import BertConfig
        vocab = Vocabulary().add_word_lst("this is a test .".split())
        embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert',
                              include_cls_sep=True)

        model = BertForSequenceClassification(2, BertConfig(32000))
        model = BertForSequenceClassification(embed, 2)

        input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
        input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
        input_ids = torch.LongTensor([[1, 2, 3], [5, 6, 0]])

        pred = model(input_ids, input_mask)
        pred = model(input_ids)
        self.assertTrue(isinstance(pred, dict))
        self.assertTrue(Const.OUTPUT in pred)
        self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 2))

        input_mask = torch.LongTensor([3, 2])
        pred = model(input_ids, input_mask)
        pred = model.predict(input_ids)
        self.assertTrue(isinstance(pred, dict))
        self.assertTrue(Const.OUTPUT in pred)
        self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 2))
        self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2,))

    def test_bert_2(self):
        from fastNLP.core.const import Const
        from fastNLP.modules.encoder.bert import BertConfig

        model = BertForMultipleChoice(2, BertConfig(32000))
        vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split())
        embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert',
                              include_cls_sep=True)

        model = BertForMultipleChoice(embed, 2)

        input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
        input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
        token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
        input_ids = torch.LongTensor([[[2, 6, 7], [1, 6, 5]]])
        print(input_ids.size())

        pred = model(input_ids, token_type_ids, input_mask)
        pred = model(input_ids)
        self.assertTrue(isinstance(pred, dict))
        self.assertTrue(Const.OUTPUT in pred)
        self.assertEqual(tuple(pred[Const.OUTPUT].shape), (1, 2))

    def test_bert_3(self):
        from fastNLP.core.const import Const
        from fastNLP.modules.encoder.bert import BertConfig

        model = BertForTokenClassification(7, BertConfig(32000))
        vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split())
        embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert',
                              include_cls_sep=False)
        model = BertForTokenClassification(embed, 7)

        input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
        input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
        token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
        input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]])

        pred = model(input_ids, token_type_ids, input_mask)
        pred = model(input_ids)
        self.assertTrue(isinstance(pred, dict))
        self.assertTrue(Const.OUTPUT in pred)
        self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 3, 7))

    def test_bert_4(self):
        from fastNLP.core.const import Const
        from fastNLP.modules.encoder.bert import BertConfig

        model = BertForQuestionAnswering(BertConfig(32000))
        vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split())
        embed = BertEmbedding(vocab, model_dir_or_name='test/data_for_tests/embedding/small_bert',
                              include_cls_sep=True)
        model = BertForQuestionAnswering(embed)

        input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
        input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
        token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
        input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]])

        pred = model(input_ids, token_type_ids, input_mask)
        pred = model(input_ids)
        self.assertTrue(isinstance(pred, dict))
        self.assertTrue(Const.OUTPUTS(0) in pred)
        self.assertTrue(Const.OUTPUTS(1) in pred)
        self.assertEqual(tuple(pred[Const.OUTPUTS(0)].shape), (2, 3))
        self.assertEqual(tuple(pred[Const.OUTPUTS(1)].shape), (2, 3))
        self.assertEqual(tuple(pred[Const.OUTPUTS(0)].shape), (2, 5))
        self.assertEqual(tuple(pred[Const.OUTPUTS(1)].shape), (2, 5))

        model = BertForQuestionAnswering(embed, 7)
        pred = model(input_ids)
        self.assertTrue(isinstance(pred, dict))
        self.assertEqual(len(pred), 7)

    def test_bert_5(self):

        vocab = Vocabulary().add_word_lst("this is a test [SEP] .".split())
        embed = BertEmbedding(vocab, model_dir_or_name='./../data_for_tests/embedding/small_bert',
                              include_cls_sep=True)
        model = BertForSentenceMatching(embed)

        input_ids = torch.LongTensor([[1, 2, 3], [6, 5, 0]])

        pred = model(input_ids)
        self.assertTrue(isinstance(pred, dict))
        self.assertTrue(Const.OUTPUT in pred)
        self.assertEqual(tuple(pred[Const.OUTPUT].shape), (2, 2))

--- a/test/modules/init.py
+++ b/test/modules/init.py
--- a/test/modules/decoder/init.py
+++ b/test/modules/decoder/init.py