diff --git a/fastNLP/embeddings/stack_embedding.py b/fastNLP/embeddings/stack_embedding.py
index 264283ac..7ef4736b 100644
--- a/fastNLP/embeddings/stack_embedding.py
+++ b/fastNLP/embeddings/stack_embedding.py
@@ -13,6 +13,7 @@ import torch
 from torch import nn as nn
 
 from .embedding import TokenEmbedding
+from .utils import _check_vocab_has_same_index
 
 
 class StackEmbedding(TokenEmbedding):
@@ -44,8 +45,9 @@ class StackEmbedding(TokenEmbedding):
                 vocabs.append(embed.get_word_vocab())
         _vocab = vocabs[0]
         for vocab in vocabs[1:]:
-            assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary."
-        
+            if _vocab!=vocab:
+                _check_vocab_has_same_index(_vocab, vocab)
+
         super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout)
         assert isinstance(embeds, list)
         for embed in embeds:
@@ -60,6 +62,7 @@ class StackEmbedding(TokenEmbedding):
         :return:
         """
         assert isinstance(embed, TokenEmbedding)
+        _check_vocab_has_same_index(self.get_word_vocab(), embed.get_word_vocab())
         self._embed_size += embed.embed_size
         self.embeds.append(embed)
         return self
diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py
index 2f57af47..81c0fe42 100644
--- a/fastNLP/embeddings/static_embedding.py
+++ b/fastNLP/embeddings/static_embedding.py
@@ -81,7 +81,7 @@ class StaticEmbedding(TokenEmbedding):
                  init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs):
         r"""
         
-        :param vocab: Vocabulary. 若该项为None则会读取所有的embedding。
+        :param Vocabulary vocab: 词表. StaticEmbedding只会加载包含在词表中的词的词向量，在预训练向量中没找到的使用随机初始化
         :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding：第一种是传入embedding文件夹(文件夹下应该只有一个
             以.txt作为后缀的文件)或文件路径；第二种是传入embedding的名称，第二种情况将自动查看缓存中是否存在该模型，没有的话将自动下载。
             如果输入为None则使用embedding_dim的维度随机初始化一个embedding。
diff --git a/fastNLP/embeddings/utils.py b/fastNLP/embeddings/utils.py
index cec015e0..9a18bfe3 100644
--- a/fastNLP/embeddings/utils.py
+++ b/fastNLP/embeddings/utils.py
@@ -89,3 +89,16 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
 
     return torch.FloatTensor(sinusoid_table)
 
+
+def _check_vocab_has_same_index(vocab, other_vocab):
+    """
+    检查两个vocabulary是否含有相同的word idx
+
+    :param Vocabulary vocab:
+    :param Vocabulary other_vocab:
+    :return:
+    """
+    if other_vocab != vocab:
+        for word, word_ix in vocab:
+            other_word_idx = other_vocab.to_index(word)
+            assert other_word_idx == word_ix, f"Word {word} has different index in vocabs, {word_ix} Vs. {other_word_idx}."
\ No newline at end of file
diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py
index 80fffb30..f83f768f 100644
--- a/fastNLP/models/base_model.py
+++ b/fastNLP/models/base_model.py
@@ -34,56 +34,3 @@ class NaiveClassifier(BaseModel):
     
     def predict(self, x):
         return {"predict": torch.sigmoid(self.mlp(x)) > 0.5}
-
-
-class NaiveClassifier2(BaseModel):
-    r"""
-    一个简单的分类器例子，可用于各种测试
-    """
-
-    def __init__(self, in_feature_dim, out_feature_dim):
-        super(NaiveClassifier2, self).__init__()
-        self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim])
-
-    def forward(self, x):
-        return {"predict": self.mlp(x)}
-
-    def predict(self, x):
-        return {"predict": torch.sigmoid(self.mlp(x)) > 0.5}
-
-
-class NaiveClassifier3(BaseModel):
-    r"""
-    一个简单的分类器例子，可用于各种测试
-    """
-
-    def __init__(self, in_feature_dim, out_feature_dim):
-        super(NaiveClassifier3, self).__init__()
-        self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim])
-
-    @torch.cuda.amp.autocast()
-    def forward(self, x):
-        return {"predict": self.mlp(x)}
-
-    @torch.cuda.amp.autocast()
-    def predict(self, x):
-        return {"predict": torch.sigmoid(self.mlp(x)) > 0.5}
-
-
-class NaiveClassifier4(BaseModel):
-    r"""
-    一个简单的分类器例子，可用于各种测试
-    """
-
-    def __init__(self, in_feature_dim, out_feature_dim):
-        super(NaiveClassifier4, self).__init__()
-        self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim])
-
-    def forward(self, x):
-        with torch.cuda.amp.autocast():
-            return {"predict": self.mlp(x)}
-
-
-    def predict(self, x):
-        with torch.cuda.amp.autocast():
-            return {"predict": torch.sigmoid(self.mlp(x)) > 0.5}
\ No newline at end of file
diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py
index 8d5d576e..7245b577 100644
--- a/fastNLP/modules/encoder/bert.py
+++ b/fastNLP/modules/encoder/bert.py
@@ -477,7 +477,8 @@ class BertModel(nn.Module):
         if isinstance(module, nn.Linear) and module.bias is not None:
             module.bias.data.zero_()
 
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True,
+                position_ids=None):
         """
 
         :param torch.LongTensor input_ids: bsz x max_len的输入id
@@ -485,6 +486,7 @@ class BertModel(nn.Module):
         :param attention_mask: 需要attend的为1，不需要为0
         :param bool output_all_encoded_layers: 是否输出所有层，默认输出token embedding(包含bpe, position以及type embedding)
             及每一层的hidden states。如果为False，只输出最后一层的结果
+        :param torch.LongTensor position_ids: bsz x max_len, position的id
         :return: encode_layers: 如果output_all_encoded_layers为True，返回list(共num_layers+1个元素)，每个元素为
             bsz x max_len x hidden_size否则返回bsz x max_len x hidden_size的tensor;
             pooled_output: bsz x hidden_size为cls的表示，可以用于句子的分类
@@ -506,10 +508,11 @@ class BertModel(nn.Module):
         # positions we want to attend and -10000.0 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        # this will case an issue when DataParallel: https://github.com/pytorch/pytorch/issues/40457#issuecomment-648396469
+        # extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
-        embedding_output = self.embeddings(input_ids, token_type_ids)
+        embedding_output = self.embeddings(input_ids, token_type_ids=token_type_ids, position_ids=position_ids)
         encoded_layers = self.encoder(embedding_output,
                                       extended_attention_mask,
                                       output_all_encoded_layers=output_all_encoded_layers)
diff --git a/fastNLP/modules/encoder/gpt2.py b/fastNLP/modules/encoder/gpt2.py
index e534fa5c..076d98eb 100644
--- a/fastNLP/modules/encoder/gpt2.py
+++ b/fastNLP/modules/encoder/gpt2.py
@@ -834,7 +834,8 @@ class GPT2Model(GPT2PreTrainedModel):
             # positions we want to attend and -10000.0 for masked positions.
             # Since we are adding it to the raw scores before the softmax, this is
             # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+            # this will case an issue when DataParallel: https://github.com/pytorch/pytorch/issues/40457#issuecomment-648396469
+            # attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
             attention_mask = (1.0 - attention_mask) * -10000.0
             # attention_mask = attention_mask.masked_fill(attention_mask.eq(0), -10000.0)
 
diff --git a/fastNLP/modules/encoder/roberta.py b/fastNLP/modules/encoder/roberta.py
index 10bdb64b..aab89efc 100644
--- a/fastNLP/modules/encoder/roberta.py
+++ b/fastNLP/modules/encoder/roberta.py
@@ -39,7 +39,7 @@ class RobertaEmbeddings(BertEmbeddings):
             config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
         )
 
-    def forward(self, input_ids, token_type_ids, words_embeddings=None):
+    def forward(self, input_ids, token_type_ids, words_embeddings=None, **kwargs):
         position_ids = self.create_position_ids_from_input_ids(input_ids)
 
         return super().forward(
diff --git a/tests/core/test_trainer.py b/tests/core/test_trainer.py
index d0d462da..f9a7ae42 100644
--- a/tests/core/test_trainer.py
+++ b/tests/core/test_trainer.py
@@ -14,8 +14,12 @@ from fastNLP import CrossEntropyLoss
 from fastNLP import AccuracyMetric
 from fastNLP import SGD
 from fastNLP import Trainer
-from fastNLP.models.base_model import NaiveClassifier, NaiveClassifier2, NaiveClassifier3, NaiveClassifier4
+from fastNLP.models.base_model import NaiveClassifier
 from fastNLP import TorchLoaderIter
+from fastNLP.models import BaseModel
+from fastNLP.modules import MLP
+from pkg_resources import parse_version
+
 
 
 def prepare_fake_dataset():
@@ -577,6 +581,22 @@ class TrainerTestGround(unittest.TestCase):
     """
 
 
+class NaiveClassifier2(BaseModel):
+    r"""
+    一个简单的分类器例子，可用于各种测试
+    """
+
+    def __init__(self, in_feature_dim, out_feature_dim):
+        super(NaiveClassifier2, self).__init__()
+        self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim])
+
+    def forward(self, x):
+        return {"predict": self.mlp(x)}
+
+    def predict(self, x):
+        return {"predict": torch.sigmoid(self.mlp(x)) > 0.5}
+
+
 class Fp16TrainerTest(unittest.TestCase):
     def test_raise_error(self):
         data_set = prepare_fake_dataset()
@@ -605,7 +625,7 @@ class Fp16TrainerTest(unittest.TestCase):
                               metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None,
                               use_tqdm=True, check_code_level=2, fp16=True, device=torch.device('cpu'))
 
-    @unittest.skipIf(torch.cuda.is_available()==False, "Skip when no cuda device detch")
+    @unittest.skipIf(torch.cuda.is_available()==False or parse_version(torch.__version__) < parse_version('1.6'), "Skip when no cuda device detch")
     def test_run_fp16(self):
         data_set = prepare_fake_dataset()
         data_set.set_input("x", flag=True)
@@ -627,7 +647,7 @@ class Fp16TrainerTest(unittest.TestCase):
                           use_tqdm=True, check_code_level=2, fp16=True, device=0, test_use_fp16=False)
         trainer.train(load_best_model=False)
 
-    @unittest.skipIf(torch.cuda.device_count()<2, "Skip when lower than 1 gpus.")
+    @unittest.skipIf(torch.cuda.device_count()<2 or parse_version(torch.__version__) < parse_version('1.6'), "Skip when lower than 1 gpus.")
     def test_run_data_parallel(self):
         data_set = prepare_fake_dataset()
         data_set.set_input("x", flag=True)
@@ -635,6 +655,21 @@ class Fp16TrainerTest(unittest.TestCase):
 
         train_set, dev_set = data_set.split(0.3)
 
+        class NaiveClassifier2(BaseModel):
+            r"""
+            一个简单的分类器例子，可用于各种测试
+            """
+
+            def __init__(self, in_feature_dim, out_feature_dim):
+                super(NaiveClassifier2, self).__init__()
+                self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim])
+
+            def forward(self, x):
+                return {"predict": self.mlp(x)}
+
+            def predict(self, x):
+                return {"predict": torch.sigmoid(self.mlp(x)) > 0.5}
+
         model = NaiveClassifier2(2, 1)
         with self.assertRaises(RuntimeError):
             trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"),
@@ -643,12 +678,46 @@ class Fp16TrainerTest(unittest.TestCase):
                               use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1])
 
         with self.assertRaises(RuntimeError):
+            class NaiveClassifier3(BaseModel):
+                r"""
+                一个简单的分类器例子，可用于各种测试
+                """
+
+                def __init__(self, in_feature_dim, out_feature_dim):
+                    super(NaiveClassifier3, self).__init__()
+                    self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim])
+
+                @torch.cuda.amp.autocast()
+                def forward(self, x):
+                    return {"predict": self.mlp(x)}
+
+                @torch.cuda.amp.autocast()
+                def predict(self, x):
+                    return {"predict": torch.sigmoid(self.mlp(x)) > 0.5}
+
             model = NaiveClassifier3(2, 1)
             trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"),
                               batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set,
                               metrics=AccuracyMetric(pred="predict", target="y"), validate_every=-1, save_path=None,
                               use_tqdm=True, check_code_level=2, fp16=True, device=[0, 1], test_use_fp16=True)
 
+        class NaiveClassifier4(BaseModel):
+            r"""
+            一个简单的分类器例子，可用于各种测试
+            """
+
+            def __init__(self, in_feature_dim, out_feature_dim):
+                super(NaiveClassifier4, self).__init__()
+                self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim])
+
+            def forward(self, x):
+                with torch.cuda.amp.autocast():
+                    return {"predict": self.mlp(x)}
+
+            def predict(self, x):
+                with torch.cuda.amp.autocast():
+                    return {"predict": torch.sigmoid(self.mlp(x)) > 0.5}
+
         model = NaiveClassifier4(2, 1)
         trainer = Trainer(train_set, model, optimizer=SGD(lr=0.1), loss=BCEWithLogits(pred="predict", target="y"),
                           batch_size=32, n_epochs=10, print_every=50, dev_data=dev_set,
diff --git a/tests/embeddings/test_bert_embedding.py b/tests/embeddings/test_bert_embedding.py
index f0104a58..ef231be0 100644
--- a/tests/embeddings/test_bert_embedding.py
+++ b/tests/embeddings/test_bert_embedding.py
@@ -31,29 +31,33 @@ class TestDownload(unittest.TestCase):
 
 class TestBertEmbedding(unittest.TestCase):
     def test_bert_embedding_1(self):
-        vocab = Vocabulary().add_word_lst("this is a test . [SEP] NotInBERT".split())
-        embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert', word_dropout=0.1)
-        requires_grad = embed.requires_grad
-        embed.requires_grad = not requires_grad
-        embed.train()
-        words = torch.LongTensor([[2, 3, 4, 0]])
-        result = embed(words)
-        self.assertEqual(result.size(), (1, 4, 16))
-
-        embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert', word_dropout=0.1)
-        embed.eval()
-        words = torch.LongTensor([[2, 3, 4, 0]])
-        result = embed(words)
-        self.assertEqual(result.size(), (1, 4, 16))
-
-        # 自动截断而不报错
-        embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert', word_dropout=0.1,
-                              auto_truncate=True)
-
-        words = torch.LongTensor([[2, 3, 4, 1]*10,
-                                  [2, 3]+[0]*38])
-        result = embed(words)
-        self.assertEqual(result.size(), (2, 40, 16))
+        for pool_method in ['first', 'last', 'max', 'avg']:
+            with self.subTest(pool_method=pool_method):
+                vocab = Vocabulary().add_word_lst("this is a test . [SEP] NotInBERT".split())
+                embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert', word_dropout=0.1,
+                                      pool_method=pool_method)
+                requires_grad = embed.requires_grad
+                embed.requires_grad = not requires_grad
+                embed.train()
+                words = torch.LongTensor([[2, 3, 4, 0]])
+                result = embed(words)
+                self.assertEqual(result.size(), (1, 4, 16))
+
+                embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert', word_dropout=0.1,
+                                      pool_method=pool_method)
+                embed.eval()
+                words = torch.LongTensor([[2, 3, 4, 0]])
+                result = embed(words)
+                self.assertEqual(result.size(), (1, 4, 16))
+
+                # 自动截断而不报错
+                embed = BertEmbedding(vocab, model_dir_or_name='tests/data_for_tests/embedding/small_bert', word_dropout=0.1,
+                                      auto_truncate=True, pool_method=pool_method)
+
+                words = torch.LongTensor([[2, 3, 4, 1]*10,
+                                          [2, 3]+[0]*38])
+                result = embed(words)
+                self.assertEqual(result.size(), (2, 40, 16))
 
     def test_save_load(self):
         bert_save_test = 'bert_save_test'
diff --git a/tests/embeddings/test_stack_embeddings.py b/tests/embeddings/test_stack_embeddings.py
index 2eb0b414..8177fa90 100644
--- a/tests/embeddings/test_stack_embeddings.py
+++ b/tests/embeddings/test_stack_embeddings.py
@@ -18,3 +18,16 @@ class TestCharEmbed(unittest.TestCase):
         y = embed(x)
         self.assertEqual(tuple(y.size()), (2, 3, 130))
 
+    def test_case_2(self):
+        # 测试只需要拥有一样的index就可以concat
+        ds = DataSet([Instance(words=['hello', 'world']), Instance(words=['hello', 'Jack'])])
+        vocab1 = Vocabulary().from_dataset(ds, field_name='words')
+        vocab2 = Vocabulary().from_dataset(ds, field_name='words')
+        self.assertEqual(len(vocab1), 5)
+        cnn_embed = CNNCharEmbedding(vocab1, embed_size=60)
+        lstm_embed = LSTMCharEmbedding(vocab2, embed_size=70)
+        embed = StackEmbedding([cnn_embed, lstm_embed])
+        x = torch.LongTensor([[2, 1, 0], [4, 3, 4]])
+        y = embed(x)
+        self.assertEqual(tuple(y.size()), (2, 3, 130))
+