diff --git a/fastNLP/embeddings/__init__.py b/fastNLP/embeddings/__init__.py
new file mode 100644
index 00000000..92320fde
--- /dev/null
+++ b/fastNLP/embeddings/__init__.py
@@ -0,0 +1,21 @@
+"""
+embeddings 模块里实现了
+"""
+
+__all__ = [
+    "Embedding",
+    "StaticEmbedding",
+    "ElmoEmbedding",
+    "BertEmbedding",
+    "StackEmbedding",
+    "LSTMCharEmbedding",
+    "CNNCharEmbedding",
+]
+
+
+from .embedding import Embedding
+from .static_embedding import StaticEmbedding
+from .elmo_embedding import ElmoEmbedding
+from .bert_embedding import BertEmbedding
+from .char_embedding import CNNCharEmbedding, LSTMCharEmbedding
+from .stack_embedding import StackEmbedding
diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py
new file mode 100644
index 00000000..f9077e10
--- /dev/null
+++ b/fastNLP/embeddings/bert_embedding.py
@@ -0,0 +1,321 @@
+
+import os
+import collections
+
+from torch import nn
+import torch
+import numpy as np
+from itertools import chain
+
+from ..core.vocabulary import Vocabulary
+from ..io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR
+from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer
+from .contextual_embedding import ContextualEmbedding
+
+
+class BertEmbedding(ContextualEmbedding):
+    """
+    别名：:class:`fastNLP.embeddings.BertEmbedding`   :class:`fastNLP.embeddings.bert_embedding.BertEmbedding`
+
+    使用BERT对words进行encode的Embedding。建议将输入的words长度限制在450以内，而不要使用512。这是由于预训练的bert模型长
+        度限制为512个token，而因为输入的word是未进行word piece分割的，在分割之后长度可能会超过最大长度限制。
+
+    Example::
+
+        >>> embedding = BertEmbedding(vocab, model_dir_or_name='en-base-uncased', requires_grad=False, layers='4,-2,-1')
+
+
+    :param fastNLP.Vocabulary vocab: 词表
+    :param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为 ``en-base-uncased``.
+    :param str layers:最终结果中的表示。以','隔开层数，可以以负数去索引倒数几层
+    :param str pool_method: 因为在bert中，每个word会被表示为多个word pieces, 当获取一个word的表示的时候，怎样从它的word pieces
+        中计算得到它对应的表示。支持``last``, ``first``, ``avg``, ``max``。
+    :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
+    :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。
+    :param bool include_cls_sep: bool，在bert计算句子的表示的时候，需要在前面加上[CLS]和[SEP], 是否在结果中保留这两个内容。 这样
+        会使得word embedding的结果比输入的结果长两个token。在使用 :class::StackEmbedding 可能会遇到问题。
+    :param bool requires_grad: 是否需要gradient。
+    """
+    def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1',
+                 pool_method: str='first', word_dropout=0, dropout=0, requires_grad: bool=False,
+                 include_cls_sep: bool=False):
+        super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
+
+        # 根据model_dir_or_name检查是否存在并下载
+        if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR:
+            PRETRAIN_URL = _get_base_url('bert')
+            model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
+            model_url = PRETRAIN_URL + model_name
+            model_dir = cached_path(model_url)
+            # 检查是否存在
+        elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))):
+            model_dir = model_dir_or_name
+        else:
+            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
+
+        self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers,
+                                    pool_method=pool_method, include_cls_sep=include_cls_sep)
+
+        self.requires_grad = requires_grad
+        self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size
+
+    def _delete_model_weights(self):
+        del self.model
+
+    def forward(self, words):
+        """
+        计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要
+            删除这两个token的表示。
+
+        :param torch.LongTensor words: [batch_size, max_len]
+        :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
+        """
+        words = self.drop_word(words)
+        outputs = self._get_sent_reprs(words)
+        if outputs is not None:
+            return self.dropout(words)
+        outputs = self.model(words)
+        outputs = torch.cat([*outputs], dim=-1)
+
+        return self.dropout(words)
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+        :return:
+        """
+        requires_grads = set([param.requires_grad for name, param in self.named_parameters()
+                             if 'word_pieces_lengths' not in name])
+        if len(requires_grads) == 1:
+            return requires_grads.pop()
+        else:
+            return None
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        for name, param in self.named_parameters():
+            if 'word_pieces_lengths' in name:  # 这个不能加入到requires_grad中
+                continue
+            param.requires_grad = value
+
+
+class BertWordPieceEncoder(nn.Module):
+    """
+    读取bert模型，读取之后调用index_dataset方法在dataset中生成word_pieces这一列。
+
+    :param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为``en-base-uncased``
+    :param str layers:最终结果中的表示。以','隔开层数，可以以负数去索引倒数几层
+    :param bool requires_grad: 是否需要gradient。
+    """
+    def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1',
+                 requires_grad: bool=False):
+        super().__init__()
+        PRETRAIN_URL = _get_base_url('bert')
+
+        if model_dir_or_name in PRETRAINED_BERT_MODEL_DIR:
+            model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
+            model_url = PRETRAIN_URL + model_name
+            model_dir = cached_path(model_url)
+            # 检查是否存在
+        elif os.path.isdir(model_dir_or_name):
+            model_dir = model_dir_or_name
+        else:
+            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
+
+        self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers)
+        self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size
+        self.requires_grad = requires_grad
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+        :return:
+        """
+        requires_grads = set([param.requires_grad for name, param in self.named_parameters()])
+        if len(requires_grads) == 1:
+            return requires_grads.pop()
+        else:
+            return None
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        for name, param in self.named_parameters():
+            param.requires_grad = value
+
+    @property
+    def embed_size(self):
+        return self._embed_size
+
+    def index_datasets(self, *datasets, field_name):
+        """
+        使用bert的tokenizer新生成word_pieces列加入到datasets中，并将他们设置为input。如果首尾不是
+            [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。
+
+        :param datasets: DataSet对象
+        :param field_name: 基于哪一列的内容生成word_pieces列。这一列中每个数据应该是List[str]的形式。
+        :return:
+        """
+        self.model.index_dataset(*datasets, field_name=field_name)
+
+    def forward(self, word_pieces, token_type_ids=None):
+        """
+        计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。
+
+        :param words: batch_size x max_len
+        :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话
+        :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
+        """
+        outputs = self.model(word_pieces, token_type_ids)
+        outputs = torch.cat([*outputs], dim=-1)
+
+        return outputs
+
+
+class _WordBertModel(nn.Module):
+    def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first', include_cls_sep:bool=False):
+        super().__init__()
+
+        self.tokenzier = BertTokenizer.from_pretrained(model_dir)
+        self.encoder = BertModel.from_pretrained(model_dir)
+        #  检查encoder_layer_number是否合理
+        encoder_layer_number = len(self.encoder.encoder.layer)
+        self.layers = list(map(int, layers.split(',')))
+        for layer in self.layers:
+            if layer<0:
+                assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \
+                    f"a bert model with {encoder_layer_number} layers."
+            else:
+                assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
+                    f"a bert model with {encoder_layer_number} layers."
+
+        assert pool_method in ('avg', 'max', 'first', 'last')
+        self.pool_method = pool_method
+        self.include_cls_sep = include_cls_sep
+
+        # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
+        print("Start to generating word pieces for word.")
+        # 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值
+        word_piece_dict = {'[CLS]':1, '[SEP]':1}  # 用到的word_piece以及新增的
+        found_count = 0
+        for word, index in vocab:
+            if index == vocab.padding_idx:  # pad是个特殊的符号
+                word = '[PAD]'
+            elif index == vocab.unknown_idx:
+                word = '[UNK]'
+            word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
+            if len(word_pieces)==1:
+                if not vocab._is_word_no_create_entry(word):  # 如果是train中的值, 但是却没有找到
+                    if index!=vocab.unknown_idx and word_pieces[0]=='[UNK]': # 说明这个词不在原始的word里面
+                        word_piece_dict[word] = 1  # 新增一个值
+                        continue
+            for word_piece in word_pieces:
+                word_piece_dict[word_piece] = 1
+            found_count += 1
+        original_embed = self.encoder.embeddings.word_embeddings.weight.data
+        # 特殊词汇要特殊处理
+        embed = nn.Embedding(len(word_piece_dict), original_embed.size(1))  # 新的embed
+        new_word_piece_vocab = collections.OrderedDict()
+        for index, token in enumerate(['[PAD]', '[UNK]']):
+            word_piece_dict.pop(token, None)
+            embed.weight.data[index] = original_embed[self.tokenzier.vocab[token]]
+            new_word_piece_vocab[token] = index
+        for token in word_piece_dict.keys():
+            if token in self.tokenzier.vocab:
+                embed.weight.data[len(new_word_piece_vocab)] = original_embed[self.tokenzier.vocab[token]]
+            else:
+                embed.weight.data[len(new_word_piece_vocab)] = original_embed[self.tokenzier.vocab['[UNK]']]
+            new_word_piece_vocab[token] = len(new_word_piece_vocab)
+        self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab)
+        self.encoder.embeddings.word_embeddings = embed
+
+        word_to_wordpieces = []
+        word_pieces_lengths = []
+        for word, index in vocab:
+            if index == vocab.padding_idx:  # pad是个特殊的符号
+                word = '[PAD]'
+            elif index == vocab.unknown_idx:
+                word = '[UNK]'
+            word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
+            word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces)
+            word_to_wordpieces.append(word_pieces)
+            word_pieces_lengths.append(len(word_pieces))
+        print("Found(Or seg into word pieces) {} words out of {}.".format(found_count, len(vocab)))
+        self._cls_index = self.tokenzier.vocab['[CLS]']
+        self._sep_index = self.tokenzier.vocab['[SEP]']
+        self._pad_index = vocab.padding_idx
+        self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]']  # 需要用于生成word_piece
+        self.word_to_wordpieces = np.array(word_to_wordpieces)
+        self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False)
+        print("Successfully generate word pieces.")
+
+    def forward(self, words):
+        """
+
+        :param words: torch.LongTensor, batch_size x max_len
+        :return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size
+        """
+        batch_size, max_word_len = words.size()
+        seq_len = words.ne(self._pad_index).sum(dim=-1)
+        batch_word_pieces_length = self.word_pieces_lengths[words]  # batch_size x max_len
+        word_pieces_lengths = batch_word_pieces_length.sum(dim=-1)
+        max_word_piece_length = word_pieces_lengths.max().item()
+        # +2是由于需要加入[CLS]与[SEP]
+        word_pieces = words.new_full((batch_size, max_word_piece_length+2), fill_value=self._wordpiece_pad_index)
+        word_pieces[:, 0].fill_(self._cls_index)
+        batch_indexes = torch.arange(batch_size).to(words)
+        word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index
+        attn_masks = torch.zeros_like(word_pieces)
+        # 1. 获取words的word_pieces的id，以及对应的span范围
+        word_indexes = words.tolist()
+        for i in range(batch_size):
+            word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i]]))
+            word_pieces[i, 1:len(word_pieces_i)+1] = torch.LongTensor(word_pieces_i)
+            attn_masks[i, :len(word_pieces_i)+2].fill_(1)
+        # TODO 截掉长度超过的部分。
+        # 2. 获取hidden的结果，根据word_pieces进行对应的pool计算
+        # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...]
+        bert_outputs, _ = self.encoder(word_pieces, token_type_ids=None, attention_mask=attn_masks,
+                                           output_all_encoded_layers=True)
+        # output_layers = [self.layers]  # len(self.layers) x batch_size x max_word_piece_length x hidden_size
+
+        if self.include_cls_sep:
+            outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2,
+                                                 bert_outputs[-1].size(-1))
+            s_shift = 1
+        else:
+            outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len,
+                                                 bert_outputs[-1].size(-1))
+            s_shift = 0
+        batch_word_pieces_cum_length = batch_word_pieces_length.new_zeros(batch_size, max_word_len + 1)
+        batch_word_pieces_cum_length[:, 1:] = batch_word_pieces_length.cumsum(dim=-1)  # batch_size x max_len
+        for l_index, l in enumerate(self.layers):
+            output_layer = bert_outputs[l]
+            # 从word_piece collapse到word的表示
+            truncate_output_layer = output_layer[:, 1:-1]  # 删除[CLS]与[SEP] batch_size x len x hidden_size
+            outputs_seq_len = seq_len + s_shift
+            if self.pool_method == 'first':
+                for i in range(batch_size):
+                    i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]]  # 每个word的start位置
+                    outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length]  # num_layer x batch_size x len x hidden_size
+            elif self.pool_method == 'last':
+                for i in range(batch_size):
+                    i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i]+1] - 1 # 每个word的end
+                    outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length]
+            elif self.pool_method == 'max':
+                for i in range(batch_size):
+                    for j in range(seq_len[i]):
+                        start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
+                        outputs[l_index, i, j+s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2)
+            else:
+                for i in range(batch_size):
+                    for j in range(seq_len[i]):
+                        start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
+                        outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2)
+            if self.include_cls_sep:
+                outputs[l_index, :, 0] = output_layer[:, 0]
+                outputs[l_index, batch_indexes, seq_len+s_shift] = output_layer[batch_indexes, seq_len+s_shift]
+        # 3. 最终的embedding结果
+        return outputs
+
diff --git a/fastNLP/embeddings/char_embedding.py b/fastNLP/embeddings/char_embedding.py
new file mode 100644
index 00000000..76297219
--- /dev/null
+++ b/fastNLP/embeddings/char_embedding.py
@@ -0,0 +1,280 @@
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List
+
+from ..modules.encoder.lstm import LSTM
+from ..core.vocabulary import Vocabulary
+from .embedding import TokenEmbedding
+from .utils import _construct_char_vocab_from_vocab
+
+
+class CNNCharEmbedding(TokenEmbedding):
+    """
+    别名：:class:`fastNLP.embeddings.CNNCharEmbedding`   :class:`fastNLP.embeddings.char_embedding.CNNCharEmbedding`
+
+    使用CNN生成character embedding。CNN的结果为, embed(x) -> Dropout(x) -> CNN(x) -> activation(x) -> pool -> fc -> Dropout.
+        不同的kernel大小的fitler结果是concat起来的。
+
+    Example::
+
+        >>> cnn_char_embed = CNNCharEmbedding(vocab)
+
+
+    :param vocab: 词表
+    :param embed_size: 该word embedding的大小，默认值为50.
+    :param char_emb_size: character的embed的大小。character是从vocab中生成的。默认值为50.
+    :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
+    :param float dropout: 以多大的概率drop
+    :param filter_nums: filter的数量. 长度需要和kernels一致。默认值为[40, 30, 20].
+    :param kernel_sizes: kernel的大小. 默认值为[5, 3, 1].
+    :param pool_method: character的表示在合成一个表示时所使用的pool方法，支持'avg', 'max'.
+    :param activation: CNN之后使用的激活方法，支持'relu', 'sigmoid', 'tanh' 或者自定义函数.
+    :param min_char_freq: character的最少出现次数。默认值为2.
+    """
+    def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0,
+                 dropout:float=0.5, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1),
+                 pool_method: str='max', activation='relu', min_char_freq: int=2):
+        super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
+
+        for kernel in kernel_sizes:
+            assert kernel % 2 == 1, "Only odd kernel is allowed."
+
+        assert pool_method in ('max', 'avg')
+        self.dropout = nn.Dropout(dropout)
+        self.pool_method = pool_method
+        # activation function
+        if isinstance(activation, str):
+            if activation.lower() == 'relu':
+                self.activation = F.relu
+            elif activation.lower() == 'sigmoid':
+                self.activation = F.sigmoid
+            elif activation.lower() == 'tanh':
+                self.activation = F.tanh
+        elif activation is None:
+            self.activation = lambda x: x
+        elif callable(activation):
+            self.activation = activation
+        else:
+            raise Exception(
+                "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")
+
+        print("Start constructing character vocabulary.")
+        # 建立char的词表
+        self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
+        self.char_pad_index = self.char_vocab.padding_idx
+        print(f"In total, there are {len(self.char_vocab)} distinct characters.")
+        # 对vocab进行index
+        max_word_len = max(map(lambda x: len(x[0]), vocab))
+        self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), max_word_len),
+                                                                fill_value=self.char_pad_index, dtype=torch.long),
+                                                     requires_grad=False)
+        self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False)
+        for word, index in vocab:
+            # if index!=vocab.padding_idx:  # 如果是pad的话，直接就为pad_value了。修改为不区分pad, 这样所有的<pad>也是同一个embed
+            self.words_to_chars_embedding[index, :len(word)] = \
+                torch.LongTensor([self.char_vocab.to_index(c) for c in word])
+            self.word_lengths[index] = len(word)
+        self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size)
+
+        self.convs = nn.ModuleList([nn.Conv1d(
+            char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2)
+            for i in range(len(kernel_sizes))])
+        self._embed_size = embed_size
+        self.fc = nn.Linear(sum(filter_nums), embed_size)
+        self.init_param()
+
+    def forward(self, words):
+        """
+        输入words的index后，生成对应的words的表示。
+
+        :param words: [batch_size, max_len]
+        :return: [batch_size, max_len, embed_size]
+        """
+        words = self.drop_word(words)
+        batch_size, max_len = words.size()
+        chars = self.words_to_chars_embedding[words]  # batch_size x max_len x max_word_len
+        word_lengths = self.word_lengths[words] # batch_size x max_len
+        max_word_len = word_lengths.max()
+        chars = chars[:, :, :max_word_len]
+        # 为1的地方为mask
+        chars_masks = chars.eq(self.char_pad_index)  # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
+        chars = self.char_embedding(chars)  # batch_size x max_len x max_word_len x embed_size
+        chars = self.dropout(chars)
+        reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1)
+        reshaped_chars = reshaped_chars.transpose(1, 2)  # B' x E x M
+        conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1)
+                      for conv in self.convs]
+        conv_chars = torch.cat(conv_chars, dim=-1).contiguous()  # B x max_len x max_word_len x sum(filters)
+        conv_chars = self.activation(conv_chars)
+        if self.pool_method == 'max':
+            conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf'))
+            chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters)
+        else:
+            conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), 0)
+            chars = torch.sum(conv_chars, dim=-2)/chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
+        chars = self.fc(chars)
+        return self.dropout(chars)
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+        :return:
+        """
+        params = []
+        for name, param in self.named_parameters():
+            if 'words_to_chars_embedding' not in name and 'word_lengths' not in name:
+                params.append(param.requires_grad)
+        requires_grads = set(params)
+        if len(requires_grads) == 1:
+            return requires_grads.pop()
+        else:
+            return None
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        for name, param in self.named_parameters():
+            if 'words_to_chars_embedding' in name or 'word_lengths' in name:  # 这个不能加入到requires_grad中
+                continue
+            param.requires_grad = value
+
+    def init_param(self):
+        for name, param in self.named_parameters():
+            if 'words_to_chars_embedding' in name or 'word_lengths' in name:  # 这个不能reset
+                continue
+            if param.data.dim()>1:
+                nn.init.xavier_uniform_(param, 1)
+            else:
+                nn.init.uniform_(param, -1, 1)
+
+
+class LSTMCharEmbedding(TokenEmbedding):
+    """
+    别名：:class:`fastNLP.embeddings.LSTMCharEmbedding`   :class:`fastNLP.embeddings.char_embedding.LSTMCharEmbedding`
+
+    使用LSTM的方式对character进行encode. embed(x) -> Dropout(x) -> LSTM(x) -> activation(x) -> pool
+
+    Example::
+
+        >>> lstm_char_embed = LSTMCharEmbedding(vocab)
+
+    :param vocab: 词表
+    :param embed_size: embedding的大小。默认值为50.
+    :param char_emb_size: character的embedding的大小。默认值为50.
+    :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
+    :param dropout: 以多大概率drop
+    :param hidden_size: LSTM的中间hidden的大小，如果为bidirectional的，hidden会除二，默认为50.
+    :param pool_method: 支持'max', 'avg'
+    :param activation: 激活函数，支持'relu', 'sigmoid', 'tanh', 或者自定义函数.
+    :param min_char_freq: character的最小出现次数。默认值为2.
+    :param bidirectional: 是否使用双向的LSTM进行encode。默认值为True。
+    """
+    def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0,
+                 dropout:float=0.5, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2,
+                 bidirectional=True):
+        super(LSTMCharEmbedding, self).__init__(vocab)
+
+        assert hidden_size % 2 == 0, "Only even kernel is allowed."
+
+        assert pool_method in ('max', 'avg')
+        self.pool_method = pool_method
+        self.dropout = nn.Dropout(dropout)
+        # activation function
+        if isinstance(activation, str):
+            if activation.lower() == 'relu':
+                self.activation = F.relu
+            elif activation.lower() == 'sigmoid':
+                self.activation = F.sigmoid
+            elif activation.lower() == 'tanh':
+                self.activation = F.tanh
+        elif activation is None:
+            self.activation = lambda x: x
+        elif callable(activation):
+            self.activation = activation
+        else:
+            raise Exception(
+                "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")
+
+        print("Start constructing character vocabulary.")
+        # 建立char的词表
+        self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
+        self.char_pad_index = self.char_vocab.padding_idx
+        print(f"In total, there are {len(self.char_vocab)} distinct characters.")
+        # 对vocab进行index
+        self.max_word_len = max(map(lambda x: len(x[0]), vocab))
+        self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), self.max_word_len),
+                                                                fill_value=self.char_pad_index, dtype=torch.long),
+                                                     requires_grad=False)
+        self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False)
+        for word, index in vocab:
+            # if index!=vocab.padding_idx:  # 如果是pad的话，直接就为pad_value了. 修改为不区分pad与否
+            self.words_to_chars_embedding[index, :len(word)] = \
+                torch.LongTensor([self.char_vocab.to_index(c) for c in word])
+            self.word_lengths[index] = len(word)
+        self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size)
+
+        self.fc = nn.Linear(hidden_size, embed_size)
+        hidden_size = hidden_size // 2 if bidirectional else hidden_size
+
+        self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True)
+        self._embed_size = embed_size
+        self.bidirectional = bidirectional
+
+    def forward(self, words):
+        """
+        输入words的index后，生成对应的words的表示。
+
+        :param words: [batch_size, max_len]
+        :return: [batch_size, max_len, embed_size]
+        """
+        words = self.drop_word(words)
+        batch_size, max_len = words.size()
+        chars = self.words_to_chars_embedding[words]  # batch_size x max_len x max_word_len
+        word_lengths = self.word_lengths[words]  # batch_size x max_len
+        max_word_len = word_lengths.max()
+        chars = chars[:, :, :max_word_len]
+        # 为mask的地方为1
+        chars_masks = chars.eq(self.char_pad_index)  # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
+        chars = self.char_embedding(chars)  # batch_size x max_len x max_word_len x embed_size
+        chars = self.dropout(chars)
+        reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1)
+        char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len)
+        lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1)
+        # B x M x M x H
+
+        lstm_chars = self.activation(lstm_chars)
+        if self.pool_method == 'max':
+            lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf'))
+            chars, _ = torch.max(lstm_chars, dim=-2)  # batch_size x max_len x H
+        else:
+            lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), 0)
+            chars = torch.sum(lstm_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
+
+        chars = self.fc(chars)
+
+        return self.dropout(chars)
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+        :return:
+        """
+        params = []
+        for name, param in self.named_parameters():
+            if 'words_to_chars_embedding' not in name and 'word_lengths' not in name:
+                params.append(param)
+        requires_grads = set(params)
+        if len(requires_grads) == 1:
+            return requires_grads.pop()
+        else:
+            return None
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        for name, param in self.named_parameters():
+            if 'words_to_chars_embedding' in name or 'word_lengths' in name:  # 这个不能加入到requires_grad中
+                continue
+            param.requires_grad = value
diff --git a/fastNLP/embeddings/contextual_embedding.py b/fastNLP/embeddings/contextual_embedding.py
new file mode 100644
index 00000000..1831af4e
--- /dev/null
+++ b/fastNLP/embeddings/contextual_embedding.py
@@ -0,0 +1,100 @@
+
+from abc import abstractmethod
+import torch
+
+from ..core.vocabulary import Vocabulary
+from ..core.dataset import DataSet
+from ..core.batch import DataSetIter
+from ..core.sampler import SequentialSampler
+from ..core.utils import _move_model_to_device, _get_model_device
+from .embedding import TokenEmbedding
+
+
+class ContextualEmbedding(TokenEmbedding):
+    def __init__(self, vocab: Vocabulary, word_dropout:float=0.0, dropout:float=0.0):
+        super(ContextualEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
+
+    def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool=True):
+        """
+        由于动态embedding生成比较耗时，所以可以把每句话embedding缓存下来，这样就不需要每次都运行生成过程。
+
+        :param datasets: DataSet对象
+        :param batch_size: int, 生成cache的sentence表示时使用的batch的大小
+        :param device: 参考 :class::fastNLP.Trainer 的device
+        :param delete_weights: 似乎在生成了cache之后删除权重，在不需要finetune动态模型的情况下，删除权重会大量减少内存占用。
+        :return:
+        """
+        for index, dataset in enumerate(datasets):
+            try:
+                assert isinstance(dataset, DataSet), "Only fastNLP.DataSet object is allowed."
+                assert 'words' in dataset.get_input_name(), "`words` field has to be set as input."
+            except Exception as e:
+                print(f"Exception happens at {index} dataset.")
+                raise e
+
+        sent_embeds = {}
+        _move_model_to_device(self, device=device)
+        device = _get_model_device(self)
+        pad_index = self._word_vocab.padding_idx
+        print("Start to calculate sentence representations.")
+        with torch.no_grad():
+            for index, dataset in enumerate(datasets):
+                try:
+                    batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler())
+                    for batch_x, batch_y in batch:
+                        words = batch_x['words'].to(device)
+                        words_list = words.tolist()
+                        seq_len = words.ne(pad_index).sum(dim=-1)
+                        max_len = words.size(1)
+                        # 因为有些情况可能包含CLS, SEP, 从后面往前计算比较安全。
+                        seq_len_from_behind = (max_len - seq_len).tolist()
+                        word_embeds = self(words).detach().cpu().numpy()
+                        for b in range(words.size(0)):
+                            length = seq_len_from_behind[b]
+                            if length==0:
+                                sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b]
+                            else:
+                                sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length]
+                except Exception as e:
+                    print(f"Exception happens at {index} dataset.")
+                    raise e
+        print("Finish calculating sentence representations.")
+        self.sent_embeds = sent_embeds
+        if delete_weights:
+            self._delete_model_weights()
+
+    def _get_sent_reprs(self, words):
+        """
+        获取sentence的表示，如果有缓存，则返回缓存的值; 没有缓存则返回None
+
+        :param words: torch.LongTensor
+        :return:
+        """
+        if hasattr(self, 'sent_embeds'):
+            words_list = words.tolist()
+            seq_len = words.ne(self._word_pad_index).sum(dim=-1)
+            _embeds = []
+            for b in range(len(words)):
+                words_i = tuple(words_list[b][:seq_len[b]])
+                embed = self.sent_embeds[words_i]
+                _embeds.append(embed)
+            max_sent_len = max(map(len, _embeds))
+            embeds = words.new_zeros(len(_embeds), max_sent_len, self.embed_size, dtype=torch.float,
+                                     device=words.device)
+            for i, embed in enumerate(_embeds):
+                embeds[i, :len(embed)] = torch.FloatTensor(embed).to(words.device)
+            return embeds
+        return None
+
+    @abstractmethod
+    def _delete_model_weights(self):
+        """删除计算表示的模型以节省资源"""
+        raise NotImplementedError
+
+    def remove_sentence_cache(self):
+        """
+        删除缓存的句子表示. 删除之后如果模型权重没有被删除，将开始使用动态计算权重。
+
+        :return:
+        """
+        del self.sent_embeds
diff --git a/fastNLP/embeddings/elmo_embedding.py b/fastNLP/embeddings/elmo_embedding.py
new file mode 100644
index 00000000..f669d121
--- /dev/null
+++ b/fastNLP/embeddings/elmo_embedding.py
@@ -0,0 +1,326 @@
+
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import json
+import codecs
+
+from ..core.vocabulary import Vocabulary
+from ..io.file_utils import cached_path, _get_base_url, PRETRAINED_ELMO_MODEL_DIR
+from ..modules.encoder._elmo import ElmobiLm, ConvTokenEmbedder
+from .contextual_embedding import ContextualEmbedding
+
+
+class ElmoEmbedding(ContextualEmbedding):
+    """
+    别名：:class:`fastNLP.modules.ElmoEmbedding`   :class:`fastNLP.modules.encoder.embedding.ElmoEmbedding`
+
+    使用ELMo的embedding。初始化之后，只需要传入words就可以得到对应的embedding。
+    我们提供的ELMo预训练模型来自 https://github.com/HIT-SCIR/ELMoForManyLangs
+
+    Example::
+
+        >>> embedding = ElmoEmbedding(vocab, model_dir_or_name='en', layers='2', requires_grad=True)
+
+    :param vocab: 词表
+    :param model_dir_or_name: 可以有两种方式调用预训练好的ELMo embedding：第一种是传入ELMo权重的文件名，第二种是传入ELMo版本的名称，
+        目前支持的ELMo包括{`en` : 英文版本的ELMo, `cn` : 中文版本的ELMo,}。第二种情况将自动查看缓存中是否存在该模型，没有的话将自动下载
+    :param layers: str, 指定返回的层数, 以,隔开不同的层。如果要返回第二层的结果'2', 返回后两层的结果'1,2'。不同的层的结果
+        按照这个顺序concat起来。默认为'2'。'mix'会使用可学习的权重结合不同层的表示(权重是否可训练与requires_grad保持一致，
+        初始化权重对三层结果进行mean-pooling, 可以通过ElmoEmbedding.set_mix_weights_requires_grad()方法只将mix weights设置为可学习。)
+    :param requires_grad: bool, 该层是否需要gradient, 默认为False.
+    :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
+    :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。
+    :param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话，将在初始化的时候为每个word生成对应的embedding，
+        并删除character encoder，之后将直接使用cache的embedding。默认为False。
+    """
+
+    def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', layers: str = '2', requires_grad: bool = False,
+                 word_dropout=0.0, dropout=0.0, cache_word_reprs: bool = False):
+        super(ElmoEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
+
+        # 根据model_dir_or_name检查是否存在并下载
+        if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR:
+            PRETRAIN_URL = _get_base_url('elmo')
+            model_name = PRETRAINED_ELMO_MODEL_DIR[model_dir_or_name]
+            model_url = PRETRAIN_URL + model_name
+            model_dir = cached_path(model_url)
+            # 检查是否存在
+        elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))):
+            model_dir = model_dir_or_name
+        else:
+            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
+        self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs)
+
+        if layers == 'mix':
+            self.layer_weights = nn.Parameter(torch.zeros(self.model.config['lstm']['n_layers'] + 1),
+                                              requires_grad=requires_grad)
+            self.gamma = nn.Parameter(torch.ones(1), requires_grad=requires_grad)
+            self._get_outputs = self._get_mixed_outputs
+            self._embed_size = self.model.config['lstm']['projection_dim'] * 2
+        else:
+            layers = list(map(int, layers.split(',')))
+            assert len(layers) > 0, "Must choose one output"
+            for layer in layers:
+                assert 0 <= layer <= 2, "Layer index should be in range [0, 2]."
+            self.layers = layers
+            self._get_outputs = self._get_layer_outputs
+            self._embed_size = len(self.layers) * self.model.config['lstm']['projection_dim'] * 2
+
+        self.requires_grad = requires_grad
+
+    def _get_mixed_outputs(self, outputs):
+        # outputs: num_layers x batch_size x max_len x hidden_size
+        # return: batch_size x max_len x hidden_size
+        weights = F.softmax(self.layer_weights + 1 / len(outputs), dim=0).to(outputs)
+        outputs = torch.einsum('l,lbij->bij', weights, outputs)
+        return self.gamma.to(outputs) * outputs
+
+    def set_mix_weights_requires_grad(self, flag=True):
+        """
+        当初始化ElmoEmbedding时layers被设置为mix时，可以通过调用该方法设置mix weights是否可训练。如果layers不是mix，调用
+        该方法没有用。
+        :param bool flag: 混合不同层表示的结果是否可以训练。
+        :return:
+        """
+        if hasattr(self, 'layer_weights'):
+            self.layer_weights.requires_grad = flag
+            self.gamma.requires_grad = flag
+
+    def _get_layer_outputs(self, outputs):
+        if len(self.layers) == 1:
+            outputs = outputs[self.layers[0]]
+        else:
+            outputs = torch.cat(tuple([*outputs[self.layers]]), dim=-1)
+
+        return outputs
+
+    def forward(self, words: torch.LongTensor):
+        """
+        计算words的elmo embedding表示。根据elmo文章中介绍的ELMO实际上是有2L+1层结果，但是为了让结果比较容易拆分，token的
+            被重复了一次，使得实际上layer=0的结果是[token_embedding;token_embedding], 而layer=1的结果是[forward_hiddens;
+            backward_hiddens].
+
+        :param words: batch_size x max_len
+        :return: torch.FloatTensor. batch_size x max_len x (512*len(self.layers))
+        """
+        words = self.drop_word(words)
+        outputs = self._get_sent_reprs(words)
+        if outputs is not None:
+            return self.dropout(outputs)
+        outputs = self.model(words)
+        outputs = self._get_outputs(outputs)
+        return self.dropout(outputs)
+
+    def _delete_model_weights(self):
+        for name in ['layers', 'model', 'layer_weights', 'gamma']:
+            if hasattr(self, name):
+                delattr(self, name)
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+
+        :return:
+        """
+        requires_grads = set([param.requires_grad for name, param in self.named_parameters()
+                              if 'words_to_chars_embedding' not in name and 'words_to_words' not in name])
+        if len(requires_grads) == 1:
+            return requires_grads.pop()
+        else:
+            return None
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        for name, param in self.named_parameters():
+            if 'words_to_chars_embedding' in name or 'words_to_words' in name:  # 这个不能加入到requires_grad中
+                continue
+            param.requires_grad = value
+
+
+class _ElmoModel(nn.Module):
+    """
+    该Module是ElmoEmbedding中进行所有的heavy lifting的地方。做的工作，包括
+        (1) 根据配置，加载模型;
+        (2) 根据vocab，对模型中的embedding进行调整. 并将其正确初始化
+        (3) 保存一个words与chars的对应转换，获取时自动进行相应的转换
+        (4) 设计一个保存token的embedding，允许缓存word的表示。
+
+    """
+
+    def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False):
+        super(_ElmoModel, self).__init__()
+        self.model_dir = model_dir
+        dir = os.walk(self.model_dir)
+        config_file = None
+        weight_file = None
+        config_count = 0
+        weight_count = 0
+        for path, dir_list, file_list in dir:
+            for file_name in file_list:
+                if file_name.__contains__(".json"):
+                    config_file = file_name
+                    config_count += 1
+                elif file_name.__contains__(".pkl"):
+                    weight_file = file_name
+                    weight_count += 1
+        if config_count > 1 or weight_count > 1:
+            raise Exception(f"Multiple config files(*.json) or weight files(*.hdf5) detected in {model_dir}.")
+        elif config_count == 0 or weight_count == 0:
+            raise Exception(f"No config file or weight file found in {model_dir}")
+
+        config = json.load(open(os.path.join(model_dir, config_file), 'r'))
+        self.weight_file = os.path.join(model_dir, weight_file)
+        self.config = config
+
+        OOV_TAG = '<oov>'
+        PAD_TAG = '<pad>'
+        BOS_TAG = '<bos>'
+        EOS_TAG = '<eos>'
+        BOW_TAG = '<bow>'
+        EOW_TAG = '<eow>'
+
+        # For the model trained with character-based word encoder.
+        char_lexicon = {}
+        with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi:
+            for line in fpi:
+                tokens = line.strip().split('\t')
+                if len(tokens) == 1:
+                    tokens.insert(0, '\u3000')
+                token, i = tokens
+                char_lexicon[token] = int(i)
+
+        # 做一些sanity check
+        for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]:
+            assert special_word in char_lexicon, f"{special_word} not found in char.dic."
+
+        # 从vocab中构建char_vocab
+        char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG)
+        # 需要保证<bow>与<eow>在里面
+        char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG])
+
+        for word, index in vocab:
+            char_vocab.add_word_lst(list(word))
+
+        self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab) + 1, vocab.padding_idx
+        # 根据char_lexicon调整, 多设置一位，是预留给word padding的(该位置的char表示为全0表示)
+        char_emb_layer = nn.Embedding(len(char_vocab) + 1, int(config['char_cnn']['embedding']['dim']),
+                                      padding_idx=len(char_vocab))
+
+        # 读入预训练权重 这里的elmo_model 包含char_cnn和 lstm 的 state_dict
+        elmo_model = torch.load(os.path.join(self.model_dir, weight_file), map_location='cpu')
+
+        char_embed_weights = elmo_model["char_cnn"]['char_emb_layer.weight']
+
+        found_char_count = 0
+        for char, index in char_vocab:  # 调整character embedding
+            if char in char_lexicon:
+                index_in_pre = char_lexicon.get(char)
+                found_char_count += 1
+            else:
+                index_in_pre = char_lexicon[OOV_TAG]
+            char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre]
+
+        print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.")
+        # 生成words到chars的映射
+        max_chars = config['char_cnn']['max_characters_per_token']
+
+        self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars),
+                                                                fill_value=len(char_vocab),
+                                                                dtype=torch.long),
+                                                     requires_grad=False)
+        for word, index in list(iter(vocab)) + [(BOS_TAG, len(vocab)), (EOS_TAG, len(vocab) + 1)]:
+            if len(word) + 2 > max_chars:
+                word = word[:max_chars - 2]
+            if index == self._pad_index:
+                continue
+            elif word == BOS_TAG or word == EOS_TAG:
+                char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(word)] + [
+                    char_vocab.to_index(EOW_TAG)]
+                char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids))
+            else:
+                char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(c) for c in word] + [
+                    char_vocab.to_index(EOW_TAG)]
+                char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids))
+            self.words_to_chars_embedding[index] = torch.LongTensor(char_ids)
+
+        self.char_vocab = char_vocab
+
+        self.token_embedder = ConvTokenEmbedder(
+            config, self.weight_file, None, char_emb_layer)
+        elmo_model["char_cnn"]['char_emb_layer.weight'] = char_emb_layer.weight
+        self.token_embedder.load_state_dict(elmo_model["char_cnn"])
+
+        self.output_dim = config['lstm']['projection_dim']
+
+        # lstm encoder
+        self.encoder = ElmobiLm(config)
+        self.encoder.load_state_dict(elmo_model["lstm"])
+
+        if cache_word_reprs:
+            if config['char_cnn']['embedding']['dim'] > 0:  # 只有在使用了chars的情况下有用
+                print("Start to generate cache word representations.")
+                batch_size = 320
+                # bos eos
+                word_size = self.words_to_chars_embedding.size(0)
+                num_batches = word_size // batch_size + \
+                              int(word_size % batch_size != 0)
+
+                self.cached_word_embedding = nn.Embedding(word_size,
+                                                          config['lstm']['projection_dim'])
+                with torch.no_grad():
+                    for i in range(num_batches):
+                        words = torch.arange(i * batch_size,
+                                             min((i + 1) * batch_size, word_size)).long()
+                        chars = self.words_to_chars_embedding[words].unsqueeze(1)  # batch_size x 1 x max_chars
+                        word_reprs = self.token_embedder(words.unsqueeze(1),
+                                                         chars).detach()  # batch_size x 1 x config['encoder']['projection_dim']
+                        self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1)
+
+                    print("Finish generating cached word representations. Going to delete the character encoder.")
+                del self.token_embedder, self.words_to_chars_embedding
+            else:
+                print("There is no need to cache word representations, since no character information is used.")
+
+    def forward(self, words):
+        """
+
+        :param words: batch_size x max_len
+        :return: num_layers x batch_size x max_len x hidden_size
+        """
+        # 扩展<bos>, <eos>
+        batch_size, max_len = words.size()
+        expanded_words = words.new_zeros(batch_size, max_len + 2)  # 因为pad一定为0，
+        seq_len = words.ne(self._pad_index).sum(dim=-1)
+        expanded_words[:, 1:-1] = words
+        expanded_words[:, 0].fill_(self.bos_index)
+        expanded_words[torch.arange(batch_size).to(words), seq_len + 1] = self.eos_index
+        seq_len = seq_len + 2
+        zero_tensor = expanded_words.new_zeros(expanded_words.shape)
+        mask = (expanded_words == zero_tensor).unsqueeze(-1)
+        if hasattr(self, 'cached_word_embedding'):
+            token_embedding = self.cached_word_embedding(expanded_words)
+        else:
+            if hasattr(self, 'words_to_chars_embedding'):
+                chars = self.words_to_chars_embedding[expanded_words]
+            else:
+                chars = None
+            token_embedding = self.token_embedder(expanded_words, chars)  # batch_size x max_len x embed_dim
+
+        encoder_output = self.encoder(token_embedding, seq_len)
+        if encoder_output.size(2) < max_len + 2:
+            num_layers, _, output_len, hidden_size = encoder_output.size()
+            dummy_tensor = encoder_output.new_zeros(num_layers, batch_size,
+                                                    max_len + 2 - output_len, hidden_size)
+            encoder_output = torch.cat((encoder_output, dummy_tensor), 2)
+        sz = encoder_output.size()  # 2, batch_size, max_len, hidden_size
+        token_embedding = token_embedding.masked_fill(mask, 0)
+        token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3])
+        encoder_output = torch.cat((token_embedding, encoder_output), dim=0)
+
+        # 删除<eos>, <bos>. 这里没有精确地删除，但应该也不会影响最后的结果了。
+        encoder_output = encoder_output[:, :, 1:-1]
+        return encoder_output
diff --git a/fastNLP/embeddings/embedding.py b/fastNLP/embeddings/embedding.py
new file mode 100644
index 00000000..1ac1df3b
--- /dev/null
+++ b/fastNLP/embeddings/embedding.py
@@ -0,0 +1,180 @@
+
+import torch.nn as nn
+from abc import abstractmethod
+import torch
+
+from .utils import get_embeddings
+
+
+class Embedding(nn.Module):
+    """
+    别名：:class:`fastNLP.embeddings.Embedding`   :class:`fastNLP.embeddings.embedding.Embedding`
+
+    Embedding组件. 可以通过self.num_embeddings获取词表大小; self.embedding_dim获取embedding的维度"""
+    
+    def __init__(self, init_embed, word_dropout=0, dropout=0.0, unk_index=None):
+        """
+
+        :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int),
+            第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding;
+        :param float word_dropout: 按照一定概率随机将word设置为unk_index，这样可以使得unk这个token得到足够的训练, 且会对网络有
+            一定的regularize的作用。
+        :param float dropout: 对Embedding的输出的dropout。
+        :param int unk_index: drop word时替换为的index。fastNLP的Vocabulary的unk_index默认为1。
+        """
+        super(Embedding, self).__init__()
+
+        self.embed = get_embeddings(init_embed)
+        
+        self.dropout = nn.Dropout(dropout)
+        if not isinstance(self.embed, TokenEmbedding):
+            self._embed_size = self.embed.weight.size(1)
+            if word_dropout>0 and not isinstance(unk_index, int):
+                raise ValueError("When drop word is set, you need to pass in the unk_index.")
+        else:
+            self._embed_size = self.embed.embed_size
+            unk_index = self.embed.get_word_vocab().unknown_idx
+        self.unk_index = unk_index
+        self.word_dropout = word_dropout
+
+    def forward(self, x):
+        """
+        :param torch.LongTensor x: [batch, seq_len]
+        :return: torch.Tensor : [batch, seq_len, embed_dim]
+        """
+        if self.word_dropout>0 and self.training:
+            mask = torch.ones_like(x).float() * self.word_dropout
+            mask = torch.bernoulli(mask).byte()  # dropout_word越大，越多位置为1
+            x = x.masked_fill(mask, self.unk_index)
+        x = self.embed(x)
+        return self.dropout(x)
+
+    @property
+    def num_embedding(self)->int:
+        if isinstance(self.embed, nn.Embedding):
+            return self.embed.weight.size(0)
+        else:
+            return self.embed.num_embedding
+
+    def __len__(self):
+        return len(self.embed)
+
+    @property
+    def embed_size(self) -> int:
+        return self._embed_size
+
+    @property
+    def embedding_dim(self) -> int:
+        return self._embed_size
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+        :return:
+        """
+        if not isinstance(self.embed, TokenEmbedding):
+            return self.embed.weight.requires_grad
+        else:
+            return self.embed.requires_grad
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        if not isinstance(self.embed, TokenEmbedding):
+            self.embed.weight.requires_grad = value
+        else:
+            self.embed.requires_grad = value
+
+    @property
+    def size(self):
+        if isinstance(self.embed, TokenEmbedding):
+            return self.embed.size
+        else:
+            return self.embed.weight.size()
+
+
+class TokenEmbedding(nn.Module):
+    def __init__(self, vocab, word_dropout=0.0, dropout=0.0):
+        super(TokenEmbedding, self).__init__()
+        assert vocab.padding is not None, "Vocabulary must have a padding entry."
+        self._word_vocab = vocab
+        self._word_pad_index = vocab.padding_idx
+        if word_dropout>0:
+            assert vocab.unknown is not None, "Vocabulary must have unknown entry when you want to drop a word."
+        self.word_dropout = word_dropout
+        self._word_unk_index = vocab.unknown_idx
+        self.dropout_layer = nn.Dropout(dropout)
+
+    def drop_word(self, words):
+        """
+        按照设定随机将words设置为unknown_index。
+
+        :param torch.LongTensor words: batch_size x max_len
+        :return:
+        """
+        if self.word_dropout > 0 and self.training:
+            mask = torch.ones_like(words).float() * self.word_dropout
+            mask = torch.bernoulli(mask).byte()  # dropout_word越大，越多位置为1
+            words = words.masked_fill(mask, self._word_unk_index)
+        return words
+
+    def dropout(self, words):
+        """
+        对embedding后的word表示进行drop。
+
+        :param torch.FloatTensor words: batch_size x max_len x embed_size
+        :return:
+        """
+        return self.dropout_layer(words)
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+        :return:
+        """
+        requires_grads = set([param.requires_grad for param in self.parameters()])
+        if len(requires_grads) == 1:
+            return requires_grads.pop()
+        else:
+            return None
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        for param in self.parameters():
+            param.requires_grad = value
+
+    def __len__(self):
+        return len(self._word_vocab)
+
+    @property
+    def embed_size(self) -> int:
+        return self._embed_size
+
+    @property
+    def embedding_dim(self) -> int:
+        return self._embed_size
+
+    @property
+    def num_embedding(self) -> int:
+        """
+        这个值可能会大于实际的embedding矩阵的大小。
+        :return:
+        """
+        return len(self._word_vocab)
+
+    def get_word_vocab(self):
+        """
+        返回embedding的词典。
+
+        :return: Vocabulary
+        """
+        return self._word_vocab
+
+    @property
+    def size(self):
+        return torch.Size(self.num_embedding, self._embed_size)
+
+    @abstractmethod
+    def forward(self, *input):
+        raise NotImplementedError
diff --git a/fastNLP/embeddings/stack_embedding.py b/fastNLP/embeddings/stack_embedding.py
new file mode 100644
index 00000000..e5c7c7a4
--- /dev/null
+++ b/fastNLP/embeddings/stack_embedding.py
@@ -0,0 +1,92 @@
+from typing import List
+
+import torch
+from torch import nn as nn
+
+from .embedding import TokenEmbedding
+
+
+class StackEmbedding(TokenEmbedding):
+    """
+    别名：:class:`fastNLP.embeddings.StackEmbedding`   :class:`fastNLP.embeddings.stack_embedding.StackEmbedding`
+
+    支持将多个embedding集合成一个embedding。
+
+    Example::
+
+        >>> embed_1 = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
+        >>> embed_2 = StaticEmbedding(vocab, model_dir_or_name='en-word2vec-300', requires_grad=True)
+
+
+    :param embeds: 一个由若干个TokenEmbedding组成的list，要求每一个TokenEmbedding的词表都保持一致
+    :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。不同embedidng会在相同的位置
+        被设置为unknown。如果这里设置了dropout，则组成的embedding就不要再设置dropout了。
+    :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。
+
+    """
+    def __init__(self, embeds: List[TokenEmbedding], word_dropout=0, dropout=0):
+        vocabs = []
+        for embed in embeds:
+            if hasattr(embed, 'get_word_vocab'):
+                vocabs.append(embed.get_word_vocab())
+        _vocab = vocabs[0]
+        for vocab in vocabs[1:]:
+            assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary."
+
+        super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout)
+        assert isinstance(embeds, list)
+        for embed in embeds:
+            assert isinstance(embed, TokenEmbedding), "Only TokenEmbedding type is supported."
+        self.embeds = nn.ModuleList(embeds)
+        self._embed_size = sum([embed.embed_size for embed in self.embeds])
+
+    def append(self, embed: TokenEmbedding):
+        """
+        添加一个embedding到结尾。
+        :param embed:
+        :return:
+        """
+        assert isinstance(embed, TokenEmbedding)
+        self.embeds.append(embed)
+
+    def pop(self):
+        """
+        弹出最后一个embed
+        :return:
+        """
+        return self.embeds.pop()
+
+    @property
+    def embed_size(self):
+        return self._embed_size
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+        :return:
+        """
+        requires_grads = set([embed.requires_grad for embed in self.embeds()])
+        if len(requires_grads) == 1:
+            return requires_grads.pop()
+        else:
+            return None
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        for embed in self.embeds():
+            embed.requires_grad = value
+
+    def forward(self, words):
+        """
+        得到多个embedding的结果，并把结果按照顺序concat起来。
+
+        :param words: batch_size x max_len
+        :return: 返回的shape和当前这个stack embedding中embedding的组成有关
+        """
+        outputs = []
+        words = self.drop_word(words)
+        for embed in self.embeds:
+            outputs.append(embed(words))
+        outputs = self.dropout(torch.cat(outputs, dim=-1))
+        return outputs
\ No newline at end of file
diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py
new file mode 100644
index 00000000..c8778e35
--- /dev/null
+++ b/fastNLP/embeddings/static_embedding.py
@@ -0,0 +1,217 @@
+
+import os
+
+import torch
+import torch.nn as nn
+import numpy as np
+import warnings
+
+from ..core.vocabulary import Vocabulary
+from ..io.file_utils import PRETRAIN_STATIC_FILES, _get_base_url, cached_path
+from .embedding import TokenEmbedding
+
+
+class StaticEmbedding(TokenEmbedding):
+    """
+    别名：:class:`fastNLP.embeddings.StaticEmbedding`   :class:`fastNLP.embeddings.static_embedding.StaticEmbedding`
+
+    StaticEmbedding组件. 给定embedding的名称，根据vocab从embedding中抽取相应的数据。该Embedding可以就按照正常的embedding使用了
+
+    Example::
+
+        >>> embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50')
+
+
+    :param vocab: Vocabulary. 若该项为None则会读取所有的embedding。
+    :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding：第一种是传入embedding的文件名，第二种是传入embedding
+        的名称。目前支持的embedding包括{`en` 或者 `en-glove-840b-300` : glove.840B.300d, `en-glove-6b-50` : glove.6B.50d,
+        `en-word2vec-300` : GoogleNews-vectors-negative300}。第二种情况将自动查看缓存中是否存在该模型，没有的话将自动下载。
+    :param bool requires_grad: 是否需要gradient. 默认为True
+    :param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对象。
+    :param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语，或者就是需要单独
+        为大写的词语开辟一个vector表示，则将lower设置为False。
+    :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
+    :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。
+    :param bool normailize: 是否对vector进行normalize，使得每个vector的norm为1。
+    """
+    def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=True, init_method=None,
+                 lower=False, dropout=0, word_dropout=0, normalize=False):
+        super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
+
+        # 得到cache_path
+        if model_dir_or_name.lower() in PRETRAIN_STATIC_FILES:
+            PRETRAIN_URL = _get_base_url('static')
+            model_name = PRETRAIN_STATIC_FILES[model_dir_or_name]
+            model_url = PRETRAIN_URL + model_name
+            model_path = cached_path(model_url)
+            # 检查是否存在
+        elif os.path.isfile(os.path.expanduser(os.path.abspath(model_dir_or_name))):
+            model_path = model_dir_or_name
+        else:
+            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
+
+        # 读取embedding
+        if lower:
+            lowered_vocab = Vocabulary(padding=vocab.padding, unknown=vocab.unknown)
+            for word, index in vocab:
+                if not vocab._is_word_no_create_entry(word):
+                    lowered_vocab.add_word(word.lower())  # 先加入需要创建entry的
+            for word in vocab._no_create_word.keys():  # 不需要创建entry的
+                if word in vocab:
+                    lowered_word = word.lower()
+                    if lowered_word not in lowered_vocab.word_count:
+                        lowered_vocab.add_word(lowered_word)
+                        lowered_vocab._no_create_word[lowered_word] += 1
+            print(f"All word in vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} unique lowered "
+                  f"words.")
+            embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method,
+                                                          normalize=normalize)
+            # 需要适配一下
+            if not hasattr(self, 'words_to_words'):
+                self.words_to_words = torch.arange(len(lowered_vocab, )).long()
+            if lowered_vocab.unknown:
+                unknown_idx = lowered_vocab.unknown_idx
+            else:
+                unknown_idx = embedding.size(0) - 1  # 否则是最后一个为unknow
+            words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(),
+                                          requires_grad=False)
+            for word, index in vocab:
+                if word not in lowered_vocab:
+                    word = word.lower()
+                    if lowered_vocab._is_word_no_create_entry(word):  # 如果不需要创建entry,已经默认unknown了
+                        continue
+                words_to_words[index] = self.words_to_words[lowered_vocab.to_index(word)]
+            self.words_to_words = words_to_words
+        else:
+            embedding = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method,
+                                                          normalize=normalize)
+        self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1],
+                                      padding_idx=vocab.padding_idx,
+                                      max_norm=None, norm_type=2, scale_grad_by_freq=False,
+                                      sparse=False, _weight=embedding)
+        self._embed_size = self.embedding.weight.size(1)
+        self.requires_grad = requires_grad
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+        :return:
+        """
+        requires_grads = set([param.requires_grad for name, param in self.named_parameters()
+                              if 'words_to_words' not in name])
+        if len(requires_grads) == 1:
+            return requires_grads.pop()
+        else:
+            return None
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        for name, param in self.named_parameters():
+            if 'words_to_words' in name:
+                continue
+            param.requires_grad = value
+
+    def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>',
+                         normalize=True, error='ignore', init_method=None):
+        """
+        从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是
+        word2vec(第一行只有两个元素)还是glove格式的数据。
+
+        :param str embed_filepath: 预训练的embedding的路径。
+        :param vocab: 词表 :class:`~fastNLP.Vocabulary` 类型，读取出现在vocab中的词的embedding。
+            没有出现在vocab中的词的embedding将通过找到的词的embedding的正态分布采样出来，以使得整个Embedding是同分布的。
+        :param dtype: 读出的embedding的类型
+        :param str padding: 词表中padding的token
+        :param str unknown: 词表中unknown的token
+        :param bool normalize: 是否将每个vector归一化到norm为1
+        :param str error: `ignore` , `strict` ; 如果 `ignore` ，错误将自动跳过; 如果 `strict` , 错误将抛出。
+            这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。
+        :param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。默认使用torch.nn.init.zeros_
+        :return torch.tensor:  shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。
+        """
+        assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported."
+        if not os.path.exists(embed_filepath):
+            raise FileNotFoundError("`{}` does not exist.".format(embed_filepath))
+        with open(embed_filepath, 'r', encoding='utf-8') as f:
+            line = f.readline().strip()
+            parts = line.split()
+            start_idx = 0
+            if len(parts) == 2:
+                dim = int(parts[1])
+                start_idx += 1
+            else:
+                dim = len(parts) - 1
+                f.seek(0)
+            matrix = {}
+            found_count = 0
+            for idx, line in enumerate(f, start_idx):
+                try:
+                    parts = line.strip().split()
+                    word = ''.join(parts[:-dim])
+                    nums = parts[-dim:]
+                    # 对齐unk与pad
+                    if word == padding and vocab.padding is not None:
+                        word = vocab.padding
+                    elif word == unknown and vocab.unknown is not None:
+                        word = vocab.unknown
+                    if word in vocab:
+                        index = vocab.to_index(word)
+                        matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim))
+                        found_count += 1
+                except Exception as e:
+                    if error == 'ignore':
+                        warnings.warn("Error occurred at the {} line.".format(idx))
+                    else:
+                        print("Error occurred at the {} line.".format(idx))
+                        raise e
+            print("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab)))
+            for word, index in vocab:
+                if index not in matrix and not vocab._is_word_no_create_entry(word):
+                    if vocab.unknown_idx in matrix:  # 如果有unkonwn，用unknown初始化
+                        matrix[index] = matrix[vocab.unknown_idx]
+                    else:
+                        matrix[index] = None
+
+            vectors = torch.zeros(len(matrix), dim)
+            if init_method:
+                init_method(vectors)
+            else:
+                nn.init.uniform_(vectors, -np.sqrt(3/dim), np.sqrt(3/dim))
+
+            if vocab._no_create_word_length>0:
+                if vocab.unknown is None:  # 创建一个专门的unknown
+                    unknown_idx = len(matrix)
+                    vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous()
+                else:
+                    unknown_idx = vocab.unknown_idx
+                words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(),
+                                              requires_grad=False)
+                for order, (index, vec) in enumerate(matrix.items()):
+                    if vec is not None:
+                        vectors[order] = vec
+                    words_to_words[index] = order
+                self.words_to_words = words_to_words
+            else:
+                for index, vec in matrix.items():
+                    if vec is not None:
+                        vectors[index] = vec
+
+            if normalize:
+                vectors /= (torch.norm(vectors, dim=1, keepdim=True) + 1e-12)
+
+            return vectors
+
+    def forward(self, words):
+        """
+        传入words的index
+
+        :param words: torch.LongTensor, [batch_size, max_len]
+        :return: torch.FloatTensor, [batch_size, max_len, embed_size]
+        """
+        if hasattr(self, 'words_to_words'):
+            words = self.words_to_words[words]
+        words = self.drop_word(words)
+        words = self.embedding(words)
+        words = self.dropout(words)
+        return words
diff --git a/fastNLP/embeddings/utils.py b/fastNLP/embeddings/utils.py
new file mode 100644
index 00000000..ff5d8733
--- /dev/null
+++ b/fastNLP/embeddings/utils.py
@@ -0,0 +1,47 @@
+import numpy as np
+import torch
+from torch import nn as nn
+
+from ..core.vocabulary import Vocabulary
+
+
+def _construct_char_vocab_from_vocab(vocab:Vocabulary, min_freq:int=1):
+    """
+    给定一个word的vocabulary生成character的vocabulary.
+
+    :param vocab: 从vocab
+    :param min_freq:
+    :return:
+    """
+    char_vocab = Vocabulary(min_freq=min_freq)
+    for word, index in vocab:
+        if not vocab._is_word_no_create_entry(word):
+            char_vocab.add_word_lst(list(word))
+    return char_vocab
+
+
+def get_embeddings(init_embed):
+    """
+    根据输入的init_embed生成nn.Embedding对象。
+
+    :param init_embed: 可以是 tuple:(num_embedings, embedding_dim), 即embedding的大小和每个词的维度;也可以传入
+        nn.Embedding 对象, 此时就以传入的对象作为embedding; 传入np.ndarray也行，将使用传入的ndarray作为作为Embedding初始
+        化; 传入orch.Tensor, 将使用传入的值作为Embedding初始化。
+    :return nn.Embedding embeddings:
+    """
+    if isinstance(init_embed, tuple):
+        res = nn.Embedding(
+            num_embeddings=init_embed[0], embedding_dim=init_embed[1])
+        nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)),
+                         b=np.sqrt(3/res.weight.data.size(1)))
+    elif isinstance(init_embed, nn.Module):
+        res = init_embed
+    elif isinstance(init_embed, torch.Tensor):
+        res = nn.Embedding.from_pretrained(init_embed, freeze=False)
+    elif isinstance(init_embed, np.ndarray):
+        init_embed = torch.tensor(init_embed, dtype=torch.float32)
+        res = nn.Embedding.from_pretrained(init_embed, freeze=False)
+    else:
+        raise TypeError(
+            'invalid init_embed type: {}'.format((type(init_embed))))
+    return res
\ No newline at end of file
diff --git a/fastNLP/io/data_loader/matching.py b/fastNLP/io/data_loader/matching.py
index cecaee96..481b5056 100644
--- a/fastNLP/io/data_loader/matching.py
+++ b/fastNLP/io/data_loader/matching.py
@@ -6,7 +6,7 @@ from ...core.const import Const
 from ...core.vocabulary import Vocabulary
 from ..base_loader import DataBundle, DataSetLoader
 from ..file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR
-from ...modules.encoder._bert import BertTokenizer
+from ...modules.encoder.bert import BertTokenizer
 
 
 class MatchingLoader(DataSetLoader):
diff --git a/fastNLP/models/bert.py b/fastNLP/models/bert.py
index fb186ce4..adecab60 100644
--- a/fastNLP/models/bert.py
+++ b/fastNLP/models/bert.py
@@ -8,7 +8,7 @@ from torch import nn
 from .base_model import BaseModel
 from ..core.const import Const
 from ..modules.encoder import BertModel
-from ..modules.encoder._bert import BertConfig
+from ..modules.encoder.bert import BertConfig
 
 
 class BertForSequenceClassification(BaseModel):
diff --git a/fastNLP/models/biaffine_parser.py b/fastNLP/models/biaffine_parser.py
index 8533e7af..dc13ba42 100644
--- a/fastNLP/models/biaffine_parser.py
+++ b/fastNLP/models/biaffine_parser.py
@@ -20,7 +20,7 @@ from ..modules.dropout import TimestepDropout
 from ..modules.encoder.transformer import TransformerEncoder
 from ..modules.encoder.variational_rnn import VarLSTM
 from ..modules.utils import initial_parameter
-from ..modules.utils import get_embeddings
+from ..embeddings.utils import get_embeddings
 from .base_model import BaseModel
 from ..core.utils import seq_len_to_mask
 
diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py
index 081dd510..e7d5f0ca 100644
--- a/fastNLP/models/cnn_text_classification.py
+++ b/fastNLP/models/cnn_text_classification.py
@@ -6,8 +6,9 @@ import torch
 import torch.nn as nn
 
 from ..core.const import Const as C
+from ..core.utils import seq_len_to_mask
 from ..modules import encoder
-from fastNLP import seq_len_to_mask
+from ..embeddings import embedding
 
 
 class CNNText(torch.nn.Module):
@@ -33,7 +34,7 @@ class CNNText(torch.nn.Module):
         super(CNNText, self).__init__()
         
         # no support for pre-trained embedding currently
-        self.embed = encoder.Embedding(init_embed)
+        self.embed = embedding.Embedding(init_embed)
         self.conv_pool = encoder.ConvMaxpool(
             in_channels=self.embed.embedding_dim,
             out_channels=kernel_nums,
diff --git a/fastNLP/models/sequence_labeling.py b/fastNLP/models/sequence_labeling.py
index 8e6a5db1..ff8cbd30 100644
--- a/fastNLP/models/sequence_labeling.py
+++ b/fastNLP/models/sequence_labeling.py
@@ -10,6 +10,7 @@ import torch
 import torch.nn as nn
 
 from .base_model import BaseModel
+from ..embeddings import embedding
 from ..modules import decoder, encoder
 from ..modules.decoder.crf import allowed_transitions
 from ..core.utils import seq_len_to_mask
@@ -32,10 +33,10 @@ class SeqLabeling(BaseModel):
     def __init__(self, init_embed, hidden_size, num_classes):
         super(SeqLabeling, self).__init__()
         
-        self.Embedding = encoder.embedding.Embedding(init_embed)
-        self.Rnn = encoder.lstm.LSTM(self.Embedding.embedding_dim, hidden_size)
+        self.Embedding = embedding.Embedding(init_embed)
+        self.Rnn = encoder.LSTM(self.Embedding.embedding_dim, hidden_size)
         self.Linear = nn.Linear(hidden_size, num_classes)
-        self.Crf = decoder.crf.ConditionalRandomField(num_classes)
+        self.Crf = decoder.ConditionalRandomField(num_classes)
         self.mask = None
     
     def forward(self, words, seq_len, target):
@@ -129,7 +130,7 @@ class AdvSeqLabel(nn.Module):
         
         super().__init__()
         
-        self.Embedding = encoder.embedding.Embedding(init_embed)
+        self.Embedding = embedding.Embedding(init_embed)
         self.norm1 = torch.nn.LayerNorm(self.Embedding.embedding_dim)
         self.Rnn = encoder.LSTM(input_size=self.Embedding.embedding_dim, hidden_size=hidden_size, num_layers=2,
                                 dropout=dropout,
diff --git a/fastNLP/models/snli.py b/fastNLP/models/snli.py
index d12524cc..0a76d48a 100644
--- a/fastNLP/models/snli.py
+++ b/fastNLP/models/snli.py
@@ -8,11 +8,10 @@ import torch.nn.functional as F
 
 from torch.nn import CrossEntropyLoss
 
-from fastNLP.models import BaseModel
-from fastNLP.modules.encoder.embedding import TokenEmbedding
-from fastNLP.modules.encoder.lstm import LSTM
-from fastNLP.core.const import Const
-from fastNLP.core.utils import seq_len_to_mask
+from .base_model import BaseModel
+from ..embeddings.embedding import TokenEmbedding
+from ..core.const import Const
+from ..core.utils import seq_len_to_mask
 
 
 class ESIM(BaseModel):
diff --git a/fastNLP/models/star_transformer.py b/fastNLP/models/star_transformer.py
index bb91a5b6..97593f72 100644
--- a/fastNLP/models/star_transformer.py
+++ b/fastNLP/models/star_transformer.py
@@ -13,7 +13,7 @@ from torch import nn
 
 from ..modules.encoder.star_transformer import StarTransformer
 from ..core.utils import seq_len_to_mask
-from ..modules.utils import get_embeddings
+from ..embeddings.utils import get_embeddings
 from ..core.const import Const
 
 
diff --git a/fastNLP/modules/__init__.py b/fastNLP/modules/__init__.py
index 2cd2216c..43ec3f5f 100644
--- a/fastNLP/modules/__init__.py
+++ b/fastNLP/modules/__init__.py
@@ -24,7 +24,6 @@ __all__ = [
     "ConvolutionCharEncoder",
     "LSTMCharEncoder",
     "ConvMaxpool",
-    "Embedding",
     "LSTM",
     "StarTransformer",
     "TransformerEncoder",
@@ -48,4 +47,3 @@ from . import encoder
 from .decoder import *
 from .dropout import TimestepDropout
 from .encoder import *
-from .utils import get_embeddings
diff --git a/fastNLP/modules/encoder/__init__.py b/fastNLP/modules/encoder/__init__.py
index 7b5bc070..051a0c01 100644
--- a/fastNLP/modules/encoder/__init__.py
+++ b/fastNLP/modules/encoder/__init__.py
@@ -1,19 +1,11 @@
 __all__ = [
-    # "BertModel",
+    "BertModel",
     
     "ConvolutionCharEncoder",
     "LSTMCharEncoder",
     
     "ConvMaxpool",
     
-    "Embedding",
-    "StaticEmbedding",
-    "ElmoEmbedding",
-    "BertEmbedding",
-    "StackEmbedding",
-    "LSTMCharEmbedding",
-    "CNNCharEmbedding",
-    
     "LSTM",
     
     "StarTransformer",
@@ -31,12 +23,10 @@ __all__ = [
 
     "MultiHeadAttention",
 ]
-from ._bert import BertModel
-from .bert import BertWordPieceEncoder
+
+from .bert import BertModel
 from .char_encoder import ConvolutionCharEncoder, LSTMCharEncoder
 from .conv_maxpool import ConvMaxpool
-from .embedding import Embedding, StaticEmbedding, ElmoEmbedding, BertEmbedding, \
-    StackEmbedding, LSTMCharEmbedding, CNNCharEmbedding
 from .lstm import LSTM
 from .star_transformer import StarTransformer
 from .transformer import TransformerEncoder
diff --git a/fastNLP/modules/encoder/_bert.py b/fastNLP/modules/encoder/_bert.py
deleted file mode 100644
index 61a5d7d1..00000000
--- a/fastNLP/modules/encoder/_bert.py
+++ /dev/null
@@ -1,1069 +0,0 @@
-
-
-
-"""
-这个页面的代码很大程度上参考(复制粘贴)了https://github.com/huggingface/pytorch-pretrained-BERT的代码， 如果你发现该代码对你
-    有用，也请引用一下他们。
-"""
-
-
-
-from ...core.vocabulary import Vocabulary
-import collections
-
-import unicodedata
-import numpy as np
-from itertools import chain
-import copy
-import json
-import math
-import os
-
-import torch
-from torch import nn
-import glob
-import sys
-
-CONFIG_FILE = 'bert_config.json'
-
-
-class BertConfig(object):
-    """Configuration class to store the configuration of a `BertModel`.
-    """
-    def __init__(self,
-                 vocab_size_or_config_json_file,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
-                 hidden_act="gelu",
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
-                 initializer_range=0.02,
-                 layer_norm_eps=1e-12):
-        """Constructs BertConfig.
-
-        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
-            hidden_size: Size of the encoder layers and the pooler layer.
-            num_hidden_layers: Number of hidden layers in the Transformer encoder.
-            num_attention_heads: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
-                layer in the Transformer encoder.
-            hidden_act: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attention_probs_dropout_prob: The dropout ratio for the attention
-                probabilities.
-            max_position_embeddings: The maximum sequence length that this model might
-                ever be used with. Typically set this to something large just in case
-                (e.g., 512 or 1024 or 2048).
-            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
-                `BertModel`.
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-            layer_norm_eps: The epsilon used by LayerNorm.
-        """
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.hidden_act = hidden_act
-            self.intermediate_size = intermediate_size
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.initializer_range = initializer_range
-            self.layer_norm_eps = layer_norm_eps
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             "or the path to a pretrained model config file (str)")
-
-    @classmethod
-    def from_dict(cls, json_object):
-        """Constructs a `BertConfig` from a Python dictionary of parameters."""
-        config = BertConfig(vocab_size_or_config_json_file=-1)
-        for key, value in json_object.items():
-            config.__dict__[key] = value
-        return config
-
-    @classmethod
-    def from_json_file(cls, json_file):
-        """Constructs a `BertConfig` from a json file of parameters."""
-        with open(json_file, "r", encoding='utf-8') as reader:
-            text = reader.read()
-        return cls.from_dict(json.loads(text))
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path):
-        """ Save this instance to a json file."""
-        with open(json_file_path, "w", encoding='utf-8') as writer:
-            writer.write(self.to_json_string())
-
-
-def gelu(x):
-    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
-
-
-def swish(x):
-    return x * torch.sigmoid(x)
-
-
-ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
-
-
-class BertLayerNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-12):
-        """Construct a layernorm module in the TF style (epsilon inside the square root).
-        """
-        super(BertLayerNorm, self).__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.bias = nn.Parameter(torch.zeros(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, x):
-        u = x.mean(-1, keepdim=True)
-        s = (x - u).pow(2).mean(-1, keepdim=True)
-        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
-        return self.weight * x + self.bias
-
-
-class BertEmbeddings(nn.Module):
-    """Construct the embeddings from word, position and token_type embeddings.
-    """
-    def __init__(self, config):
-        super(BertEmbeddings, self).__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
-        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-
-        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
-        # any TensorFlow checkpoint file
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, input_ids, token_type_ids=None):
-        seq_length = input_ids.size(1)
-        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
-        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-
-        words_embeddings = self.word_embeddings(input_ids)
-        position_embeddings = self.position_embeddings(position_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-
-        embeddings = words_embeddings + position_embeddings + token_type_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-
-
-class BertSelfAttention(nn.Module):
-    def __init__(self, config):
-        super(BertSelfAttention, self).__init__()
-        if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(self, hidden_states, attention_mask):
-        mixed_query_layer = self.query(hidden_states)
-        mixed_key_layer = self.key(hidden_states)
-        mixed_value_layer = self.value(hidden_states)
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-        key_layer = self.transpose_for_scores(mixed_key_layer)
-        value_layer = self.transpose_for_scores(mixed_value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
-        attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(*new_context_layer_shape)
-        return context_layer
-
-
-class BertSelfOutput(nn.Module):
-    def __init__(self, config):
-        super(BertSelfOutput, self).__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class BertAttention(nn.Module):
-    def __init__(self, config):
-        super(BertAttention, self).__init__()
-        self.self = BertSelfAttention(config)
-        self.output = BertSelfOutput(config)
-
-    def forward(self, input_tensor, attention_mask):
-        self_output = self.self(input_tensor, attention_mask)
-        attention_output = self.output(self_output, input_tensor)
-        return attention_output
-
-
-class BertIntermediate(nn.Module):
-    def __init__(self, config):
-        super(BertIntermediate, self).__init__()
-        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
-        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
-            self.intermediate_act_fn = ACT2FN[config.hidden_act]
-        else:
-            self.intermediate_act_fn = config.hidden_act
-
-    def forward(self, hidden_states):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.intermediate_act_fn(hidden_states)
-        return hidden_states
-
-
-class BertOutput(nn.Module):
-    def __init__(self, config):
-        super(BertOutput, self).__init__()
-        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-
-    def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.LayerNorm(hidden_states + input_tensor)
-        return hidden_states
-
-
-class BertLayer(nn.Module):
-    def __init__(self, config):
-        super(BertLayer, self).__init__()
-        self.attention = BertAttention(config)
-        self.intermediate = BertIntermediate(config)
-        self.output = BertOutput(config)
-
-    def forward(self, hidden_states, attention_mask):
-        attention_output = self.attention(hidden_states, attention_mask)
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-
-
-class BertEncoder(nn.Module):
-    def __init__(self, config):
-        super(BertEncoder, self).__init__()
-        layer = BertLayer(config)
-        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
-
-    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
-        all_encoder_layers = []
-        for layer_module in self.layer:
-            hidden_states = layer_module(hidden_states, attention_mask)
-            if output_all_encoded_layers:
-                all_encoder_layers.append(hidden_states)
-        if not output_all_encoded_layers:
-            all_encoder_layers.append(hidden_states)
-        return all_encoder_layers
-
-
-class BertPooler(nn.Module):
-    def __init__(self, config):
-        super(BertPooler, self).__init__()
-        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.activation = nn.Tanh()
-
-    def forward(self, hidden_states):
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
-
-
-class BertModel(nn.Module):
-    """BERT(Bidirectional Embedding Representations from Transformers).
-
-    如果你想使用预训练好的权重矩阵，请在以下网址下载.
-    sources::
-
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
-    'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-pytorch_model.bin",
-    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
-    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
-    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
-    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
-    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin"
-
-
-    用预训练权重矩阵来建立BERT模型::
-
-        model = BertModel.from_pretrained("path/to/weights/directory")
-
-    用随机初始化权重矩阵来建立BERT模型::
-
-        model = BertModel()
-
-    :param int vocab_size: 词表大小，默认值为30522，为BERT English uncase版本的词表大小
-    :param int hidden_size: 隐层大小，默认值为768，为BERT base的版本
-    :param int num_hidden_layers: 隐藏层数，默认值为12，为BERT base的版本
-    :param int num_attention_heads: 多头注意力头数，默认值为12，为BERT base的版本
-    :param int intermediate_size: FFN隐藏层大小，默认值是3072，为BERT base的版本
-    :param str hidden_act: FFN隐藏层激活函数，默认值为``gelu``
-    :param float hidden_dropout_prob: FFN隐藏层dropout，默认值为0.1
-    :param float attention_probs_dropout_prob: Attention层的dropout，默认值为0.1
-    :param int max_position_embeddings: 最大的序列长度，默认值为512，
-    :param int type_vocab_size: 最大segment数量，默认值为2
-    :param int initializer_range: 初始化权重范围，默认值为0.02
-    """
-
-    def __init__(self, config, *inputs, **kwargs):
-        super(BertModel, self).__init__()
-        if not isinstance(config, BertConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
-                "To create a model from a Google pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                ))
-        super(BertModel, self).__init__()
-        self.config = config
-        self.hidden_size = self.config.hidden_size
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
-        self.pooler = BertPooler(config)
-        self.apply(self.init_bert_weights)
-
-    def init_bert_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, BertLayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        embedding_output = self.embeddings(input_ids, token_type_ids)
-        encoded_layers = self.encoder(embedding_output,
-                                      extended_attention_mask,
-                                      output_all_encoded_layers=output_all_encoded_layers)
-        sequence_output = encoded_layers[-1]
-        pooled_output = self.pooler(sequence_output)
-        if not output_all_encoded_layers:
-            encoded_layers = encoded_layers[-1]
-        return encoded_layers, pooled_output
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_dir, *inputs, **kwargs):
-        state_dict = kwargs.get('state_dict', None)
-        kwargs.pop('state_dict', None)
-        cache_dir = kwargs.get('cache_dir', None)
-        kwargs.pop('cache_dir', None)
-        from_tf = kwargs.get('from_tf', False)
-        kwargs.pop('from_tf', None)
-        # Load config
-        config_file = os.path.join(pretrained_model_dir, CONFIG_FILE)
-        config = BertConfig.from_json_file(config_file)
-        # logger.info("Model config {}".format(config))
-        # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
-        if state_dict is None:
-            files = glob.glob(os.path.join(pretrained_model_dir, '*.bin'))
-            if len(files)==0:
-                raise FileNotFoundError(f"There is no *.bin file in {pretrained_model_dir}")
-            elif len(files)>1:
-                raise FileExistsError(f"There are multiple *.bin files in {pretrained_model_dir}")
-            weights_path = files[0]
-            state_dict = torch.load(weights_path, map_location='cpu')
-
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if 'gamma' in key:
-                new_key = key.replace('gamma', 'weight')
-            if 'beta' in key:
-                new_key = key.replace('beta', 'bias')
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, '_metadata', None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        def load(module, prefix=''):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + '.')
-
-        load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
-        if len(missing_keys) > 0:
-            print("Weights of {} not initialized from pretrained model: {}".format(
-                model.__class__.__name__, missing_keys))
-        if len(unexpected_keys) > 0:
-            print("Weights from pretrained model not used in {}: {}".format(
-                model.__class__.__name__, unexpected_keys))
-        return model
-
-
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    text = text.strip()
-    if not text:
-        return []
-    tokens = text.split()
-    return tokens
-
-
-class WordpieceTokenizer(object):
-    """Runs WordPiece tokenization."""
-
-    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-    def tokenize(self, text):
-        """Tokenizes a piece of text into its word pieces.
-
-        This uses a greedy longest-match-first algorithm to perform tokenization
-        using the given vocabulary.
-
-        For example:
-          input = "unaffable"
-          output = ["un", "##aff", "##able"]
-
-        Args:
-          text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer`.
-
-        Returns:
-          A list of wordpiece tokens.
-        """
-
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            if len(chars) > self.max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < len(chars):
-                end = len(chars)
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
-
-
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    index = 0
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        while True:
-            token = reader.readline()
-            if not token:
-                break
-            token = token.strip()
-            vocab[token] = index
-            index += 1
-    return vocab
-
-class BasicTokenizer(object):
-    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-
-    def __init__(self,
-                 do_lower_case=True,
-                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
-        """Constructs a BasicTokenizer.
-
-        Args:
-          do_lower_case: Whether to lower case the input.
-        """
-        self.do_lower_case = do_lower_case
-        self.never_split = never_split
-
-    def tokenize(self, text):
-        """Tokenizes a piece of text."""
-        text = self._clean_text(text)
-        # This was added on November 1st, 2018 for the multilingual and Chinese
-        # models. This is also applied to the English models now, but it doesn't
-        # matter since the English models were not trained on any Chinese data
-        # and generally don't have any Chinese data in them (there are Chinese
-        # characters in the vocabulary because Wikipedia does have some Chinese
-        # words in the English Wikipedia.).
-        text = self._tokenize_chinese_chars(text)
-        orig_tokens = whitespace_tokenize(text)
-        split_tokens = []
-        for token in orig_tokens:
-            if self.do_lower_case and token not in self.never_split:
-                token = token.lower()
-                token = self._run_strip_accents(token)
-            split_tokens.extend(self._run_split_on_punc(token))
-
-        output_tokens = whitespace_tokenize(" ".join(split_tokens))
-        return output_tokens
-
-    def _run_strip_accents(self, text):
-        """Strips accents from a piece of text."""
-        text = unicodedata.normalize("NFD", text)
-        output = []
-        for char in text:
-            cat = unicodedata.category(char)
-            if cat == "Mn":
-                continue
-            output.append(char)
-        return "".join(output)
-
-    def _run_split_on_punc(self, text):
-        """Splits punctuation on a piece of text."""
-        if text in self.never_split:
-            return [text]
-        chars = list(text)
-        i = 0
-        start_new_word = True
-        output = []
-        while i < len(chars):
-            char = chars[i]
-            if _is_punctuation(char):
-                output.append([char])
-                start_new_word = True
-            else:
-                if start_new_word:
-                    output.append([])
-                start_new_word = False
-                output[-1].append(char)
-            i += 1
-
-        return ["".join(x) for x in output]
-
-    def _tokenize_chinese_chars(self, text):
-        """Adds whitespace around any CJK character."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if self._is_chinese_char(cp):
-                output.append(" ")
-                output.append(char)
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-    def _is_chinese_char(self, cp):
-        """Checks whether CP is the codepoint of a CJK character."""
-        # This defines a "chinese character" as anything in the CJK Unicode block:
-        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-        #
-        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-        # despite its name. The modern Korean Hangul alphabet is a different block,
-        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-        # space-separated words, so they are not treated specially and handled
-        # like the all of the other languages.
-        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-                (cp >= 0x3400 and cp <= 0x4DBF) or  #
-                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-                (cp >= 0x2B820 and cp <= 0x2CEAF) or
-                (cp >= 0xF900 and cp <= 0xFAFF) or  #
-                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
-            return True
-
-        return False
-
-    def _clean_text(self, text):
-        """Performs invalid character removal and whitespace cleanup on text."""
-        output = []
-        for char in text:
-            cp = ord(char)
-            if cp == 0 or cp == 0xfffd or _is_control(char):
-                continue
-            if _is_whitespace(char):
-                output.append(" ")
-            else:
-                output.append(char)
-        return "".join(output)
-
-
-def _is_whitespace(char):
-    """Checks whether `chars` is a whitespace character."""
-    # \t, \n, and \r are technically contorl characters but we treat them
-    # as whitespace since they are generally considered as such.
-    if char == " " or char == "\t" or char == "\n" or char == "\r":
-        return True
-    cat = unicodedata.category(char)
-    if cat == "Zs":
-        return True
-    return False
-
-
-def _is_control(char):
-    """Checks whether `chars` is a control character."""
-    # These are technically control characters but we count them as whitespace
-    # characters.
-    if char == "\t" or char == "\n" or char == "\r":
-        return False
-    cat = unicodedata.category(char)
-    if cat.startswith("C"):
-        return True
-    return False
-
-
-def _is_punctuation(char):
-    """Checks whether `chars` is a punctuation character."""
-    cp = ord(char)
-    # We treat all non-letter/number ASCII as punctuation.
-    # Characters such as "^", "$", and "`" are not in the Unicode
-    # Punctuation class but we treat them as punctuation anyways, for
-    # consistency.
-    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
-        return True
-    cat = unicodedata.category(char)
-    if cat.startswith("P"):
-        return True
-    return False
-
-
-class BertTokenizer(object):
-    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
-
-    def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
-                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
-        """Constructs a BertTokenizer.
-
-        Args:
-          vocab_file: Path to a one-wordpiece-per-line vocabulary file
-          do_lower_case: Whether to lower case the input
-                         Only has an effect when do_wordpiece_only=False
-          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-          max_len: An artificial maximum length to truncate tokenized sequences to;
-                         Effective maximum length is always the minimum of this
-                         value (if specified) and the underlying BERT model's
-                         sequence length.
-          never_split: List of tokens which will never be split during tokenization.
-                         Only has an effect when do_wordpiece_only=False
-        """
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict(
-            [(ids, tok) for tok, ids in self.vocab.items()])
-        self.do_basic_tokenize = do_basic_tokenize
-        if do_basic_tokenize:
-          self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
-                                                never_split=never_split)
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-        self.max_len = max_len if max_len is not None else int(1e12)
-
-    def _reinit_on_new_vocab(self, vocab):
-        """
-        在load bert之后，可能会对vocab进行重新排列。重新排列之后调用这个函数重新初始化与vocab相关的性质
-
-        :param vocab:
-        :return:
-        """
-        self.vocab = vocab
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
-
-    def tokenize(self, text):
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text):
-                for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                    split_tokens.append(sub_token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
-
-    def convert_tokens_to_ids(self, tokens):
-        """Converts a sequence of tokens into ids using the vocab."""
-        ids = []
-        for token in tokens:
-            ids.append(self.vocab[token])
-        if len(ids) > self.max_len:
-            print(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this BERT model ({} > {}). Running this"
-                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
-            )
-        return ids
-
-    def convert_ids_to_tokens(self, ids):
-        """Converts a sequence of ids in wordpiece tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            tokens.append(self.ids_to_tokens[i])
-        return tokens
-
-    def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary to a directory or file."""
-        index = 0
-        if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_NAME)
-        else:
-            vocab_file = vocab_path
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    print("Saving vocabulary to {}: vocabulary indices are not consecutive."
-                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
-                    index = token_index
-                writer.write(token + u'\n')
-                index += 1
-        return vocab_file
-
-    @classmethod
-    def from_pretrained(cls, model_dir, *inputs, **kwargs):
-        """
-        给定path，直接读取vocab.
-
-        """
-        pretrained_model_name_or_path = os.path.join(model_dir, VOCAB_NAME)
-        print("loading vocabulary file {}".format(pretrained_model_name_or_path))
-        max_len = 512
-        kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        tokenizer = cls(pretrained_model_name_or_path, *inputs, **kwargs)
-        return tokenizer
-
-VOCAB_NAME = 'vocab.txt'
-
-class _WordBertModel(nn.Module):
-    def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first', include_cls_sep:bool=False):
-        super().__init__()
-
-        self.tokenzier = BertTokenizer.from_pretrained(model_dir)
-        self.encoder = BertModel.from_pretrained(model_dir)
-        #  检查encoder_layer_number是否合理
-        encoder_layer_number = len(self.encoder.encoder.layer)
-        self.layers = list(map(int, layers.split(',')))
-        for layer in self.layers:
-            if layer<0:
-                assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \
-                    f"a bert model with {encoder_layer_number} layers."
-            else:
-                assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
-                    f"a bert model with {encoder_layer_number} layers."
-
-        assert pool_method in ('avg', 'max', 'first', 'last')
-        self.pool_method = pool_method
-        self.include_cls_sep = include_cls_sep
-
-        # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
-        print("Start to generating word pieces for word.")
-        # 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值
-        word_piece_dict = {'[CLS]':1, '[SEP]':1}  # 用到的word_piece以及新增的
-        found_count = 0
-        for word, index in vocab:
-            if index == vocab.padding_idx:  # pad是个特殊的符号
-                word = '[PAD]'
-            elif index == vocab.unknown_idx:
-                word = '[UNK]'
-            word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
-            if len(word_pieces)==1:
-                if not vocab._is_word_no_create_entry(word):  # 如果是train中的值, 但是却没有找到
-                    if index!=vocab.unknown_idx and word_pieces[0]=='[UNK]': # 说明这个词不在原始的word里面
-                        word_piece_dict[word] = 1  # 新增一个值
-                        continue
-            for word_piece in word_pieces:
-                word_piece_dict[word_piece] = 1
-            found_count += 1
-        original_embed = self.encoder.embeddings.word_embeddings.weight.data
-        # 特殊词汇要特殊处理
-        embed = nn.Embedding(len(word_piece_dict), original_embed.size(1))  # 新的embed
-        new_word_piece_vocab = collections.OrderedDict()
-        for index, token in enumerate(['[PAD]', '[UNK]']):
-            word_piece_dict.pop(token, None)
-            embed.weight.data[index] = original_embed[self.tokenzier.vocab[token]]
-            new_word_piece_vocab[token] = index
-        for token in word_piece_dict.keys():
-            if token in self.tokenzier.vocab:
-                embed.weight.data[len(new_word_piece_vocab)] = original_embed[self.tokenzier.vocab[token]]
-            else:
-                embed.weight.data[len(new_word_piece_vocab)] = original_embed[self.tokenzier.vocab['[UNK]']]
-            new_word_piece_vocab[token] = len(new_word_piece_vocab)
-        self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab)
-        self.encoder.embeddings.word_embeddings = embed
-
-        word_to_wordpieces = []
-        word_pieces_lengths = []
-        for word, index in vocab:
-            if index == vocab.padding_idx:  # pad是个特殊的符号
-                word = '[PAD]'
-            elif index == vocab.unknown_idx:
-                word = '[UNK]'
-            word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
-            word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces)
-            word_to_wordpieces.append(word_pieces)
-            word_pieces_lengths.append(len(word_pieces))
-        print("Found(Or seg into word pieces) {} words out of {}.".format(found_count, len(vocab)))
-        self._cls_index = self.tokenzier.vocab['[CLS]']
-        self._sep_index = self.tokenzier.vocab['[SEP]']
-        self._pad_index = vocab.padding_idx
-        self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]']  # 需要用于生成word_piece
-        self.word_to_wordpieces = np.array(word_to_wordpieces)
-        self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False)
-        print("Successfully generate word pieces.")
-
-    def forward(self, words):
-        """
-
-        :param words: torch.LongTensor, batch_size x max_len
-        :return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size
-        """
-        batch_size, max_word_len = words.size()
-        seq_len = words.ne(self._pad_index).sum(dim=-1)
-        batch_word_pieces_length = self.word_pieces_lengths[words]  # batch_size x max_len
-        word_pieces_lengths = batch_word_pieces_length.sum(dim=-1)
-        max_word_piece_length = word_pieces_lengths.max().item()
-        # +2是由于需要加入[CLS]与[SEP]
-        word_pieces = words.new_full((batch_size, max_word_piece_length+2), fill_value=self._wordpiece_pad_index)
-        word_pieces[:, 0].fill_(self._cls_index)
-        batch_indexes = torch.arange(batch_size).to(words)
-        word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index
-        attn_masks = torch.zeros_like(word_pieces)
-        # 1. 获取words的word_pieces的id，以及对应的span范围
-        word_indexes = words.tolist()
-        for i in range(batch_size):
-            word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i]]))
-            word_pieces[i, 1:len(word_pieces_i)+1] = torch.LongTensor(word_pieces_i)
-            attn_masks[i, :len(word_pieces_i)+2].fill_(1)
-        # TODO 截掉长度超过的部分。
-        # 2. 获取hidden的结果，根据word_pieces进行对应的pool计算
-        # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...]
-        bert_outputs, _ = self.encoder(word_pieces, token_type_ids=None, attention_mask=attn_masks,
-                                           output_all_encoded_layers=True)
-        # output_layers = [self.layers]  # len(self.layers) x batch_size x max_word_piece_length x hidden_size
-
-        if self.include_cls_sep:
-            outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2,
-                                                 bert_outputs[-1].size(-1))
-            s_shift = 1
-        else:
-            outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len,
-                                                 bert_outputs[-1].size(-1))
-            s_shift = 0
-        batch_word_pieces_cum_length = batch_word_pieces_length.new_zeros(batch_size, max_word_len + 1)
-        batch_word_pieces_cum_length[:, 1:] = batch_word_pieces_length.cumsum(dim=-1)  # batch_size x max_len
-        for l_index, l in enumerate(self.layers):
-            output_layer = bert_outputs[l]
-            # 从word_piece collapse到word的表示
-            truncate_output_layer = output_layer[:, 1:-1]  # 删除[CLS]与[SEP] batch_size x len x hidden_size
-            outputs_seq_len = seq_len + s_shift
-            if self.pool_method == 'first':
-                for i in range(batch_size):
-                    i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]]  # 每个word的start位置
-                    outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length]  # num_layer x batch_size x len x hidden_size
-            elif self.pool_method == 'last':
-                for i in range(batch_size):
-                    i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i]+1] - 1 # 每个word的end
-                    outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length]
-            elif self.pool_method == 'max':
-                for i in range(batch_size):
-                    for j in range(seq_len[i]):
-                        start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
-                        outputs[l_index, i, j+s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2)
-            else:
-                for i in range(batch_size):
-                    for j in range(seq_len[i]):
-                        start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
-                        outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2)
-            if self.include_cls_sep:
-                outputs[l_index, :, 0] = output_layer[:, 0]
-                outputs[l_index, batch_indexes, seq_len+s_shift] = output_layer[batch_indexes, seq_len+s_shift]
-        # 3. 最终的embedding结果
-        return outputs
-
-
-class _WordPieceBertModel(nn.Module):
-    """
-    这个模块用于直接计算word_piece的结果.
-
-    """
-    def __init__(self, model_dir:str, layers:str='-1'):
-        super().__init__()
-
-        self.tokenzier = BertTokenizer.from_pretrained(model_dir)
-        self.encoder = BertModel.from_pretrained(model_dir)
-        #  检查encoder_layer_number是否合理
-        encoder_layer_number = len(self.encoder.encoder.layer)
-        self.layers = list(map(int, layers.split(',')))
-        for layer in self.layers:
-            if layer<0:
-                assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \
-                    f"a bert model with {encoder_layer_number} layers."
-            else:
-                assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
-                    f"a bert model with {encoder_layer_number} layers."
-
-        self._cls_index = self.tokenzier.vocab['[CLS]']
-        self._sep_index = self.tokenzier.vocab['[SEP]']
-        self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]']  # 需要用于生成word_piece
-
-    def index_dataset(self, *datasets, field_name):
-        """
-        使用bert的tokenizer新生成word_pieces列加入到datasets中，并将他们设置为input。如果首尾不是
-            [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。
-
-        :param datasets: DataSet对象
-        :param field_name: 基于哪一列index
-        :return:
-        """
-        def convert_words_to_word_pieces(words):
-            word_pieces = []
-            for word in words:
-                tokens = self.tokenzier.wordpiece_tokenizer.tokenize(word)
-                word_piece_ids = self.tokenzier.convert_tokens_to_ids(tokens)
-                word_pieces.extend(word_piece_ids)
-            if word_pieces[0]!=self._cls_index:
-                word_pieces.insert(0, self._cls_index)
-            if word_pieces[-1]!=self._sep_index:
-                word_pieces.insert(-1, self._sep_index)
-            return word_pieces
-
-        for index, dataset in enumerate(datasets):
-            try:
-                dataset.apply_field(convert_words_to_word_pieces, field_name=field_name, new_field_name='word_pieces',
-                                    is_input=True)
-                dataset.set_pad_val('word_pieces', self._wordpiece_pad_index)
-            except Exception as e:
-                print(f"Exception happens when processing the {index} dataset.")
-                raise e
-
-    def forward(self, word_pieces, token_type_ids=None):
-        """
-
-        :param word_pieces: torch.LongTensor, batch_size x max_len
-        :param token_type_ids: torch.LongTensor, batch_size x max_len
-        :return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size
-        """
-        batch_size, max_len = word_pieces.size()
-
-        attn_masks = word_pieces.ne(self._wordpiece_pad_index)
-        bert_outputs, _ = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks,
-                                           output_all_encoded_layers=True)
-        # output_layers = [self.layers]  # len(self.layers) x batch_size x max_word_piece_length x hidden_size
-        outputs = bert_outputs[0].new_zeros((len(self.layers), batch_size, max_len, bert_outputs[0].size(-1)))
-        for l_index, l in enumerate(self.layers):
-            outputs[l_index] = bert_outputs[l]
-        return outputs
-
diff --git a/fastNLP/modules/encoder/_elmo.py b/fastNLP/modules/encoder/_elmo.py
index b887c6b1..befae8bc 100644
--- a/fastNLP/modules/encoder/_elmo.py
+++ b/fastNLP/modules/encoder/_elmo.py
@@ -4,18 +4,13 @@
 
 from typing import Optional, Tuple, List, Callable
 
-import os
-
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence
-from ...core.vocabulary import Vocabulary
-import json
-import pickle
 
 from ..utils import get_dropout_mask
-import codecs
+
 
 class LstmCellWithProjection(torch.nn.Module):
     """
@@ -541,188 +536,3 @@ class Highway(torch.nn.Module):
             gate = torch.sigmoid(gate)
             current_input = gate * linear_part + (1 - gate) * nonlinear_part
         return current_input
-
-
-class _ElmoModel(nn.Module):
-    """
-    该Module是ElmoEmbedding中进行所有的heavy lifting的地方。做的工作，包括
-        (1) 根据配置，加载模型;
-        (2) 根据vocab，对模型中的embedding进行调整. 并将其正确初始化
-        (3) 保存一个words与chars的对应转换，获取时自动进行相应的转换
-        (4) 设计一个保存token的embedding，允许缓存word的表示。
-
-    """
-
-    def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False):
-        super(_ElmoModel, self).__init__()
-        self.model_dir = model_dir
-        dir = os.walk(self.model_dir)
-        config_file = None
-        weight_file = None
-        config_count = 0
-        weight_count = 0
-        for path, dir_list, file_list in dir:
-            for file_name in file_list:
-                if file_name.__contains__(".json"):
-                    config_file = file_name
-                    config_count += 1
-                elif file_name.__contains__(".pkl"):
-                    weight_file = file_name
-                    weight_count += 1
-        if config_count > 1 or weight_count > 1:
-            raise Exception(f"Multiple config files(*.json) or weight files(*.hdf5) detected in {model_dir}.")
-        elif config_count == 0 or weight_count == 0:
-            raise Exception(f"No config file or weight file found in {model_dir}")
-
-        config = json.load(open(os.path.join(model_dir, config_file), 'r'))
-        self.weight_file = os.path.join(model_dir, weight_file)
-        self.config = config
-
-        OOV_TAG = '<oov>'
-        PAD_TAG = '<pad>'
-        BOS_TAG = '<bos>'
-        EOS_TAG = '<eos>'
-        BOW_TAG = '<bow>'
-        EOW_TAG = '<eow>'
-
-        # For the model trained with character-based word encoder.
-        char_lexicon = {}
-        with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi:
-            for line in fpi:
-                tokens = line.strip().split('\t')
-                if len(tokens) == 1:
-                    tokens.insert(0, '\u3000')
-                token, i = tokens
-                char_lexicon[token] = int(i)
-
-        # 做一些sanity check
-        for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]:
-            assert special_word in char_lexicon, f"{special_word} not found in char.dic."
-
-        # 从vocab中构建char_vocab
-        char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG)
-        # 需要保证<bow>与<eow>在里面
-        char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG])
-
-        for word, index in vocab:
-            char_vocab.add_word_lst(list(word))
-
-        self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab) + 1, vocab.padding_idx
-        # 根据char_lexicon调整, 多设置一位，是预留给word padding的(该位置的char表示为全0表示)
-        char_emb_layer = nn.Embedding(len(char_vocab) + 1, int(config['char_cnn']['embedding']['dim']),
-                                      padding_idx=len(char_vocab))
-
-        # 读入预训练权重 这里的elmo_model 包含char_cnn和 lstm 的 state_dict
-        elmo_model = torch.load(os.path.join(self.model_dir, weight_file), map_location='cpu')
-
-        char_embed_weights = elmo_model["char_cnn"]['char_emb_layer.weight']
-
-        found_char_count = 0
-        for char, index in char_vocab:  # 调整character embedding
-            if char in char_lexicon:
-                index_in_pre = char_lexicon.get(char)
-                found_char_count += 1
-            else:
-                index_in_pre = char_lexicon[OOV_TAG]
-            char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre]
-
-        print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.")
-        # 生成words到chars的映射
-        max_chars = config['char_cnn']['max_characters_per_token']
-
-        self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars),
-                                                                fill_value=len(char_vocab),
-                                                                dtype=torch.long),
-                                                     requires_grad=False)
-        for word, index in list(iter(vocab)) + [(BOS_TAG, len(vocab)), (EOS_TAG, len(vocab) + 1)]:
-            if len(word) + 2 > max_chars:
-                word = word[:max_chars - 2]
-            if index == self._pad_index:
-                continue
-            elif word == BOS_TAG or word == EOS_TAG:
-                char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(word)] + [
-                    char_vocab.to_index(EOW_TAG)]
-                char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids))
-            else:
-                char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(c) for c in word] + [
-                    char_vocab.to_index(EOW_TAG)]
-                char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids))
-            self.words_to_chars_embedding[index] = torch.LongTensor(char_ids)
-
-        self.char_vocab = char_vocab
-
-        self.token_embedder = ConvTokenEmbedder(
-            config, self.weight_file, None, char_emb_layer)
-        elmo_model["char_cnn"]['char_emb_layer.weight'] = char_emb_layer.weight
-        self.token_embedder.load_state_dict(elmo_model["char_cnn"])
-
-        self.output_dim = config['lstm']['projection_dim']
-
-        # lstm encoder
-        self.encoder = ElmobiLm(config)
-        self.encoder.load_state_dict(elmo_model["lstm"])
-
-        if cache_word_reprs:
-            if config['char_cnn']['embedding']['dim'] > 0:  # 只有在使用了chars的情况下有用
-                print("Start to generate cache word representations.")
-                batch_size = 320
-                # bos eos
-                word_size = self.words_to_chars_embedding.size(0)
-                num_batches = word_size // batch_size + \
-                              int(word_size % batch_size != 0)
-
-                self.cached_word_embedding = nn.Embedding(word_size,
-                                                          config['lstm']['projection_dim'])
-                with torch.no_grad():
-                    for i in range(num_batches):
-                        words = torch.arange(i * batch_size,
-                                             min((i + 1) * batch_size, word_size)).long()
-                        chars = self.words_to_chars_embedding[words].unsqueeze(1)  # batch_size x 1 x max_chars
-                        word_reprs = self.token_embedder(words.unsqueeze(1),
-                                                         chars).detach()  # batch_size x 1 x config['encoder']['projection_dim']
-                        self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1)
-
-                    print("Finish generating cached word representations. Going to delete the character encoder.")
-                del self.token_embedder, self.words_to_chars_embedding
-            else:
-                print("There is no need to cache word representations, since no character information is used.")
-
-    def forward(self, words):
-        """
-
-        :param words: batch_size x max_len
-        :return: num_layers x batch_size x max_len x hidden_size
-        """
-        # 扩展<bos>, <eos>
-        batch_size, max_len = words.size()
-        expanded_words = words.new_zeros(batch_size, max_len + 2)  # 因为pad一定为0，
-        seq_len = words.ne(self._pad_index).sum(dim=-1)
-        expanded_words[:, 1:-1] = words
-        expanded_words[:, 0].fill_(self.bos_index)
-        expanded_words[torch.arange(batch_size).to(words), seq_len + 1] = self.eos_index
-        seq_len = seq_len + 2
-        zero_tensor = expanded_words.new_zeros(expanded_words.shape)
-        mask = (expanded_words == zero_tensor).unsqueeze(-1)
-        if hasattr(self, 'cached_word_embedding'):
-            token_embedding = self.cached_word_embedding(expanded_words)
-        else:
-            if hasattr(self, 'words_to_chars_embedding'):
-                chars = self.words_to_chars_embedding[expanded_words]
-            else:
-                chars = None
-            token_embedding = self.token_embedder(expanded_words, chars)  # batch_size x max_len x embed_dim
-
-        encoder_output = self.encoder(token_embedding, seq_len)
-        if encoder_output.size(2) < max_len + 2:
-            num_layers, _, output_len, hidden_size = encoder_output.size()
-            dummy_tensor = encoder_output.new_zeros(num_layers, batch_size,
-                                                    max_len + 2 - output_len, hidden_size)
-            encoder_output = torch.cat((encoder_output, dummy_tensor), 2)
-        sz = encoder_output.size()  # 2, batch_size, max_len, hidden_size
-        token_embedding = token_embedding.masked_fill(mask, 0)
-        token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3])
-        encoder_output = torch.cat((token_embedding, encoder_output), dim=0)
-
-        # 删除<eos>, <bos>. 这里没有精确地删除，但应该也不会影响最后的结果了。
-        encoder_output = encoder_output[:, :, 1:-1]
-        return encoder_output
diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py
index 1819cc69..6d32ae74 100644
--- a/fastNLP/modules/encoder/bert.py
+++ b/fastNLP/modules/encoder/bert.py
@@ -1,79 +1,919 @@
 
+
+
+"""
+这个页面的代码很大程度上参考(复制粘贴)了https://github.com/huggingface/pytorch-pretrained-BERT的代码， 如果你发现该代码对你
+    有用，也请引用一下他们。
+"""
+
+
+import collections
+
+import unicodedata
+import copy
+import json
+import math
 import os
-from torch import nn
+
 import torch
-from ...io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR
-from ._bert import _WordPieceBertModel, BertModel
+from torch import nn
+import glob
+import sys
 
+CONFIG_FILE = 'bert_config.json'
 
-class BertWordPieceEncoder(nn.Module):
+
+class BertConfig(object):
+    """Configuration class to store the configuration of a `BertModel`.
     """
-    读取bert模型，读取之后调用index_dataset方法在dataset中生成word_pieces这一列。
+    def __init__(self,
+                 vocab_size_or_config_json_file,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12):
+        """Constructs BertConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
+        """
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.hidden_act = hidden_act
+            self.intermediate_size = intermediate_size
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.initializer_range = initializer_range
+            self.layer_norm_eps = layer_norm_eps
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             "or the path to a pretrained model config file (str)")
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = BertConfig(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
+
 
-    :param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为``en-base-uncased``
-    :param str layers:最终结果中的表示。以','隔开层数，可以以负数去索引倒数几层
-    :param bool requires_grad: 是否需要gradient。
+def gelu(x):
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+
+
+class BertLayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-12):
+        """Construct a layernorm module in the TF style (epsilon inside the square root).
+        """
+        super(BertLayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.bias = nn.Parameter(torch.zeros(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        u = x.mean(-1, keepdim=True)
+        s = (x - u).pow(2).mean(-1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+        return self.weight * x + self.bias
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
     """
-    def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1',
-                 requires_grad: bool=False):
-        super().__init__()
-        PRETRAIN_URL = _get_base_url('bert')
-
-        if model_dir_or_name in PRETRAINED_BERT_MODEL_DIR:
-            model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
-            model_url = PRETRAIN_URL + model_name
-            model_dir = cached_path(model_url)
-            # 检查是否存在
-        elif os.path.isdir(model_dir_or_name):
-            model_dir = model_dir_or_name
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids, token_type_ids=None):
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        words_embeddings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = words_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+
+    def forward(self, input_tensor, attention_mask):
+        self_output = self.self(input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
         else:
-            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self, hidden_states, attention_mask):
+        attention_output = self.attention(hidden_states, attention_mask)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        layer = BertLayer(config)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+
+    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
+        all_encoder_layers = []
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states, attention_mask)
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertModel(nn.Module):
+    """BERT(Bidirectional Embedding Representations from Transformers).
+
+    如果你想使用预训练好的权重矩阵，请在以下网址下载.
+    sources::
+
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
+    'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-pytorch_model.bin",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin"
+
+
+    用预训练权重矩阵来建立BERT模型::
+
+        model = BertModel.from_pretrained("path/to/weights/directory")
+
+    用随机初始化权重矩阵来建立BERT模型::
 
-        self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers)
-        self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size
-        self.requires_grad = requires_grad
+        model = BertModel()
 
-    @property
-    def requires_grad(self):
+    :param int vocab_size: 词表大小，默认值为30522，为BERT English uncase版本的词表大小
+    :param int hidden_size: 隐层大小，默认值为768，为BERT base的版本
+    :param int num_hidden_layers: 隐藏层数，默认值为12，为BERT base的版本
+    :param int num_attention_heads: 多头注意力头数，默认值为12，为BERT base的版本
+    :param int intermediate_size: FFN隐藏层大小，默认值是3072，为BERT base的版本
+    :param str hidden_act: FFN隐藏层激活函数，默认值为``gelu``
+    :param float hidden_dropout_prob: FFN隐藏层dropout，默认值为0.1
+    :param float attention_probs_dropout_prob: Attention层的dropout，默认值为0.1
+    :param int max_position_embeddings: 最大的序列长度，默认值为512，
+    :param int type_vocab_size: 最大segment数量，默认值为2
+    :param int initializer_range: 初始化权重范围，默认值为0.02
+    """
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(BertModel, self).__init__()
+        if not isinstance(config, BertConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `BertConfig`. "
+                "To create a model from a Google pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        super(BertModel, self).__init__()
+        self.config = config
+        self.hidden_size = self.config.hidden_size
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+        self.apply(self.init_bert_weights)
+
+    def init_bert_weights(self, module):
+        """ Initialize the weights.
         """
-        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = self.embeddings(input_ids, token_type_ids)
+        encoded_layers = self.encoder(embedding_output,
+                                      extended_attention_mask,
+                                      output_all_encoded_layers=output_all_encoded_layers)
+        sequence_output = encoded_layers[-1]
+        pooled_output = self.pooler(sequence_output)
+        if not output_all_encoded_layers:
+            encoded_layers = encoded_layers[-1]
+        return encoded_layers, pooled_output
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_dir, *inputs, **kwargs):
+        state_dict = kwargs.get('state_dict', None)
+        kwargs.pop('state_dict', None)
+        cache_dir = kwargs.get('cache_dir', None)
+        kwargs.pop('cache_dir', None)
+        from_tf = kwargs.get('from_tf', False)
+        kwargs.pop('from_tf', None)
+        # Load config
+        config_file = os.path.join(pretrained_model_dir, CONFIG_FILE)
+        config = BertConfig.from_json_file(config_file)
+        # logger.info("Model config {}".format(config))
+        # Instantiate model.
+        model = cls(config, *inputs, **kwargs)
+        if state_dict is None:
+            files = glob.glob(os.path.join(pretrained_model_dir, '*.bin'))
+            if len(files)==0:
+                raise FileNotFoundError(f"There is no *.bin file in {pretrained_model_dir}")
+            elif len(files)>1:
+                raise FileExistsError(f"There are multiple *.bin files in {pretrained_model_dir}")
+            weights_path = files[0]
+            state_dict = torch.load(weights_path, map_location='cpu')
+
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+
+        load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
+        if len(missing_keys) > 0:
+            print("Weights of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            print("Weights from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        return model
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        while True:
+            token = reader.readline()
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self,
+                 do_lower_case=True,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in self.never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        if text in self.never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+                (cp >= 0x3400 and cp <= 0x4DBF) or  #
+                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+                (cp >= 0x2B820 and cp <= 0x2CEAF) or
+                (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+
+
+class BertTokenizer(object):
+    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+
+    def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+        """Constructs a BertTokenizer.
+
+        Args:
+          vocab_file: Path to a one-wordpiece-per-line vocabulary file
+          do_lower_case: Whether to lower case the input
+                         Only has an effect when do_wordpiece_only=False
+          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+          max_len: An artificial maximum length to truncate tokenized sequences to;
+                         Effective maximum length is always the minimum of this
+                         value (if specified) and the underlying BERT model's
+                         sequence length.
+          never_split: List of tokens which will never be split during tokenization.
+                         Only has an effect when do_wordpiece_only=False
+        """
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+          self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                                never_split=never_split)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+        self.max_len = max_len if max_len is not None else int(1e12)
+
+    def _reinit_on_new_vocab(self, vocab):
+        """
+        在load bert之后，可能会对vocab进行重新排列。重新排列之后调用这个函数重新初始化与vocab相关的性质
+
+        :param vocab:
         :return:
         """
-        requires_grads = set([param.requires_grad for name, param in self.named_parameters()])
-        if len(requires_grads)==1:
-            return requires_grads.pop()
+        self.vocab = vocab
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+    def tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            ids.append(self.vocab[token])
+        if len(ids) > self.max_len:
+            print(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this BERT model ({} > {}). Running this"
+                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in wordpiece tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary to a directory or file."""
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_NAME)
         else:
-            return None
+            vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    print("Saving vocabulary to {}: vocabulary indices are not consecutive."
+                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+        return vocab_file
+
+    @classmethod
+    def from_pretrained(cls, model_dir, *inputs, **kwargs):
+        """
+        给定path，直接读取vocab.
+
+        """
+        pretrained_model_name_or_path = os.path.join(model_dir, VOCAB_NAME)
+        print("loading vocabulary file {}".format(pretrained_model_name_or_path))
+        max_len = 512
+        kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(pretrained_model_name_or_path, *inputs, **kwargs)
+        return tokenizer
 
-    @requires_grad.setter
-    def requires_grad(self, value):
-        for name, param in self.named_parameters():
-            param.requires_grad = value
+VOCAB_NAME = 'vocab.txt'
 
-    @property
-    def embed_size(self):
-        return self._embed_size
 
-    def index_datasets(self, *datasets, field_name):
+class _WordPieceBertModel(nn.Module):
+    """
+    这个模块用于直接计算word_piece的结果.
+
+    """
+    def __init__(self, model_dir:str, layers:str='-1'):
+        super().__init__()
+
+        self.tokenzier = BertTokenizer.from_pretrained(model_dir)
+        self.encoder = BertModel.from_pretrained(model_dir)
+        #  检查encoder_layer_number是否合理
+        encoder_layer_number = len(self.encoder.encoder.layer)
+        self.layers = list(map(int, layers.split(',')))
+        for layer in self.layers:
+            if layer<0:
+                assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \
+                    f"a bert model with {encoder_layer_number} layers."
+            else:
+                assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
+                    f"a bert model with {encoder_layer_number} layers."
+
+        self._cls_index = self.tokenzier.vocab['[CLS]']
+        self._sep_index = self.tokenzier.vocab['[SEP]']
+        self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]']  # 需要用于生成word_piece
+
+    def index_dataset(self, *datasets, field_name):
         """
         使用bert的tokenizer新生成word_pieces列加入到datasets中，并将他们设置为input。如果首尾不是
             [CLS]与[SEP]会在首尾额外加入[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。
 
         :param datasets: DataSet对象
-        :param field_name: 基于哪一列的内容生成word_pieces列。这一列中每个数据应该是List[str]的形式。
+        :param field_name: 基于哪一列index
         :return:
         """
-        self.model.index_dataset(*datasets, field_name=field_name)
+        def convert_words_to_word_pieces(words):
+            word_pieces = []
+            for word in words:
+                tokens = self.tokenzier.wordpiece_tokenizer.tokenize(word)
+                word_piece_ids = self.tokenzier.convert_tokens_to_ids(tokens)
+                word_pieces.extend(word_piece_ids)
+            if word_pieces[0]!=self._cls_index:
+                word_pieces.insert(0, self._cls_index)
+            if word_pieces[-1]!=self._sep_index:
+                word_pieces.insert(-1, self._sep_index)
+            return word_pieces
+
+        for index, dataset in enumerate(datasets):
+            try:
+                dataset.apply_field(convert_words_to_word_pieces, field_name=field_name, new_field_name='word_pieces',
+                                    is_input=True)
+                dataset.set_pad_val('word_pieces', self._wordpiece_pad_index)
+            except Exception as e:
+                print(f"Exception happens when processing the {index} dataset.")
+                raise e
 
     def forward(self, word_pieces, token_type_ids=None):
         """
-        计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。
 
-        :param words: batch_size x max_len
-        :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话
-        :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
+        :param word_pieces: torch.LongTensor, batch_size x max_len
+        :param token_type_ids: torch.LongTensor, batch_size x max_len
+        :return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size
         """
-        outputs = self.model(word_pieces, token_type_ids)
-        outputs = torch.cat([*outputs], dim=-1)
+        batch_size, max_len = word_pieces.size()
 
+        attn_masks = word_pieces.ne(self._wordpiece_pad_index)
+        bert_outputs, _ = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks,
+                                           output_all_encoded_layers=True)
+        # output_layers = [self.layers]  # len(self.layers) x batch_size x max_word_piece_length x hidden_size
+        outputs = bert_outputs[0].new_zeros((len(self.layers), batch_size, max_len, bert_outputs[0].size(-1)))
+        for l_index, l in enumerate(self.layers):
+            outputs[l_index] = bert_outputs[l]
         return outputs
+
diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py
deleted file mode 100644
index 050a423a..00000000
--- a/fastNLP/modules/encoder/embedding.py
+++ /dev/null
@@ -1,1083 +0,0 @@
-__all__ = [
-    "Embedding",
-    "StaticEmbedding",
-    "ElmoEmbedding",
-    "BertEmbedding",
-    "StackEmbedding",
-    "LSTMCharEmbedding",
-    "CNNCharEmbedding",
-]
-import torch.nn as nn
-from ..utils import get_embeddings
-from .lstm import LSTM
-from ...core.vocabulary import Vocabulary
-from abc import abstractmethod
-import torch
-import numpy as np
-import torch.nn.functional as F
-import os
-from ._elmo import _ElmoModel
-from ...io.file_utils import cached_path, _get_base_url
-from ._bert import _WordBertModel
-from typing import List
-
-import warnings
-from ...core.dataset import DataSet
-from ...core.batch import DataSetIter
-from ...core.sampler import SequentialSampler
-from ...core.utils import _move_model_to_device, _get_model_device
-from ...io.file_utils import PRETRAINED_BERT_MODEL_DIR, PRETRAINED_ELMO_MODEL_DIR, PRETRAIN_STATIC_FILES
-
-
-class Embedding(nn.Module):
-    """
-    别名：:class:`fastNLP.modules.Embedding`   :class:`fastNLP.modules.encoder.embedding.Embedding`
-
-    Embedding组件. 可以通过self.num_embeddings获取词表大小; self.embedding_dim获取embedding的维度"""
-    
-    def __init__(self, init_embed, word_dropout=0, dropout=0.0, unk_index=None):
-        """
-
-        :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int),
-            第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding;
-        :param float word_dropout: 按照一定概率随机将word设置为unk_index，这样可以使得unk这个token得到足够的训练, 且会对网络有
-            一定的regularize的作用。
-        :param float dropout: 对Embedding的输出的dropout。
-        :param int unk_index: drop word时替换为的index。fastNLP的Vocabulary的unk_index默认为1。
-        """
-        super(Embedding, self).__init__()
-
-        self.embed = get_embeddings(init_embed)
-        
-        self.dropout = nn.Dropout(dropout)
-        if not isinstance(self.embed, TokenEmbedding):
-            self._embed_size = self.embed.weight.size(1)
-            if word_dropout>0 and not isinstance(unk_index, int):
-                raise ValueError("When drop word is set, you need to pass in the unk_index.")
-        else:
-            self._embed_size = self.embed.embed_size
-            unk_index = self.embed.get_word_vocab().unknown_idx
-        self.unk_index = unk_index
-        self.word_dropout = word_dropout
-
-    def forward(self, x):
-        """
-        :param torch.LongTensor x: [batch, seq_len]
-        :return: torch.Tensor : [batch, seq_len, embed_dim]
-        """
-        if self.word_dropout>0 and self.training:
-            mask = torch.ones_like(x).float() * self.word_dropout
-            mask = torch.bernoulli(mask).byte()  # dropout_word越大，越多位置为1
-            x = x.masked_fill(mask, self.unk_index)
-        x = self.embed(x)
-        return self.dropout(x)
-
-    @property
-    def num_embedding(self)->int:
-        if isinstance(self.embed, nn.Embedding):
-            return self.embed.weight.size(0)
-        else:
-            return self.embed.num_embedding
-
-    def __len__(self):
-        return len(self.embed)
-
-    @property
-    def embed_size(self) -> int:
-        return self._embed_size
-
-    @property
-    def embedding_dim(self) -> int:
-        return self._embed_size
-
-    @property
-    def requires_grad(self):
-        """
-        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
-        :return:
-        """
-        if not isinstance(self.embed, TokenEmbedding):
-            return self.embed.weight.requires_grad
-        else:
-            return self.embed.requires_grad
-
-    @requires_grad.setter
-    def requires_grad(self, value):
-        if not isinstance(self.embed, TokenEmbedding):
-            self.embed.weight.requires_grad = value
-        else:
-            self.embed.requires_grad = value
-
-    @property
-    def size(self):
-        if isinstance(self.embed, TokenEmbedding):
-            return self.embed.size
-        else:
-            return self.embed.weight.size()
-
-
-class TokenEmbedding(nn.Module):
-    def __init__(self, vocab, word_dropout=0.0, dropout=0.0):
-        super(TokenEmbedding, self).__init__()
-        assert vocab.padding is not None, "Vocabulary must have a padding entry."
-        self._word_vocab = vocab
-        self._word_pad_index = vocab.padding_idx
-        if word_dropout>0:
-            assert vocab.unknown is not None, "Vocabulary must have unknown entry when you want to drop a word."
-        self.word_dropout = word_dropout
-        self._word_unk_index = vocab.unknown_idx
-        self.dropout_layer = nn.Dropout(dropout)
-
-    def drop_word(self, words):
-        """
-        按照设定随机将words设置为unknown_index。
-
-        :param torch.LongTensor words: batch_size x max_len
-        :return:
-        """
-        if self.word_dropout > 0 and self.training:
-            mask = torch.ones_like(words).float() * self.word_dropout
-            mask = torch.bernoulli(mask).byte()  # dropout_word越大，越多位置为1
-            words = words.masked_fill(mask, self._word_unk_index)
-        return words
-
-    def dropout(self, words):
-        """
-        对embedding后的word表示进行drop。
-
-        :param torch.FloatTensor words: batch_size x max_len x embed_size
-        :return:
-        """
-        return self.dropout_layer(words)
-
-    @property
-    def requires_grad(self):
-        """
-        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
-        :return:
-        """
-        requires_grads = set([param.requires_grad for param in self.parameters()])
-        if len(requires_grads) == 1:
-            return requires_grads.pop()
-        else:
-            return None
-
-    @requires_grad.setter
-    def requires_grad(self, value):
-        for param in self.parameters():
-            param.requires_grad = value
-
-    def __len__(self):
-        return len(self._word_vocab)
-
-    @property
-    def embed_size(self) -> int:
-        return self._embed_size
-
-    @property
-    def embedding_dim(self) -> int:
-        return self._embed_size
-
-    @property
-    def num_embedding(self) -> int:
-        """
-        这个值可能会大于实际的embedding矩阵的大小。
-        :return:
-        """
-        return len(self._word_vocab)
-
-    def get_word_vocab(self):
-        """
-        返回embedding的词典。
-
-        :return: Vocabulary
-        """
-        return self._word_vocab
-
-    @property
-    def size(self):
-        return torch.Size(self.num_embedding, self._embed_size)
-
-    @abstractmethod
-    def forward(self, *input):
-        raise NotImplementedError
-
-class StaticEmbedding(TokenEmbedding):
-    """
-    别名：:class:`fastNLP.modules.StaticEmbedding`   :class:`fastNLP.modules.encoder.embedding.StaticEmbedding`
-
-    StaticEmbedding组件. 给定embedding的名称，根据vocab从embedding中抽取相应的数据。该Embedding可以就按照正常的embedding使用了
-
-    Example::
-
-        >>> embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50')
-
-
-    :param vocab: Vocabulary. 若该项为None则会读取所有的embedding。
-    :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding：第一种是传入embedding的文件名，第二种是传入embedding
-        的名称。目前支持的embedding包括{`en` 或者 `en-glove-840b-300` : glove.840B.300d, `en-glove-6b-50` : glove.6B.50d,
-        `en-word2vec-300` : GoogleNews-vectors-negative300}。第二种情况将自动查看缓存中是否存在该模型，没有的话将自动下载。
-    :param bool requires_grad: 是否需要gradient. 默认为True
-    :param callable init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。调用该方法时传入一个tensor对象。
-    :param bool lower: 是否将vocab中的词语小写后再和预训练的词表进行匹配。如果你的词表中包含大写的词语，或者就是需要单独
-        为大写的词语开辟一个vector表示，则将lower设置为False。
-    :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
-    :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。
-    :param bool normailize: 是否对vector进行normalize，使得每个vector的norm为1。
-    """
-    def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=True, init_method=None,
-                 lower=False, dropout=0, word_dropout=0, normalize=False):
-        super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
-
-        # 得到cache_path
-        if model_dir_or_name.lower() in PRETRAIN_STATIC_FILES:
-            PRETRAIN_URL = _get_base_url('static')
-            model_name = PRETRAIN_STATIC_FILES[model_dir_or_name]
-            model_url = PRETRAIN_URL + model_name
-            model_path = cached_path(model_url)
-            # 检查是否存在
-        elif os.path.isfile(os.path.expanduser(os.path.abspath(model_dir_or_name))):
-            model_path = model_dir_or_name
-        else:
-            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
-
-        # 读取embedding
-        if lower:
-            lowered_vocab = Vocabulary(padding=vocab.padding, unknown=vocab.unknown)
-            for word, index in vocab:
-                if not vocab._is_word_no_create_entry(word):
-                    lowered_vocab.add_word(word.lower())  # 先加入需要创建entry的
-            for word in vocab._no_create_word.keys():  # 不需要创建entry的
-                if word in vocab:
-                    lowered_word = word.lower()
-                    if lowered_word not in lowered_vocab.word_count:
-                        lowered_vocab.add_word(lowered_word)
-                        lowered_vocab._no_create_word[lowered_word] += 1
-            print(f"All word in vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} unique lowered "
-                  f"words.")
-            embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method,
-                                                          normalize=normalize)
-            # 需要适配一下
-            if not hasattr(self, 'words_to_words'):
-                self.words_to_words = torch.arange(len(lowered_vocab, )).long()
-            if lowered_vocab.unknown:
-                unknown_idx = lowered_vocab.unknown_idx
-            else:
-                unknown_idx = embedding.size(0) - 1  # 否则是最后一个为unknow
-            words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(),
-                                          requires_grad=False)
-            for word, index in vocab:
-                if word not in lowered_vocab:
-                    word = word.lower()
-                    if lowered_vocab._is_word_no_create_entry(word):  # 如果不需要创建entry,已经默认unknown了
-                        continue
-                words_to_words[index] = self.words_to_words[lowered_vocab.to_index(word)]
-            self.words_to_words = words_to_words
-        else:
-            embedding = self._load_with_vocab(model_path, vocab=vocab, init_method=init_method,
-                                                          normalize=normalize)
-        self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1],
-                                      padding_idx=vocab.padding_idx,
-                                      max_norm=None, norm_type=2, scale_grad_by_freq=False,
-                                      sparse=False, _weight=embedding)
-        self._embed_size = self.embedding.weight.size(1)
-        self.requires_grad = requires_grad
-
-    @property
-    def requires_grad(self):
-        """
-        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
-        :return:
-        """
-        requires_grads = set([param.requires_grad for name, param in self.named_parameters()
-                              if 'words_to_words' not in name])
-        if len(requires_grads) == 1:
-            return requires_grads.pop()
-        else:
-            return None
-
-    @requires_grad.setter
-    def requires_grad(self, value):
-        for name, param in self.named_parameters():
-            if 'words_to_words' in name:
-                continue
-            param.requires_grad = value
-
-    def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>',
-                         normalize=True, error='ignore', init_method=None):
-        """
-        从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是
-        word2vec(第一行只有两个元素)还是glove格式的数据。
-
-        :param str embed_filepath: 预训练的embedding的路径。
-        :param vocab: 词表 :class:`~fastNLP.Vocabulary` 类型，读取出现在vocab中的词的embedding。
-            没有出现在vocab中的词的embedding将通过找到的词的embedding的正态分布采样出来，以使得整个Embedding是同分布的。
-        :param dtype: 读出的embedding的类型
-        :param str padding: 词表中padding的token
-        :param str unknown: 词表中unknown的token
-        :param bool normalize: 是否将每个vector归一化到norm为1
-        :param str error: `ignore` , `strict` ; 如果 `ignore` ，错误将自动跳过; 如果 `strict` , 错误将抛出。
-            这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。
-        :param init_method: 如何初始化没有找到的值。可以使用torch.nn.init.*中各种方法。默认使用torch.nn.init.zeros_
-        :return torch.tensor:  shape为 [len(vocab), dimension], dimension由pretrain的embedding决定。
-        """
-        assert isinstance(vocab, Vocabulary), "Only fastNLP.Vocabulary is supported."
-        if not os.path.exists(embed_filepath):
-            raise FileNotFoundError("`{}` does not exist.".format(embed_filepath))
-        with open(embed_filepath, 'r', encoding='utf-8') as f:
-            line = f.readline().strip()
-            parts = line.split()
-            start_idx = 0
-            if len(parts) == 2:
-                dim = int(parts[1])
-                start_idx += 1
-            else:
-                dim = len(parts) - 1
-                f.seek(0)
-            matrix = {}
-            found_count = 0
-            for idx, line in enumerate(f, start_idx):
-                try:
-                    parts = line.strip().split()
-                    word = ''.join(parts[:-dim])
-                    nums = parts[-dim:]
-                    # 对齐unk与pad
-                    if word == padding and vocab.padding is not None:
-                        word = vocab.padding
-                    elif word == unknown and vocab.unknown is not None:
-                        word = vocab.unknown
-                    if word in vocab:
-                        index = vocab.to_index(word)
-                        matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim))
-                        found_count += 1
-                except Exception as e:
-                    if error == 'ignore':
-                        warnings.warn("Error occurred at the {} line.".format(idx))
-                    else:
-                        print("Error occurred at the {} line.".format(idx))
-                        raise e
-            print("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab)))
-            for word, index in vocab:
-                if index not in matrix and not vocab._is_word_no_create_entry(word):
-                    if vocab.unknown_idx in matrix:  # 如果有unkonwn，用unknown初始化
-                        matrix[index] = matrix[vocab.unknown_idx]
-                    else:
-                        matrix[index] = None
-
-            vectors = torch.zeros(len(matrix), dim)
-            if init_method:
-                init_method(vectors)
-            else:
-                nn.init.uniform_(vectors, -np.sqrt(3/dim), np.sqrt(3/dim))
-
-            if vocab._no_create_word_length>0:
-                if vocab.unknown is None:  # 创建一个专门的unknown
-                    unknown_idx = len(matrix)
-                    vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous()
-                else:
-                    unknown_idx = vocab.unknown_idx
-                words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(),
-                                              requires_grad=False)
-                for order, (index, vec) in enumerate(matrix.items()):
-                    if vec is not None:
-                        vectors[order] = vec
-                    words_to_words[index] = order
-                self.words_to_words = words_to_words
-            else:
-                for index, vec in matrix.items():
-                    if vec is not None:
-                        vectors[index] = vec
-
-            if normalize:
-                vectors /= (torch.norm(vectors, dim=1, keepdim=True) + 1e-12)
-
-            return vectors
-
-    def forward(self, words):
-        """
-        传入words的index
-
-        :param words: torch.LongTensor, [batch_size, max_len]
-        :return: torch.FloatTensor, [batch_size, max_len, embed_size]
-        """
-        if hasattr(self, 'words_to_words'):
-            words = self.words_to_words[words]
-        words = self.drop_word(words)
-        words = self.embedding(words)
-        words = self.dropout(words)
-        return words
-
-
-class ContextualEmbedding(TokenEmbedding):
-    def __init__(self, vocab: Vocabulary, word_dropout:float=0.0, dropout:float=0.0):
-        super(ContextualEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
-
-    def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool=True):
-        """
-        由于动态embedding生成比较耗时，所以可以把每句话embedding缓存下来，这样就不需要每次都运行生成过程。
-
-        :param datasets: DataSet对象
-        :param batch_size: int, 生成cache的sentence表示时使用的batch的大小
-        :param device: 参考 :class::fastNLP.Trainer 的device
-        :param delete_weights: 似乎在生成了cache之后删除权重，在不需要finetune动态模型的情况下，删除权重会大量减少内存占用。
-        :return:
-        """
-        for index, dataset in enumerate(datasets):
-            try:
-                assert isinstance(dataset, DataSet), "Only fastNLP.DataSet object is allowed."
-                assert 'words' in dataset.get_input_name(), "`words` field has to be set as input."
-            except Exception as e:
-                print(f"Exception happens at {index} dataset.")
-                raise e
-
-        sent_embeds = {}
-        _move_model_to_device(self, device=device)
-        device = _get_model_device(self)
-        pad_index = self._word_vocab.padding_idx
-        print("Start to calculate sentence representations.")
-        with torch.no_grad():
-            for index, dataset in enumerate(datasets):
-                try:
-                    batch = DataSetIter(dataset, batch_size=batch_size, sampler=SequentialSampler())
-                    for batch_x, batch_y in batch:
-                        words = batch_x['words'].to(device)
-                        words_list = words.tolist()
-                        seq_len = words.ne(pad_index).sum(dim=-1)
-                        max_len = words.size(1)
-                        # 因为有些情况可能包含CLS, SEP, 从后面往前计算比较安全。
-                        seq_len_from_behind = (max_len - seq_len).tolist()
-                        word_embeds = self(words).detach().cpu().numpy()
-                        for b in range(words.size(0)):
-                            length = seq_len_from_behind[b]
-                            if length==0:
-                                sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b]
-                            else:
-                                sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length]
-                except Exception as e:
-                    print(f"Exception happens at {index} dataset.")
-                    raise e
-        print("Finish calculating sentence representations.")
-        self.sent_embeds = sent_embeds
-        if delete_weights:
-            self._delete_model_weights()
-
-    def _get_sent_reprs(self, words):
-        """
-        获取sentence的表示，如果有缓存，则返回缓存的值; 没有缓存则返回None
-
-        :param words: torch.LongTensor
-        :return:
-        """
-        if hasattr(self, 'sent_embeds'):
-            words_list = words.tolist()
-            seq_len = words.ne(self._word_pad_index).sum(dim=-1)
-            _embeds = []
-            for b in range(len(words)):
-                words_i = tuple(words_list[b][:seq_len[b]])
-                embed = self.sent_embeds[words_i]
-                _embeds.append(embed)
-            max_sent_len = max(map(len, _embeds))
-            embeds = words.new_zeros(len(_embeds), max_sent_len, self.embed_size, dtype=torch.float,
-                                     device=words.device)
-            for i, embed in enumerate(_embeds):
-                embeds[i, :len(embed)] = torch.FloatTensor(embed).to(words.device)
-            return embeds
-        return None
-
-    @abstractmethod
-    def _delete_model_weights(self):
-        """删除计算表示的模型以节省资源"""
-        raise NotImplementedError
-
-    def remove_sentence_cache(self):
-        """
-        删除缓存的句子表示. 删除之后如果模型权重没有被删除，将开始使用动态计算权重。
-
-        :return:
-        """
-        del self.sent_embeds
-
-
-class ElmoEmbedding(ContextualEmbedding):
-    """
-    别名：:class:`fastNLP.modules.ElmoEmbedding`   :class:`fastNLP.modules.encoder.embedding.ElmoEmbedding`
-
-    使用ELMo的embedding。初始化之后，只需要传入words就可以得到对应的embedding。
-    我们提供的ELMo预训练模型来自 https://github.com/HIT-SCIR/ELMoForManyLangs
-
-    Example::
-
-        >>> embedding = ElmoEmbedding(vocab, model_dir_or_name='en', layers='2', requires_grad=True)
-
-    :param vocab: 词表
-    :param model_dir_or_name: 可以有两种方式调用预训练好的ELMo embedding：第一种是传入ELMo权重的文件名，第二种是传入ELMo版本的名称，
-        目前支持的ELMo包括{`en` : 英文版本的ELMo, `cn` : 中文版本的ELMo,}。第二种情况将自动查看缓存中是否存在该模型，没有的话将自动下载
-    :param layers: str, 指定返回的层数, 以,隔开不同的层。如果要返回第二层的结果'2', 返回后两层的结果'1,2'。不同的层的结果
-        按照这个顺序concat起来。默认为'2'。'mix'会使用可学习的权重结合不同层的表示(权重是否可训练与requires_grad保持一致，
-        初始化权重对三层结果进行mean-pooling, 可以通过ElmoEmbedding.set_mix_weights_requires_grad()方法只将mix weights设置为可学习。)
-    :param requires_grad: bool, 该层是否需要gradient, 默认为False.
-    :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
-    :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。
-    :param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话，将在初始化的时候为每个word生成对应的embedding，
-        并删除character encoder，之后将直接使用cache的embedding。默认为False。
-    """
-    def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', layers: str='2', requires_grad: bool=False,
-                 word_dropout=0.0, dropout=0.0, cache_word_reprs: bool=False):
-        super(ElmoEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
-
-        # 根据model_dir_or_name检查是否存在并下载
-        if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR:
-            PRETRAIN_URL = _get_base_url('elmo')
-            model_name = PRETRAINED_ELMO_MODEL_DIR[model_dir_or_name]
-            model_url = PRETRAIN_URL + model_name
-            model_dir = cached_path(model_url)
-            # 检查是否存在
-        elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))):
-            model_dir = model_dir_or_name
-        else:
-            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
-        self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs)
-
-        if layers=='mix':
-            self.layer_weights = nn.Parameter(torch.zeros(self.model.config['lstm']['n_layers']+1),
-                                              requires_grad=requires_grad)
-            self.gamma = nn.Parameter(torch.ones(1), requires_grad=requires_grad)
-            self._get_outputs = self._get_mixed_outputs
-            self._embed_size = self.model.config['lstm']['projection_dim'] * 2
-        else:
-            layers = list(map(int, layers.split(',')))
-            assert len(layers) > 0, "Must choose one output"
-            for layer in layers:
-                assert 0 <= layer <= 2, "Layer index should be in range [0, 2]."
-            self.layers = layers
-            self._get_outputs = self._get_layer_outputs
-            self._embed_size = len(self.layers) * self.model.config['lstm']['projection_dim'] * 2
-
-        self.requires_grad = requires_grad
-
-    def _get_mixed_outputs(self, outputs):
-        # outputs: num_layers x batch_size x max_len x hidden_size
-        # return: batch_size x max_len x hidden_size
-        weights = F.softmax(self.layer_weights+1/len(outputs), dim=0).to(outputs)
-        outputs = torch.einsum('l,lbij->bij', weights, outputs)
-        return self.gamma.to(outputs)*outputs
-
-    def set_mix_weights_requires_grad(self, flag=True):
-        """
-        当初始化ElmoEmbedding时layers被设置为mix时，可以通过调用该方法设置mix weights是否可训练。如果layers不是mix，调用
-        该方法没有用。
-        :param bool flag: 混合不同层表示的结果是否可以训练。
-        :return:
-        """
-        if hasattr(self, 'layer_weights'):
-            self.layer_weights.requires_grad = flag
-            self.gamma.requires_grad = flag
-
-    def _get_layer_outputs(self, outputs):
-        if len(self.layers) == 1:
-            outputs = outputs[self.layers[0]]
-        else:
-            outputs = torch.cat(tuple([*outputs[self.layers]]), dim=-1)
-
-        return outputs
-
-    def forward(self, words: torch.LongTensor):
-        """
-        计算words的elmo embedding表示。根据elmo文章中介绍的ELMO实际上是有2L+1层结果，但是为了让结果比较容易拆分，token的
-            被重复了一次，使得实际上layer=0的结果是[token_embedding;token_embedding], 而layer=1的结果是[forward_hiddens;
-            backward_hiddens].
-
-        :param words: batch_size x max_len
-        :return: torch.FloatTensor. batch_size x max_len x (512*len(self.layers))
-        """
-        words = self.drop_word(words)
-        outputs = self._get_sent_reprs(words)
-        if outputs is not None:
-            return self.dropout(outputs)
-        outputs = self.model(words)
-        outputs = self._get_outputs(outputs)
-        return self.dropout(outputs)
-
-    def _delete_model_weights(self):
-        for name in ['layers', 'model', 'layer_weights', 'gamma']:
-            if hasattr(self, name):
-                delattr(self, name)
-
-    @property
-    def requires_grad(self):
-        """
-        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
-
-        :return:
-        """
-        requires_grads = set([param.requires_grad for name, param in self.named_parameters()
-                             if 'words_to_chars_embedding' not in name and 'words_to_words' not in name])
-        if len(requires_grads) == 1:
-            return requires_grads.pop()
-        else:
-            return None
-
-    @requires_grad.setter
-    def requires_grad(self, value):
-        for name, param in self.named_parameters():
-            if 'words_to_chars_embedding' in name or 'words_to_words' in name: # 这个不能加入到requires_grad中
-                continue
-            param.requires_grad = value
-
-
-class BertEmbedding(ContextualEmbedding):
-    """
-    别名：:class:`fastNLP.modules.BertEmbedding`   :class:`fastNLP.modules.encoder.embedding.BertEmbedding`
-
-    使用BERT对words进行encode的Embedding。建议将输入的words长度限制在450以内，而不要使用512。这是由于预训练的bert模型长
-        度限制为512个token，而因为输入的word是未进行word piece分割的，在分割之后长度可能会超过最大长度限制。
-
-    Example::
-
-        >>> embedding = BertEmbedding(vocab, model_dir_or_name='en-base-uncased', requires_grad=False, layers='4,-2,-1')
-
-
-    :param fastNLP.Vocabulary vocab: 词表
-    :param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为 ``en-base-uncased``.
-    :param str layers:最终结果中的表示。以','隔开层数，可以以负数去索引倒数几层
-    :param str pool_method: 因为在bert中，每个word会被表示为多个word pieces, 当获取一个word的表示的时候，怎样从它的word pieces
-        中计算得到它对应的表示。支持``last``, ``first``, ``avg``, ``max``。
-    :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
-    :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。
-    :param bool include_cls_sep: bool，在bert计算句子的表示的时候，需要在前面加上[CLS]和[SEP], 是否在结果中保留这两个内容。 这样
-        会使得word embedding的结果比输入的结果长两个token。在使用 :class::StackEmbedding 可能会遇到问题。
-    :param bool requires_grad: 是否需要gradient。
-    """
-    def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1',
-                 pool_method: str='first', word_dropout=0, dropout=0, requires_grad: bool=False,
-                 include_cls_sep: bool=False):
-        super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
-
-        # 根据model_dir_or_name检查是否存在并下载
-        if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR:
-            PRETRAIN_URL = _get_base_url('bert')
-            model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
-            model_url = PRETRAIN_URL + model_name
-            model_dir = cached_path(model_url)
-            # 检查是否存在
-        elif os.path.isdir(os.path.expanduser(os.path.abspath(model_dir_or_name))):
-            model_dir = model_dir_or_name
-        else:
-            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
-
-        self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers,
-                                    pool_method=pool_method, include_cls_sep=include_cls_sep)
-
-        self.requires_grad = requires_grad
-        self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size
-
-    def _delete_model_weights(self):
-        del self.model
-
-    def forward(self, words):
-        """
-        计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要
-            删除这两个token的表示。
-
-        :param torch.LongTensor words: [batch_size, max_len]
-        :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
-        """
-        words = self.drop_word(words)
-        outputs = self._get_sent_reprs(words)
-        if outputs is not None:
-            return self.dropout(words)
-        outputs = self.model(words)
-        outputs = torch.cat([*outputs], dim=-1)
-
-        return self.dropout(words)
-
-    @property
-    def requires_grad(self):
-        """
-        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
-        :return:
-        """
-        requires_grads = set([param.requires_grad for name, param in self.named_parameters()
-                             if 'word_pieces_lengths' not in name])
-        if len(requires_grads) == 1:
-            return requires_grads.pop()
-        else:
-            return None
-
-    @requires_grad.setter
-    def requires_grad(self, value):
-        for name, param in self.named_parameters():
-            if 'word_pieces_lengths' in name:  # 这个不能加入到requires_grad中
-                continue
-            param.requires_grad = value
-
-
-def _construct_char_vocab_from_vocab(vocab:Vocabulary, min_freq:int=1):
-    """
-    给定一个word的vocabulary生成character的vocabulary.
-
-    :param vocab: 从vocab
-    :param min_freq:
-    :return:
-    """
-    char_vocab = Vocabulary(min_freq=min_freq)
-    for word, index in vocab:
-        if not vocab._is_word_no_create_entry(word):
-            char_vocab.add_word_lst(list(word))
-    return char_vocab
-
-
-class CNNCharEmbedding(TokenEmbedding):
-    """
-    别名：:class:`fastNLP.modules.CNNCharEmbedding`   :class:`fastNLP.modules.encoder.embedding.CNNCharEmbedding`
-
-    使用CNN生成character embedding。CNN的结果为, embed(x) -> Dropout(x) -> CNN(x) -> activation(x) -> pool -> fc -> Dropout.
-        不同的kernel大小的fitler结果是concat起来的。
-
-    Example::
-
-        >>> cnn_char_embed = CNNCharEmbedding(vocab)
-
-
-    :param vocab: 词表
-    :param embed_size: 该word embedding的大小，默认值为50.
-    :param char_emb_size: character的embed的大小。character是从vocab中生成的。默认值为50.
-    :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
-    :param float dropout: 以多大的概率drop
-    :param filter_nums: filter的数量. 长度需要和kernels一致。默认值为[40, 30, 20].
-    :param kernel_sizes: kernel的大小. 默认值为[5, 3, 1].
-    :param pool_method: character的表示在合成一个表示时所使用的pool方法，支持'avg', 'max'.
-    :param activation: CNN之后使用的激活方法，支持'relu', 'sigmoid', 'tanh' 或者自定义函数.
-    :param min_char_freq: character的最少出现次数。默认值为2.
-    """
-    def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0,
-                 dropout:float=0.5, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1),
-                 pool_method: str='max', activation='relu', min_char_freq: int=2):
-        super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
-
-        for kernel in kernel_sizes:
-            assert kernel % 2 == 1, "Only odd kernel is allowed."
-
-        assert pool_method in ('max', 'avg')
-        self.dropout = nn.Dropout(dropout)
-        self.pool_method = pool_method
-        # activation function
-        if isinstance(activation, str):
-            if activation.lower() == 'relu':
-                self.activation = F.relu
-            elif activation.lower() == 'sigmoid':
-                self.activation = F.sigmoid
-            elif activation.lower() == 'tanh':
-                self.activation = F.tanh
-        elif activation is None:
-            self.activation = lambda x: x
-        elif callable(activation):
-            self.activation = activation
-        else:
-            raise Exception(
-                "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")
-
-        print("Start constructing character vocabulary.")
-        # 建立char的词表
-        self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
-        self.char_pad_index = self.char_vocab.padding_idx
-        print(f"In total, there are {len(self.char_vocab)} distinct characters.")
-        # 对vocab进行index
-        max_word_len = max(map(lambda x: len(x[0]), vocab))
-        self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), max_word_len),
-                                                                fill_value=self.char_pad_index, dtype=torch.long),
-                                                     requires_grad=False)
-        self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False)
-        for word, index in vocab:
-            # if index!=vocab.padding_idx:  # 如果是pad的话，直接就为pad_value了。修改为不区分pad, 这样所有的<pad>也是同一个embed
-            self.words_to_chars_embedding[index, :len(word)] = \
-                torch.LongTensor([self.char_vocab.to_index(c) for c in word])
-            self.word_lengths[index] = len(word)
-        self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size)
-
-        self.convs = nn.ModuleList([nn.Conv1d(
-            char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2)
-            for i in range(len(kernel_sizes))])
-        self._embed_size = embed_size
-        self.fc = nn.Linear(sum(filter_nums), embed_size)
-        self.init_param()
-
-    def forward(self, words):
-        """
-        输入words的index后，生成对应的words的表示。
-
-        :param words: [batch_size, max_len]
-        :return: [batch_size, max_len, embed_size]
-        """
-        words = self.drop_word(words)
-        batch_size, max_len = words.size()
-        chars = self.words_to_chars_embedding[words]  # batch_size x max_len x max_word_len
-        word_lengths = self.word_lengths[words] # batch_size x max_len
-        max_word_len = word_lengths.max()
-        chars = chars[:, :, :max_word_len]
-        # 为1的地方为mask
-        chars_masks = chars.eq(self.char_pad_index)  # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
-        chars = self.char_embedding(chars)  # batch_size x max_len x max_word_len x embed_size
-        chars = self.dropout(chars)
-        reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1)
-        reshaped_chars = reshaped_chars.transpose(1, 2)  # B' x E x M
-        conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1)
-                      for conv in self.convs]
-        conv_chars = torch.cat(conv_chars, dim=-1).contiguous()  # B x max_len x max_word_len x sum(filters)
-        conv_chars = self.activation(conv_chars)
-        if self.pool_method == 'max':
-            conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf'))
-            chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters)
-        else:
-            conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), 0)
-            chars = torch.sum(conv_chars, dim=-2)/chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
-        chars = self.fc(chars)
-        return self.dropout(chars)
-
-    @property
-    def requires_grad(self):
-        """
-        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
-        :return:
-        """
-        params = []
-        for name, param in self.named_parameters():
-            if 'words_to_chars_embedding' not in name and 'word_lengths' not in name:
-                params.append(param.requires_grad)
-        requires_grads = set(params)
-        if len(requires_grads) == 1:
-            return requires_grads.pop()
-        else:
-            return None
-
-    @requires_grad.setter
-    def requires_grad(self, value):
-        for name, param in self.named_parameters():
-            if 'words_to_chars_embedding' in name or 'word_lengths' in name:  # 这个不能加入到requires_grad中
-                continue
-            param.requires_grad = value
-
-    def init_param(self):
-        for name, param in self.named_parameters():
-            if 'words_to_chars_embedding' in name or 'word_lengths' in name:  # 这个不能reset
-                continue
-            if param.data.dim()>1:
-                nn.init.xavier_uniform_(param, 1)
-            else:
-                nn.init.uniform_(param, -1, 1)
-
-class LSTMCharEmbedding(TokenEmbedding):
-    """
-    别名：:class:`fastNLP.modules.LSTMCharEmbedding`   :class:`fastNLP.modules.encoder.embedding.LSTMCharEmbedding`
-
-    使用LSTM的方式对character进行encode. embed(x) -> Dropout(x) -> LSTM(x) -> activation(x) -> pool
-
-    Example::
-
-        >>> lstm_char_embed = LSTMCharEmbedding(vocab)
-
-    :param vocab: 词表
-    :param embed_size: embedding的大小。默认值为50.
-    :param char_emb_size: character的embedding的大小。默认值为50.
-    :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。
-    :param dropout: 以多大概率drop
-    :param hidden_size: LSTM的中间hidden的大小，如果为bidirectional的，hidden会除二，默认为50.
-    :param pool_method: 支持'max', 'avg'
-    :param activation: 激活函数，支持'relu', 'sigmoid', 'tanh', 或者自定义函数.
-    :param min_char_freq: character的最小出现次数。默认值为2.
-    :param bidirectional: 是否使用双向的LSTM进行encode。默认值为True。
-    """
-    def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0,
-                 dropout:float=0.5, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2,
-                 bidirectional=True):
-        super(LSTMCharEmbedding, self).__init__(vocab)
-
-        assert hidden_size % 2 == 0, "Only even kernel is allowed."
-
-        assert pool_method in ('max', 'avg')
-        self.pool_method = pool_method
-        self.dropout = nn.Dropout(dropout)
-        # activation function
-        if isinstance(activation, str):
-            if activation.lower() == 'relu':
-                self.activation = F.relu
-            elif activation.lower() == 'sigmoid':
-                self.activation = F.sigmoid
-            elif activation.lower() == 'tanh':
-                self.activation = F.tanh
-        elif activation is None:
-            self.activation = lambda x: x
-        elif callable(activation):
-            self.activation = activation
-        else:
-            raise Exception(
-                "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")
-
-        print("Start constructing character vocabulary.")
-        # 建立char的词表
-        self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
-        self.char_pad_index = self.char_vocab.padding_idx
-        print(f"In total, there are {len(self.char_vocab)} distinct characters.")
-        # 对vocab进行index
-        self.max_word_len = max(map(lambda x: len(x[0]), vocab))
-        self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), self.max_word_len),
-                                                                fill_value=self.char_pad_index, dtype=torch.long),
-                                                     requires_grad=False)
-        self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False)
-        for word, index in vocab:
-            # if index!=vocab.padding_idx:  # 如果是pad的话，直接就为pad_value了. 修改为不区分pad与否
-            self.words_to_chars_embedding[index, :len(word)] = \
-                torch.LongTensor([self.char_vocab.to_index(c) for c in word])
-            self.word_lengths[index] = len(word)
-        self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size)
-
-        self.fc = nn.Linear(hidden_size, embed_size)
-        hidden_size = hidden_size // 2 if bidirectional else hidden_size
-
-        self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True)
-        self._embed_size = embed_size
-        self.bidirectional = bidirectional
-
-    def forward(self, words):
-        """
-        输入words的index后，生成对应的words的表示。
-
-        :param words: [batch_size, max_len]
-        :return: [batch_size, max_len, embed_size]
-        """
-        words = self.drop_word(words)
-        batch_size, max_len = words.size()
-        chars = self.words_to_chars_embedding[words]  # batch_size x max_len x max_word_len
-        word_lengths = self.word_lengths[words]  # batch_size x max_len
-        max_word_len = word_lengths.max()
-        chars = chars[:, :, :max_word_len]
-        # 为mask的地方为1
-        chars_masks = chars.eq(self.char_pad_index)  # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
-        chars = self.char_embedding(chars)  # batch_size x max_len x max_word_len x embed_size
-        chars = self.dropout(chars)
-        reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1)
-        char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len)
-        lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1)
-        # B x M x M x H
-
-        lstm_chars = self.activation(lstm_chars)
-        if self.pool_method == 'max':
-            lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf'))
-            chars, _ = torch.max(lstm_chars, dim=-2)  # batch_size x max_len x H
-        else:
-            lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), 0)
-            chars = torch.sum(lstm_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
-
-        chars = self.fc(chars)
-
-        return self.dropout(chars)
-
-    @property
-    def requires_grad(self):
-        """
-        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
-        :return:
-        """
-        params = []
-        for name, param in self.named_parameters():
-            if 'words_to_chars_embedding' not in name and 'word_lengths' not in name:
-                params.append(param)
-        requires_grads = set(params)
-        if len(requires_grads) == 1:
-            return requires_grads.pop()
-        else:
-            return None
-
-    @requires_grad.setter
-    def requires_grad(self, value):
-        for name, param in self.named_parameters():
-            if 'words_to_chars_embedding' in name or 'word_lengths' in name:  # 这个不能加入到requires_grad中
-                continue
-            param.requires_grad = value
-
-
-class StackEmbedding(TokenEmbedding):
-    """
-    别名：:class:`fastNLP.modules.StackEmbedding`   :class:`fastNLP.modules.encoder.embedding.StackEmbedding`
-
-    支持将多个embedding集合成一个embedding。
-
-    Example::
-
-        >>> embed_1 = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
-        >>> embed_2 = StaticEmbedding(vocab, model_dir_or_name='en-word2vec-300', requires_grad=True)
-
-
-    :param embeds: 一个由若干个TokenEmbedding组成的list，要求每一个TokenEmbedding的词表都保持一致
-    :param float word_dropout: 以多大的概率将一个词替换为unk。这样既可以训练unk也是一定的regularize。不同embedidng会在相同的位置
-        被设置为unknown。如果这里设置了dropout，则组成的embedding就不要再设置dropout了。
-    :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。
-
-    """
-    def __init__(self, embeds: List[TokenEmbedding], word_dropout=0, dropout=0):
-        vocabs = []
-        for embed in embeds:
-            if hasattr(embed, 'get_word_vocab'):
-                vocabs.append(embed.get_word_vocab())
-        _vocab = vocabs[0]
-        for vocab in vocabs[1:]:
-            assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary."
-
-        super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout)
-        assert isinstance(embeds, list)
-        for embed in embeds:
-            assert isinstance(embed, TokenEmbedding), "Only TokenEmbedding type is supported."
-        self.embeds = nn.ModuleList(embeds)
-        self._embed_size = sum([embed.embed_size for embed in self.embeds])
-
-    def append(self, embed: TokenEmbedding):
-        """
-        添加一个embedding到结尾。
-        :param embed:
-        :return:
-        """
-        assert isinstance(embed, TokenEmbedding)
-        self.embeds.append(embed)
-
-    def pop(self):
-        """
-        弹出最后一个embed
-        :return:
-        """
-        return self.embeds.pop()
-
-    @property
-    def embed_size(self):
-        return self._embed_size
-
-    @property
-    def requires_grad(self):
-        """
-        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
-        :return:
-        """
-        requires_grads = set([embed.requires_grad for embed in self.embeds()])
-        if len(requires_grads)==1:
-            return requires_grads.pop()
-        else:
-            return None
-
-    @requires_grad.setter
-    def requires_grad(self, value):
-        for embed in self.embeds():
-            embed.requires_grad = value
-
-    def forward(self, words):
-        """
-        得到多个embedding的结果，并把结果按照顺序concat起来。
-
-        :param words: batch_size x max_len
-        :return: 返回的shape和当前这个stack embedding中embedding的组成有关
-        """
-        outputs = []
-        words = self.drop_word(words)
-        for embed in self.embeds:
-            outputs.append(embed(words))
-        outputs = self.dropout(torch.cat(outputs, dim=-1))
-        return outputs
-
diff --git a/fastNLP/modules/utils.py b/fastNLP/modules/utils.py
index 3c6a3d27..4a9e034d 100644
--- a/fastNLP/modules/utils.py
+++ b/fastNLP/modules/utils.py
@@ -1,6 +1,5 @@
 from functools import reduce
 
-import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.init as init
@@ -70,33 +69,6 @@ def initial_parameter(net, initial_method=None):
     net.apply(weights_init)
 
 
-def get_embeddings(init_embed):
-    """
-    根据输入的init_embed生成nn.Embedding对象。
-
-    :param init_embed: 可以是 tuple:(num_embedings, embedding_dim), 即embedding的大小和每个词的维度;也可以传入
-        nn.Embedding 对象, 此时就以传入的对象作为embedding; 传入np.ndarray也行，将使用传入的ndarray作为作为Embedding初始
-        化; 传入orch.Tensor, 将使用传入的值作为Embedding初始化。
-    :return nn.Embedding embeddings:
-    """
-    if isinstance(init_embed, tuple):
-        res = nn.Embedding(
-            num_embeddings=init_embed[0], embedding_dim=init_embed[1])
-        nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)),
-                         b=np.sqrt(3/res.weight.data.size(1)))
-    elif isinstance(init_embed, nn.Module):
-        res = init_embed
-    elif isinstance(init_embed, torch.Tensor):
-        res = nn.Embedding.from_pretrained(init_embed, freeze=False)
-    elif isinstance(init_embed, np.ndarray):
-        init_embed = torch.tensor(init_embed, dtype=torch.float32)
-        res = nn.Embedding.from_pretrained(init_embed, freeze=False)
-    else:
-        raise TypeError(
-            'invalid init_embed type: {}'.format((type(init_embed))))
-    return res
-
-
 def summary(model: nn.Module):
     """
     得到模型的总参数量
diff --git a/reproduction/LSTM+self_attention_sentiment_analysis/README.md b/reproduction/LSTM+self_attention_sentiment_analysis/README.md
index 2dff7caa..dfb337ec 100644
--- a/reproduction/LSTM+self_attention_sentiment_analysis/README.md
+++ b/reproduction/LSTM+self_attention_sentiment_analysis/README.md
@@ -1,5 +1,7 @@
 # Prototype
 
+这是一个很旧版本的reproduction，待修改
+
 ## Word2Idx.py
 A mapping model between words and indexes
 
diff --git a/reproduction/LSTM+self_attention_sentiment_analysis/main.py b/reproduction/LSTM+self_attention_sentiment_analysis/main.py
index 871dc476..05077530 100644
--- a/reproduction/LSTM+self_attention_sentiment_analysis/main.py
+++ b/reproduction/LSTM+self_attention_sentiment_analysis/main.py
@@ -1,6 +1,9 @@
+# 这是一个很旧版本的代码
+
+"""
 import torch.nn.functional as F
 
-from fastNLP.core.trainer import ClassificationTrainer
+from fastNLP.core.trainer import Trainer
 from fastNLP.core.utils import ClassPreprocess as Preprocess
 from fastNLP.io.config_io import ConfigLoader
 from fastNLP.io.config_io import ConfigSection
@@ -8,7 +11,7 @@ from fastNLP.io.dataset_loader import DummyClassificationReader as Dataset_loade
 from fastNLP.models.base_model import BaseModel
 from fastNLP.modules.aggregator.self_attention import SelfAttention
 from fastNLP.modules.decoder.mlp import MLP
-from fastNLP.modules.encoder.embedding import Embedding as Embedding
+from fastNLP.embeddings.embedding import Embedding as Embedding
 from fastNLP.modules.encoder.lstm import LSTM
 
 train_data_path =  'small_train_data.txt'
@@ -61,12 +64,13 @@ class SELF_ATTENTION_YELP_CLASSIFICATION(BaseModel):
 
 train_args = ConfigSection()
 ConfigLoader("good path").load_config('config.cfg',{"train": train_args})
-train_args['vocab'] = len(word2index)
+# train_args['vocab'] = len(word2index)
 
 
-trainer = ClassificationTrainer(**train_args.data)
+trainer = Trainer(**train_args.data)
 
 # for k in train_args.__dict__.keys():
 #     print(k, train_args[k])
 model = SELF_ATTENTION_YELP_CLASSIFICATION(train_args)
-trainer.train(model,train_data , dev_data)
+trainer.train()
+"""
diff --git a/reproduction/Star_transformer/train.py b/reproduction/Star_transformer/train.py
index f1e5c2f9..d8e2576b 100644
--- a/reproduction/Star_transformer/train.py
+++ b/reproduction/Star_transformer/train.py
@@ -1,7 +1,7 @@
-from util import get_argparser, set_gpu, set_rng_seeds, add_model_args
+from reproduction.Star_transformer.util import get_argparser, set_gpu, set_rng_seeds, add_model_args
 seed = set_rng_seeds(15360)
 print('RNG SEED {}'.format(seed))
-from datasets import load_seqtag, load_sst, load_snli, EmbedLoader, MAX_LEN
+from reproduction.Star_transformer.datasets import load_seqtag, load_sst, load_snli, EmbedLoader, MAX_LEN
 import torch.nn as nn
 import torch
 import numpy as np
diff --git a/reproduction/Summarization/BertSum/model.py b/reproduction/Summarization/BertSum/model.py
index 655ad16e..1ee821fc 100644
--- a/reproduction/Summarization/BertSum/model.py
+++ b/reproduction/Summarization/BertSum/model.py
@@ -2,7 +2,7 @@ import torch
 from torch import nn
 from torch.nn import init
 
-from fastNLP.modules.encoder._bert import BertModel
+from fastNLP.modules.encoder.bert import BertModel
 
 
 class Classifier(nn.Module):
diff --git a/reproduction/joint_cws_parse/readme.md b/reproduction/joint_cws_parse/README.md
similarity index 100%
rename from reproduction/joint_cws_parse/readme.md
rename to reproduction/joint_cws_parse/README.md
diff --git a/reproduction/joint_cws_parse/models/CharParser.py b/reproduction/joint_cws_parse/models/CharParser.py
index 1ed5ea2d..c07c070e 100644
--- a/reproduction/joint_cws_parse/models/CharParser.py
+++ b/reproduction/joint_cws_parse/models/CharParser.py
@@ -12,7 +12,7 @@ from torch.nn import functional as F
 from fastNLP.modules.dropout import TimestepDropout
 from fastNLP.modules.encoder.variational_rnn import VarLSTM
 from fastNLP import seq_len_to_mask
-from fastNLP.modules import Embedding
+from fastNLP.embeddings import Embedding
 
 
 def drop_input_independent(word_embeddings, dropout_emb):
diff --git a/reproduction/joint_cws_parse/train.py b/reproduction/joint_cws_parse/train.py
index 2f8b0d04..0c34614b 100644
--- a/reproduction/joint_cws_parse/train.py
+++ b/reproduction/joint_cws_parse/train.py
@@ -2,15 +2,15 @@ import sys
 sys.path.append('../..')
 
 from reproduction.joint_cws_parse.data.data_loader import CTBxJointLoader
-from fastNLP.modules.encoder.embedding import StaticEmbedding
+from fastNLP.embeddings.static_embedding import StaticEmbedding
 from torch import nn
 from functools import partial
 from reproduction.joint_cws_parse.models.CharParser import CharParser
 from reproduction.joint_cws_parse.models.metrics import SegAppCharParseF1Metric, CWSMetric
-from fastNLP import cache_results, BucketSampler, Trainer
+from fastNLP import BucketSampler, Trainer
 from torch import optim
-from reproduction.joint_cws_parse.models.callbacks import DevCallback, OptimizerCallback
-from torch.optim.lr_scheduler import LambdaLR, StepLR
+from reproduction.joint_cws_parse.models.callbacks import DevCallback
+from torch.optim.lr_scheduler import StepLR
 from fastNLP import Tester
 from fastNLP import GradientClipCallback, LRScheduler
 import os
diff --git a/reproduction/matching/data/MatchingDataLoader.py b/reproduction/matching/data/MatchingDataLoader.py
index 67fa4c8d..bba26a8a 100644
--- a/reproduction/matching/data/MatchingDataLoader.py
+++ b/reproduction/matching/data/MatchingDataLoader.py
@@ -1,3 +1,7 @@
+"""
+这个文件的内容已合并到fastNLP.io.data_loader里，这个文件的内容不再更新
+"""
+
 
 import os
 
diff --git a/reproduction/matching/matching_bert.py b/reproduction/matching/matching_bert.py
index 75112d5a..3ed75fd1 100644
--- a/reproduction/matching/matching_bert.py
+++ b/reproduction/matching/matching_bert.py
@@ -3,9 +3,8 @@ import numpy as np
 import torch
 
 from fastNLP.core import Trainer, Tester, AccuracyMetric, Const, Adam
+from fastNLP.io.data_loader import SNLILoader, RTELoader, MNLILoader, QNLILoader, QuoraLoader
 
-from reproduction.matching.data.MatchingDataLoader import SNLILoader, RTELoader, \
-    MNLILoader, QNLILoader, QuoraLoader
 from reproduction.matching.model.bert import BertForNLI
 
 
diff --git a/reproduction/matching/matching_cntn.py b/reproduction/matching/matching_cntn.py
index d813164d..098f3bc4 100644
--- a/reproduction/matching/matching_cntn.py
+++ b/reproduction/matching/matching_cntn.py
@@ -1,11 +1,10 @@
 import argparse
 import torch
-import os
 
 from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const
-from fastNLP.modules.encoder.embedding import StaticEmbedding
+from fastNLP.embeddings import StaticEmbedding
+from fastNLP.io.data_loader import QNLILoader, RTELoader, SNLILoader, MNLILoader
 
-from reproduction.matching.data.MatchingDataLoader import QNLILoader, RTELoader, SNLILoader, MNLILoader
 from reproduction.matching.model.cntn import CNTNModel
 
 # define hyper-parameters
diff --git a/reproduction/matching/matching_esim.py b/reproduction/matching/matching_esim.py
index d878608f..2ff6916a 100644
--- a/reproduction/matching/matching_esim.py
+++ b/reproduction/matching/matching_esim.py
@@ -7,11 +7,10 @@ from torch.optim.lr_scheduler import StepLR
 
 from fastNLP.core import Trainer, Tester, AccuracyMetric, Const
 from fastNLP.core.callback import GradientClipCallback, LRScheduler
-from fastNLP.modules.encoder.embedding import ElmoEmbedding, StaticEmbedding
-
-from reproduction.matching.data.MatchingDataLoader import SNLILoader, RTELoader, \
-    MNLILoader, QNLILoader, QuoraLoader
-from reproduction.matching.model.esim import ESIMModel
+from fastNLP.embeddings.static_embedding import StaticEmbedding
+from fastNLP.embeddings.elmo_embedding import ElmoEmbedding
+from fastNLP.io.data_loader import SNLILoader, RTELoader, MNLILoader, QNLILoader, QuoraLoader
+from fastNLP.models.snli import ESIM
 
 
 # define hyper-parameters
@@ -81,7 +80,7 @@ else:
     raise RuntimeError(f'NOT support {arg.embedding} embedding yet!')
 
 # define model
-model = ESIMModel(embedding, num_labels=len(data_info.vocabs[Const.TARGET]))
+model = ESIM(embedding, num_labels=len(data_info.vocabs[Const.TARGET]))
 
 # define optimizer and callback
 optimizer = Adamax(lr=arg.lr, params=model.parameters())
diff --git a/reproduction/matching/matching_mwan.py b/reproduction/matching/matching_mwan.py
index e96ee0c9..31af54c5 100644
--- a/reproduction/matching/matching_mwan.py
+++ b/reproduction/matching/matching_mwan.py
@@ -1,23 +1,17 @@
-import sys
-
-import os
 import random
 
 import numpy as np
 import torch
-from torch.optim import Adadelta, SGD
+from torch.optim import Adadelta
 from torch.optim.lr_scheduler import StepLR
 
-from tqdm import tqdm
-
 from fastNLP import CrossEntropyLoss
 from fastNLP import cache_results
-from fastNLP.core import Trainer, Tester, Adam, AccuracyMetric, Const
-from fastNLP.core.predictor import Predictor
-from fastNLP.core.callback import GradientClipCallback, LRScheduler, FitlogCallback
-from fastNLP.modules.encoder.embedding import ElmoEmbedding, StaticEmbedding
+from fastNLP.core import Trainer, Tester, AccuracyMetric, Const
+from fastNLP.core.callback import LRScheduler, FitlogCallback
+from fastNLP.embeddings import StaticEmbedding
 
-from fastNLP.io.data_loader import MNLILoader, QNLILoader, QuoraLoader, SNLILoader, RTELoader
+from fastNLP.io.data_loader import MNLILoader, QNLILoader, SNLILoader, RTELoader
 from reproduction.matching.model.mwan import MwanModel
 
 import fitlog
diff --git a/reproduction/matching/model/bert.py b/reproduction/matching/model/bert.py
index 9b3a78b2..a21f8c36 100644
--- a/reproduction/matching/model/bert.py
+++ b/reproduction/matching/model/bert.py
@@ -4,7 +4,7 @@ import torch.nn as nn
 
 from fastNLP.core.const import Const
 from fastNLP.models import BaseModel
-from fastNLP.modules.encoder.bert import BertModel
+from fastNLP.embeddings.bert import BertModel
 
 
 class BertForNLI(BaseModel):
diff --git a/reproduction/matching/model/cntn.py b/reproduction/matching/model/cntn.py
index 0b4803fa..a0a104a3 100644
--- a/reproduction/matching/model/cntn.py
+++ b/reproduction/matching/model/cntn.py
@@ -6,7 +6,7 @@ import numpy as np
 from torch.nn import CrossEntropyLoss
 
 from fastNLP.models import BaseModel
-from fastNLP.modules.encoder.embedding import TokenEmbedding
+from fastNLP.embeddings.embedding import TokenEmbedding
 from fastNLP.core.const import Const
 
 
diff --git a/reproduction/matching/model/esim.py b/reproduction/matching/model/esim.py
index 187e565d..87e5ba65 100644
--- a/reproduction/matching/model/esim.py
+++ b/reproduction/matching/model/esim.py
@@ -5,8 +5,7 @@ import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss
 
 from fastNLP.models import BaseModel
-from fastNLP.modules.encoder.embedding import TokenEmbedding
-from fastNLP.modules.encoder.lstm import LSTM
+from fastNLP.embeddings.embedding import TokenEmbedding
 from fastNLP.core.const import Const
 from fastNLP.core.utils import seq_len_to_mask
 
diff --git a/reproduction/seqence_labelling/cws/model/model.py b/reproduction/seqence_labelling/cws/model/model.py
index bdd9002d..de945ac3 100644
--- a/reproduction/seqence_labelling/cws/model/model.py
+++ b/reproduction/seqence_labelling/cws/model/model.py
@@ -1,6 +1,6 @@
 from torch import nn
 import torch
-from fastNLP.modules import Embedding
+from fastNLP.embeddings import Embedding
 import numpy as np
 from reproduction.seqence_labelling.cws.model.module import FeatureFunMax, SemiCRFShiftRelay
 from fastNLP.modules import LSTM
diff --git a/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py
index e9d18048..caa0247a 100644
--- a/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py
+++ b/reproduction/seqence_labelling/ner/train_cnn_lstm_crf_conll2003.py
@@ -1,7 +1,7 @@
 import sys
 sys.path.append('../../..')
 
-from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding, BertEmbedding, ElmoEmbedding, StackEmbedding
+from fastNLP.embeddings.embedding import CNNCharEmbedding, StaticEmbedding
 from fastNLP.core.vocabulary import VocabularyOption
 
 from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF
@@ -9,13 +9,11 @@ from fastNLP import Trainer
 from fastNLP import SpanFPreRecMetric
 from fastNLP import BucketSampler
 from fastNLP import Const
-from torch.optim import SGD, Adam
+from torch.optim import SGD
 from fastNLP import GradientClipCallback
 from fastNLP.core.callback import FitlogCallback, LRScheduler
 from torch.optim.lr_scheduler import LambdaLR
-from fastNLP.core.optimizer import AdamW
 # from reproduction.seqence_labelling.ner.model.swats import SWATS
-from reproduction.seqence_labelling.chinese_ner.callbacks import SaveModelCallback
 from fastNLP import cache_results
 
 import fitlog
diff --git a/reproduction/seqence_labelling/ner/train_idcnn.py b/reproduction/seqence_labelling/ner/train_idcnn.py
index a21499ab..53f2798f 100644
--- a/reproduction/seqence_labelling/ner/train_idcnn.py
+++ b/reproduction/seqence_labelling/ner/train_idcnn.py
@@ -1,21 +1,18 @@
 from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader
-from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader
-from fastNLP.core.callback import FitlogCallback, LRScheduler
+from fastNLP.core.callback import LRScheduler
 from fastNLP import GradientClipCallback
-from torch.optim.lr_scheduler import LambdaLR, CosineAnnealingLR
-from torch.optim import SGD, Adam
+from torch.optim.lr_scheduler import LambdaLR
+from torch.optim import Adam
 from fastNLP import Const
-from fastNLP import RandomSampler, BucketSampler
+from fastNLP import BucketSampler
 from fastNLP import SpanFPreRecMetric
 from fastNLP import Trainer, Tester
 from fastNLP.core.metrics import MetricBase
 from reproduction.seqence_labelling.ner.model.dilated_cnn import IDCNN
 from fastNLP.core.utils import Option
-from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding
+from fastNLP.embeddings.embedding import StaticEmbedding
 from fastNLP.core.utils import cache_results
 from fastNLP.core.vocabulary import VocabularyOption
-import fitlog
-import sys
 import torch.cuda
 import os
 os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
diff --git a/reproduction/seqence_labelling/ner/train_ontonote.py b/reproduction/seqence_labelling/ner/train_ontonote.py
index 6548cb9f..01fcd032 100644
--- a/reproduction/seqence_labelling/ner/train_ontonote.py
+++ b/reproduction/seqence_labelling/ner/train_ontonote.py
@@ -2,18 +2,17 @@ import sys
 
 sys.path.append('../../..')
 
-from fastNLP.modules.encoder.embedding import CNNCharEmbedding, StaticEmbedding
+from fastNLP.embeddings.embedding import CNNCharEmbedding
 
 from reproduction.seqence_labelling.ner.model.lstm_cnn_crf import CNNBiLSTMCRF
 from fastNLP import Trainer
 from fastNLP import SpanFPreRecMetric
 from fastNLP import BucketSampler
 from fastNLP import Const
-from torch.optim import SGD, Adam
+from torch.optim import SGD
 from torch.optim.lr_scheduler import LambdaLR
 from fastNLP import GradientClipCallback
 from fastNLP.core.callback import FitlogCallback, LRScheduler
-from reproduction.seqence_labelling.ner.model.swats import SWATS
 
 import fitlog
 fitlog.debug()
diff --git a/reproduction/text_classification/model/HAN.py b/reproduction/text_classification/model/HAN.py
index 0902d1e4..7ebbe30f 100644
--- a/reproduction/text_classification/model/HAN.py
+++ b/reproduction/text_classification/model/HAN.py
@@ -1,7 +1,7 @@
 import torch
 import torch.nn as nn
 from torch.autograd import Variable
-from fastNLP.modules.utils import get_embeddings
+from fastNLP.embeddings.utils import get_embeddings
 from fastNLP.core import Const as C
 
 
diff --git a/reproduction/text_classification/model/dpcnn.py b/reproduction/text_classification/model/dpcnn.py
index dafe62bc..ae2d46bd 100644
--- a/reproduction/text_classification/model/dpcnn.py
+++ b/reproduction/text_classification/model/dpcnn.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-from fastNLP.modules.utils import get_embeddings
+from fastNLP.embeddings.utils import get_embeddings
 from fastNLP.core import Const as C
 
 
diff --git a/reproduction/text_classification/train_HAN.py b/reproduction/text_classification/train_HAN.py
index b1135342..a8b06146 100644
--- a/reproduction/text_classification/train_HAN.py
+++ b/reproduction/text_classification/train_HAN.py
@@ -9,11 +9,9 @@ os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 from fastNLP.core.const import Const as C
 from fastNLP.core import LRScheduler
-import torch.nn as nn
-from fastNLP.io.dataset_loader import SSTLoader
-from reproduction.text_classification.data.yelpLoader import yelpLoader
+from fastNLP.io.data_loader import YelpLoader
 from reproduction.text_classification.model.HAN import HANCLS
-from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding
+from fastNLP.embeddings import StaticEmbedding
 from fastNLP import CrossEntropyLoss, AccuracyMetric
 from fastNLP.core.trainer import Trainer
 from torch.optim import SGD
@@ -44,7 +42,7 @@ ops = Config()
 
 ##1.task相关信息：利用dataloader载入dataInfo
 
-datainfo = yelpLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train'])
+datainfo = YelpLoader(fine_grained=True).process(paths=ops.datapath, train_ds=['train'])
 print(len(datainfo.datasets['train']))
 print(len(datainfo.datasets['test']))
 
diff --git a/reproduction/text_classification/train_awdlstm.py b/reproduction/text_classification/train_awdlstm.py
index 007b2910..b0f2af49 100644
--- a/reproduction/text_classification/train_awdlstm.py
+++ b/reproduction/text_classification/train_awdlstm.py
@@ -5,20 +5,13 @@ import os
 os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
 os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'
 
-
-import torch.nn as nn
-
 from data.IMDBLoader import IMDBLoader
-from fastNLP.modules.encoder.embedding import StaticEmbedding
+from fastNLP.embeddings import StaticEmbedding
 from model.awd_lstm import AWDLSTMSentiment
 
-from fastNLP.core.const import Const as C
 from fastNLP import CrossEntropyLoss, AccuracyMetric
-from fastNLP import Trainer, Tester
+from fastNLP import Trainer
 from torch.optim import Adam
-from fastNLP.io.model_io import ModelLoader, ModelSaver
-
-import argparse
 
 
 class Config():
diff --git a/reproduction/text_classification/train_char_cnn.py b/reproduction/text_classification/train_char_cnn.py
index e4bb9220..0b8fc535 100644
--- a/reproduction/text_classification/train_char_cnn.py
+++ b/reproduction/text_classification/train_char_cnn.py
@@ -7,23 +7,17 @@ import sys
 sys.path.append('../..')
 from fastNLP.core.const import Const as C
 import torch.nn as nn
-from data.yelpLoader import yelpLoader
+from fastNLP.io.data_loader import YelpLoader
 #from data.sstLoader import sst2Loader
-from fastNLP.io.data_loader.sst import SST2Loader
-from data.IMDBLoader import IMDBLoader
 from model.char_cnn import CharacterLevelCNN
-from fastNLP.core.vocabulary import Vocabulary
-from fastNLP.models.cnn_text_classification import CNNText
-from fastNLP.modules.encoder.embedding import CNNCharEmbedding,StaticEmbedding,StackEmbedding,LSTMCharEmbedding
 from fastNLP import CrossEntropyLoss, AccuracyMetric
 from fastNLP.core.trainer import Trainer
 from torch.optim import SGD
 from torch.autograd import Variable
 import torch
-from fastNLP import BucketSampler
-from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR
+from torch.optim.lr_scheduler import LambdaLR
 from fastNLP.core import LRScheduler
-from utils.util_init import set_rng_seeds
+
 
 ##hyper
 #todo 这里加入fastnlp的记录
@@ -117,7 +111,7 @@ ops=Config
 ##1.task相关信息：利用dataloader载入dataInfo
 #dataloader=SST2Loader()
 #dataloader=IMDBLoader()
-dataloader=yelpLoader(fine_grained=True)
+dataloader=YelpLoader(fine_grained=True)
 datainfo=dataloader.process(ops.datapath,char_level_op=True,split_dev_op=False)
 char_vocab=ops.char_cnn_config["alphabet"]["en"]["lower"]["alphabet"]
 ops.number_of_characters=len(char_vocab)
diff --git a/reproduction/text_classification/train_dpcnn.py b/reproduction/text_classification/train_dpcnn.py
index 70570970..6cce453b 100644
--- a/reproduction/text_classification/train_dpcnn.py
+++ b/reproduction/text_classification/train_dpcnn.py
@@ -3,15 +3,14 @@
 import torch.cuda
 from fastNLP.core.utils import cache_results
 from torch.optim import SGD
-from torch.optim.lr_scheduler import CosineAnnealingLR, LambdaLR
+from torch.optim.lr_scheduler import CosineAnnealingLR
 from fastNLP.core.trainer import Trainer
 from fastNLP import CrossEntropyLoss, AccuracyMetric
-from fastNLP.modules.encoder.embedding import StaticEmbedding, CNNCharEmbedding, StackEmbedding
+from fastNLP.embeddings import StaticEmbedding
 from reproduction.text_classification.model.dpcnn import DPCNN
-from data.yelpLoader import yelpLoader
+from fastNLP.io.data_loader import YelpLoader
 from fastNLP.core.sampler import BucketSampler
-import torch.nn as nn
-from fastNLP.core import LRScheduler, Callback
+from fastNLP.core import LRScheduler
 from fastNLP.core.const import Const as C
 from fastNLP.core.vocabulary import VocabularyOption
 from utils.util_init import set_rng_seeds
@@ -59,7 +58,7 @@ print('RNG SEED: {}'.format(ops.seed))
 
 @cache_results(ops.model_dir_or_name+'-data-cache')
 def load_data():
-    datainfo = yelpLoader(fine_grained=True, lower=True).process(
+    datainfo = YelpLoader(fine_grained=True, lower=True).process(
         paths=ops.datapath, train_ds=['train'], src_vocab_op=ops.src_vocab_op)
     for ds in datainfo.datasets.values():
         ds.apply_field(len, C.INPUT, C.INPUT_LEN)
diff --git a/reproduction/text_classification/train_lstm.py b/reproduction/text_classification/train_lstm.py
index 4ecc61a1..40f77061 100644
--- a/reproduction/text_classification/train_lstm.py
+++ b/reproduction/text_classification/train_lstm.py
@@ -3,20 +3,13 @@ import os
 os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
 os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'
 
-
-import torch.nn as nn
-
-from data.IMDBLoader import IMDBLoader
-from fastNLP.modules.encoder.embedding import StaticEmbedding
+from fastNLP.io.data_loader import IMDBLoader
+from fastNLP.embeddings import StaticEmbedding
 from model.lstm import BiLSTMSentiment
 
-from fastNLP.core.const import Const as C
 from fastNLP import CrossEntropyLoss, AccuracyMetric
-from fastNLP import Trainer, Tester
+from fastNLP import Trainer
 from torch.optim import Adam
-from fastNLP.io.model_io import ModelLoader, ModelSaver
-
-import argparse
 
 
 class Config():
diff --git a/reproduction/text_classification/train_lstm_att.py b/reproduction/text_classification/train_lstm_att.py
index a6f0dd03..1052f606 100644
--- a/reproduction/text_classification/train_lstm_att.py
+++ b/reproduction/text_classification/train_lstm_att.py
@@ -3,20 +3,13 @@ import os
 os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
 os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'
 
-
-import torch.nn as nn
-
-from data.IMDBLoader import IMDBLoader
-from fastNLP.modules.encoder.embedding import StaticEmbedding
+from fastNLP.io.data_loader import IMDBLoader
+from fastNLP.embeddings import StaticEmbedding
 from model.lstm_self_attention import BiLSTM_SELF_ATTENTION
 
-from fastNLP.core.const import Const as C
 from fastNLP import CrossEntropyLoss, AccuracyMetric
-from fastNLP import Trainer, Tester
+from fastNLP import Trainer
 from torch.optim import Adam
-from fastNLP.io.model_io import ModelLoader, ModelSaver
-
-import argparse
 
 
 class Config():
diff --git a/test/embeddings/test_char_embedding.py b/test/embeddings/test_char_embedding.py
new file mode 100644
index 00000000..ceafe4f5
--- /dev/null
+++ b/test/embeddings/test_char_embedding.py
@@ -0,0 +1,26 @@
+import unittest
+
+import torch
+
+from fastNLP import Vocabulary, DataSet, Instance
+from fastNLP.embeddings.char_embedding import LSTMCharEmbedding, CNNCharEmbedding
+
+
+class TestCharEmbed(unittest.TestCase):
+    def test_case_1(self):
+        ds = DataSet([Instance(words=['hello', 'world']), Instance(words=['Jack'])])
+        vocab = Vocabulary().from_dataset(ds, field_name='words')
+        self.assertEqual(len(vocab), 5)
+        embed = LSTMCharEmbedding(vocab, embed_size=60)
+        x = torch.LongTensor([[2, 1, 0], [4, 3, 4]])
+        y = embed(x)
+        self.assertEqual(tuple(y.size()), (2, 3, 60))
+
+    def test_case_2(self):
+        ds = DataSet([Instance(words=['hello', 'world']), Instance(words=['Jack'])])
+        vocab = Vocabulary().from_dataset(ds, field_name='words')
+        self.assertEqual(len(vocab), 5)
+        embed = CNNCharEmbedding(vocab, embed_size=60)
+        x = torch.LongTensor([[2, 1, 0], [4, 3, 4]])
+        y = embed(x)
+        self.assertEqual(tuple(y.size()), (2, 3, 60))
diff --git a/test/modules/encoder/test_bert.py b/test/modules/encoder/test_bert.py
index 2a799478..0fcf01e4 100644
--- a/test/modules/encoder/test_bert.py
+++ b/test/modules/encoder/test_bert.py
@@ -8,7 +8,7 @@ from fastNLP.models.bert import BertModel
 
 class TestBert(unittest.TestCase):
     def test_bert_1(self):
-        from fastNLP.modules.encoder._bert import BertConfig
+        from fastNLP.modules.encoder.bert import BertConfig
         config = BertConfig(32000)
         model = BertModel(config)