diff --git a/docs/source/user/quickstart.rst b/docs/source/user/quickstart.rst
index 43056a26..12e541b7 100644
--- a/docs/source/user/quickstart.rst
+++ b/docs/source/user/quickstart.rst
@@ -49,7 +49,7 @@
 .. code-block:: python
 
     from fastNLP.models import CNNText
-    model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)
+    model = CNNText((len(vocab),50), num_classes=5, dropout=0.1)
 
 :class:`~fastNLP.models.CNNText` 的网络结构如下::
 
@@ -121,4 +121,4 @@
     In Epoch:6/Step:12, got best dev performance:AccuracyMetric: acc=0.8
     Reloaded the best model.
 
-这份教程只是简单地介绍了使用 fastNLP 工作的流程，具体的细节分析见 :doc:`/user/tutorial_one`
\ No newline at end of file
+这份教程只是简单地介绍了使用 fastNLP 工作的流程，具体的细节分析见 :doc:`/user/tutorial_one`
diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py
index ab020ce4..b011d15a 100644
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -554,6 +554,7 @@ class DataSet(object):
             self.field_arrays[new_name].name = new_name
         else:
             raise KeyError("DataSet has no field named {}.".format(old_name))
+        return self
     
     def set_target(self, *field_names, flag=True):
         """
@@ -593,7 +594,7 @@ class DataSet(object):
                 try:
                     self.field_arrays[name].is_input = flag
                 except SetInputOrTargetException as e:
-                    print(f"Cannot set field:{name} as input.")
+                    print(f"Cannot set field:{name} as input, exception happens at the {e.index} value.")
                     raise e
             else:
                 raise KeyError("{} is not a valid field name.".format(name))
@@ -761,7 +762,20 @@ class DataSet(object):
             self._add_apply_field(results, new_field_name, kwargs)
         
         return results
-    
+
+    def add_seq_len(self, field_name:str, new_field_name='seq_len'):
+        """
+        将使用len()直接对field_name中每个元素作用，将其结果作为seqence length, 并放入seq_len这个field。
+
+        :param field_name: str.
+        :return:
+        """
+        if self.has_field(field_name=field_name):
+            self.apply_field(len, field_name, new_field_name=new_field_name)
+        else:
+            raise KeyError(f"Field:{field_name} not found.")
+        return self
+
     def drop(self, func, inplace=True):
         """
         func接受一个Instance，返回bool值。返回值为True时，该Instance会被移除或者加入到返回的DataSet中。
diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py
index c47771df..faa306f3 100644
--- a/fastNLP/core/field.py
+++ b/fastNLP/core/field.py
@@ -6,6 +6,7 @@ import numpy as np
 from typing import Any
 from abc import abstractmethod
 from copy import deepcopy
+from collections import Counter
 
 class SetInputOrTargetException(Exception):
     def __init__(self, msg, index=None, field_name=None):
@@ -61,6 +62,7 @@ class FieldArray:
         if value:
             self._cell_ndim = None
             self.dtype = None
+        self._ignore_type = value
 
     @property
     def is_input(self):
@@ -223,6 +225,155 @@ class FieldArray:
 
         return self
 
+    def split(self, sep:str=None, inplace:bool=True):
+        """
+        依次对自身的元素使用.split()方法，应该只有当本field的元素为str时，该方法才有用。将返回值
+
+        :param sep: 分割符，如果为None则直接调用str.split()。
+        :param inplace: 如果为True，则将新生成值替换本field。否则返回list。
+        :return: List[List[str]] or self
+        """
+        new_contents = []
+        for index, cell in enumerate(self.content):
+            try:
+                new_contents.append(cell.split(sep))
+            except Exception as e:
+                print(f"Exception happens when process value in index {index}.")
+                print(e)
+        return self._after_process(new_contents, inplace=inplace)
+
+    def int(self, inplace:bool=True):
+        """
+        将本field中的值调用int(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的)，
+            (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list，list中的值会被依次转换。)
+
+        :param inplace: 如果为True，则将新生成值替换本field。否则返回list。
+        :return: List[int], List[List[int]], self
+        """
+        new_contents = []
+        for index, cell in enumerate(self.content):
+            try:
+                if isinstance(cell, list):
+                    new_contents.append([int(value) for value in cell])
+                else:
+                    new_contents.append(int(cell))
+            except Exception as e:
+                print(f"Exception happens when process value in index {index}.")
+                print(e)
+        return self._after_process(new_contents, inplace=inplace)
+
+    def float(self, inplace=True):
+        """
+        将本field中的值调用float(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的)，
+            (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list，list中的值会被依次转换。)
+
+        :param inplace: 如果为True，则将新生成值替换本field。否则返回list。
+        :return:
+        """
+        new_contents = []
+        for index, cell in enumerate(self.content):
+            try:
+                if isinstance(cell, list):
+                    new_contents.append([float(value) for value in cell])
+                else:
+                    new_contents.append(float(cell))
+            except Exception as e:
+                print(f"Exception happens when process value in index {index}.")
+                print(e)
+        return self._after_process(new_contents, inplace=inplace)
+
+    def bool(self, inplace=True):
+        """
+        将本field中的值调用bool(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的)，
+            (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list，list中的值会被依次转换。)
+
+        :param inplace: 如果为True，则将新生成值替换本field。否则返回list。
+        :return:
+        """
+        new_contents = []
+        for index, cell in enumerate(self.content):
+            try:
+                if isinstance(cell, list):
+                    new_contents.append([bool(value) for value in cell])
+                else:
+                    new_contents.append(bool(cell))
+            except Exception as e:
+                print(f"Exception happens when process value in index {index}.")
+                print(e)
+
+        return self._after_process(new_contents, inplace=inplace)
+
+    def lower(self, inplace=True):
+        """
+        将本field中的值调用cell.lower(). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的)，
+            (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list，list中的值会被依次转换。)
+
+        :param inplace: 如果为True，则将新生成值替换本field。否则返回list。
+        :return: List[int], List[List[int]], self
+        """
+        new_contents = []
+        for index, cell in enumerate(self.content):
+            try:
+                if isinstance(cell, list):
+                    new_contents.append([value.lower() for value in cell])
+                else:
+                    new_contents.append(cell.lower())
+            except Exception as e:
+                print(f"Exception happens when process value in index {index}.")
+                print(e)
+        return self._after_process(new_contents, inplace=inplace)
+
+    def upper(self, inplace=True):
+        """
+        将本field中的值调用cell.lower(). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的)，
+            (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list，list中的值会被依次转换。)
+
+        :param inplace: 如果为True，则将新生成值替换本field。否则返回list。
+        :return: List[int], List[List[int]], self
+        """
+        new_contents = []
+        for index, cell in enumerate(self.content):
+            try:
+                if isinstance(cell, list):
+                    new_contents.append([value.upper() for value in cell])
+                else:
+                    new_contents.append(cell.upper())
+            except Exception as e:
+                print(f"Exception happens when process value in index {index}.")
+                print(e)
+        return self._after_process(new_contents, inplace=inplace)
+
+    def value_count(self):
+        """
+        返回该field下不同value的数量。多用于统计label数量
+
+        :return: Counter, key是label，value是出现次数
+        """
+        count = Counter()
+        for cell in self.content:
+            count[cell] += 1
+        return count
+
+    def _after_process(self, new_contents, inplace):
+        """
+        当调用处理函数之后，决定是否要替换field。
+
+        :param new_contents:
+        :param inplace:
+        :return: self或者生成的content
+        """
+        if inplace:
+            self.content = new_contents
+            try:
+                self.is_input = self.is_input
+                self.is_target = self.is_input
+            except SetInputOrTargetException as e:
+                print("The newly generated field cannot be set as input or target.")
+                raise e
+            return self
+        else:
+            return new_contents
+
 
 def _get_ele_type_and_dim(cell:Any, dim=0):
     """
@@ -242,6 +393,8 @@ def _get_ele_type_and_dim(cell:Any, dim=0):
         dims = set([j for i,j in res])
         if len(types)>1:
             raise SetInputOrTargetException("Mixed types detected: {}.".format(list(types)))
+        elif len(types)==0:
+            raise SetInputOrTargetException("Empty value encountered.")
         if len(dims)>1:
             raise SetInputOrTargetException("Mixed dimension detected: {}.".format(list(dims)))
         return types.pop(), dims.pop()
@@ -257,6 +410,8 @@ def _get_ele_type_and_dim(cell:Any, dim=0):
         dims = set([j for i,j in res])
         if len(types)>1:
             raise SetInputOrTargetException("Mixed types detected: {}.".format(list(types)))
+        elif len(types)==0:
+            raise SetInputOrTargetException("Empty value encountered.")
         if len(dims)>1:
             raise SetInputOrTargetException("Mixed dimension detected: {}.".format(list(dims)))
         return types.pop(), dims.pop()
diff --git a/fastNLP/core/metrics.py b/fastNLP/core/metrics.py
index 868d67b1..19c33c86 100644
--- a/fastNLP/core/metrics.py
+++ b/fastNLP/core/metrics.py
@@ -22,7 +22,7 @@ from .utils import _check_arg_dict_list
 from .utils import _get_func_signature
 from .utils import seq_len_to_mask
 from .vocabulary import Vocabulary
-
+from abc import abstractmethod
 
 class MetricBase(object):
     """
@@ -117,10 +117,12 @@ class MetricBase(object):
     def __init__(self):
         self.param_map = {}  # key is param in function, value is input param.
         self._checked = False
-    
+
+    @abstractmethod
     def evaluate(self, *args, **kwargs):
         raise NotImplementedError
-    
+
+    @abstractmethod
     def get_metric(self, reset=True):
         raise NotImplemented
     
diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py
index 9dab47b5..1eb2b70e 100644
--- a/fastNLP/core/utils.py
+++ b/fastNLP/core/utils.py
@@ -285,6 +285,7 @@ def _get_model_device(model):
     :param model: nn.Module
     :return: torch.device,None 如果返回值为None，说明这个模型没有任何参数。
     """
+    # TODO 这个函数存在一定的风险，因为同一个模型可能存在某些parameter不在显卡中，比如BertEmbedding
     assert isinstance(model, nn.Module)
     
     parameters = list(model.parameters())
@@ -295,6 +296,13 @@ def _get_model_device(model):
 
 
 def _build_args(func, **kwargs):
+    """
+    根据func的初始化参数，从kwargs中选择func需要的参数
+
+    :param func: callable
+    :param kwargs: 参数
+    :return:dict. func中用到的参数
+    """
     spect = inspect.getfullargspec(func)
     if spect.varkw is not None:
         return kwargs
diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py
index bca28e10..1d5d6f32 100644
--- a/fastNLP/core/vocabulary.py
+++ b/fastNLP/core/vocabulary.py
@@ -148,7 +148,7 @@ class Vocabulary(object):
         self.word2idx.update({w: i + start_idx for i, (w, _) in enumerate(words)})
         self.build_reverse_vocab()
         self.rebuild = False
-    
+
     def build_reverse_vocab(self):
         """
         基于 "word to index" dict, 构建 "index to word" dict.
@@ -359,5 +359,7 @@ class Vocabulary(object):
     def __repr__(self):
         return "Vocabulary({}...)".format(list(self.word_count.keys())[:5])
     
+    @_check_build_vocab
     def __iter__(self):
-        return iter(list(self.word_count.keys()))
+        for word, index in self.word2idx.items():
+            yield word, index
diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py
index 4119d93f..5237a8a7 100644
--- a/fastNLP/io/embed_loader.py
+++ b/fastNLP/io/embed_loader.py
@@ -26,6 +26,7 @@ class EmbeddingOption(Option):
             error=error
         )
 
+
 class EmbedLoader(BaseLoader):
     """
     别名：:class:`fastNLP.io.EmbedLoader` :class:`fastNLP.io.embed_loader.EmbedLoader`
@@ -35,9 +36,9 @@ class EmbedLoader(BaseLoader):
     
     def __init__(self):
         super(EmbedLoader, self).__init__()
-    
+
     @staticmethod
-    def load_with_vocab(embed_filepath, vocab, dtype=np.float32, normalize=True, error='ignore'):
+    def load_with_vocab(embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', normalize=True, error='ignore'):
         """
         从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是
         word2vec(第一行只有两个元素)还是glove格式的数据。
@@ -46,6 +47,8 @@ class EmbedLoader(BaseLoader):
         :param vocab: 词表 :class:`~fastNLP.Vocabulary` 类型，读取出现在vocab中的词的embedding。
             没有出现在vocab中的词的embedding将通过找到的词的embedding的正态分布采样出来，以使得整个Embedding是同分布的。
         :param dtype: 读出的embedding的类型
+        :param str padding: 词表中padding的token
+        :param str unknown: 词表中unknown的token
         :param bool normalize: 是否将每个vector归一化到norm为1
         :param str error: `ignore` , `strict` ; 如果 `ignore` ，错误将自动跳过; 如果 `strict` , 错误将抛出。
             这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。
@@ -69,8 +72,14 @@ class EmbedLoader(BaseLoader):
             for idx, line in enumerate(f, start_idx):
                 try:
                     parts = line.strip().split()
-                    if parts[0] in vocab:
-                        index = vocab.to_index(parts[0])
+                    word = parts[0]
+                    # 对齐unk与pad
+                    if word==padding and vocab.padding is not None:
+                        word = vocab.padding
+                    elif word==unknown and vocab.unknown is not None:
+                        word = vocab.unknown
+                    if word in vocab:
+                        index = vocab.to_index(word)
                         matrix[index] = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim)
                         hit_flags[index] = True
                 except Exception as e:
@@ -102,8 +111,8 @@ class EmbedLoader(BaseLoader):
 
         :param str embed_filepath: 预训练的embedding的路径。
         :param dtype: 读出的embedding的类型
-        :param str padding: the padding tag for vocabulary.
-        :param str unknown: the unknown tag for vocabulary.
+        :param str padding: 词表中的padding的token. 并以此用做vocab的padding。
+        :param str unknown: 词表中的unknown的token. 并以此用做vocab的unknown。
         :param bool normalize: 是否将每个vector归一化到norm为1
         :param str error: `ignore` , `strict` ; 如果 `ignore` ，错误将自动跳过; 如果 `strict` , 错误将抛出。这里主要可能出错的地
             方在于词表有空行或者词表出现了维度不一致。
@@ -134,7 +143,7 @@ class EmbedLoader(BaseLoader):
                     vocab.add_word(word)
                     if unknown is not None and unknown == word:
                         found_unknown = True
-                    if found_pad is not None and padding == word:
+                    if padding is not None and padding == word:
                         found_pad = True
                 except Exception as e:
                     if error == 'ignore':
diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py
new file mode 100644
index 00000000..11c7ab64
--- /dev/null
+++ b/fastNLP/io/file_utils.py
@@ -0,0 +1,255 @@
+
+import os
+from pathlib import Path
+from urllib.parse import urlparse
+import re
+import requests
+import tempfile
+from tqdm import tqdm
+import shutil
+import hashlib
+
+
+def cached_path(url_or_filename: str, cache_dir: Path=None) -> Path:
+    """
+        给定一个url或者文件名(可以是具体的文件名，也可以是文件)，先在cache_dir下寻找该文件是否存在，如果不存在则去下载, 并
+    将文件放入到
+    """
+    if cache_dir is None:
+        dataset_cache = Path(get_defalt_path())
+    else:
+        dataset_cache = cache_dir
+
+    parsed = urlparse(url_or_filename)
+
+    if parsed.scheme in ("http", "https"):
+        # URL, so get it from the cache (downloading if necessary)
+        return get_from_cache(url_or_filename, dataset_cache)
+    elif parsed.scheme == "" and Path(os.path.join(dataset_cache, url_or_filename)).exists():
+        # File, and it exists.
+        return Path(url_or_filename)
+    elif parsed.scheme == "":
+        # File, but it doesn't exist.
+        raise FileNotFoundError("file {} not found".format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError(
+            "unable to parse {} as a URL or as a local path".format(url_or_filename)
+        )
+
+def get_filepath(filepath):
+    """
+    如果filepath中只有一个文件，则直接返回对应的全路径
+    :param filepath:
+    :return:
+    """
+    if os.path.isdir(filepath):
+        files = os.listdir(filepath)
+        if len(files)==1:
+            return os.path.join(filepath, files[0])
+        else:
+            return filepath
+    return filepath
+
+def get_defalt_path():
+    """
+    获取默认的fastNLP存放路径, 如果将FASTNLP_CACHE_PATH设置在了环境变量中，将使用环境变量的值，使得不用每个用户都去下载。
+
+    :return:
+    """
+    if 'FASTNLP_CACHE_DIR' in os.environ:
+        fastnlp_cache_dir = os.environ.get('FASTNLP_CACHE_DIR')
+        if os.path.exists(fastnlp_cache_dir):
+            return fastnlp_cache_dir
+        raise RuntimeError("Some errors happens on cache directory.")
+    else:
+        raise RuntimeError("There function is not available right now.")
+    fastnlp_cache_dir = os.path.expanduser(os.path.join("~", ".fastNLP"))
+    return fastnlp_cache_dir
+
+def _get_base_url(name):
+    # 返回的URL结尾必须是/
+    if 'FASTNLP_BASE_URL' in os.environ:
+        fastnlp_base_url = os.environ['FASTNLP_BASE_URL']
+        return fastnlp_base_url
+    raise RuntimeError("There function is not available right now.")
+
+def split_filename_suffix(filepath):
+    """
+    给定filepath返回对应的name和suffix
+    :param filepath:
+    :return: filename, suffix
+    """
+    filename = os.path.basename(filepath)
+    if filename.endswith('.tar.gz'):
+        return filename[:-7], '.tar.gz'
+    return os.path.splitext(filename)
+
+def get_from_cache(url: str, cache_dir: Path = None) -> Path:
+    """
+    尝试在cache_dir中寻找url定义的资源; 如果没有找到。则从url下载并将结果放在cache_dir下，缓存的名称由url的结果推断而来。
+        如果从url中下载的资源解压后有多个文件，则返回directory的路径; 如果只有一个资源，则返回具体的路径
+
+    """
+    cache_dir.mkdir(parents=True, exist_ok=True)
+
+    filename = re.sub(r".+/", "", url)
+    dir_name, suffix = split_filename_suffix(filename)
+    sep_index = dir_name[::-1].index('-')
+    if sep_index<0:
+        check_sum = None
+    else:
+        check_sum = dir_name[-sep_index+1:]
+    sep_index = len(dir_name) if sep_index==-1 else -sep_index-1
+    dir_name = dir_name[:sep_index]
+
+    # 寻找与它名字匹配的内容, 而不关心后缀
+    match_dir_name = match_file(dir_name, cache_dir)
+    if match_dir_name:
+        dir_name = match_dir_name
+    cache_path = cache_dir / dir_name
+
+    # get cache path to put the file
+    if cache_path.exists():
+        return get_filepath(cache_path)
+
+    # make HEAD request to check ETag TODO ETag可以用来判断资源是否已经更新了，之后需要加上
+    response = requests.head(url, headers={"User-Agent": "fastNLP"})
+    if response.status_code != 200:
+        raise IOError(
+            f"HEAD request failed for url {url} with status code {response.status_code}."
+        )
+
+    # add ETag to filename if it exists
+    # etag = response.headers.get("ETag")
+
+    if not cache_path.exists():
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        fd, temp_filename = tempfile.mkstemp()
+        print("%s not found in cache, downloading to %s"%(url, temp_filename))
+
+        # GET file object
+        req = requests.get(url, stream=True, headers={"User-Agent": "fastNLP"})
+        content_length = req.headers.get("Content-Length")
+        total = int(content_length) if content_length is not None else None
+        progress = tqdm(unit="B", total=total)
+        sha256 = hashlib.sha256()
+        with open(temp_filename, "wb") as temp_file:
+            for chunk in req.iter_content(chunk_size=1024):
+                if chunk:  # filter out keep-alive new chunks
+                    progress.update(len(chunk))
+                    temp_file.write(chunk)
+                    sha256.update(chunk)
+        # check sum
+        digit = sha256.hexdigest()[:8]
+        if not check_sum:
+            assert digit == check_sum, "File corrupted when download."
+        progress.close()
+        print(f"Finish download from {url}.")
+
+        # 开始解压
+        delete_temp_dir = None
+        if suffix in ('.zip', '.tar.gz'):
+            uncompress_temp_dir = tempfile.mkdtemp()
+            delete_temp_dir = uncompress_temp_dir
+            print(f"Start to uncompress file to {uncompress_temp_dir}.")
+            if suffix == '.zip':
+                unzip_file(Path(temp_filename), Path(uncompress_temp_dir))
+            else:
+                untar_gz_file(Path(temp_filename), Path(uncompress_temp_dir))
+            filenames = os.listdir(uncompress_temp_dir)
+            if len(filenames)==1:
+                if os.path.isdir(os.path.join(uncompress_temp_dir, filenames[0])):
+                    uncompress_temp_dir = os.path.join(uncompress_temp_dir, filenames[0])
+
+            cache_path.mkdir(parents=True, exist_ok=True)
+            print("Finish un-compressing file.")
+        else:
+            uncompress_temp_dir = temp_filename
+            cache_path = str(cache_path) + suffix
+        success = False
+        try:
+            # 复制到指定的位置
+            print(f"Copy file to {cache_path}.")
+            if os.path.isdir(uncompress_temp_dir):
+                for filename in os.listdir(uncompress_temp_dir):
+                    shutil.copyfile(os.path.join(uncompress_temp_dir, filename), cache_path/filename)
+            else:
+                shutil.copyfile(uncompress_temp_dir, cache_path)
+            success = True
+        except Exception as e:
+            print(e)
+            raise e
+        finally:
+            if not success:
+                if cache_path.exists():
+                    if cache_path.is_file():
+                        os.remove(cache_path)
+                    else:
+                        shutil.rmtree(cache_path)
+            if delete_temp_dir:
+                shutil.rmtree(delete_temp_dir)
+            os.close(fd)
+            os.remove(temp_filename)
+
+    return get_filepath(cache_path)
+
+def unzip_file(file: Path, to: Path):
+    # unpack and write out in CoNLL column-like format
+    from zipfile import ZipFile
+
+    with ZipFile(file, "r") as zipObj:
+        # Extract all the contents of zip file in current directory
+        zipObj.extractall(to)
+
+def untar_gz_file(file:Path, to:Path):
+    import tarfile
+
+    with tarfile.open(file, 'r:gz') as tar:
+        tar.extractall(to)
+
+def match_file(dir_name:str, cache_dir:str)->str:
+    """
+    匹配的原则是，在cache_dir下的文件: (1) 与dir_name完全一致; (2) 除了后缀以外和dir_name完全一致。
+    如果找到了两个匹配的结果将报错. 如果找到了则返回匹配的文件的名称; 没有找到返回空字符串
+
+    :param dir_name: 需要匹配的名称
+    :param cache_dir: 在该目录下找匹配dir_name是否存在
+    :return: str
+    """
+    files = os.listdir(cache_dir)
+    matched_filenames = []
+    for file_name in files:
+        if re.match(dir_name+'$', file_name) or re.match(dir_name+'\\..*', file_name):
+            matched_filenames.append(file_name)
+    if len(matched_filenames)==0:
+        return ''
+    elif len(matched_filenames)==1:
+        return matched_filenames[-1]
+    else:
+        raise RuntimeError(f"Duplicate matched files:{matched_filenames}, this should be caused by a bug.")
+
+if __name__ == '__main__':
+    cache_dir = Path('caches')
+    cache_dir = None
+    # 需要对cache_dir进行测试
+    base_url = 'http://0.0.0.0:8888/file/download'
+    # if True:
+    #     for filename in os.listdir(cache_dir):
+    #         if os.path.isdir(os.path.join(cache_dir, filename)):
+    #             shutil.rmtree(os.path.join(cache_dir, filename))
+    #         else:
+    #             os.remove(os.path.join(cache_dir, filename))
+    # 1. 测试.txt文件
+    print(cached_path(base_url + '/{}'.format('txt_test-bcb4fe65.txt'), cache_dir))
+    # 2. 测试.zip文件(只有一个文件)
+    print(cached_path(base_url + '/{}'.format('zip_test-40966d39.zip'), cache_dir))
+    # 3. 测试.zip文件(有多个文件)
+    print(cached_path(base_url + '/{}'.format('zip_pack_test-70c0b20d.zip'), cache_dir))
+    # 4. 测试.tar.gz文件
+    print(cached_path(base_url + '/{}'.format('tar_gz_test-3e2679cf.tar.gz'), cache_dir))
+    # 5. 测试.tar.gz多个文件
+    print(cached_path(base_url + '/{}'.format('tar_gz_pack_test-08dfdccd.tar.gz'), cache_dir))
+
+    # 6. 测试.pkl文件
diff --git a/fastNLP/models/bert.py b/fastNLP/models/bert.py
index 02227c0d..4846c7fa 100644
--- a/fastNLP/models/bert.py
+++ b/fastNLP/models/bert.py
@@ -30,7 +30,7 @@ class BertConfig:
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
         self.num_attention_heads = num_attention_heads
-        self.intermediate = intermediate_size
+        self.intermediate_size = intermediate_size
         self.hidden_act = hidden_act
         self.hidden_dropout_prob = hidden_dropout_prob
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py
index 3a71a80a..081dd510 100644
--- a/fastNLP/models/cnn_text_classification.py
+++ b/fastNLP/models/cnn_text_classification.py
@@ -7,6 +7,7 @@ import torch.nn as nn
 
 from ..core.const import Const as C
 from ..modules import encoder
+from fastNLP import seq_len_to_mask
 
 
 class CNNText(torch.nn.Module):
@@ -21,15 +22,13 @@ class CNNText(torch.nn.Module):
     :param int num_classes: 一共有多少类
     :param int,tuple(int) out_channels: 输出channel的数量。如果为list，则需要与kernel_sizes的数量保持一致
     :param int,tuple(int) kernel_sizes: 输出channel的kernel大小。
-    :param int padding: 对句子前后的pad的大小, 用0填充。
     :param float dropout: Dropout的大小
     """
     
     def __init__(self, init_embed,
                  num_classes,
-                 kernel_nums=(3, 4, 5),
-                 kernel_sizes=(3, 4, 5),
-                 padding=0,
+                 kernel_nums=(30, 40, 50),
+                 kernel_sizes=(1, 3, 5),
                  dropout=0.5):
         super(CNNText, self).__init__()
         
@@ -38,8 +37,7 @@ class CNNText(torch.nn.Module):
         self.conv_pool = encoder.ConvMaxpool(
             in_channels=self.embed.embedding_dim,
             out_channels=kernel_nums,
-            kernel_sizes=kernel_sizes,
-            padding=padding)
+            kernel_sizes=kernel_sizes)
         self.dropout = nn.Dropout(dropout)
         self.fc = nn.Linear(sum(kernel_nums), num_classes)
     
@@ -51,7 +49,11 @@ class CNNText(torch.nn.Module):
         :return output: dict of torch.LongTensor, [batch_size, num_classes]
         """
         x = self.embed(words)  # [N,L] -> [N,L,C]
-        x = self.conv_pool(x)  # [N,L,C] -> [N,C]
+        if seq_len is not None:
+            mask = seq_len_to_mask(seq_len)
+            x = self.conv_pool(x, mask)
+        else:
+            x = self.conv_pool(x)  # [N,L,C] -> [N,C]
         x = self.dropout(x)
         x = self.fc(x)  # [N,C] -> [N, N_class]
         return {C.OUTPUT: x}
diff --git a/fastNLP/modules/encoder/_bert.py b/fastNLP/modules/encoder/_bert.py
new file mode 100644
index 00000000..fc62ea9c
--- /dev/null
+++ b/fastNLP/modules/encoder/_bert.py
@@ -0,0 +1,625 @@
+
+
+
+"""
+这个页面的代码很大程度上参考了https://github.com/huggingface/pytorch-pretrained-BERT的代码
+"""
+
+
+import torch
+from torch import nn
+
+from ... import Vocabulary
+import collections
+
+import os
+import unicodedata
+from ...io.file_utils import _get_base_url, cached_path
+from .bert import BertModel
+import numpy as np
+from itertools import chain
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+
+        Returns:
+          A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        while True:
+            token = reader.readline()
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+    def __init__(self,
+                 do_lower_case=True,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+        """Constructs a BasicTokenizer.
+
+        Args:
+          do_lower_case: Whether to lower case the input.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text."""
+        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in self.never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        if text in self.never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+                (cp >= 0x3400 and cp <= 0x4DBF) or  #
+                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+                (cp >= 0x2B820 and cp <= 0x2CEAF) or
+                (cp >= 0xF900 and cp <= 0xFAFF) or  #
+                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+
+
+class BertTokenizer(object):
+    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
+
+    def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
+        """Constructs a BertTokenizer.
+
+        Args:
+          vocab_file: Path to a one-wordpiece-per-line vocabulary file
+          do_lower_case: Whether to lower case the input
+                         Only has an effect when do_wordpiece_only=False
+          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
+          max_len: An artificial maximum length to truncate tokenized sequences to;
+                         Effective maximum length is always the minimum of this
+                         value (if specified) and the underlying BERT model's
+                         sequence length.
+          never_split: List of tokens which will never be split during tokenization.
+                         Only has an effect when do_wordpiece_only=False
+        """
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+          self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                                never_split=never_split)
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+        self.max_len = max_len if max_len is not None else int(1e12)
+
+    def tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        """Converts a sequence of tokens into ids using the vocab."""
+        ids = []
+        for token in tokens:
+            ids.append(self.vocab[token])
+        if len(ids) > self.max_len:
+            print(
+                "Token indices sequence length is longer than the specified maximum "
+                " sequence length for this BERT model ({} > {}). Running this"
+                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
+            )
+        return ids
+
+    def convert_ids_to_tokens(self, ids):
+        """Converts a sequence of ids in wordpiece tokens using the vocab."""
+        tokens = []
+        for i in ids:
+            tokens.append(self.ids_to_tokens[i])
+        return tokens
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary to a directory or file."""
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_NAME)
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    print("Saving vocabulary to {}: vocabulary indices are not consecutive."
+                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+        return vocab_file
+
+    @classmethod
+    def from_pretrained(cls, model_dir, *inputs, **kwargs):
+        """
+        给定path，直接读取vocab.
+
+        """
+        pretrained_model_name_or_path = os.path.join(model_dir, VOCAB_NAME)
+        print("loading vocabulary file {}".format(pretrained_model_name_or_path))
+        max_len = 512
+        kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
+        # Instantiate tokenizer.
+        tokenizer = cls(pretrained_model_name_or_path, *inputs, **kwargs)
+        return tokenizer
+
+VOCAB_NAME = 'vocab.txt'
+
+class _WordBertModel(nn.Module):
+    def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first', include_cls_sep:bool=False):
+        super().__init__()
+
+        self.tokenzier = BertTokenizer.from_pretrained(model_dir)
+        self.encoder = BertModel.from_pretrained(model_dir)
+        #  检查encoder_layer_number是否合理
+        encoder_layer_number = len(self.encoder.encoder.layer)
+        self.layers = list(map(int, layers.split(',')))
+        for layer in self.layers:
+            if layer<0:
+                assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \
+                    f"a bert model with {encoder_layer_number} layers."
+            else:
+                assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
+                    f"a bert model with {encoder_layer_number} layers."
+
+        assert pool_method in ('avg', 'max', 'first', 'last')
+        self.pool_method = pool_method
+
+        self.include_cls_sep = include_cls_sep
+
+        # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
+        print("Start to generating word pieces for word.")
+        word_to_wordpieces = []
+        word_pieces_lengths = []
+        for word, index in vocab:
+            if index == vocab.padding_idx:  # pad是个特殊的符号
+                word = '[PAD]'
+            elif index == vocab.unknown_idx:
+                word = '[UNK]'
+            word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
+            word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces)
+            word_to_wordpieces.append(word_pieces)
+            word_pieces_lengths.append(len(word_pieces))
+        self._cls_index = len(vocab)
+        self._sep_index = len(vocab) + 1
+        self._pad_index = vocab.padding_idx
+        self._wordpiece_pad_index = self.tokenzier.convert_tokens_to_ids(['[PAD]'])[0]  # 需要用于生成word_piece
+        word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[CLS]']))
+        word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[SEP]']))
+        self.word_to_wordpieces = np.array(word_to_wordpieces)
+        self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False)
+        print("Successfully generate word pieces.")
+
+    def forward(self, words):
+        """
+
+        :param words: torch.LongTensor, batch_size x max_len
+        :return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size
+        """
+        batch_size, max_word_len = words.size()
+        seq_len = words.ne(self._pad_index).sum(dim=-1)
+        batch_word_pieces_length = self.word_pieces_lengths[words]  # batch_size x max_len
+        word_pieces_lengths = batch_word_pieces_length.sum(dim=-1)
+        max_word_piece_length = word_pieces_lengths.max().item()
+        # +2是由于需要加入[CLS]与[SEP]
+        word_pieces = words.new_full((batch_size, max_word_piece_length+2), fill_value=self._wordpiece_pad_index)
+        word_pieces[:, 0].fill_(self._cls_index)
+        word_pieces[:, word_pieces_lengths+1] = self._sep_index
+        attn_masks = torch.zeros_like(word_pieces)
+        # 1. 获取words的word_pieces的id，以及对应的span范围
+        word_indexes = words.tolist()
+        for i in range(batch_size):
+            word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i]]))
+            word_pieces[i, 1:len(word_pieces_i)+1] = torch.LongTensor(word_pieces_i)
+            attn_masks[i, :len(word_pieces_i)+2].fill_(1)
+        # 2. 获取hidden的结果，根据word_pieces进行对应的pool计算
+        # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...]
+        bert_outputs, _ = self.encoder(word_pieces, token_type_ids=None, attention_mask=attn_masks,
+                                           output_all_encoded_layers=True)
+        # output_layers = [self.layers]  # len(self.layers) x batch_size x max_word_piece_length x hidden_size
+
+        if self.include_cls_sep:
+            outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2,
+                                                 bert_outputs[-1].size(-1))
+            s_shift = 1
+        else:
+            outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len,
+                                                 bert_outputs[-1].size(-1))
+            s_shift = 0
+        batch_word_pieces_cum_length = batch_word_pieces_length.new_zeros(batch_size, max_word_len + 1)
+        batch_word_pieces_cum_length[:, 1:] = batch_word_pieces_length.cumsum(dim=-1)  # batch_size x max_len
+        for l_index, l in enumerate(self.layers):
+            output_layer = bert_outputs[l]
+            # 从word_piece collapse到word的表示
+            truncate_output_layer = output_layer[:, 1:-1]  # 删除[CLS]与[SEP] batch_size x len x hidden_size
+            outputs_seq_len = seq_len + s_shift
+            if self.pool_method == 'first':
+                for i in range(batch_size):
+                    i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]]  # 每个word的start位置
+                    outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length]  # num_layer x batch_size x len x hidden_size
+            elif self.pool_method == 'last':
+                for i in range(batch_size):
+                    i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i]+1] - 1 # 每个word的end
+                    outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length]
+            elif self.pool_method == 'max':
+                for i in range(batch_size):
+                    for j in range(seq_len[i]):
+                        start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
+                        outputs[l_index, i, j+s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2)
+            else:
+                for i in range(batch_size):
+                    for j in range(seq_len[i]):
+                        start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
+                        outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2)
+            if self.include_cls_sep:
+                outputs[:, :, 0] = output_layer[:, 0]
+                outputs[:, :, seq_len+s_shift] = output_layer[:, seq_len+s_shift]
+        # 3. 最终的embedding结果
+        return outputs
+
+
+class _WordPieceBertModel(nn.Module):
+    """
+    这个模块用于直接计算word_piece的结果.
+
+    """
+    def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1'):
+        super().__init__()
+
+        self.tokenzier = BertTokenizer.from_pretrained(model_dir)
+        self.encoder = BertModel.from_pretrained(model_dir)
+        #  检查encoder_layer_number是否合理
+        encoder_layer_number = len(self.encoder.encoder.layer)
+        self.layers = list(map(int, layers.split(',')))
+        for layer in self.layers:
+            if layer<0:
+                assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \
+                    f"a bert model with {encoder_layer_number} layers."
+            else:
+                assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
+                    f"a bert model with {encoder_layer_number} layers."
+
+        # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
+        print("Start to generating word pieces for word.")
+        self.word_to_wordpieces = []
+        self.word_pieces_length = []
+        for word, index in vocab:
+            if index == vocab.padding_idx:  # pad是个特殊的符号
+                word = '[PAD]'
+            elif index == vocab.unknown_idx:
+                word = '[UNK]'
+            word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
+            word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces)
+            self.word_to_wordpieces.append(word_pieces)
+            self.word_pieces_length.append(len(word_pieces))
+        self._cls_index = len(vocab)
+        self._sep_index = len(vocab) + 1
+        self._pad_index = vocab.padding_idx
+        self._wordpiece_pad_index = self.tokenzier.convert_tokens_to_ids(['[PAD]'])[0]  # 需要用于生成word_piece
+        self.word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[CLS]']))
+        self.word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[SEP]']))
+        self.word_to_wordpieces = np.array(self.word_to_wordpieces, dtype=int)
+        print("Successfully generate word pieces.")
+
+    def index_dataset(self, *datasets):
+        """
+        使用bert的tokenizer将word_pieces与word_pieces_seq_len这两列加入到datasets中，并将他们设置为input。加入的word_piece
+            已经包含了[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。
+
+        :param datasets: DataSet对象
+        :return:
+        """
+        def convert_words_to_word_pieces(words):
+            word_pieces = list(chain(*self.word_to_wordpieces[words].tolist()))
+            word_pieces = [self._cls_index] + word_pieces + [self._sep_index]
+            return word_pieces
+
+        for index, dataset in enumerate(datasets):
+            try:
+                dataset.apply_field(convert_words_to_word_pieces, field_name='words', new_field_name='word_pieces',
+                                    is_input=True)
+                dataset.set_pad_val('word_pieces', self._wordpiece_pad_index)
+            except Exception as e:
+                print(f"Exception happens when processing the {index} dataset.")
+                raise e
+
+    def forward(self, word_pieces, token_type_ids=None):
+        """
+
+        :param word_pieces: torch.LongTensor, batch_size x max_len
+        :param token_type_ids: torch.LongTensor, batch_size x max_len
+        :return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size
+        """
+        batch_size, max_len = word_pieces.size()
+
+        attn_masks = word_pieces.ne(self._pad_index)
+        bert_outputs, _ = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks,
+                                           output_all_encoded_layers=True)
+        # output_layers = [self.layers]  # len(self.layers) x batch_size x max_word_piece_length x hidden_size
+        outputs = bert_outputs[0].new_zeros((len(self.layers), batch_size, max_len, bert_outputs[0].size(-1)))
+        for l_index, l in enumerate(self.layers):
+            outputs[l_index] = bert_outputs[l]
+        return outputs
+
+class BertWordPieceEncoder(nn.Module):
+    """
+    可以通过读取vocabulary使用的Bert的Encoder。传入vocab，然后调用index_datasets方法在vocabulary中生成word piece的表示。
+
+    :param vocab: Vocabulary.
+    :param model_dir_or_name:
+    :param layers:
+    :param requires_grad:
+    """
+    def __init__(self, vocab:Vocabulary, model_dir_or_name:str='en-base', layers:str='-1',
+                 requires_grad:bool=False):
+        super().__init__()
+        PRETRAIN_URL = _get_base_url('bert')
+        # TODO 修改
+        PRETRAINED_BERT_MODEL_DIR = {'en-base': 'bert_en-80f95ea7.tar.gz',
+                                     'cn': 'elmo_cn.zip'}
+
+        if model_dir_or_name in PRETRAINED_BERT_MODEL_DIR:
+            model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
+            model_url = PRETRAIN_URL + model_name
+            model_dir = cached_path(model_url)
+            # 检查是否存在
+        elif os.path.isdir(model_dir_or_name):
+            model_dir = model_dir_or_name
+        else:
+            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
+
+        self.model = _WordPieceBertModel(model_dir=model_dir, vocab=vocab, layers=layers)
+        self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size
+        self.requires_grad = requires_grad
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+        :return:
+        """
+        requires_grads = set([param.requires_grad for name, param in self.named_parameters()])
+        if len(requires_grads)==1:
+            return requires_grads.pop()
+        else:
+            return None
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        for name, param in self.named_parameters():
+            param.requires_grad = value
+
+    @property
+    def embed_size(self):
+        return self._embed_size
+
+    def index_datasets(self, *datasets):
+        """
+        对datasets进行word piece的index。
+
+        Example::
+
+        :param datasets:
+        :return:
+        """
+        self.model.index_dataset(*datasets)
+
+    def forward(self, words, token_type_ids=None):
+        """
+        计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要
+            删除这两个表示。
+
+        :param words: batch_size x max_len
+        :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话
+        :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
+        """
+        outputs = self.model(words, token_type_ids)
+        outputs = torch.cat([*outputs], dim=-1)
+
+        return outputs
\ No newline at end of file
diff --git a/fastNLP/modules/encoder/_elmo.py b/fastNLP/modules/encoder/_elmo.py
new file mode 100644
index 00000000..1f400f1d
--- /dev/null
+++ b/fastNLP/modules/encoder/_elmo.py
@@ -0,0 +1,774 @@
+
+"""
+这个页面的代码大量参考了https://github.com/HIT-SCIR/ELMoForManyLangs/tree/master/elmoformanylangs
+"""
+
+
+from typing import Optional, Tuple, List, Callable
+
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence
+from ...core.vocabulary import Vocabulary
+import json
+
+from ..utils import get_dropout_mask
+import codecs
+
+class LstmCellWithProjection(torch.nn.Module):
+    """
+    An LSTM with Recurrent Dropout and a projected and clipped hidden state and
+    memory. Note: this implementation is slower than the native Pytorch LSTM because
+    it cannot make use of CUDNN optimizations for stacked RNNs due to and
+    variational dropout and the custom nature of the cell state.
+    Parameters
+    ----------
+    input_size : ``int``, required.
+        The dimension of the inputs to the LSTM.
+    hidden_size : ``int``, required.
+        The dimension of the outputs of the LSTM.
+    cell_size : ``int``, required.
+        The dimension of the memory cell used for the LSTM.
+    go_forward: ``bool``, optional (default = True)
+        The direction in which the LSTM is applied to the sequence.
+        Forwards by default, or backwards if False.
+    recurrent_dropout_probability: ``float``, optional (default = 0.0)
+        The dropout probability to be used in a dropout scheme as stated in
+        `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks
+        <https://arxiv.org/abs/1512.05287>`_ . Implementation wise, this simply
+        applies a fixed dropout mask per sequence to the recurrent connection of the
+        LSTM.
+    state_projection_clip_value: ``float``, optional, (default = None)
+        The magnitude with which to clip the hidden_state after projecting it.
+    memory_cell_clip_value: ``float``, optional, (default = None)
+        The magnitude with which to clip the memory cell.
+    Returns
+    -------
+    output_accumulator : ``torch.FloatTensor``
+        The outputs of the LSTM for each timestep. A tensor of shape
+        (batch_size, max_timesteps, hidden_size) where for a given batch
+        element, all outputs past the sequence length for that batch are
+        zero tensors.
+    final_state: ``Tuple[torch.FloatTensor, torch.FloatTensor]``
+        The final (state, memory) states of the LSTM, with shape
+        (1, batch_size, hidden_size) and  (1, batch_size, cell_size)
+        respectively. The first dimension is 1 in order to match the Pytorch
+        API for returning stacked LSTM states.
+    """
+    def __init__(self,
+                 input_size: int,
+                 hidden_size: int,
+                 cell_size: int,
+                 go_forward: bool = True,
+                 recurrent_dropout_probability: float = 0.0,
+                 memory_cell_clip_value: Optional[float] = None,
+                 state_projection_clip_value: Optional[float] = None) -> None:
+        super(LstmCellWithProjection, self).__init__()
+        # Required to be wrapped with a :class:`PytorchSeq2SeqWrapper`.
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.cell_size = cell_size
+
+        self.go_forward = go_forward
+        self.state_projection_clip_value = state_projection_clip_value
+        self.memory_cell_clip_value = memory_cell_clip_value
+        self.recurrent_dropout_probability = recurrent_dropout_probability
+
+        # We do the projections for all the gates all at once.
+        self.input_linearity = torch.nn.Linear(input_size, 4 * cell_size, bias=False)
+        self.state_linearity = torch.nn.Linear(hidden_size, 4 * cell_size, bias=True)
+
+        # Additional projection matrix for making the hidden state smaller.
+        self.state_projection = torch.nn.Linear(cell_size, hidden_size, bias=False)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        # Use sensible default initializations for parameters.
+        nn.init.orthogonal_(self.input_linearity.weight.data)
+        nn.init.orthogonal_(self.state_linearity.weight.data)
+
+        self.state_linearity.bias.data.fill_(0.0)
+        # Initialize forget gate biases to 1.0 as per An Empirical
+        # Exploration of Recurrent Network Architectures, (Jozefowicz, 2015).
+        self.state_linearity.bias.data[self.cell_size:2 * self.cell_size].fill_(1.0)
+
+    def forward(self,  # pylint: disable=arguments-differ
+                inputs: torch.FloatTensor,
+                batch_lengths: List[int],
+                initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None):
+        """
+        Parameters
+        ----------
+        inputs : ``torch.FloatTensor``, required.
+            A tensor of shape (batch_size, num_timesteps, input_size)
+            to apply the LSTM over.
+        batch_lengths : ``List[int]``, required.
+            A list of length batch_size containing the lengths of the sequences in batch.
+        initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None)
+            A tuple (state, memory) representing the initial hidden state and memory
+            of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the
+            ``memory`` has shape (1, batch_size, cell_size).
+        Returns
+        -------
+        output_accumulator : ``torch.FloatTensor``
+            The outputs of the LSTM for each timestep. A tensor of shape
+            (batch_size, max_timesteps, hidden_size) where for a given batch
+            element, all outputs past the sequence length for that batch are
+            zero tensors.
+        final_state : ``Tuple[``torch.FloatTensor, torch.FloatTensor]``
+            A tuple (state, memory) representing the initial hidden state and memory
+            of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the
+            ``memory`` has shape (1, batch_size, cell_size).
+        """
+        batch_size = inputs.size()[0]
+        total_timesteps = inputs.size()[1]
+
+        # We have to use this '.data.new().fill_' pattern to create tensors with the correct
+        # type - forward has no knowledge of whether these are torch.Tensors or torch.cuda.Tensors.
+        output_accumulator = inputs.data.new(batch_size,
+                                                      total_timesteps,
+                                                      self.hidden_size).fill_(0)
+        if initial_state is None:
+            full_batch_previous_memory = inputs.data.new(batch_size,
+                                                                  self.cell_size).fill_(0)
+            full_batch_previous_state = inputs.data.new(batch_size,
+                                                                 self.hidden_size).fill_(0)
+        else:
+            full_batch_previous_state = initial_state[0].squeeze(0)
+            full_batch_previous_memory = initial_state[1].squeeze(0)
+
+        current_length_index = batch_size - 1 if self.go_forward else 0
+        if self.recurrent_dropout_probability > 0.0 and self.training:
+            dropout_mask = get_dropout_mask(self.recurrent_dropout_probability,
+                                            full_batch_previous_state)
+        else:
+            dropout_mask = None
+
+        for timestep in range(total_timesteps):
+            # The index depends on which end we start.
+            index = timestep if self.go_forward else total_timesteps - timestep - 1
+
+            # What we are doing here is finding the index into the batch dimension
+            # which we need to use for this timestep, because the sequences have
+            # variable length, so once the index is greater than the length of this
+            # particular batch sequence, we no longer need to do the computation for
+            # this sequence. The key thing to recognise here is that the batch inputs
+            # must be _ordered_ by length from longest (first in batch) to shortest
+            # (last) so initially, we are going forwards with every sequence and as we
+            # pass the index at which the shortest elements of the batch finish,
+            # we stop picking them up for the computation.
+            if self.go_forward:
+                while batch_lengths[current_length_index] <= index:
+                    current_length_index -= 1
+            # If we're going backwards, we are _picking up_ more indices.
+            else:
+                # First conditional: Are we already at the maximum number of elements in the batch?
+                # Second conditional: Does the next shortest sequence beyond the current batch
+                # index require computation use this timestep?
+                while current_length_index < (len(batch_lengths) - 1) and \
+                                batch_lengths[current_length_index + 1] > index:
+                    current_length_index += 1
+
+            # Actually get the slices of the batch which we
+            # need for the computation at this timestep.
+            # shape (batch_size, cell_size)
+            previous_memory = full_batch_previous_memory[0: current_length_index + 1].clone()
+            # Shape (batch_size, hidden_size)
+            previous_state = full_batch_previous_state[0: current_length_index + 1].clone()
+            # Shape (batch_size, input_size)
+            timestep_input = inputs[0: current_length_index + 1, index]
+
+            # Do the projections for all the gates all at once.
+            # Both have shape (batch_size, 4 * cell_size)
+            projected_input = self.input_linearity(timestep_input)
+            projected_state = self.state_linearity(previous_state)
+
+            # Main LSTM equations using relevant chunks of the big linear
+            # projections of the hidden state and inputs.
+            input_gate = torch.sigmoid(projected_input[:, (0 * self.cell_size):(1 * self.cell_size)] +
+                                       projected_state[:, (0 * self.cell_size):(1 * self.cell_size)])
+            forget_gate = torch.sigmoid(projected_input[:, (1 * self.cell_size):(2 * self.cell_size)] +
+                                        projected_state[:, (1 * self.cell_size):(2 * self.cell_size)])
+            memory_init = torch.tanh(projected_input[:, (2 * self.cell_size):(3 * self.cell_size)] +
+                                     projected_state[:, (2 * self.cell_size):(3 * self.cell_size)])
+            output_gate = torch.sigmoid(projected_input[:, (3 * self.cell_size):(4 * self.cell_size)] +
+                                        projected_state[:, (3 * self.cell_size):(4 * self.cell_size)])
+            memory = input_gate * memory_init + forget_gate * previous_memory
+
+            # Here is the non-standard part of this LSTM cell; first, we clip the
+            # memory cell, then we project the output of the timestep to a smaller size
+            # and again clip it.
+
+            if self.memory_cell_clip_value:
+                # pylint: disable=invalid-unary-operand-type
+                memory = torch.clamp(memory, -self.memory_cell_clip_value, self.memory_cell_clip_value)
+
+            # shape (current_length_index, cell_size)
+            pre_projection_timestep_output = output_gate * torch.tanh(memory)
+
+            # shape (current_length_index, hidden_size)
+            timestep_output = self.state_projection(pre_projection_timestep_output)
+            if self.state_projection_clip_value:
+                # pylint: disable=invalid-unary-operand-type
+                timestep_output = torch.clamp(timestep_output,
+                                              -self.state_projection_clip_value,
+                                              self.state_projection_clip_value)
+
+            # Only do dropout if the dropout prob is > 0.0 and we are in training mode.
+            if dropout_mask is not None:
+                timestep_output = timestep_output * dropout_mask[0: current_length_index + 1]
+
+            # We've been doing computation with less than the full batch, so here we create a new
+            # variable for the the whole batch at this timestep and insert the result for the
+            # relevant elements of the batch into it.
+            full_batch_previous_memory = full_batch_previous_memory.data.clone()
+            full_batch_previous_state = full_batch_previous_state.data.clone()
+            full_batch_previous_memory[0:current_length_index + 1] = memory
+            full_batch_previous_state[0:current_length_index + 1] = timestep_output
+            output_accumulator[0:current_length_index + 1, index] = timestep_output
+
+        # Mimic the pytorch API by returning state in the following shape:
+        # (num_layers * num_directions, batch_size, ...). As this
+        # LSTM cell cannot be stacked, the first dimension here is just 1.
+        final_state = (full_batch_previous_state.unsqueeze(0),
+                       full_batch_previous_memory.unsqueeze(0))
+
+        return output_accumulator, final_state
+
+
+class LstmbiLm(nn.Module):
+    def __init__(self, config):
+        super(LstmbiLm, self).__init__()
+        self.config = config
+        self.encoder = nn.LSTM(self.config['encoder']['projection_dim'],
+                               self.config['encoder']['dim'],
+                               num_layers=self.config['encoder']['n_layers'],
+                               bidirectional=True,
+                               batch_first=True,
+                               dropout=self.config['dropout'])
+        self.projection = nn.Linear(self.config['encoder']['dim'], self.config['encoder']['projection_dim'], bias=True)
+
+    def forward(self, inputs, seq_len):
+        sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True)
+        inputs = inputs[sort_idx]
+        inputs = nn.utils.rnn.pack_padded_sequence(inputs, sort_lens, batch_first=self.batch_first)
+        output, hx = self.encoder(inputs, None)  # -> [N,L,C]
+        output, _ = nn.util.rnn.pad_packed_sequence(output, batch_first=self.batch_first)
+        _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
+        output = output[unsort_idx]
+        forward, backward = output.split(self.config['encoder']['dim'], 2)
+        return torch.cat([self.projection(forward), self.projection(backward)], dim=2)
+
+
+class ElmobiLm(torch.nn.Module):
+    def __init__(self, config):
+        super(ElmobiLm, self).__init__()
+        self.config = config
+        input_size = config['encoder']['projection_dim']
+        hidden_size = config['encoder']['projection_dim']
+        cell_size = config['encoder']['dim']
+        num_layers = config['encoder']['n_layers']
+        memory_cell_clip_value = config['encoder']['cell_clip']
+        state_projection_clip_value = config['encoder']['proj_clip']
+        recurrent_dropout_probability = config['dropout']
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.cell_size = cell_size
+
+        forward_layers = []
+        backward_layers = []
+
+        lstm_input_size = input_size
+        go_forward = True
+        for layer_index in range(num_layers):
+            forward_layer = LstmCellWithProjection(lstm_input_size,
+                                                   hidden_size,
+                                                   cell_size,
+                                                   go_forward,
+                                                   recurrent_dropout_probability,
+                                                   memory_cell_clip_value,
+                                                   state_projection_clip_value)
+            backward_layer = LstmCellWithProjection(lstm_input_size,
+                                                    hidden_size,
+                                                    cell_size,
+                                                    not go_forward,
+                                                    recurrent_dropout_probability,
+                                                    memory_cell_clip_value,
+                                                    state_projection_clip_value)
+            lstm_input_size = hidden_size
+
+            self.add_module('forward_layer_{}'.format(layer_index), forward_layer)
+            self.add_module('backward_layer_{}'.format(layer_index), backward_layer)
+            forward_layers.append(forward_layer)
+            backward_layers.append(backward_layer)
+        self.forward_layers = forward_layers
+        self.backward_layers = backward_layers
+
+    def forward(self, inputs, seq_len):
+        """
+
+        :param inputs: batch_size x max_len x embed_size
+        :param seq_len: batch_size
+        :return: torch.FloatTensor. num_layers x batch_size x max_len x hidden_size
+        """
+        sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True)
+        inputs = inputs[sort_idx]
+        inputs = nn.utils.rnn.pack_padded_sequence(inputs, sort_lens, batch_first=True)
+        output, _ = self._lstm_forward(inputs, None)
+        _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
+        output = output[:, unsort_idx]
+
+        return output
+
+    def _lstm_forward(self,
+                      inputs: PackedSequence,
+                      initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> \
+            Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """
+        Parameters
+        ----------
+        inputs : ``PackedSequence``, required.
+          A batch first ``PackedSequence`` to run the stacked LSTM over.
+        initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None)
+          A tuple (state, memory) representing the initial hidden state and memory
+          of the LSTM, with shape (num_layers, batch_size, 2 * hidden_size) and
+          (num_layers, batch_size, 2 * cell_size) respectively.
+        Returns
+        -------
+        output_sequence : ``torch.FloatTensor``
+          The encoded sequence of shape (num_layers, batch_size, sequence_length, hidden_size)
+        final_states: ``Tuple[torch.FloatTensor, torch.FloatTensor]``
+          The per-layer final (state, memory) states of the LSTM, with shape
+          (num_layers, batch_size, 2 * hidden_size) and  (num_layers, batch_size, 2 * cell_size)
+          respectively. The last dimension is duplicated because it contains the state/memory
+          for both the forward and backward layers.
+        """
+
+        if initial_state is None:
+            hidden_states: List[Optional[Tuple[torch.Tensor,
+                                               torch.Tensor]]] = [None] * len(self.forward_layers)
+        elif initial_state[0].size()[0] != len(self.forward_layers):
+            raise Exception("Initial states were passed to forward() but the number of "
+                            "initial states does not match the number of layers.")
+        else:
+            hidden_states = list(zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0)))
+
+        inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True)
+        forward_output_sequence = inputs
+        backward_output_sequence = inputs
+
+        final_states = []
+        sequence_outputs = []
+        for layer_index, state in enumerate(hidden_states):
+            forward_layer = getattr(self, 'forward_layer_{}'.format(layer_index))
+            backward_layer = getattr(self, 'backward_layer_{}'.format(layer_index))
+
+            forward_cache = forward_output_sequence
+            backward_cache = backward_output_sequence
+
+            if state is not None:
+                forward_hidden_state, backward_hidden_state = state[0].split(self.hidden_size, 2)
+                forward_memory_state, backward_memory_state = state[1].split(self.cell_size, 2)
+                forward_state = (forward_hidden_state, forward_memory_state)
+                backward_state = (backward_hidden_state, backward_memory_state)
+            else:
+                forward_state = None
+                backward_state = None
+
+            forward_output_sequence, forward_state = forward_layer(forward_output_sequence,
+                                                                   batch_lengths,
+                                                                   forward_state)
+            backward_output_sequence, backward_state = backward_layer(backward_output_sequence,
+                                                                      batch_lengths,
+                                                                      backward_state)
+            # Skip connections, just adding the input to the output.
+            if layer_index != 0:
+                forward_output_sequence += forward_cache
+                backward_output_sequence += backward_cache
+
+            sequence_outputs.append(torch.cat([forward_output_sequence,
+                                               backward_output_sequence], -1))
+            # Append the state tuples in a list, so that we can return
+            # the final states for all the layers.
+            final_states.append((torch.cat([forward_state[0], backward_state[0]], -1),
+                                 torch.cat([forward_state[1], backward_state[1]], -1)))
+
+        stacked_sequence_outputs: torch.FloatTensor = torch.stack(sequence_outputs)
+        # Stack the hidden state and memory for each layer into 2 tensors of shape
+        # (num_layers, batch_size, hidden_size) and (num_layers, batch_size, cell_size)
+        # respectively.
+        final_hidden_states, final_memory_states = zip(*final_states)
+        final_state_tuple: Tuple[torch.FloatTensor,
+                                 torch.FloatTensor] = (torch.cat(final_hidden_states, 0),
+                                                       torch.cat(final_memory_states, 0))
+        return stacked_sequence_outputs, final_state_tuple
+
+
+class LstmTokenEmbedder(nn.Module):
+    def __init__(self, config, word_emb_layer, char_emb_layer):
+        super(LstmTokenEmbedder, self).__init__()
+        self.config = config
+        self.word_emb_layer = word_emb_layer
+        self.char_emb_layer = char_emb_layer
+        self.output_dim = config['encoder']['projection_dim']
+        emb_dim = 0
+        if word_emb_layer is not None:
+            emb_dim += word_emb_layer.n_d
+
+        if char_emb_layer is not None:
+            emb_dim += char_emb_layer.n_d * 2
+            self.char_lstm = nn.LSTM(char_emb_layer.n_d, char_emb_layer.n_d, num_layers=1, bidirectional=True,
+                                     batch_first=True, dropout=config['dropout'])
+
+        self.projection = nn.Linear(emb_dim, self.output_dim, bias=True)
+
+    def forward(self, words, chars):
+        embs = []
+        if self.word_emb_layer is not None:
+            word_emb = self.word_emb_layer(words)
+            embs.append(word_emb)
+
+        if self.char_emb_layer is not None:
+            batch_size, seq_len, _ = chars.shape
+            chars = chars.view(batch_size * seq_len, -1)
+            chars_emb = self.char_emb_layer(chars)
+            # TODO 这里应该要考虑seq_len的问题
+            _, (chars_outputs, __) = self.char_lstm(chars_emb)
+            chars_outputs = chars_outputs.contiguous().view(-1, self.config['token_embedder']['char_dim'] * 2)
+            embs.append(chars_outputs)
+
+        token_embedding = torch.cat(embs, dim=2)
+
+        return self.projection(token_embedding)
+
+
+class ConvTokenEmbedder(nn.Module):
+    def __init__(self, config, word_emb_layer, char_emb_layer):
+        super(ConvTokenEmbedder, self).__init__()
+        self.config = config
+        self.word_emb_layer = word_emb_layer
+        self.char_emb_layer = char_emb_layer
+
+        self.output_dim = config['encoder']['projection_dim']
+        self.emb_dim = 0
+        if word_emb_layer is not None:
+            self.emb_dim += word_emb_layer.weight.size(1)
+
+        if char_emb_layer is not None:
+            self.convolutions = []
+            cnn_config = config['token_embedder']
+            filters = cnn_config['filters']
+            char_embed_dim = cnn_config['char_dim']
+
+            for i, (width, num) in enumerate(filters):
+                conv = torch.nn.Conv1d(
+                    in_channels=char_embed_dim,
+                    out_channels=num,
+                    kernel_size=width,
+                    bias=True
+                )
+                self.convolutions.append(conv)
+
+            self.convolutions = nn.ModuleList(self.convolutions)
+
+            self.n_filters = sum(f[1] for f in filters)
+            self.n_highway = cnn_config['n_highway']
+
+            self.highways = Highway(self.n_filters, self.n_highway, activation=torch.nn.functional.relu)
+            self.emb_dim += self.n_filters
+
+        self.projection = nn.Linear(self.emb_dim, self.output_dim, bias=True)
+
+    def forward(self, words, chars):
+        embs = []
+        if self.word_emb_layer is not None:
+            word_emb = self.word_emb_layer(words)
+            embs.append(word_emb)
+
+        if self.char_emb_layer is not None:
+            batch_size, seq_len, _ = chars.size()
+            chars = chars.view(batch_size * seq_len, -1)
+            character_embedding = self.char_emb_layer(chars)
+            character_embedding = torch.transpose(character_embedding, 1, 2)
+
+            cnn_config = self.config['token_embedder']
+            if cnn_config['activation'] == 'tanh':
+                activation = torch.nn.functional.tanh
+            elif cnn_config['activation'] == 'relu':
+                activation = torch.nn.functional.relu
+            else:
+                raise Exception("Unknown activation")
+
+            convs = []
+            for i in range(len(self.convolutions)):
+                convolved = self.convolutions[i](character_embedding)
+                # (batch_size * sequence_length, n_filters for this width)
+                convolved, _ = torch.max(convolved, dim=-1)
+                convolved = activation(convolved)
+                convs.append(convolved)
+            char_emb = torch.cat(convs, dim=-1)
+            char_emb = self.highways(char_emb)
+
+            embs.append(char_emb.view(batch_size, -1, self.n_filters))
+
+        token_embedding = torch.cat(embs, dim=2)
+
+        return self.projection(token_embedding)
+
+
+class Highway(torch.nn.Module):
+    """
+    A `Highway layer <https://arxiv.org/abs/1505.00387>`_ does a gated combination of a linear
+    transformation and a non-linear transformation of its input.  :math:`y = g * x + (1 - g) *
+    f(A(x))`, where :math:`A` is a linear transformation, :math:`f` is an element-wise
+    non-linearity, and :math:`g` is an element-wise gate, computed as :math:`sigmoid(B(x))`.
+    This module will apply a fixed number of highway layers to its input, returning the final
+    result.
+    Parameters
+    ----------
+    input_dim : ``int``
+        The dimensionality of :math:`x`.  We assume the input has shape ``(batch_size,
+        input_dim)``.
+    num_layers : ``int``, optional (default=``1``)
+        The number of highway layers to apply to the input.
+    activation : ``Callable[[torch.Tensor], torch.Tensor]``, optional (default=``torch.nn.functional.relu``)
+        The non-linearity to use in the highway layers.
+    """
+    def __init__(self,
+                 input_dim: int,
+                 num_layers: int = 1,
+                 activation: Callable[[torch.Tensor], torch.Tensor] = torch.nn.functional.relu) -> None:
+        super(Highway, self).__init__()
+        self._input_dim = input_dim
+        self._layers = torch.nn.ModuleList([torch.nn.Linear(input_dim, input_dim * 2)
+                                            for _ in range(num_layers)])
+        self._activation = activation
+        for layer in self._layers:
+            # We should bias the highway layer to just carry its input forward.  We do that by
+            # setting the bias on `B(x)` to be positive, because that means `g` will be biased to
+            # be high, to we will carry the input forward.  The bias on `B(x)` is the second half
+            # of the bias vector in each Linear layer.
+            layer.bias[input_dim:].data.fill_(1)
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:  # pylint: disable=arguments-differ
+        current_input = inputs
+        for layer in self._layers:
+            projected_input = layer(current_input)
+            linear_part = current_input
+            # NOTE: if you modify this, think about whether you should modify the initialization
+            # above, too.
+            nonlinear_part = projected_input[:, (0 * self._input_dim):(1 * self._input_dim)]
+            gate = projected_input[:, (1 * self._input_dim):(2 * self._input_dim)]
+            nonlinear_part = self._activation(nonlinear_part)
+            gate = torch.sigmoid(gate)
+            current_input = gate * linear_part + (1 - gate) * nonlinear_part
+        return current_input
+
+class _ElmoModel(nn.Module):
+    """
+    该Module是ElmoEmbedding中进行所有的heavy lifting的地方。做的工作，包括
+        (1) 根据配置，加载模型;
+        (2) 根据vocab，对模型中的embedding进行调整. 并将其正确初始化
+        (3) 保存一个words与chars的对应转换，获取时自动进行相应的转换
+        (4) 设计一个保存token的embedding，允许缓存word的表示。
+
+    """
+    def __init__(self, model_dir:str, vocab:Vocabulary=None, cache_word_reprs:bool=False):
+        super(_ElmoModel, self).__init__()
+        config = json.load(open(os.path.join(model_dir, 'structure_config.json'), 'r'))
+
+        self.config = config
+
+        OOV_TAG = '<oov>'
+        PAD_TAG = '<pad>'
+        BOS_TAG = '<bos>'
+        EOS_TAG = '<eos>'
+        BOW_TAG = '<bow>'
+        EOW_TAG = '<eow>'
+
+        # 将加载embedding放到这里
+        token_embedder_states = torch.load(os.path.join(model_dir, 'token_embedder.pkl'), map_location='cpu')
+
+        # For the model trained with word form word encoder.
+        if config['token_embedder']['word_dim'] > 0:
+            word_lexicon = {}
+            with codecs.open(os.path.join(model_dir, 'word.dic'), 'r', encoding='utf-8') as fpi:
+                for line in fpi:
+                    tokens = line.strip().split('\t')
+                    if len(tokens) == 1:
+                        tokens.insert(0, '\u3000')
+                    token, i = tokens
+                    word_lexicon[token] = int(i)
+            # 做一些sanity check
+            for special_word in [PAD_TAG, OOV_TAG, BOS_TAG, EOS_TAG]:
+                assert special_word in word_lexicon, f"{special_word} not found in word.dic."
+            # 根据vocab调整word_embedding
+            pre_word_embedding = token_embedder_states.pop('word_emb_layer.embedding.weight')
+            word_emb_layer = nn.Embedding(len(vocab)+2, config['token_embedder']['word_dim'])  #多增加两个是为了<bos>与<eos>
+            found_word_count = 0
+            for word, index in vocab:
+                if index == vocab.unknown_idx:  # 因为fastNLP的unknow是<unk> 而在这里是<oov>所以ugly强制适配一下
+                    index_in_pre = word_lexicon[OOV_TAG]
+                    found_word_count += 1
+                elif index == vocab.padding_idx:  # 需要pad对齐
+                    index_in_pre = word_lexicon[PAD_TAG]
+                    found_word_count += 1
+                elif word in word_lexicon:
+                    index_in_pre = word_lexicon[word]
+                    found_word_count += 1
+                else:
+                    index_in_pre = word_lexicon[OOV_TAG]
+                word_emb_layer.weight.data[index] = pre_word_embedding[index_in_pre]
+            print(f"{found_word_count} out of {len(vocab)} words were found in pretrained elmo embedding.")
+            word_emb_layer.weight.data[-1] = pre_word_embedding[word_lexicon[EOS_TAG]]
+            word_emb_layer.weight.data[-2] = pre_word_embedding[word_lexicon[BOS_TAG]]
+            self.word_vocab = vocab
+        else:
+            word_emb_layer = None
+
+        # For the model trained with character-based word encoder.
+        if config['token_embedder']['char_dim'] > 0:
+            char_lexicon = {}
+            with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi:
+                for line in fpi:
+                    tokens = line.strip().split('\t')
+                    if len(tokens) == 1:
+                        tokens.insert(0, '\u3000')
+                    token, i = tokens
+                    char_lexicon[token] = int(i)
+            # 做一些sanity check
+            for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]:
+                assert special_word in char_lexicon, f"{special_word} not found in char.dic."
+            # 从vocab中构建char_vocab
+            char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG)
+            # 需要保证<bow>与<eow>在里面
+            char_vocab.add_word(BOW_TAG)
+            char_vocab.add_word(EOW_TAG)
+            for word, index in vocab:
+                char_vocab.add_word_lst(list(word))
+            # 保证<eos>, <bos>也在
+            char_vocab.add_word_lst(list(BOS_TAG))
+            char_vocab.add_word_lst(list(EOS_TAG))
+            # 根据char_lexicon调整
+            char_emb_layer = nn.Embedding(len(char_vocab), int(config['token_embedder']['char_dim']))
+            pre_char_embedding = token_embedder_states.pop('char_emb_layer.embedding.weight')
+            found_char_count = 0
+            for char, index in char_vocab:  # 调整character embedding
+                if char in char_lexicon:
+                    index_in_pre = char_lexicon.get(char)
+                    found_char_count += 1
+                else:
+                    index_in_pre = char_lexicon[OOV_TAG]
+                char_emb_layer.weight.data[index] = pre_char_embedding[index_in_pre]
+            print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.")
+            # 生成words到chars的映射
+            if config['token_embedder']['name'].lower() == 'cnn':
+                max_chars = config['token_embedder']['max_characters_per_token']
+            elif config['token_embedder']['name'].lower() == 'lstm':
+                max_chars = max(map(lambda x: len(x[0]), vocab)) + 2 # 需要补充两个<bow>与<eow>
+            else:
+                raise ValueError('Unknown token_embedder: {0}'.format(config['token_embedder']['name']))
+            # 增加<bos>, <eos>所以加2.
+            self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab)+2, max_chars),
+                                                                    fill_value=char_vocab.to_index(PAD_TAG), dtype=torch.long),
+                                                         requires_grad=False)
+            for word, index in vocab:
+                if len(word)+2>max_chars:
+                    word = word[:max_chars-2]
+                if index==vocab.padding_idx:  # 如果是pad的话，需要和给定的对齐
+                    word = PAD_TAG
+                elif index==vocab.unknown_idx:
+                    word = OOV_TAG
+                char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(c) for c in word] + [char_vocab.to_index(EOW_TAG)]
+                char_ids += [char_vocab.to_index(PAD_TAG)]*(max_chars-len(char_ids))
+                self.words_to_chars_embedding[index] = torch.LongTensor(char_ids)
+            for index, word in enumerate([BOS_TAG, EOS_TAG]):  # 加上<eos>, <bos>
+                if len(word)+2>max_chars:
+                    word = word[:max_chars-2]
+                char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(c) for c in word] + [char_vocab.to_index(EOW_TAG)]
+                char_ids += [char_vocab.to_index(PAD_TAG)]*(max_chars-len(char_ids))
+                self.words_to_chars_embedding[index+len(vocab)] = torch.LongTensor(char_ids)
+            self.char_vocab = char_vocab
+        else:
+            char_emb_layer = None
+
+        if config['token_embedder']['name'].lower() == 'cnn':
+            self.token_embedder = ConvTokenEmbedder(
+                config, word_emb_layer, char_emb_layer)
+        elif config['token_embedder']['name'].lower() == 'lstm':
+            self.token_embedder = LstmTokenEmbedder(
+                config, word_emb_layer, char_emb_layer)
+        self.token_embedder.load_state_dict(token_embedder_states, strict=False)
+
+        self.output_dim = config['encoder']['projection_dim']
+
+        if config['encoder']['name'].lower() == 'elmo':
+            self.encoder = ElmobiLm(config)
+        elif config['encoder']['name'].lower() == 'lstm':
+            self.encoder = LstmbiLm(config)
+        self.encoder.load_state_dict(torch.load(os.path.join(model_dir, 'encoder.pkl'),
+                                                map_location='cpu'))
+
+        self.bos_index = len(vocab)
+        self.eos_index = len(vocab) + 1
+        self._pad_index = vocab.padding_idx
+
+        if cache_word_reprs:
+            if config['token_embedder']['char_dim']>0:  # 只有在使用了chars的情况下有用
+                print("Start to generate cache word representations.")
+                batch_size = 320
+                num_batches = self.words_to_chars_embedding.size(0)//batch_size + \
+                              int(self.words_to_chars_embedding.size(0)%batch_size!=0)
+                self.cached_word_embedding = nn.Embedding(self.words_to_chars_embedding.size(0),
+                                                          config['encoder']['projection_dim'])
+                with torch.no_grad():
+                    for i in range(num_batches):
+                        words = torch.arange(i*batch_size, min((i+1)*batch_size, self.words_to_chars_embedding.size(0))).long()
+                        chars = self.words_to_chars_embedding[words].unsqueeze(1)  # batch_size x 1 x max_chars
+                        word_reprs = self.token_embedder(words.unsqueeze(1), chars).detach()  # batch_size x 1 x config['encoder']['projection_dim']
+                        self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1)
+                    print("Finish generating cached word representations. Going to delete the character encoder.")
+                del self.token_embedder, self.words_to_chars_embedding
+            else:
+                print("There is no need to cache word representations, since no character information is used.")
+
+    def forward(self, words):
+        """
+
+        :param words: batch_size x max_len
+        :return: num_layers x batch_size x max_len x hidden_size
+        """
+        # 扩展<bos>, <eos>
+        batch_size, max_len = words.size()
+        expanded_words = words.new_zeros(batch_size, max_len + 2)  # 因为pad一定为0，
+        seq_len = words.ne(self._pad_index).sum(dim=-1)
+        expanded_words[:, 1:-1] = words
+        expanded_words[:, 0].fill_(self.bos_index)
+        expanded_words[torch.arange(batch_size).to(words), seq_len+1] = self.eos_index
+        seq_len = seq_len + 2
+        if hasattr(self, 'cached_word_embedding'):
+            token_embedding = self.cached_word_embedding(expanded_words)
+        else:
+            if hasattr(self, 'words_to_chars_embedding'):
+                chars = self.words_to_chars_embedding[expanded_words]
+            else:
+                chars = None
+            token_embedding = self.token_embedder(expanded_words, chars)
+        if self.config['encoder']['name'] == 'elmo':
+            encoder_output = self.encoder(token_embedding, seq_len)
+            sz = encoder_output.size()
+            token_embedding = torch.cat([token_embedding, token_embedding], dim=2).view(1, sz[1], sz[2], sz[3])
+            encoder_output = torch.cat([token_embedding, encoder_output], dim=0)
+        elif self.config['encoder']['name'] == 'lstm':
+            encoder_output = self.encoder(token_embedding, seq_len)
+        else:
+            raise ValueError('Unknown encoder: {0}'.format(self.config['encoder']['name']))
+
+        # 删除<eos>, <bos>. 这里没有精确地删除，但应该也不会影响最后的结果了。
+        encoder_output = encoder_output[:, :, 1:-1]
+
+        return encoder_output
diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py
index e123fda6..38a35fc9 100644
--- a/fastNLP/modules/encoder/bert.py
+++ b/fastNLP/modules/encoder/bert.py
@@ -269,8 +269,9 @@ class BertModel(nn.Module):
                  attention_probs_dropout_prob=0.1,
                  max_position_embeddings=512,
                  type_vocab_size=2,
-                 initializer_range=0.02, **kwargs):
+                 initializer_range=0.02):
         super(BertModel, self).__init__()
+        self.hidden_size = hidden_size
         self.embeddings = BertEmbeddings(vocab_size, hidden_size, max_position_embeddings,
                                          type_vocab_size, hidden_dropout_prob)
         self.encoder = BertEncoder(num_hidden_layers, hidden_size, num_attention_heads,
diff --git a/fastNLP/modules/encoder/char_encoder.py b/fastNLP/modules/encoder/char_encoder.py
index 481ad7ad..6ce63d1a 100644
--- a/fastNLP/modules/encoder/char_encoder.py
+++ b/fastNLP/modules/encoder/char_encoder.py
@@ -22,10 +22,10 @@ class ConvolutionCharEncoder(nn.Module):
     :param initial_method: 初始化参数的方式, 默认为`xavier normal`
     """
     
-    def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(3, 4, 5), initial_method=None):
+    def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(1, 3, 5), initial_method=None):
         super(ConvolutionCharEncoder, self).__init__()
         self.convs = nn.ModuleList([
-            nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, 4))
+            nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, kernels[i]//2))
             for i in range(len(kernels))])
         
         initial_parameter(self, initial_method)
diff --git a/fastNLP/modules/encoder/conv_maxpool.py b/fastNLP/modules/encoder/conv_maxpool.py
index ae6bea04..68605c98 100644
--- a/fastNLP/modules/encoder/conv_maxpool.py
+++ b/fastNLP/modules/encoder/conv_maxpool.py
@@ -5,9 +5,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from ..utils import initial_parameter
-
-
 class ConvMaxpool(nn.Module):
     """
     别名：:class:`fastNLP.modules.ConvMaxpool`   :class:`fastNLP.modules.encoder.conv_maxpool.ConvMaxpool`
@@ -19,20 +16,15 @@ class ConvMaxpool(nn.Module):
     :param int in_channels: 输入channel的大小，一般是embedding的维度; 或encoder的output维度
     :param int,tuple(int) out_channels: 输出channel的数量。如果为list，则需要与kernel_sizes的数量保持一致
     :param int,tuple(int) kernel_sizes: 输出channel的kernel大小。
-    :param int stride: 见pytorch Conv1D文档。所有kernel共享一个stride。
-    :param int padding: 见pytorch Conv1D文档。所有kernel共享一个padding。
-    :param int dilation: 见pytorch Conv1D文档。所有kernel共享一个dilation。
-    :param int groups: 见pytorch Conv1D文档。所有kernel共享一个groups。
-    :param bool bias: 见pytorch Conv1D文档。所有kernel共享一个bias。
     :param str activation: Convolution后的结果将通过该activation后再经过max-pooling。支持relu, sigmoid, tanh
-    :param str initial_method: str。
     """
     
-    def __init__(self, in_channels, out_channels, kernel_sizes,
-                 stride=1, padding=0, dilation=1,
-                 groups=1, bias=True, activation="relu", initial_method=None):
+    def __init__(self, in_channels, out_channels, kernel_sizes, activation="relu"):
         super(ConvMaxpool, self).__init__()
-        
+
+        for kernel_size in kernel_sizes:
+            assert kernel_size%2==1, "kernel size has to be odd numbers."
+
         # convolution
         if isinstance(kernel_sizes, (list, tuple, int)):
             if isinstance(kernel_sizes, int) and isinstance(out_channels, int):
@@ -49,11 +41,11 @@ class ConvMaxpool(nn.Module):
                 in_channels=in_channels,
                 out_channels=oc,
                 kernel_size=ks,
-                stride=stride,
-                padding=padding,
-                dilation=dilation,
-                groups=groups,
-                bias=bias)
+                stride=1,
+                padding=ks//2,
+                dilation=1,
+                groups=1,
+                bias=None)
                 for oc, ks in zip(out_channels, kernel_sizes)])
         
         else:
@@ -70,9 +62,7 @@ class ConvMaxpool(nn.Module):
         else:
             raise Exception(
                 "Undefined activation function: choose from: relu, tanh, sigmoid")
-        
-        initial_parameter(self, initial_method)
-    
+
     def forward(self, x, mask=None):
         """
 
@@ -86,7 +76,7 @@ class ConvMaxpool(nn.Module):
         xs = [self.activation(conv(x)) for conv in self.convs]  # [[N,C,L], ...]
         if mask is not None:
             mask = mask.unsqueeze(1)  # B x 1 x L
-            xs = [x.masked_fill_(mask, float('-inf')) for x in xs]
+            xs = [x.masked_fill_(mask.eq(0), float('-inf')) for x in xs]
         # max-pooling
         xs = [F.max_pool1d(input=i, kernel_size=i.size(2)).squeeze(2)
               for i in xs]  # [[N, C], ...]
diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py
index c2dfab65..d8d6f533 100644
--- a/fastNLP/modules/encoder/embedding.py
+++ b/fastNLP/modules/encoder/embedding.py
@@ -3,48 +3,821 @@ __all__ = [
 ]
 import torch.nn as nn
 from ..utils import get_embeddings
+from .lstm import LSTM
+from ... import Vocabulary
+from abc import abstractmethod
+import torch
+from ...io import EmbedLoader
+import torch.nn.functional as F
+import os
+from ._elmo import _ElmoModel
+from ...io.file_utils import cached_path, _get_base_url
+from ._bert import _WordBertModel
+from typing import List
 
+from ... import DataSet, Batch, SequentialSampler
+from ...core.utils import _move_model_to_device, _get_model_device
 
-class Embedding(nn.Embedding):
+
+class Embedding(nn.Module):
     """
     别名：:class:`fastNLP.modules.Embedding`   :class:`fastNLP.modules.encoder.embedding.Embedding`
 
     Embedding组件. 可以通过self.num_embeddings获取词表大小; self.embedding_dim获取embedding的维度"""
     
-    def __init__(self, init_embed, padding_idx=None, dropout=0.0, sparse=False, max_norm=None, norm_type=2,
-                 scale_grad_by_freq=False):
+    def __init__(self, init_embed, dropout=0.0):
         """
 
         :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int),
-            第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding
-        :param None,int padding_idx: 该index的Embedding将一直为0.
+            第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding;
+            也可以传入TokenEmbedding对象
         :param float dropout: 对Embedding的输出的dropout。
-        :param bool sparse: 如果为True，则对Embedding的梯度将是sparse的，参考Pytorch Embedding获取更多信息。
-        :param None,float max_norm: 每个vector最大的norm能为多大
-        :param int norm_type: norm的类型
-        :param bool scale_grad_by_freq: 如果为True，将会把梯度除以这个词出现的次数.
         """
-        embed = get_embeddings(init_embed)
-        num_embeddings, embedding_dim = embed.weight.size()
-        
-        super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx,
-                         max_norm=max_norm, norm_type=norm_type, scale_grad_by_freq=scale_grad_by_freq,
-                         sparse=sparse, _weight=embed.weight.data)
-        del embed
+        super(Embedding, self).__init__()
+
+        self.embed = get_embeddings(init_embed)
         
         self.dropout = nn.Dropout(dropout)
+        if not isinstance(self.embed, TokenEmbedding):
+            self._embed_size = self.embed.weight.size(1)
+        else:
+            self._embed_size = self.embed.embed_size
     
     def forward(self, x):
         """
         :param torch.LongTensor x: [batch, seq_len]
         :return: torch.Tensor : [batch, seq_len, embed_dim]
         """
-        x = super().forward(x)
+        x = self.embed(x)
         return self.dropout(x)
 
+    @property
+    def embed_size(self) -> int:
+        return self._embed_size
+
+    @property
+    def embedding_dim(self) -> int:
+        return self._embed_size
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+        :return:
+        """
+        if not isinstance(self.embed, TokenEmbedding):
+            return self.embed.weight.requires_grad
+        else:
+            return self.embed.requires_grad
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        if not isinstance(self.embed, TokenEmbedding):
+            self.embed.weight.requires_grad = value
+        else:
+            self.embed.requires_grad = value
+
+    @property
+    def size(self):
+        if isinstance(self.embed, TokenEmbedding):
+            return torch.Size(self.embed._word_vocab, self.embed.embed_size)
+        else:
+            return self.embed.weight.size()
+
+
+class TokenEmbedding(nn.Module):
+    def __init__(self, vocab):
+        super(TokenEmbedding, self).__init__()
+        assert vocab.padding_idx is not None, "You vocabulary must have padding."
+        self._word_vocab = vocab
+        self._word_pad_index = vocab.padding_idx
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+        :return:
+        """
+        requires_grads = set([param.requires_grad for param in self.parameters()])
+        if len(requires_grads) == 1:
+            return requires_grads.pop()
+        else:
+            return None
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        for param in self.parameters():
+            param.requires_grad = value
+
+    @abstractmethod
+    def get_original_vocab(self):
+        pass
+
+    @property
+    def embed_size(self) -> int:
+        return self._embed_size
+
+    def get_word_vocab(self):
+        """
+        返回embedding的词典。
+
+        :return: Vocabulary
+        """
+        return self._word_vocab
+
+    @property
     def size(self):
+        return torch.Size(self.embed._word_vocab, self._embed_size)
+
+
+class StaticEmbedding(TokenEmbedding):
+    """
+    别名：:class:`fastNLP.modules.StaticEmbedding`   :class:`fastNLP.modules.encoder.embedding.StaticEmbedding`
+
+    StaticEmbedding组件. 给定embedding的名称，根据vocab从embedding中抽取相应的数据。该Embedding可以就按照正常的embedding使用了
+
+    Example::
+
+
+    :param vocab: Vocabulary. 若该项为None则会读取所有的embedding。
+    :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding：第一种是传入embedding的文件名，第二种是传入embedding
+        的名称。目前支持的embedding包括{`en` 或者 `en-glove-840b-300` : glove.840B.300d, `en-glove-6b-50` : glove.6B.50d,
+        `en-word2vec-300` : GoogleNews-vectors-negative300}。第二种情况将自动查看缓存中是否存在该模型，没有的话将自动下载。
+    :param requires_grad: 是否需要gradient
+
+    """
+
+    def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=False):
+        super(StaticEmbedding, self).__init__(vocab)
+
+        # 优先定义需要下载的static embedding有哪些。这里估计需要自己搞一个server，
+        PRETRAIN_URL = _get_base_url('static')
+        PRETRAIN_STATIC_FILES = {
+            'en': 'glove.840B.300d-cc1ad5e1.tar.gz',
+            'en-glove-840b-300': 'glove.840B.300d-cc1ad5e1.tar.gz',
+            'en-glove-6b-50': "glove.6B.50d-a6028c70.tar.gz",
+            'en-word2vec-300': "GoogleNews-vectors-negative300-be166d9d.tar.gz",
+            'en-fasttext': "cc.en.300.vec-d53187b2.gz",
+            'cn': "tencent_cn-dab24577.tar.gz",
+            'cn-fasttext': "cc.zh.300.vec-d68a9bcf.gz",
+        }
+
+        # 得到cache_path
+        if model_dir_or_name.lower() in PRETRAIN_STATIC_FILES:
+            model_name = PRETRAIN_STATIC_FILES[model_dir_or_name]
+            model_url = PRETRAIN_URL + model_name
+            model_path = cached_path(model_url)
+            # 检查是否存在
+        elif os.path.isfile(model_dir_or_name):
+            model_path = model_dir_or_name
+        else:
+            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
+
+        # 读取embedding
+        embedding = EmbedLoader.load_with_vocab(model_path, vocab=vocab)
+        embedding = torch.tensor(embedding)
+        self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1],
+                                      padding_idx=vocab.padding_idx,
+                                      max_norm=None, norm_type=2, scale_grad_by_freq=False,
+                                      sparse=False, _weight=embedding)
+        self._embed_size = self.embedding.weight.size(1)
+        self.requires_grad = requires_grad
+
+    def forward(self, words):
+        """
+        传入words的index
+
+        :param words: torch.LongTensor, [batch_size, max_len]
+        :return: torch.FloatTensor, [batch_size, max_len, embed_size]
+        """
+        return self.embedding(words)
+
+
+class ContextualEmbedding(TokenEmbedding):
+    def __init__(self, vocab: Vocabulary):
+        super(ContextualEmbedding, self).__init__(vocab)
+
+    def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool=True):
+        """
+        由于动态embedding生成比较耗时，所以可以把每句话embedding缓存下来，这样就不需要每次都运行生成过程。
+
+        Example::
+
+            >>>
+
+
+        :param datasets: DataSet对象
+        :param batch_size: int, 生成cache的sentence表示时使用的batch的大小
+        :param device: 参考 :class::fastNLP.Trainer 的device
+        :param delete_weights: 似乎在生成了cache之后删除权重，在不需要finetune动态模型的情况下，删除权重会大量减少内存占用。
+        :return:
+        """
+        for index, dataset in enumerate(datasets):
+            try:
+                assert isinstance(dataset, DataSet), "Only fastNLP.DataSet object is allowed."
+                assert 'words' in dataset.get_input_name(), "`words` field has to be set as input."
+            except Exception as e:
+                print(f"Exception happens at {index} dataset.")
+                raise e
+
+        sent_embeds = {}
+        _move_model_to_device(self, device=device)
+        device = _get_model_device(self)
+        pad_index = self._word_vocab.padding_idx
+        print("Start to calculate sentence representations.")
+        with torch.no_grad():
+            for index, dataset in enumerate(datasets):
+                try:
+                    batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), prefetch=False)
+                    for batch_x, batch_y in batch:
+                        words = batch_x['words'].to(device)
+                        words_list = words.tolist()
+                        seq_len = words.ne(pad_index).sum(dim=-1)
+                        max_len = words.size(1)
+                        # 因为有些情况可能包含CLS, SEP, 从后面往前计算比较安全。
+                        seq_len_from_behind =(max_len - seq_len).tolist()
+                        word_embeds = self(words).detach().cpu().numpy()
+                        for b in range(words.size(0)):
+                            length = seq_len_from_behind[b]
+                            if length==0:
+                                sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b]
+                            else:
+                                sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length]
+                except Exception as e:
+                    print(f"Exception happens at {index} dataset.")
+                    raise e
+        print("Finish calculating sentence representations.")
+        self.sent_embeds = sent_embeds
+        if delete_weights:
+            self._delete_model_weights()
+
+    def _get_sent_reprs(self, words):
+        """
+        获取sentence的表示，如果有缓存，则返回缓存的值; 没有缓存则返回None
+
+        :param words: torch.LongTensor
+        :return:
+        """
+        if hasattr(self, 'sent_embeds'):
+            words_list = words.tolist()
+            seq_len = words.ne(self._word_pad_index).sum(dim=-1)
+            _embeds = []
+            for b in range(len(words)):
+                words_i = tuple(words_list[b][:seq_len[b]])
+                embed = self.sent_embeds[words_i]
+                _embeds.append(embed)
+            max_sent_len = max(map(len, _embeds))
+            embeds = words.new_zeros(len(_embeds), max_sent_len, self.embed_size, dtype=torch.float,
+                                     device=words.device)
+            for i, embed in enumerate(_embeds):
+                embeds[i, :len(embed)] = torch.FloatTensor(embed).to(words.device)
+            return embeds
+        return None
+
+    @abstractmethod
+    def _delete_model_weights(self):
+        """删除计算表示的模型以节省资源"""
+        raise NotImplementedError
+
+    def remove_sentence_cache(self):
+        """
+        删除缓存的句子表示. 删除之后如果模型权重没有被删除，将开始使用动态计算权重。
+
+        :return:
+        """
+        del self.sent_embeds
+
+
+class ElmoEmbedding(ContextualEmbedding):
+    """
+    别名：:class:`fastNLP.modules.ElmoEmbedding`   :class:`fastNLP.modules.encoder.embedding.ElmoEmbedding`
+
+    使用ELMo的embedding。初始化之后，只需要传入words就可以得到对应的embedding。
+    我们提供的ELMo预训练模型来自 https://github.com/HIT-SCIR/ELMoForManyLangs
+
+    Example::
+
+        >>>
+        >>>
+
+    :param vocab: 词表
+    :param model_dir_or_name: 可以有两种方式调用预训练好的ELMo embedding：第一种是传入ELMo权重的文件名，第二种是传入ELMo版本的名称，
+        目前支持的ELMo包括{`en` : 英文版本的ELMo, `cn` : 中文版本的ELMo,}。第二种情况将自动查看缓存中是否存在该模型，没有的话将自动下载
+    :param layers: str, 指定返回的层数, 以,隔开不同的层。如果要返回第二层的结果'2', 返回后两层的结果'1,2'。不同的层的结果
+        按照这个顺序concat起来。默认为'2'。
+    :param requires_grad: bool, 该层是否需要gradient. 默认为False
+    :param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话，将在初始化的时候为每个word生成对应的embedding，
+        并删除character encoder，之后将直接使用cache的embedding。默认为False。
+    """
+    def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en',
+                 layers: str='2', requires_grad: bool=False, cache_word_reprs: bool=False):
+        super(ElmoEmbedding, self).__init__(vocab)
+        layers = list(map(int, layers.split(',')))
+        assert len(layers) > 0, "Must choose one output"
+        for layer in layers:
+            assert 0 <= layer <= 2, "Layer index should be in range [0, 2]."
+        self.layers = layers
+
+        # 根据model_dir_or_name检查是否存在并下载
+        PRETRAIN_URL = _get_base_url('elmo')
+        # TODO 把baidu云上的加上去
+        PRETRAINED_ELMO_MODEL_DIR = {'en': 'elmo_en-d39843fe.tar.gz',
+                                     'cn': 'elmo_cn-5e9b34e2.tar.gz'}
+
+        if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR:
+            model_name = PRETRAINED_ELMO_MODEL_DIR[model_dir_or_name]
+            model_url = PRETRAIN_URL + model_name
+            model_dir = cached_path(model_url)
+            # 检查是否存在
+        elif os.path.isdir(model_dir_or_name):
+            model_dir = model_dir_or_name
+        else:
+            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
+        self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs)
+        self.requires_grad = requires_grad
+        self._embed_size = len(self.layers) * self.model.config['encoder']['projection_dim'] * 2
+
+    def forward(self, words: torch.LongTensor):
+        """
+        计算words的elmo embedding表示。根据elmo文章中介绍的ELMO实际上是有2L+1层结果，但是为了让结果比较容易拆分，token的
+            被重复了一次，使得实际上layer=0的结果是[token_embedding;token_embedding], 而layer=1的结果是[forward_hiddens;
+            backward_hiddens].
+
+        :param words: batch_size x max_len
+        :return: torch.FloatTensor. batch_size x max_len x (512*len(self.layers))
+        """
+        outputs = self._get_sent_reprs(words)
+        if outputs is not None:
+            return outputs
+        outputs = self.model(words)
+        if len(self.layers) == 1:
+            outputs = outputs[self.layers[0]]
+        else:
+            outputs = torch.cat([*outputs[self.layers]], dim=-1)
+
+        return outputs
+
+    def _delete_model_weights(self):
+        del self.layers, self.model
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+
+        :return:
+        """
+        requires_grads = set([param.requires_grad for name, param in self.named_parameters()
+                             if 'words_to_chars_embedding' not in name])
+        if len(requires_grads) == 1:
+            return requires_grads.pop()
+        else:
+            return None
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        for name, param in self.named_parameters():
+            if 'words_to_chars_embedding' in name: # 这个不能加入到requires_grad中
+                pass
+            param.requires_grad = value
+
+
+class BertEmbedding(ContextualEmbedding):
+    """
+    别名：:class:`fastNLP.modules.BertEmbedding`   :class:`fastNLP.modules.encoder.embedding.BertEmbedding`
+
+    使用BERT对words进行encode的Embedding。
+
+    Example::
+
+        >>>
+
+
+    :param fastNLP.Vocabulary vocab: 词表
+    :param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为``en-base-uncased``
+    :param str layers:最终结果中的表示。以','隔开层数，可以以负数去索引倒数几层
+    :param str pool_method: 因为在bert中，每个word会被表示为多个word pieces, 当获取一个word的表示的时候，怎样从它的word pieces
+        中计算得到他对应的表示。支持``last``, ``first``, ``avg``, ``max``.
+    :param bool include_cls_sep: bool，在bert计算句子的表示的时候，需要在前面加上[CLS]和[SEP], 是否在结果中保留这两个内容。 这样
+        会使得word embedding的结果比输入的结果长两个token。在使用 :class::StackEmbedding 可能会遇到问题。
+    :param bool requires_grad: 是否需要gradient。
+    """
+    def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1',
+                 pool_method: str='first', include_cls_sep: bool=False, requires_grad: bool=False):
+        super(BertEmbedding, self).__init__(vocab)
+        # 根据model_dir_or_name检查是否存在并下载
+        PRETRAIN_URL = _get_base_url('bert')
+        PRETRAINED_BERT_MODEL_DIR = {'en': 'bert-base-cased-f89bfe08.zip',
+                                     'en-base-uncased': 'bert-base-uncased-3413b23c.zip',
+                                     'en-base-cased': 'bert-base-cased-f89bfe08.zip',
+                                     'en-large-uncased': 'bert-large-uncased-20939f45.zip',
+                                     'en-large-cased': 'bert-large-cased-e0cf90fc.zip',
+
+                                     'cn': 'bert-base-chinese-29d0a84a.zip',
+                                     'cn-base': 'bert-base-chinese-29d0a84a.zip',
+
+                                     'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip',
+                                     'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip',
+                                     'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip',
+                                     }
+
+        if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR:
+            model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
+            model_url = PRETRAIN_URL + model_name
+            model_dir = cached_path(model_url)
+            # 检查是否存在
+        elif os.path.isdir(model_dir_or_name):
+            model_dir = model_dir_or_name
+        else:
+            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
+
+        self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers,
+                                    pool_method=pool_method, include_cls_sep=include_cls_sep)
+
+        self.requires_grad = requires_grad
+        self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size
+
+    def _delete_model_weights(self):
+        del self.model
+
+    def forward(self, words):
+        """
+        计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要
+            删除这两个token的表示。
+
+        :param words: batch_size x max_len
+        :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
+        """
+        outputs = self._get_sent_reprs(words)
+        if outputs is not None:
+            return outputs
+        outputs = self.model(words)
+        outputs = torch.cat([*outputs], dim=-1)
+
+        return outputs
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+        :return:
+        """
+        requires_grads = set([param.requires_grad for name, param in self.named_parameters()
+                             if 'word_pieces_lengths' not in name])
+        if len(requires_grads) == 1:
+            return requires_grads.pop()
+        else:
+            return None
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        for name, param in self.named_parameters():
+            if 'word_pieces_lengths' in name:  # 这个不能加入到requires_grad中
+                pass
+            param.requires_grad = value
+
+
+def _construct_char_vocab_from_vocab(vocab:Vocabulary, min_freq:int=1):
+    """
+    给定一个word的vocabulary生成character的vocabulary.
+
+    :param vocab: 从vocab
+    :param min_freq:
+    :return:
+    """
+    char_vocab = Vocabulary(min_freq=min_freq)
+    for word, index in vocab:
+        char_vocab.add_word_lst(list(word))
+    return char_vocab
+
+
+class CNNCharEmbedding(TokenEmbedding):
+    """
+    别名：:class:`fastNLP.modules.CNNCharEmbedding`   :class:`fastNLP.modules.encoder.embedding.CNNCharEmbedding`
+
+    使用CNN生成character embedding。CNN的结果为, CNN(x) -> activation(x) -> pool -> fc. 不同的kernel大小的fitler结果是
+        concat起来的。
+
+    Example::
+
+        >>>
+
+
+    :param vocab: 词表
+    :param embed_size: 该word embedding的大小，默认值为50.
+    :param char_emb_size: character的embed的大小。character是从vocab中生成的。默认值为50.
+    :param filter_nums: filter的数量. 长度需要和kernels一致。默认值为[40, 30, 20].
+    :param kernels: kernel的大小. 默认值为[5, 3, 1].
+    :param pool_method: character的表示在合成一个表示时所使用的pool方法，支持'avg', 'max'.
+    :param activation: CNN之后使用的激活方法，支持'relu', 'sigmoid', 'tanh' 或者自定义函数.
+    :param min_char_freq: character的最少出现次数。默认值为2.
+    """
+    def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50,
+                 filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), pool_method: str='max',
+                 activation='relu', min_char_freq: int=2):
+        super(CNNCharEmbedding, self).__init__(vocab)
+
+        for kernel in kernel_sizes:
+            assert kernel % 2 == 1, "Only odd kernel is allowed."
+
+        assert pool_method in ('max', 'avg')
+        self.pool_method = pool_method
+        # activation function
+        if isinstance(activation, str):
+            if activation.lower() == 'relu':
+                self.activation = F.relu
+            elif activation.lower() == 'sigmoid':
+                self.activation = F.sigmoid
+            elif activation.lower() == 'tanh':
+                self.activation = F.tanh
+        elif activation is None:
+            self.activation = lambda x: x
+        elif callable(activation):
+            self.activation = activation
+        else:
+            raise Exception(
+                "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")
+
+        print("Start constructing character vocabulary.")
+        # 建立char的词表
+        self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
+        self.char_pad_index = self.char_vocab.padding_idx
+        print(f"In total, there are {len(self.char_vocab)} distinct characters.")
+        # 对vocab进行index
+        self.max_word_len = max(map(lambda x: len(x[0]), vocab))
+        self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), self.max_word_len),
+                                                                fill_value=self.char_pad_index, dtype=torch.long),
+                                                     requires_grad=False)
+        self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False)
+        for word, index in vocab:
+            # if index!=vocab.padding_idx:  # 如果是pad的话，直接就为pad_value了。 修改为不区分pad, 这样所有的<pad>也是同一个embed
+            self.words_to_chars_embedding[index, :len(word)] = \
+                torch.LongTensor([self.char_vocab.to_index(c) for c in word])
+            self.word_lengths[index] = len(word)
+        self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size)
+
+        self.convs = nn.ModuleList([nn.Conv1d(
+            char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2)
+            for i in range(len(kernel_sizes))])
+        self._embed_size = embed_size
+        self.fc = nn.Linear(sum(filter_nums), embed_size)
+
+    def forward(self, words):
         """
-        Embedding的大小
-        :return: torch.Size()
+        输入words的index后，生成对应的words的表示。
+
+        :param words: [batch_size, max_len]
+        :return: [batch_size, max_len, embed_size]
         """
-        return self.weight.size()
+        batch_size, max_len = words.size()
+        chars = self.words_to_chars_embedding[words]  # batch_size x max_len x max_word_len
+        word_lengths = self.word_lengths[words] # batch_size x max_len
+        max_word_len = word_lengths.max()
+        chars = chars[:, :, :max_word_len]
+        # 为1的地方为mask
+        chars_masks = chars.eq(self.char_pad_index)  # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
+        chars = self.char_embedding(chars)  # batch_size x max_len x max_word_len x embed_size
+
+        reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1)
+        reshaped_chars = reshaped_chars.transpose(1, 2)  # B' x E x M
+        conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1)
+                      for conv in self.convs]
+        conv_chars = torch.cat(conv_chars, dim=-1).contiguous()  # B x max_len x max_word_len x sum(filters)
+        conv_chars = self.activation(conv_chars)
+        if self.pool_method == 'max':
+            conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf'))
+            chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters)
+        else:
+            conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), 0)
+            chars = torch.sum(conv_chars, dim=-2)/chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
+        chars = self.fc(chars)
+        return chars
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+        :return:
+        """
+        params = []
+        for name, param in self.named_parameters():
+            if 'words_to_chars_embedding' not in name and 'word_lengths' not in name:
+                params.append(param.requires_grad)
+        requires_grads = set(params)
+        if len(requires_grads) == 1:
+            return requires_grads.pop()
+        else:
+            return None
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        for name, param in self.named_parameters():
+            if 'words_to_chars_embedding' in name or 'word_lengths' in name:  # 这个不能加入到requires_grad中
+                pass
+            param.requires_grad = value
+
+
+class LSTMCharEmbedding(TokenEmbedding):
+    """
+    别名：:class:`fastNLP.modules.LSTMCharEmbedding`   :class:`fastNLP.modules.encoder.embedding.LSTMCharEmbedding`
+
+    使用LSTM的方式对character进行encode.
+
+    Example::
+
+        >>>
+
+    :param vocab: 词表
+    :param embed_size: embedding的大小。默认值为50.
+    :param char_emb_size: character的embedding的大小。默认值为50.
+    :param hidden_size: LSTM的中间hidden的大小，如果为bidirectional的，hidden会除二，默认为50.
+    :param pool_method: 支持'max', 'avg'
+    :param activation: 激活函数，支持'relu', 'sigmoid', 'tanh', 或者自定义函数.
+    :param min_char_freq: character的最小出现次数。默认值为2.
+    :param bidirectional: 是否使用双向的LSTM进行encode。默认值为True。
+    """
+    def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, hidden_size=50,
+                 pool_method: str='max', activation='relu', min_char_freq: int=2, bidirectional=True):
+        super(LSTMCharEmbedding, self).__init__(vocab)
+
+        assert hidden_size % 2 == 0, "Only even kernel is allowed."
+
+        assert pool_method in ('max', 'avg')
+        self.pool_method = pool_method
+
+        # activation function
+        if isinstance(activation, str):
+            if activation.lower() == 'relu':
+                self.activation = F.relu
+            elif activation.lower() == 'sigmoid':
+                self.activation = F.sigmoid
+            elif activation.lower() == 'tanh':
+                self.activation = F.tanh
+        elif activation is None:
+            self.activation = lambda x: x
+        elif callable(activation):
+            self.activation = activation
+        else:
+            raise Exception(
+                "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")
+
+        print("Start constructing character vocabulary.")
+        # 建立char的词表
+        self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
+        self.char_pad_index = self.char_vocab.padding_idx
+        print(f"In total, there are {len(self.char_vocab)} distinct characters.")
+        # 对vocab进行index
+        self.max_word_len = max(map(lambda x: len(x[0]), vocab))
+        self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), self.max_word_len),
+                                                                fill_value=self.char_pad_index, dtype=torch.long),
+                                                     requires_grad=False)
+        self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False)
+        for word, index in vocab:
+            # if index!=vocab.padding_idx:  # 如果是pad的话，直接就为pad_value了. 修改为不区分pad与否
+            self.words_to_chars_embedding[index, :len(word)] = \
+                torch.LongTensor([self.char_vocab.to_index(c) for c in word])
+            self.word_lengths[index] = len(word)
+        self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size)
+
+        self.fc = nn.Linear(hidden_size, embed_size)
+        hidden_size = hidden_size // 2 if bidirectional else hidden_size
+
+        self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True)
+        self._embed_size = embed_size
+        self.bidirectional = bidirectional
+
+    def forward(self, words):
+        """
+        输入words的index后，生成对应的words的表示。
+
+        :param words: [batch_size, max_len]
+        :return: [batch_size, max_len, embed_size]
+        """
+        batch_size, max_len = words.size()
+        chars = self.words_to_chars_embedding[words]  # batch_size x max_len x max_word_len
+        word_lengths = self.word_lengths[words]  # batch_size x max_len
+        max_word_len = word_lengths.max()
+        chars = chars[:, :, :max_word_len]
+        # 为mask的地方为1
+        chars_masks = chars.eq(self.char_pad_index)  # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
+        chars = self.char_embedding(chars)  # batch_size x max_len x max_word_len x embed_size
+
+        reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1)
+        char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len)
+        lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1)
+        # B x M x M x H
+
+        lstm_chars = self.activation(lstm_chars)
+        if self.pool_method == 'max':
+            lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf'))
+            chars, _ = torch.max(lstm_chars, dim=-2)  # batch_size x max_len x H
+        else:
+            lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), 0)
+            chars = torch.sum(lstm_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
+
+        chars = self.fc(chars)
+
+        return chars
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+        :return:
+        """
+        params = []
+        for name, param in self.named_parameters():
+            if 'words_to_chars_embedding' not in name and 'word_lengths' not in name:
+                params.append(param)
+        requires_grads = set(params)
+        if len(requires_grads) == 1:
+            return requires_grads.pop()
+        else:
+            return None
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        for name, param in self.named_parameters():
+            if 'words_to_chars_embedding' in name or 'word_lengths' in name:  # 这个不能加入到requires_grad中
+                pass
+            param.requires_grad = value
+
+
+class StackEmbedding(TokenEmbedding):
+    """
+    别名：:class:`fastNLP.modules.StackEmbedding`   :class:`fastNLP.modules.encoder.embedding.StackEmbedding`
+
+    支持将多个embedding集合成一个embedding。
+
+    Example::
+
+        >>>
+
+
+    :param embeds: 一个由若干个TokenEmbedding组成的list，要求每一个TokenEmbedding的词表都保持一致
+
+    """
+    def __init__(self, embeds: List[TokenEmbedding]):
+        vocabs = []
+        for embed in embeds:
+            vocabs.append(embed.get_word_vocab())
+        _vocab = vocabs[0]
+        for vocab in vocabs[1:]:
+            assert vocab == _vocab, "All embeddings should use the same word vocabulary."
+
+        super(StackEmbedding, self).__init__(_vocab)
+        assert isinstance(embeds, list)
+        for embed in embeds:
+            assert isinstance(embed, TokenEmbedding), "Only TokenEmbedding type is supported."
+        self.embeds = nn.ModuleList(embeds)
+        self._embed_size = sum([embed.embed_size for embed in self.embeds])
+
+    def append(self, embed: TokenEmbedding):
+        """
+        添加一个embedding到结尾。
+        :param embed:
+        :return:
+        """
+        assert isinstance(embed, TokenEmbedding)
+        self.embeds.append(embed)
+
+    def pop(self):
+        """
+        弹出最后一个embed
+        :return:
+        """
+        return self.embeds.pop()
+
+    @property
+    def embed_size(self):
+        return self._embed_size
+
+    @property
+    def requires_grad(self):
+        """
+        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
+        :return:
+        """
+        requires_grads = set([embed.requires_grad for embed in self.embeds()])
+        if len(requires_grads)==1:
+            return requires_grads.pop()
+        else:
+            return None
+
+    @requires_grad.setter
+    def requires_grad(self, value):
+        for embed in self.embeds():
+            embed.requires_grad = value
+
+    def forward(self, words):
+        """
+        得到多个embedding的结果，并把结果按照顺序concat起来。
+
+        :param words: batch_size x max_len
+        :return: 返回的shape和当前这个stack embedding中embedding的组成有关
+        """
+        outputs = []
+        for embed in self.embeds:
+            outputs.append(embed(words))
+        return torch.cat(outputs, dim=-1)
+
diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py
index b4f960e7..b4d3aff2 100644
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -20,7 +20,7 @@ class LSTM(nn.Module):
     LSTM 模块, 轻量封装的Pytorch LSTM
 
     :param input_size:  输入 `x` 的特征维度
-    :param hidden_size: 隐状态 `h` 的特征维度
+    :param hidden_size: 隐状态 `h` 的特征维度.
     :param num_layers: rnn的层数. Default: 1
     :param dropout: 层间dropout概率. Default: 0
     :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
diff --git a/fastNLP/modules/utils.py b/fastNLP/modules/utils.py
index 741429bb..c87f3a68 100644
--- a/fastNLP/modules/utils.py
+++ b/fastNLP/modules/utils.py
@@ -82,7 +82,7 @@ def get_embeddings(init_embed):
     if isinstance(init_embed, tuple):
         res = nn.Embedding(
             num_embeddings=init_embed[0], embedding_dim=init_embed[1])
-    elif isinstance(init_embed, nn.Embedding):
+    elif isinstance(init_embed, nn.Module):
         res = init_embed
     elif isinstance(init_embed, torch.Tensor):
         res = nn.Embedding.from_pretrained(init_embed, freeze=False)
@@ -130,3 +130,17 @@ def summary(model: nn.Module):
     strings = [bar] + strings + [bar]
     print('\n'.join(strings))
     return total, total_train, total_nontrain
+
+
+def get_dropout_mask(drop_p: float, tensor: torch.Tensor):
+    """
+    根据tensor的形状，生成一个mask
+
+    :param drop_p: float, 以多大的概率置为0。
+    :param tensor:torch.Tensor
+    :return: torch.FloatTensor. 与tensor一样的shape
+    """
+    mask_x = torch.ones_like(tensor)
+    nn.functional.dropout(mask_x, p=drop_p,
+                          training=False, inplace=True)
+    return mask_x
\ No newline at end of file
diff --git a/reproduction/seqence_labelling/cws/model/module.py b/reproduction/seqence_labelling/cws/model/module.py
index 6cd8b5e3..86149f39 100644
--- a/reproduction/seqence_labelling/cws/model/module.py
+++ b/reproduction/seqence_labelling/cws/model/module.py
@@ -1,11 +1,10 @@
 from torch import nn
 import torch
-from fastNLP.modules import Embedding
 import numpy as np
 
 class SemiCRFShiftRelay(nn.Module):
     """
-    该模块是一个decoder，但
+    该模块是一个decoder，但当前不支持含有tag的decode。
 
     """
     def __init__(self, L):
diff --git a/reproduction/seqence_labelling/cws/train_shift_relay.py b/reproduction/seqence_labelling/cws/train_shift_relay.py
index ed512252..805521e7 100644
--- a/reproduction/seqence_labelling/cws/train_shift_relay.py
+++ b/reproduction/seqence_labelling/cws/train_shift_relay.py
@@ -32,11 +32,11 @@ lr = 0.02
 #########hyper
 device = 0
 
-# !!!!这里前往不要放完全路径，因为这样会暴露你们在服务器上的用户名，比较危险。所以一定要使用相对路径，最好把数据放到
+# !!!!这里千万不要放完全路径，因为这样会暴露你们在服务器上的用户名，比较危险。所以一定要使用相对路径，最好把数据放到
 #   你们的reproduction路径下，然后设置.gitignore
-file_dir = '/path/to/pku'
-char_embed_path = '/path/to/1grams_t3_m50_corpus.txt'
-bigram_embed_path = 'path/to/2grams_t3_m50_corpus.txt'
+file_dir = '/path/to/'
+char_embed_path = '/pretrain/vectors/1grams_t3_m50_corpus.txt'
+bigram_embed_path = '/pretrain/vectors/2grams_t3_m50_corpus.txt'
 bigram_vocab_opt = VocabularyOption(min_freq=3)
 char_embed_opt = EmbeddingOption(embed_filepath=char_embed_path)
 bigram_embed_opt = EmbeddingOption(embed_filepath=bigram_embed_path)
@@ -44,7 +44,7 @@ bigram_embed_opt = EmbeddingOption(embed_filepath=bigram_embed_path)
 data_name = os.path.basename(file_dir)
 cache_fp = 'caches/{}.pkl'.format(data_name)
 
-data = prepare_data(_cache_fp=cache_fp, _refresh=False)
+data = prepare_data(_cache_fp=cache_fp, _refresh=True)
 
 model = ShiftRelayCWSModel(char_embed=data.embeddings['chars'], bigram_embed=data.embeddings['bigrams'],
                            hidden_size=hidden_size, num_layers=num_layers,
diff --git a/requirements.txt b/requirements.txt
index dfd2b16e..7ea8fdac 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 numpy
 torch>=0.4.0
 tqdm
-nltk
\ No newline at end of file
+nltk
+requests
diff --git a/test/core/test_vocabulary.py b/test/core/test_vocabulary.py
index df0ebb1a..c161ad9d 100644
--- a/test/core/test_vocabulary.py
+++ b/test/core/test_vocabulary.py
@@ -100,13 +100,14 @@ class TestIndexing(unittest.TestCase):
         self.assertEqual(text, [vocab.to_word(idx) for idx in [vocab[w] for w in text]])
     
     def test_iteration(self):
-        vocab = Vocabulary()
+        vocab = Vocabulary(padding=None, unknown=None)
         text = ["FastNLP", "works", "well", "in", "most", "cases", "and", "scales", "well", "in",
                 "works", "well", "in", "most", "cases", "scales", "well"]
         vocab.update(text)
         text = set(text)
-        for word in vocab:
+        for word, idx in vocab:
             self.assertTrue(word in text)
+            self.assertTrue(idx < len(vocab))
 
 
 class TestOther(unittest.TestCase):
diff --git a/test/models/test_cnn_text_classification.py b/test/models/test_cnn_text_classification.py
index b83b7bad..2ea48220 100644
--- a/test/models/test_cnn_text_classification.py
+++ b/test/models/test_cnn_text_classification.py
@@ -12,7 +12,6 @@ class TestCNNText(unittest.TestCase):
         model = CNNText(init_emb,
                         NUM_CLS,
                         kernel_nums=(1, 3, 5),
-                        kernel_sizes=(2, 2, 2),
-                        padding=0,
+                        kernel_sizes=(1, 3, 5),
                         dropout=0.5)
         RUNNER.run_model_with_task(TEXT_CLS, model)
diff --git a/test/test_tutorials.py b/test/test_tutorials.py
index 128e4235..a38d5ae1 100644
--- a/test/test_tutorials.py
+++ b/test/test_tutorials.py
@@ -70,7 +70,7 @@ class TestTutorial(unittest.TestCase):
             break
 
         from fastNLP.models import CNNText
-        model = CNNText((len(vocab), 50), num_classes=5, padding=2, dropout=0.1)
+        model = CNNText((len(vocab), 50), num_classes=5, dropout=0.1)
 
         from fastNLP import Trainer
         from copy import deepcopy
@@ -143,7 +143,7 @@ class TestTutorial(unittest.TestCase):
                        is_input=True)
 
         from fastNLP.models import CNNText
-        model = CNNText((len(vocab), 50), num_classes=5, padding=2, dropout=0.1)
+        model = CNNText((len(vocab), 50), num_classes=5, dropout=0.1)
 
         from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam