Merge pull request #3 from fastnlp/dev0.5.0

Dev0.5.0
6 years ago · 97b5909f5d
--- a/docs/source/user/quickstart.rst
+++ b/docs/source/user/quickstart.rst
@@ -49,7 +49,7 @@
 .. code-block:: python
    from fastNLP.models import CNNText
    model = CNNText((len(vocab),50), num_classes=5, padding=2, dropout=0.1)
    model = CNNText((len(vocab),50), num_classes=5, dropout=0.1)
 :class:`~fastNLP.models.CNNText` 的网络结构如下::
@@ -121,4 +121,4 @@
    In Epoch:6/Step:12, got best dev performance:AccuracyMetric: acc=0.8
    Reloaded the best model.
 这份教程只是简单地介绍了使用 fastNLP 工作的流程，具体的细节分析见 :doc:`/user/tutorial_one`
 这份教程只是简单地介绍了使用 fastNLP 工作的流程，具体的细节分析见 :doc:`/user/tutorial_one`
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -554,6 +554,7 @@ class DataSet(object):
            self.field_arrays[new_name].name = new_name
        else:
            raise KeyError("DataSet has no field named {}.".format(old_name))
        return self
    def set_target(self, *field_names, flag=True):
        """
@@ -593,7 +594,7 @@ class DataSet(object):
                try:
                    self.field_arrays[name].is_input = flag
                except SetInputOrTargetException as e:
                    print(f"Cannot set field:{name} as input.")
                    print(f"Cannot set field:{name} as input, exception happens at the {e.index} value.")
                    raise e
            else:
                raise KeyError("{} is not a valid field name.".format(name))
@@ -761,7 +762,20 @@ class DataSet(object):
            self._add_apply_field(results, new_field_name, kwargs)
        return results
    def add_seq_len(self, field_name:str, new_field_name='seq_len'):
        """
        将使用len()直接对field_name中每个元素作用，将其结果作为seqence length, 并放入seq_len这个field。
        :param field_name: str.
        :return:
        """
        if self.has_field(field_name=field_name):
            self.apply_field(len, field_name, new_field_name=new_field_name)
        else:
            raise KeyError(f"Field:{field_name} not found.")
        return self
    def drop(self, func, inplace=True):
        """
        func接受一个Instance，返回bool值。返回值为True时，该Instance会被移除或者加入到返回的DataSet中。
--- a/fastNLP/core/field.py
+++ b/fastNLP/core/field.py
@@ -6,6 +6,7 @@ import numpy as np
 from typing import Any
 from abc import abstractmethod
 from copy import deepcopy
 from collections import Counter
 class SetInputOrTargetException(Exception):
    def __init__(self, msg, index=None, field_name=None):
@@ -61,6 +62,7 @@ class FieldArray:
        if value:
            self._cell_ndim = None
            self.dtype = None
        self._ignore_type = value
    @property
    def is_input(self):
@@ -223,6 +225,155 @@ class FieldArray:
        return self
    def split(self, sep:str=None, inplace:bool=True):
        """
        依次对自身的元素使用.split()方法，应该只有当本field的元素为str时，该方法才有用。将返回值
        :param sep: 分割符，如果为None则直接调用str.split()。
        :param inplace: 如果为True，则将新生成值替换本field。否则返回list。
        :return: List[List[str]] or self
        """
        new_contents = []
        for index, cell in enumerate(self.content):
            try:
                new_contents.append(cell.split(sep))
            except Exception as e:
                print(f"Exception happens when process value in index {index}.")
                print(e)
        return self._after_process(new_contents, inplace=inplace)
    def int(self, inplace:bool=True):
        """
        将本field中的值调用int(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的)，
            (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list，list中的值会被依次转换。)
        :param inplace: 如果为True，则将新生成值替换本field。否则返回list。
        :return: List[int], List[List[int]], self
        """
        new_contents = []
        for index, cell in enumerate(self.content):
            try:
                if isinstance(cell, list):
                    new_contents.append([int(value) for value in cell])
                else:
                    new_contents.append(int(cell))
            except Exception as e:
                print(f"Exception happens when process value in index {index}.")
                print(e)
        return self._after_process(new_contents, inplace=inplace)
    def float(self, inplace=True):
        """
        将本field中的值调用float(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的)，
            (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list，list中的值会被依次转换。)
        :param inplace: 如果为True，则将新生成值替换本field。否则返回list。
        :return:
        """
        new_contents = []
        for index, cell in enumerate(self.content):
            try:
                if isinstance(cell, list):
                    new_contents.append([float(value) for value in cell])
                else:
                    new_contents.append(float(cell))
            except Exception as e:
                print(f"Exception happens when process value in index {index}.")
                print(e)
        return self._after_process(new_contents, inplace=inplace)
    def bool(self, inplace=True):
        """
        将本field中的值调用bool(cell). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的)，
            (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list，list中的值会被依次转换。)
        :param inplace: 如果为True，则将新生成值替换本field。否则返回list。
        :return:
        """
        new_contents = []
        for index, cell in enumerate(self.content):
            try:
                if isinstance(cell, list):
                    new_contents.append([bool(value) for value in cell])
                else:
                    new_contents.append(bool(cell))
            except Exception as e:
                print(f"Exception happens when process value in index {index}.")
                print(e)
        return self._after_process(new_contents, inplace=inplace)
    def lower(self, inplace=True):
        """
        将本field中的值调用cell.lower(). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的)，
            (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list，list中的值会被依次转换。)
        :param inplace: 如果为True，则将新生成值替换本field。否则返回list。
        :return: List[int], List[List[int]], self
        """
        new_contents = []
        for index, cell in enumerate(self.content):
            try:
                if isinstance(cell, list):
                    new_contents.append([value.lower() for value in cell])
                else:
                    new_contents.append(cell.lower())
            except Exception as e:
                print(f"Exception happens when process value in index {index}.")
                print(e)
        return self._after_process(new_contents, inplace=inplace)
    def upper(self, inplace=True):
        """
        将本field中的值调用cell.lower(). 支持field中内容为以下两种情况(1)['1', '2', ...](即field中每个值为str的)，
            (2) [['1', '2', ..], ['3', ..], ...](即field中每个值为一个list，list中的值会被依次转换。)
        :param inplace: 如果为True，则将新生成值替换本field。否则返回list。
        :return: List[int], List[List[int]], self
        """
        new_contents = []
        for index, cell in enumerate(self.content):
            try:
                if isinstance(cell, list):
                    new_contents.append([value.upper() for value in cell])
                else:
                    new_contents.append(cell.upper())
            except Exception as e:
                print(f"Exception happens when process value in index {index}.")
                print(e)
        return self._after_process(new_contents, inplace=inplace)
    def value_count(self):
        """
        返回该field下不同value的数量。多用于统计label数量
        :return: Counter, key是label，value是出现次数
        """
        count = Counter()
        for cell in self.content:
            count[cell] += 1
        return count
    def _after_process(self, new_contents, inplace):
        """
        当调用处理函数之后，决定是否要替换field。
        :param new_contents:
        :param inplace:
        :return: self或者生成的content
        """
        if inplace:
            self.content = new_contents
            try:
                self.is_input = self.is_input
                self.is_target = self.is_input
            except SetInputOrTargetException as e:
                print("The newly generated field cannot be set as input or target.")
                raise e
            return self
        else:
            return new_contents
 def _get_ele_type_and_dim(cell:Any, dim=0):
    """
@@ -242,6 +393,8 @@ def _get_ele_type_and_dim(cell:Any, dim=0):
        dims = set([j for i,j in res])
        if len(types)>1:
            raise SetInputOrTargetException("Mixed types detected: {}.".format(list(types)))
        elif len(types)==0:
            raise SetInputOrTargetException("Empty value encountered.")
        if len(dims)>1:
            raise SetInputOrTargetException("Mixed dimension detected: {}.".format(list(dims)))
        return types.pop(), dims.pop()
@@ -257,6 +410,8 @@ def _get_ele_type_and_dim(cell:Any, dim=0):
        dims = set([j for i,j in res])
        if len(types)>1:
            raise SetInputOrTargetException("Mixed types detected: {}.".format(list(types)))
        elif len(types)==0:
            raise SetInputOrTargetException("Empty value encountered.")
        if len(dims)>1:
            raise SetInputOrTargetException("Mixed dimension detected: {}.".format(list(dims)))
        return types.pop(), dims.pop()
--- a/fastNLP/core/metrics.py
+++ b/fastNLP/core/metrics.py
@@ -22,7 +22,7 @@ from .utils import _check_arg_dict_list
 from .utils import _get_func_signature
 from .utils import seq_len_to_mask
 from .vocabulary import Vocabulary
 from abc import abstractmethod
 class MetricBase(object):
    """
@@ -117,10 +117,12 @@ class MetricBase(object):
    def __init__(self):
        self.param_map = {}  # key is param in function, value is input param.
        self._checked = False
    @abstractmethod
    def evaluate(self, *args, **kwargs):
        raise NotImplementedError
    @abstractmethod
    def get_metric(self, reset=True):
        raise NotImplemented
--- a/fastNLP/core/utils.py
+++ b/fastNLP/core/utils.py
@@ -285,6 +285,7 @@ def _get_model_device(model):
    :param model: nn.Module
    :return: torch.device,None 如果返回值为None，说明这个模型没有任何参数。
    """
    # TODO 这个函数存在一定的风险，因为同一个模型可能存在某些parameter不在显卡中，比如BertEmbedding
    assert isinstance(model, nn.Module)
    parameters = list(model.parameters())
@@ -295,6 +296,13 @@ def _get_model_device(model):
 def _build_args(func, **kwargs):
    """
    根据func的初始化参数，从kwargs中选择func需要的参数
    :param func: callable
    :param kwargs: 参数
    :return:dict. func中用到的参数
    """
    spect = inspect.getfullargspec(func)
    if spect.varkw is not None:
        return kwargs
--- a/fastNLP/core/vocabulary.py
+++ b/fastNLP/core/vocabulary.py
@@ -148,7 +148,7 @@ class Vocabulary(object):
        self.word2idx.update({w: i + start_idx for i, (w, _) in enumerate(words)})
        self.build_reverse_vocab()
        self.rebuild = False
    def build_reverse_vocab(self):
        """
        基于 "word to index" dict, 构建 "index to word" dict.
@@ -359,5 +359,7 @@ class Vocabulary(object):
    def __repr__(self):
        return "Vocabulary({}...)".format(list(self.word_count.keys())[:5])
    @_check_build_vocab
    def __iter__(self):
        return iter(list(self.word_count.keys()))
        for word, index in self.word2idx.items():
            yield word, index
--- a/fastNLP/io/embed_loader.py
+++ b/fastNLP/io/embed_loader.py
@@ -26,6 +26,7 @@ class EmbeddingOption(Option):
            error=error
        )
 class EmbedLoader(BaseLoader):
    """
    别名：:class:`fastNLP.io.EmbedLoader` :class:`fastNLP.io.embed_loader.EmbedLoader`
@@ -35,9 +36,9 @@ class EmbedLoader(BaseLoader):
    def __init__(self):
        super(EmbedLoader, self).__init__()
    @staticmethod
    def load_with_vocab(embed_filepath, vocab, dtype=np.float32, normalize=True, error='ignore'):
    def load_with_vocab(embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>', normalize=True, error='ignore'):
        """
        从embed_filepath这个预训练的词向量中抽取出vocab这个词表的词的embedding。EmbedLoader将自动判断embed_filepath是
        word2vec(第一行只有两个元素)还是glove格式的数据。
@@ -46,6 +47,8 @@ class EmbedLoader(BaseLoader):
        :param vocab: 词表 :class:`~fastNLP.Vocabulary` 类型，读取出现在vocab中的词的embedding。
            没有出现在vocab中的词的embedding将通过找到的词的embedding的正态分布采样出来，以使得整个Embedding是同分布的。
        :param dtype: 读出的embedding的类型
        :param str padding: 词表中padding的token
        :param str unknown: 词表中unknown的token
        :param bool normalize: 是否将每个vector归一化到norm为1
        :param str error: `ignore` , `strict` ; 如果 `ignore` ，错误将自动跳过; 如果 `strict` , 错误将抛出。
            这里主要可能出错的地方在于词表有空行或者词表出现了维度不一致。
@@ -69,8 +72,14 @@ class EmbedLoader(BaseLoader):
            for idx, line in enumerate(f, start_idx):
                try:
                    parts = line.strip().split()
                    if parts[0] in vocab:
                        index = vocab.to_index(parts[0])
                    word = parts[0]
                    # 对齐unk与pad
                    if word==padding and vocab.padding is not None:
                        word = vocab.padding
                    elif word==unknown and vocab.unknown is not None:
                        word = vocab.unknown
                    if word in vocab:
                        index = vocab.to_index(word)
                        matrix[index] = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim)
                        hit_flags[index] = True
                except Exception as e:
@@ -102,8 +111,8 @@ class EmbedLoader(BaseLoader):
        :param str embed_filepath: 预训练的embedding的路径。
        :param dtype: 读出的embedding的类型
        :param str padding: the padding tag for vocabulary.
        :param str unknown: the unknown tag for vocabulary.
        :param str padding: 词表中的padding的token. 并以此用做vocab的padding。
        :param str unknown: 词表中的unknown的token. 并以此用做vocab的unknown。
        :param bool normalize: 是否将每个vector归一化到norm为1
        :param str error: `ignore` , `strict` ; 如果 `ignore` ，错误将自动跳过; 如果 `strict` , 错误将抛出。这里主要可能出错的地
            方在于词表有空行或者词表出现了维度不一致。
@@ -134,7 +143,7 @@ class EmbedLoader(BaseLoader):
                    vocab.add_word(word)
                    if unknown is not None and unknown == word:
                        found_unknown = True
                    if found_pad is not None and padding == word:
                    if padding is not None and padding == word:
                        found_pad = True
                except Exception as e:
                    if error == 'ignore':
--- a/fastNLP/io/file_utils.py
+++ b/fastNLP/io/file_utils.py
@@ -0,0 +1,255 @@
 import os
 from pathlib import Path
 from urllib.parse import urlparse
 import re
 import requests
 import tempfile
 from tqdm import tqdm
 import shutil
 import hashlib
 def cached_path(url_or_filename: str, cache_dir: Path=None) -> Path:
    """
        给定一个url或者文件名(可以是具体的文件名，也可以是文件)，先在cache_dir下寻找该文件是否存在，如果不存在则去下载, 并
    将文件放入到
    """
    if cache_dir is None:
        dataset_cache = Path(get_defalt_path())
    else:
        dataset_cache = cache_dir
    parsed = urlparse(url_or_filename)
    if parsed.scheme in ("http", "https"):
        # URL, so get it from the cache (downloading if necessary)
        return get_from_cache(url_or_filename, dataset_cache)
    elif parsed.scheme == "" and Path(os.path.join(dataset_cache, url_or_filename)).exists():
        # File, and it exists.
        return Path(url_or_filename)
    elif parsed.scheme == "":
        # File, but it doesn't exist.
        raise FileNotFoundError("file {} not found".format(url_or_filename))
    else:
        # Something unknown
        raise ValueError(
            "unable to parse {} as a URL or as a local path".format(url_or_filename)
        )
 def get_filepath(filepath):
    """
    如果filepath中只有一个文件，则直接返回对应的全路径
    :param filepath:
    :return:
    """
    if os.path.isdir(filepath):
        files = os.listdir(filepath)
        if len(files)==1:
            return os.path.join(filepath, files[0])
        else:
            return filepath
    return filepath
 def get_defalt_path():
    """
    获取默认的fastNLP存放路径, 如果将FASTNLP_CACHE_PATH设置在了环境变量中，将使用环境变量的值，使得不用每个用户都去下载。
    :return:
    """
    if 'FASTNLP_CACHE_DIR' in os.environ:
        fastnlp_cache_dir = os.environ.get('FASTNLP_CACHE_DIR')
        if os.path.exists(fastnlp_cache_dir):
            return fastnlp_cache_dir
        raise RuntimeError("Some errors happens on cache directory.")
    else:
        raise RuntimeError("There function is not available right now.")
    fastnlp_cache_dir = os.path.expanduser(os.path.join("~", ".fastNLP"))
    return fastnlp_cache_dir
 def _get_base_url(name):
    # 返回的URL结尾必须是/
    if 'FASTNLP_BASE_URL' in os.environ:
        fastnlp_base_url = os.environ['FASTNLP_BASE_URL']
        return fastnlp_base_url
    raise RuntimeError("There function is not available right now.")
 def split_filename_suffix(filepath):
    """
    给定filepath返回对应的name和suffix
    :param filepath:
    :return: filename, suffix
    """
    filename = os.path.basename(filepath)
    if filename.endswith('.tar.gz'):
        return filename[:-7], '.tar.gz'
    return os.path.splitext(filename)
 def get_from_cache(url: str, cache_dir: Path = None) -> Path:
    """
    尝试在cache_dir中寻找url定义的资源; 如果没有找到。则从url下载并将结果放在cache_dir下，缓存的名称由url的结果推断而来。
        如果从url中下载的资源解压后有多个文件，则返回directory的路径; 如果只有一个资源，则返回具体的路径
    """
    cache_dir.mkdir(parents=True, exist_ok=True)
    filename = re.sub(r".+/", "", url)
    dir_name, suffix = split_filename_suffix(filename)
    sep_index = dir_name[::-1].index('-')
    if sep_index<0:
        check_sum = None
    else:
        check_sum = dir_name[-sep_index+1:]
    sep_index = len(dir_name) if sep_index==-1 else -sep_index-1
    dir_name = dir_name[:sep_index]
    # 寻找与它名字匹配的内容, 而不关心后缀
    match_dir_name = match_file(dir_name, cache_dir)
    if match_dir_name:
        dir_name = match_dir_name
    cache_path = cache_dir / dir_name
    # get cache path to put the file
    if cache_path.exists():
        return get_filepath(cache_path)
    # make HEAD request to check ETag TODO ETag可以用来判断资源是否已经更新了，之后需要加上
    response = requests.head(url, headers={"User-Agent": "fastNLP"})
    if response.status_code != 200:
        raise IOError(
            f"HEAD request failed for url {url} with status code {response.status_code}."
        )
    # add ETag to filename if it exists
    # etag = response.headers.get("ETag")
    if not cache_path.exists():
        # Download to temporary file, then copy to cache dir once finished.
        # Otherwise you get corrupt cache entries if the download gets interrupted.
        fd, temp_filename = tempfile.mkstemp()
        print("%s not found in cache, downloading to %s"%(url, temp_filename))
        # GET file object
        req = requests.get(url, stream=True, headers={"User-Agent": "fastNLP"})
        content_length = req.headers.get("Content-Length")
        total = int(content_length) if content_length is not None else None
        progress = tqdm(unit="B", total=total)
        sha256 = hashlib.sha256()
        with open(temp_filename, "wb") as temp_file:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    progress.update(len(chunk))
                    temp_file.write(chunk)
                    sha256.update(chunk)
        # check sum
        digit = sha256.hexdigest()[:8]
        if not check_sum:
            assert digit == check_sum, "File corrupted when download."
        progress.close()
        print(f"Finish download from {url}.")
        # 开始解压
        delete_temp_dir = None
        if suffix in ('.zip', '.tar.gz'):
            uncompress_temp_dir = tempfile.mkdtemp()
            delete_temp_dir = uncompress_temp_dir
            print(f"Start to uncompress file to {uncompress_temp_dir}.")
            if suffix == '.zip':
                unzip_file(Path(temp_filename), Path(uncompress_temp_dir))
            else:
                untar_gz_file(Path(temp_filename), Path(uncompress_temp_dir))
            filenames = os.listdir(uncompress_temp_dir)
            if len(filenames)==1:
                if os.path.isdir(os.path.join(uncompress_temp_dir, filenames[0])):
                    uncompress_temp_dir = os.path.join(uncompress_temp_dir, filenames[0])
            cache_path.mkdir(parents=True, exist_ok=True)
            print("Finish un-compressing file.")
        else:
            uncompress_temp_dir = temp_filename
            cache_path = str(cache_path) + suffix
        success = False
        try:
            # 复制到指定的位置
            print(f"Copy file to {cache_path}.")
            if os.path.isdir(uncompress_temp_dir):
                for filename in os.listdir(uncompress_temp_dir):
                    shutil.copyfile(os.path.join(uncompress_temp_dir, filename), cache_path/filename)
            else:
                shutil.copyfile(uncompress_temp_dir, cache_path)
            success = True
        except Exception as e:
            print(e)
            raise e
        finally:
            if not success:
                if cache_path.exists():
                    if cache_path.is_file():
                        os.remove(cache_path)
                    else:
                        shutil.rmtree(cache_path)
            if delete_temp_dir:
                shutil.rmtree(delete_temp_dir)
            os.close(fd)
            os.remove(temp_filename)
    return get_filepath(cache_path)
 def unzip_file(file: Path, to: Path):
    # unpack and write out in CoNLL column-like format
    from zipfile import ZipFile
    with ZipFile(file, "r") as zipObj:
        # Extract all the contents of zip file in current directory
        zipObj.extractall(to)
 def untar_gz_file(file:Path, to:Path):
    import tarfile
    with tarfile.open(file, 'r:gz') as tar:
        tar.extractall(to)
 def match_file(dir_name:str, cache_dir:str)->str:
    """
    匹配的原则是，在cache_dir下的文件: (1) 与dir_name完全一致; (2) 除了后缀以外和dir_name完全一致。
    如果找到了两个匹配的结果将报错. 如果找到了则返回匹配的文件的名称; 没有找到返回空字符串
    :param dir_name: 需要匹配的名称
    :param cache_dir: 在该目录下找匹配dir_name是否存在
    :return: str
    """
    files = os.listdir(cache_dir)
    matched_filenames = []
    for file_name in files:
        if re.match(dir_name+'$', file_name) or re.match(dir_name+'\\..*', file_name):
            matched_filenames.append(file_name)
    if len(matched_filenames)==0:
        return ''
    elif len(matched_filenames)==1:
        return matched_filenames[-1]
    else:
        raise RuntimeError(f"Duplicate matched files:{matched_filenames}, this should be caused by a bug.")
 if __name__ == '__main__':
    cache_dir = Path('caches')
    cache_dir = None
    # 需要对cache_dir进行测试
    base_url = 'http://0.0.0.0:8888/file/download'
    # if True:
    #     for filename in os.listdir(cache_dir):
    #         if os.path.isdir(os.path.join(cache_dir, filename)):
    #             shutil.rmtree(os.path.join(cache_dir, filename))
    #         else:
    #             os.remove(os.path.join(cache_dir, filename))
    # 1. 测试.txt文件
    print(cached_path(base_url + '/{}'.format('txt_test-bcb4fe65.txt'), cache_dir))
    # 2. 测试.zip文件(只有一个文件)
    print(cached_path(base_url + '/{}'.format('zip_test-40966d39.zip'), cache_dir))
    # 3. 测试.zip文件(有多个文件)
    print(cached_path(base_url + '/{}'.format('zip_pack_test-70c0b20d.zip'), cache_dir))
    # 4. 测试.tar.gz文件
    print(cached_path(base_url + '/{}'.format('tar_gz_test-3e2679cf.tar.gz'), cache_dir))
    # 5. 测试.tar.gz多个文件
    print(cached_path(base_url + '/{}'.format('tar_gz_pack_test-08dfdccd.tar.gz'), cache_dir))
    # 6. 测试.pkl文件
--- a/fastNLP/models/bert.py
+++ b/fastNLP/models/bert.py
@@ -30,7 +30,7 @@ class BertConfig:
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.intermediate = intermediate_size
        self.intermediate_size = intermediate_size
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
--- a/fastNLP/models/cnn_text_classification.py
+++ b/fastNLP/models/cnn_text_classification.py
@@ -7,6 +7,7 @@ import torch.nn as nn
 from ..core.const import Const as C
 from ..modules import encoder
 from fastNLP import seq_len_to_mask
 class CNNText(torch.nn.Module):
@@ -21,15 +22,13 @@ class CNNText(torch.nn.Module):
    :param int num_classes: 一共有多少类
    :param int,tuple(int) out_channels: 输出channel的数量。如果为list，则需要与kernel_sizes的数量保持一致
    :param int,tuple(int) kernel_sizes: 输出channel的kernel大小。
    :param int padding: 对句子前后的pad的大小, 用0填充。
    :param float dropout: Dropout的大小
    """
    def __init__(self, init_embed,
                 num_classes,
                 kernel_nums=(3, 4, 5),
                 kernel_sizes=(3, 4, 5),
                 padding=0,
                 kernel_nums=(30, 40, 50),
                 kernel_sizes=(1, 3, 5),
                 dropout=0.5):
        super(CNNText, self).__init__()
@@ -38,8 +37,7 @@ class CNNText(torch.nn.Module):
        self.conv_pool = encoder.ConvMaxpool(
            in_channels=self.embed.embedding_dim,
            out_channels=kernel_nums,
            kernel_sizes=kernel_sizes,
            padding=padding)
            kernel_sizes=kernel_sizes)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(sum(kernel_nums), num_classes)
@@ -51,7 +49,11 @@ class CNNText(torch.nn.Module):
        :return output: dict of torch.LongTensor, [batch_size, num_classes]
        """
        x = self.embed(words)  # [N,L] -> [N,L,C]
        x = self.conv_pool(x)  # [N,L,C] -> [N,C]
        if seq_len is not None:
            mask = seq_len_to_mask(seq_len)
            x = self.conv_pool(x, mask)
        else:
            x = self.conv_pool(x)  # [N,L,C] -> [N,C]
        x = self.dropout(x)
        x = self.fc(x)  # [N,C] -> [N, N_class]
        return {C.OUTPUT: x}
--- a/fastNLP/modules/encoder/_bert.py
+++ b/fastNLP/modules/encoder/_bert.py
@@ -0,0 +1,625 @@
 """
 这个页面的代码很大程度上参考了https://github.com/huggingface/pytorch-pretrained-BERT的代码
 """
 import torch
 from torch import nn
 from ... import Vocabulary
 import collections
 import os
 import unicodedata
 from ...io.file_utils import _get_base_url, cached_path
 from .bert import BertModel
 import numpy as np
 from itertools import chain
 def whitespace_tokenize(text):
    """Runs basic whitespace cleaning and splitting on a piece of text."""
    text = text.strip()
    if not text:
        return []
    tokens = text.split()
    return tokens
 class WordpieceTokenizer(object):
    """Runs WordPiece tokenization."""
    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100):
        self.vocab = vocab
        self.unk_token = unk_token
        self.max_input_chars_per_word = max_input_chars_per_word
    def tokenize(self, text):
        """Tokenizes a piece of text into its word pieces.
        This uses a greedy longest-match-first algorithm to perform tokenization
        using the given vocabulary.
        For example:
          input = "unaffable"
          output = ["un", "##aff", "##able"]
        Args:
          text: A single token or whitespace separated tokens. This should have
            already been passed through `BasicTokenizer`.
        Returns:
          A list of wordpiece tokens.
        """
        output_tokens = []
        for token in whitespace_tokenize(text):
            chars = list(token)
            if len(chars) > self.max_input_chars_per_word:
                output_tokens.append(self.unk_token)
                continue
            is_bad = False
            start = 0
            sub_tokens = []
            while start < len(chars):
                end = len(chars)
                cur_substr = None
                while start < end:
                    substr = "".join(chars[start:end])
                    if start > 0:
                        substr = "##" + substr
                    if substr in self.vocab:
                        cur_substr = substr
                        break
                    end -= 1
                if cur_substr is None:
                    is_bad = True
                    break
                sub_tokens.append(cur_substr)
                start = end
            if is_bad:
                output_tokens.append(self.unk_token)
            else:
                output_tokens.extend(sub_tokens)
        return output_tokens
 def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""
    vocab = collections.OrderedDict()
    index = 0
    with open(vocab_file, "r", encoding="utf-8") as reader:
        while True:
            token = reader.readline()
            if not token:
                break
            token = token.strip()
            vocab[token] = index
            index += 1
    return vocab
 class BasicTokenizer(object):
    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
    def __init__(self,
                 do_lower_case=True,
                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
        """Constructs a BasicTokenizer.
        Args:
          do_lower_case: Whether to lower case the input.
        """
        self.do_lower_case = do_lower_case
        self.never_split = never_split
    def tokenize(self, text):
        """Tokenizes a piece of text."""
        text = self._clean_text(text)
        # This was added on November 1st, 2018 for the multilingual and Chinese
        # models. This is also applied to the English models now, but it doesn't
        # matter since the English models were not trained on any Chinese data
        # and generally don't have any Chinese data in them (there are Chinese
        # characters in the vocabulary because Wikipedia does have some Chinese
        # words in the English Wikipedia.).
        text = self._tokenize_chinese_chars(text)
        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
            if self.do_lower_case and token not in self.never_split:
                token = token.lower()
                token = self._run_strip_accents(token)
            split_tokens.extend(self._run_split_on_punc(token))
        output_tokens = whitespace_tokenize(" ".join(split_tokens))
        return output_tokens
    def _run_strip_accents(self, text):
        """Strips accents from a piece of text."""
        text = unicodedata.normalize("NFD", text)
        output = []
        for char in text:
            cat = unicodedata.category(char)
            if cat == "Mn":
                continue
            output.append(char)
        return "".join(output)
    def _run_split_on_punc(self, text):
        """Splits punctuation on a piece of text."""
        if text in self.never_split:
            return [text]
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1
        return ["".join(x) for x in output]
    def _tokenize_chinese_chars(self, text):
        """Adds whitespace around any CJK character."""
        output = []
        for char in text:
            cp = ord(char)
            if self._is_chinese_char(cp):
                output.append(" ")
                output.append(char)
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)
    def _is_chinese_char(self, cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
                (cp >= 0x3400 and cp <= 0x4DBF) or  #
                (cp >= 0x20000 and cp <= 0x2A6DF) or  #
                (cp >= 0x2A700 and cp <= 0x2B73F) or  #
                (cp >= 0x2B740 and cp <= 0x2B81F) or  #
                (cp >= 0x2B820 and cp <= 0x2CEAF) or
                (cp >= 0xF900 and cp <= 0xFAFF) or  #
                (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
            return True
        return False
    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []
        for char in text:
            cp = ord(char)
            if cp == 0 or cp == 0xfffd or _is_control(char):
                continue
            if _is_whitespace(char):
                output.append(" ")
            else:
                output.append(char)
        return "".join(output)
 def _is_whitespace(char):
    """Checks whether `chars` is a whitespace character."""
    # \t, \n, and \r are technically contorl characters but we treat them
    # as whitespace since they are generally considered as such.
    if char == " " or char == "\t" or char == "\n" or char == "\r":
        return True
    cat = unicodedata.category(char)
    if cat == "Zs":
        return True
    return False
 def _is_control(char):
    """Checks whether `chars` is a control character."""
    # These are technically control characters but we count them as whitespace
    # characters.
    if char == "\t" or char == "\n" or char == "\r":
        return False
    cat = unicodedata.category(char)
    if cat.startswith("C"):
        return True
    return False
 def _is_punctuation(char):
    """Checks whether `chars` is a punctuation character."""
    cp = ord(char)
    # We treat all non-letter/number ASCII as punctuation.
    # Characters such as "^", "$", and "`" are not in the Unicode
    # Punctuation class but we treat them as punctuation anyways, for
    # consistency.
    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
            (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
        return True
    cat = unicodedata.category(char)
    if cat.startswith("P"):
        return True
    return False
 class BertTokenizer(object):
    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
    def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
        """Constructs a BertTokenizer.
        Args:
          vocab_file: Path to a one-wordpiece-per-line vocabulary file
          do_lower_case: Whether to lower case the input
                         Only has an effect when do_wordpiece_only=False
          do_basic_tokenize: Whether to do basic tokenization before wordpiece.
          max_len: An artificial maximum length to truncate tokenized sequences to;
                         Effective maximum length is always the minimum of this
                         value (if specified) and the underlying BERT model's
                         sequence length.
          never_split: List of tokens which will never be split during tokenization.
                         Only has an effect when do_wordpiece_only=False
        """
        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
        self.vocab = load_vocab(vocab_file)
        self.ids_to_tokens = collections.OrderedDict(
            [(ids, tok) for tok, ids in self.vocab.items()])
        self.do_basic_tokenize = do_basic_tokenize
        if do_basic_tokenize:
          self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
                                                never_split=never_split)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.max_len = max_len if max_len is not None else int(1e12)
    def tokenize(self, text):
        split_tokens = []
        if self.do_basic_tokenize:
            for token in self.basic_tokenizer.tokenize(text):
                for sub_token in self.wordpiece_tokenizer.tokenize(token):
                    split_tokens.append(sub_token)
        else:
            split_tokens = self.wordpiece_tokenizer.tokenize(text)
        return split_tokens
    def convert_tokens_to_ids(self, tokens):
        """Converts a sequence of tokens into ids using the vocab."""
        ids = []
        for token in tokens:
            ids.append(self.vocab[token])
        if len(ids) > self.max_len:
            print(
                "Token indices sequence length is longer than the specified maximum "
                " sequence length for this BERT model ({} > {}). Running this"
                " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
            )
        return ids
    def convert_ids_to_tokens(self, ids):
        """Converts a sequence of ids in wordpiece tokens using the vocab."""
        tokens = []
        for i in ids:
            tokens.append(self.ids_to_tokens[i])
        return tokens
    def save_vocabulary(self, vocab_path):
        """Save the tokenizer vocabulary to a directory or file."""
        index = 0
        if os.path.isdir(vocab_path):
            vocab_file = os.path.join(vocab_path, VOCAB_NAME)
        with open(vocab_file, "w", encoding="utf-8") as writer:
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    print("Saving vocabulary to {}: vocabulary indices are not consecutive."
                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
                    index = token_index
                writer.write(token + u'\n')
                index += 1
        return vocab_file
    @classmethod
    def from_pretrained(cls, model_dir, *inputs, **kwargs):
        """
        给定path，直接读取vocab.
        """
        pretrained_model_name_or_path = os.path.join(model_dir, VOCAB_NAME)
        print("loading vocabulary file {}".format(pretrained_model_name_or_path))
        max_len = 512
        kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
        # Instantiate tokenizer.
        tokenizer = cls(pretrained_model_name_or_path, *inputs, **kwargs)
        return tokenizer
 VOCAB_NAME = 'vocab.txt'
 class _WordBertModel(nn.Module):
    def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first', include_cls_sep:bool=False):
        super().__init__()
        self.tokenzier = BertTokenizer.from_pretrained(model_dir)
        self.encoder = BertModel.from_pretrained(model_dir)
        #  检查encoder_layer_number是否合理
        encoder_layer_number = len(self.encoder.encoder.layer)
        self.layers = list(map(int, layers.split(',')))
        for layer in self.layers:
            if layer<0:
                assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                    f"a bert model with {encoder_layer_number} layers."
            else:
                assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                    f"a bert model with {encoder_layer_number} layers."
        assert pool_method in ('avg', 'max', 'first', 'last')
        self.pool_method = pool_method
        self.include_cls_sep = include_cls_sep
        # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
        print("Start to generating word pieces for word.")
        word_to_wordpieces = []
        word_pieces_lengths = []
        for word, index in vocab:
            if index == vocab.padding_idx:  # pad是个特殊的符号
                word = '[PAD]'
            elif index == vocab.unknown_idx:
                word = '[UNK]'
            word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
            word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces)
            word_to_wordpieces.append(word_pieces)
            word_pieces_lengths.append(len(word_pieces))
        self._cls_index = len(vocab)
        self._sep_index = len(vocab) + 1
        self._pad_index = vocab.padding_idx
        self._wordpiece_pad_index = self.tokenzier.convert_tokens_to_ids(['[PAD]'])[0]  # 需要用于生成word_piece
        word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[CLS]']))
        word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[SEP]']))
        self.word_to_wordpieces = np.array(word_to_wordpieces)
        self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False)
        print("Successfully generate word pieces.")
    def forward(self, words):
        """
        :param words: torch.LongTensor, batch_size x max_len
        :return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size
        """
        batch_size, max_word_len = words.size()
        seq_len = words.ne(self._pad_index).sum(dim=-1)
        batch_word_pieces_length = self.word_pieces_lengths[words]  # batch_size x max_len
        word_pieces_lengths = batch_word_pieces_length.sum(dim=-1)
        max_word_piece_length = word_pieces_lengths.max().item()
        # +2是由于需要加入[CLS]与[SEP]
        word_pieces = words.new_full((batch_size, max_word_piece_length+2), fill_value=self._wordpiece_pad_index)
        word_pieces[:, 0].fill_(self._cls_index)
        word_pieces[:, word_pieces_lengths+1] = self._sep_index
        attn_masks = torch.zeros_like(word_pieces)
        # 1. 获取words的word_pieces的id，以及对应的span范围
        word_indexes = words.tolist()
        for i in range(batch_size):
            word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i]]))
            word_pieces[i, 1:len(word_pieces_i)+1] = torch.LongTensor(word_pieces_i)
            attn_masks[i, :len(word_pieces_i)+2].fill_(1)
        # 2. 获取hidden的结果，根据word_pieces进行对应的pool计算
        # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...]
        bert_outputs, _ = self.encoder(word_pieces, token_type_ids=None, attention_mask=attn_masks,
                                           output_all_encoded_layers=True)
        # output_layers = [self.layers]  # len(self.layers) x batch_size x max_word_piece_length x hidden_size
        if self.include_cls_sep:
            outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2,
                                                 bert_outputs[-1].size(-1))
            s_shift = 1
        else:
            outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len,
                                                 bert_outputs[-1].size(-1))
            s_shift = 0
        batch_word_pieces_cum_length = batch_word_pieces_length.new_zeros(batch_size, max_word_len + 1)
        batch_word_pieces_cum_length[:, 1:] = batch_word_pieces_length.cumsum(dim=-1)  # batch_size x max_len
        for l_index, l in enumerate(self.layers):
            output_layer = bert_outputs[l]
            # 从word_piece collapse到word的表示
            truncate_output_layer = output_layer[:, 1:-1]  # 删除[CLS]与[SEP] batch_size x len x hidden_size
            outputs_seq_len = seq_len + s_shift
            if self.pool_method == 'first':
                for i in range(batch_size):
                    i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]]  # 每个word的start位置
                    outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length]  # num_layer x batch_size x len x hidden_size
            elif self.pool_method == 'last':
                for i in range(batch_size):
                    i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i]+1] - 1 # 每个word的end
                    outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length]
            elif self.pool_method == 'max':
                for i in range(batch_size):
                    for j in range(seq_len[i]):
                        start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
                        outputs[l_index, i, j+s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2)
            else:
                for i in range(batch_size):
                    for j in range(seq_len[i]):
                        start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
                        outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2)
            if self.include_cls_sep:
                outputs[:, :, 0] = output_layer[:, 0]
                outputs[:, :, seq_len+s_shift] = output_layer[:, seq_len+s_shift]
        # 3. 最终的embedding结果
        return outputs
 class _WordPieceBertModel(nn.Module):
    """
    这个模块用于直接计算word_piece的结果.
    """
    def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1'):
        super().__init__()
        self.tokenzier = BertTokenizer.from_pretrained(model_dir)
        self.encoder = BertModel.from_pretrained(model_dir)
        #  检查encoder_layer_number是否合理
        encoder_layer_number = len(self.encoder.encoder.layer)
        self.layers = list(map(int, layers.split(',')))
        for layer in self.layers:
            if layer<0:
                assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                    f"a bert model with {encoder_layer_number} layers."
            else:
                assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                    f"a bert model with {encoder_layer_number} layers."
        # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
        print("Start to generating word pieces for word.")
        self.word_to_wordpieces = []
        self.word_pieces_length = []
        for word, index in vocab:
            if index == vocab.padding_idx:  # pad是个特殊的符号
                word = '[PAD]'
            elif index == vocab.unknown_idx:
                word = '[UNK]'
            word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
            word_pieces = self.tokenzier.convert_tokens_to_ids(word_pieces)
            self.word_to_wordpieces.append(word_pieces)
            self.word_pieces_length.append(len(word_pieces))
        self._cls_index = len(vocab)
        self._sep_index = len(vocab) + 1
        self._pad_index = vocab.padding_idx
        self._wordpiece_pad_index = self.tokenzier.convert_tokens_to_ids(['[PAD]'])[0]  # 需要用于生成word_piece
        self.word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[CLS]']))
        self.word_to_wordpieces.append(self.tokenzier.convert_tokens_to_ids(['[SEP]']))
        self.word_to_wordpieces = np.array(self.word_to_wordpieces, dtype=int)
        print("Successfully generate word pieces.")
    def index_dataset(self, *datasets):
        """
        使用bert的tokenizer将word_pieces与word_pieces_seq_len这两列加入到datasets中，并将他们设置为input。加入的word_piece
            已经包含了[CLS]与[SEP], 且将word_pieces这一列的pad value设置为了bert的pad value。
        :param datasets: DataSet对象
        :return:
        """
        def convert_words_to_word_pieces(words):
            word_pieces = list(chain(*self.word_to_wordpieces[words].tolist()))
            word_pieces = [self._cls_index] + word_pieces + [self._sep_index]
            return word_pieces
        for index, dataset in enumerate(datasets):
            try:
                dataset.apply_field(convert_words_to_word_pieces, field_name='words', new_field_name='word_pieces',
                                    is_input=True)
                dataset.set_pad_val('word_pieces', self._wordpiece_pad_index)
            except Exception as e:
                print(f"Exception happens when processing the {index} dataset.")
                raise e
    def forward(self, word_pieces, token_type_ids=None):
        """
        :param word_pieces: torch.LongTensor, batch_size x max_len
        :param token_type_ids: torch.LongTensor, batch_size x max_len
        :return: num_layers x batch_size x max_len x hidden_size或者num_layers x batch_size x (max_len+2) x hidden_size
        """
        batch_size, max_len = word_pieces.size()
        attn_masks = word_pieces.ne(self._pad_index)
        bert_outputs, _ = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks,
                                           output_all_encoded_layers=True)
        # output_layers = [self.layers]  # len(self.layers) x batch_size x max_word_piece_length x hidden_size
        outputs = bert_outputs[0].new_zeros((len(self.layers), batch_size, max_len, bert_outputs[0].size(-1)))
        for l_index, l in enumerate(self.layers):
            outputs[l_index] = bert_outputs[l]
        return outputs
 class BertWordPieceEncoder(nn.Module):
    """
    可以通过读取vocabulary使用的Bert的Encoder。传入vocab，然后调用index_datasets方法在vocabulary中生成word piece的表示。
    :param vocab: Vocabulary.
    :param model_dir_or_name:
    :param layers:
    :param requires_grad:
    """
    def __init__(self, vocab:Vocabulary, model_dir_or_name:str='en-base', layers:str='-1',
                 requires_grad:bool=False):
        super().__init__()
        PRETRAIN_URL = _get_base_url('bert')
        # TODO 修改
        PRETRAINED_BERT_MODEL_DIR = {'en-base': 'bert_en-80f95ea7.tar.gz',
                                     'cn': 'elmo_cn.zip'}
        if model_dir_or_name in PRETRAINED_BERT_MODEL_DIR:
            model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
            model_url = PRETRAIN_URL + model_name
            model_dir = cached_path(model_url)
            # 检查是否存在
        elif os.path.isdir(model_dir_or_name):
            model_dir = model_dir_or_name
        else:
            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
        self.model = _WordPieceBertModel(model_dir=model_dir, vocab=vocab, layers=layers)
        self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size
        self.requires_grad = requires_grad
    @property
    def requires_grad(self):
        """
        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
        :return:
        """
        requires_grads = set([param.requires_grad for name, param in self.named_parameters()])
        if len(requires_grads)==1:
            return requires_grads.pop()
        else:
            return None
    @requires_grad.setter
    def requires_grad(self, value):
        for name, param in self.named_parameters():
            param.requires_grad = value
    @property
    def embed_size(self):
        return self._embed_size
    def index_datasets(self, *datasets):
        """
        对datasets进行word piece的index。
        Example::
        :param datasets:
        :return:
        """
        self.model.index_dataset(*datasets)
    def forward(self, words, token_type_ids=None):
        """
        计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要
            删除这两个表示。
        :param words: batch_size x max_len
        :param token_type_ids: batch_size x max_len, 用于区分前一句和后一句话
        :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
        """
        outputs = self.model(words, token_type_ids)
        outputs = torch.cat([*outputs], dim=-1)
        return outputs
--- a/fastNLP/modules/encoder/_elmo.py
+++ b/fastNLP/modules/encoder/_elmo.py
@@ -0,0 +1,774 @@
 """
 这个页面的代码大量参考了https://github.com/HIT-SCIR/ELMoForManyLangs/tree/master/elmoformanylangs
 """
 from typing import Optional, Tuple, List, Callable
 import os
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn.utils.rnn import PackedSequence, pad_packed_sequence
 from ...core.vocabulary import Vocabulary
 import json
 from ..utils import get_dropout_mask
 import codecs
 class LstmCellWithProjection(torch.nn.Module):
    """
    An LSTM with Recurrent Dropout and a projected and clipped hidden state and
    memory. Note: this implementation is slower than the native Pytorch LSTM because
    it cannot make use of CUDNN optimizations for stacked RNNs due to and
    variational dropout and the custom nature of the cell state.
    Parameters
    ----------
    input_size : ``int``, required.
        The dimension of the inputs to the LSTM.
    hidden_size : ``int``, required.
        The dimension of the outputs of the LSTM.
    cell_size : ``int``, required.
        The dimension of the memory cell used for the LSTM.
    go_forward: ``bool``, optional (default = True)
        The direction in which the LSTM is applied to the sequence.
        Forwards by default, or backwards if False.
    recurrent_dropout_probability: ``float``, optional (default = 0.0)
        The dropout probability to be used in a dropout scheme as stated in
        `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks
        <https://arxiv.org/abs/1512.05287>`_ . Implementation wise, this simply
        applies a fixed dropout mask per sequence to the recurrent connection of the
        LSTM.
    state_projection_clip_value: ``float``, optional, (default = None)
        The magnitude with which to clip the hidden_state after projecting it.
    memory_cell_clip_value: ``float``, optional, (default = None)
        The magnitude with which to clip the memory cell.
    Returns
    -------
    output_accumulator : ``torch.FloatTensor``
        The outputs of the LSTM for each timestep. A tensor of shape
        (batch_size, max_timesteps, hidden_size) where for a given batch
        element, all outputs past the sequence length for that batch are
        zero tensors.
    final_state: ``Tuple[torch.FloatTensor, torch.FloatTensor]``
        The final (state, memory) states of the LSTM, with shape
        (1, batch_size, hidden_size) and  (1, batch_size, cell_size)
        respectively. The first dimension is 1 in order to match the Pytorch
        API for returning stacked LSTM states.
    """
    def __init__(self,
                 input_size: int,
                 hidden_size: int,
                 cell_size: int,
                 go_forward: bool = True,
                 recurrent_dropout_probability: float = 0.0,
                 memory_cell_clip_value: Optional[float] = None,
                 state_projection_clip_value: Optional[float] = None) -> None:
        super(LstmCellWithProjection, self).__init__()
        # Required to be wrapped with a :class:`PytorchSeq2SeqWrapper`.
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.cell_size = cell_size
        self.go_forward = go_forward
        self.state_projection_clip_value = state_projection_clip_value
        self.memory_cell_clip_value = memory_cell_clip_value
        self.recurrent_dropout_probability = recurrent_dropout_probability
        # We do the projections for all the gates all at once.
        self.input_linearity = torch.nn.Linear(input_size, 4 * cell_size, bias=False)
        self.state_linearity = torch.nn.Linear(hidden_size, 4 * cell_size, bias=True)
        # Additional projection matrix for making the hidden state smaller.
        self.state_projection = torch.nn.Linear(cell_size, hidden_size, bias=False)
        self.reset_parameters()
    def reset_parameters(self):
        # Use sensible default initializations for parameters.
        nn.init.orthogonal_(self.input_linearity.weight.data)
        nn.init.orthogonal_(self.state_linearity.weight.data)
        self.state_linearity.bias.data.fill_(0.0)
        # Initialize forget gate biases to 1.0 as per An Empirical
        # Exploration of Recurrent Network Architectures, (Jozefowicz, 2015).
        self.state_linearity.bias.data[self.cell_size:2 * self.cell_size].fill_(1.0)
    def forward(self,  # pylint: disable=arguments-differ
                inputs: torch.FloatTensor,
                batch_lengths: List[int],
                initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None):
        """
        Parameters
        ----------
        inputs : ``torch.FloatTensor``, required.
            A tensor of shape (batch_size, num_timesteps, input_size)
            to apply the LSTM over.
        batch_lengths : ``List[int]``, required.
            A list of length batch_size containing the lengths of the sequences in batch.
        initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None)
            A tuple (state, memory) representing the initial hidden state and memory
            of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the
            ``memory`` has shape (1, batch_size, cell_size).
        Returns
        -------
        output_accumulator : ``torch.FloatTensor``
            The outputs of the LSTM for each timestep. A tensor of shape
            (batch_size, max_timesteps, hidden_size) where for a given batch
            element, all outputs past the sequence length for that batch are
            zero tensors.
        final_state : ``Tuple[``torch.FloatTensor, torch.FloatTensor]``
            A tuple (state, memory) representing the initial hidden state and memory
            of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the
            ``memory`` has shape (1, batch_size, cell_size).
        """
        batch_size = inputs.size()[0]
        total_timesteps = inputs.size()[1]
        # We have to use this '.data.new().fill_' pattern to create tensors with the correct
        # type - forward has no knowledge of whether these are torch.Tensors or torch.cuda.Tensors.
        output_accumulator = inputs.data.new(batch_size,
                                                      total_timesteps,
                                                      self.hidden_size).fill_(0)
        if initial_state is None:
            full_batch_previous_memory = inputs.data.new(batch_size,
                                                                  self.cell_size).fill_(0)
            full_batch_previous_state = inputs.data.new(batch_size,
                                                                 self.hidden_size).fill_(0)
        else:
            full_batch_previous_state = initial_state[0].squeeze(0)
            full_batch_previous_memory = initial_state[1].squeeze(0)
        current_length_index = batch_size - 1 if self.go_forward else 0
        if self.recurrent_dropout_probability > 0.0 and self.training:
            dropout_mask = get_dropout_mask(self.recurrent_dropout_probability,
                                            full_batch_previous_state)
        else:
            dropout_mask = None
        for timestep in range(total_timesteps):
            # The index depends on which end we start.
            index = timestep if self.go_forward else total_timesteps - timestep - 1
            # What we are doing here is finding the index into the batch dimension
            # which we need to use for this timestep, because the sequences have
            # variable length, so once the index is greater than the length of this
            # particular batch sequence, we no longer need to do the computation for
            # this sequence. The key thing to recognise here is that the batch inputs
            # must be _ordered_ by length from longest (first in batch) to shortest
            # (last) so initially, we are going forwards with every sequence and as we
            # pass the index at which the shortest elements of the batch finish,
            # we stop picking them up for the computation.
            if self.go_forward:
                while batch_lengths[current_length_index] <= index:
                    current_length_index -= 1
            # If we're going backwards, we are _picking up_ more indices.
            else:
                # First conditional: Are we already at the maximum number of elements in the batch?
                # Second conditional: Does the next shortest sequence beyond the current batch
                # index require computation use this timestep?
                while current_length_index < (len(batch_lengths) - 1) and \
                                batch_lengths[current_length_index + 1] > index:
                    current_length_index += 1
            # Actually get the slices of the batch which we
            # need for the computation at this timestep.
            # shape (batch_size, cell_size)
            previous_memory = full_batch_previous_memory[0: current_length_index + 1].clone()
            # Shape (batch_size, hidden_size)
            previous_state = full_batch_previous_state[0: current_length_index + 1].clone()
            # Shape (batch_size, input_size)
            timestep_input = inputs[0: current_length_index + 1, index]
            # Do the projections for all the gates all at once.
            # Both have shape (batch_size, 4 * cell_size)
            projected_input = self.input_linearity(timestep_input)
            projected_state = self.state_linearity(previous_state)
            # Main LSTM equations using relevant chunks of the big linear
            # projections of the hidden state and inputs.
            input_gate = torch.sigmoid(projected_input[:, (0 * self.cell_size):(1 * self.cell_size)] +
                                       projected_state[:, (0 * self.cell_size):(1 * self.cell_size)])
            forget_gate = torch.sigmoid(projected_input[:, (1 * self.cell_size):(2 * self.cell_size)] +
                                        projected_state[:, (1 * self.cell_size):(2 * self.cell_size)])
            memory_init = torch.tanh(projected_input[:, (2 * self.cell_size):(3 * self.cell_size)] +
                                     projected_state[:, (2 * self.cell_size):(3 * self.cell_size)])
            output_gate = torch.sigmoid(projected_input[:, (3 * self.cell_size):(4 * self.cell_size)] +
                                        projected_state[:, (3 * self.cell_size):(4 * self.cell_size)])
            memory = input_gate * memory_init + forget_gate * previous_memory
            # Here is the non-standard part of this LSTM cell; first, we clip the
            # memory cell, then we project the output of the timestep to a smaller size
            # and again clip it.
            if self.memory_cell_clip_value:
                # pylint: disable=invalid-unary-operand-type
                memory = torch.clamp(memory, -self.memory_cell_clip_value, self.memory_cell_clip_value)
            # shape (current_length_index, cell_size)
            pre_projection_timestep_output = output_gate * torch.tanh(memory)
            # shape (current_length_index, hidden_size)
            timestep_output = self.state_projection(pre_projection_timestep_output)
            if self.state_projection_clip_value:
                # pylint: disable=invalid-unary-operand-type
                timestep_output = torch.clamp(timestep_output,
                                              -self.state_projection_clip_value,
                                              self.state_projection_clip_value)
            # Only do dropout if the dropout prob is > 0.0 and we are in training mode.
            if dropout_mask is not None:
                timestep_output = timestep_output * dropout_mask[0: current_length_index + 1]
            # We've been doing computation with less than the full batch, so here we create a new
            # variable for the the whole batch at this timestep and insert the result for the
            # relevant elements of the batch into it.
            full_batch_previous_memory = full_batch_previous_memory.data.clone()
            full_batch_previous_state = full_batch_previous_state.data.clone()
            full_batch_previous_memory[0:current_length_index + 1] = memory
            full_batch_previous_state[0:current_length_index + 1] = timestep_output
            output_accumulator[0:current_length_index + 1, index] = timestep_output
        # Mimic the pytorch API by returning state in the following shape:
        # (num_layers * num_directions, batch_size, ...). As this
        # LSTM cell cannot be stacked, the first dimension here is just 1.
        final_state = (full_batch_previous_state.unsqueeze(0),
                       full_batch_previous_memory.unsqueeze(0))
        return output_accumulator, final_state
 class LstmbiLm(nn.Module):
    def __init__(self, config):
        super(LstmbiLm, self).__init__()
        self.config = config
        self.encoder = nn.LSTM(self.config['encoder']['projection_dim'],
                               self.config['encoder']['dim'],
                               num_layers=self.config['encoder']['n_layers'],
                               bidirectional=True,
                               batch_first=True,
                               dropout=self.config['dropout'])
        self.projection = nn.Linear(self.config['encoder']['dim'], self.config['encoder']['projection_dim'], bias=True)
    def forward(self, inputs, seq_len):
        sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True)
        inputs = inputs[sort_idx]
        inputs = nn.utils.rnn.pack_padded_sequence(inputs, sort_lens, batch_first=self.batch_first)
        output, hx = self.encoder(inputs, None)  # -> [N,L,C]
        output, _ = nn.util.rnn.pad_packed_sequence(output, batch_first=self.batch_first)
        _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
        output = output[unsort_idx]
        forward, backward = output.split(self.config['encoder']['dim'], 2)
        return torch.cat([self.projection(forward), self.projection(backward)], dim=2)
 class ElmobiLm(torch.nn.Module):
    def __init__(self, config):
        super(ElmobiLm, self).__init__()
        self.config = config
        input_size = config['encoder']['projection_dim']
        hidden_size = config['encoder']['projection_dim']
        cell_size = config['encoder']['dim']
        num_layers = config['encoder']['n_layers']
        memory_cell_clip_value = config['encoder']['cell_clip']
        state_projection_clip_value = config['encoder']['proj_clip']
        recurrent_dropout_probability = config['dropout']
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cell_size = cell_size
        forward_layers = []
        backward_layers = []
        lstm_input_size = input_size
        go_forward = True
        for layer_index in range(num_layers):
            forward_layer = LstmCellWithProjection(lstm_input_size,
                                                   hidden_size,
                                                   cell_size,
                                                   go_forward,
                                                   recurrent_dropout_probability,
                                                   memory_cell_clip_value,
                                                   state_projection_clip_value)
            backward_layer = LstmCellWithProjection(lstm_input_size,
                                                    hidden_size,
                                                    cell_size,
                                                    not go_forward,
                                                    recurrent_dropout_probability,
                                                    memory_cell_clip_value,
                                                    state_projection_clip_value)
            lstm_input_size = hidden_size
            self.add_module('forward_layer_{}'.format(layer_index), forward_layer)
            self.add_module('backward_layer_{}'.format(layer_index), backward_layer)
            forward_layers.append(forward_layer)
            backward_layers.append(backward_layer)
        self.forward_layers = forward_layers
        self.backward_layers = backward_layers
    def forward(self, inputs, seq_len):
        """
        :param inputs: batch_size x max_len x embed_size
        :param seq_len: batch_size
        :return: torch.FloatTensor. num_layers x batch_size x max_len x hidden_size
        """
        sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True)
        inputs = inputs[sort_idx]
        inputs = nn.utils.rnn.pack_padded_sequence(inputs, sort_lens, batch_first=True)
        output, _ = self._lstm_forward(inputs, None)
        _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
        output = output[:, unsort_idx]
        return output
    def _lstm_forward(self,
                      inputs: PackedSequence,
                      initial_state: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> \
            Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """
        Parameters
        ----------
        inputs : ``PackedSequence``, required.
          A batch first ``PackedSequence`` to run the stacked LSTM over.
        initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None)
          A tuple (state, memory) representing the initial hidden state and memory
          of the LSTM, with shape (num_layers, batch_size, 2 * hidden_size) and
          (num_layers, batch_size, 2 * cell_size) respectively.
        Returns
        -------
        output_sequence : ``torch.FloatTensor``
          The encoded sequence of shape (num_layers, batch_size, sequence_length, hidden_size)
        final_states: ``Tuple[torch.FloatTensor, torch.FloatTensor]``
          The per-layer final (state, memory) states of the LSTM, with shape
          (num_layers, batch_size, 2 * hidden_size) and  (num_layers, batch_size, 2 * cell_size)
          respectively. The last dimension is duplicated because it contains the state/memory
          for both the forward and backward layers.
        """
        if initial_state is None:
            hidden_states: List[Optional[Tuple[torch.Tensor,
                                               torch.Tensor]]] = [None] * len(self.forward_layers)
        elif initial_state[0].size()[0] != len(self.forward_layers):
            raise Exception("Initial states were passed to forward() but the number of "
                            "initial states does not match the number of layers.")
        else:
            hidden_states = list(zip(initial_state[0].split(1, 0), initial_state[1].split(1, 0)))
        inputs, batch_lengths = pad_packed_sequence(inputs, batch_first=True)
        forward_output_sequence = inputs
        backward_output_sequence = inputs
        final_states = []
        sequence_outputs = []
        for layer_index, state in enumerate(hidden_states):
            forward_layer = getattr(self, 'forward_layer_{}'.format(layer_index))
            backward_layer = getattr(self, 'backward_layer_{}'.format(layer_index))
            forward_cache = forward_output_sequence
            backward_cache = backward_output_sequence
            if state is not None:
                forward_hidden_state, backward_hidden_state = state[0].split(self.hidden_size, 2)
                forward_memory_state, backward_memory_state = state[1].split(self.cell_size, 2)
                forward_state = (forward_hidden_state, forward_memory_state)
                backward_state = (backward_hidden_state, backward_memory_state)
            else:
                forward_state = None
                backward_state = None
            forward_output_sequence, forward_state = forward_layer(forward_output_sequence,
                                                                   batch_lengths,
                                                                   forward_state)
            backward_output_sequence, backward_state = backward_layer(backward_output_sequence,
                                                                      batch_lengths,
                                                                      backward_state)
            # Skip connections, just adding the input to the output.
            if layer_index != 0:
                forward_output_sequence += forward_cache
                backward_output_sequence += backward_cache
            sequence_outputs.append(torch.cat([forward_output_sequence,
                                               backward_output_sequence], -1))
            # Append the state tuples in a list, so that we can return
            # the final states for all the layers.
            final_states.append((torch.cat([forward_state[0], backward_state[0]], -1),
                                 torch.cat([forward_state[1], backward_state[1]], -1)))
        stacked_sequence_outputs: torch.FloatTensor = torch.stack(sequence_outputs)
        # Stack the hidden state and memory for each layer into 2 tensors of shape
        # (num_layers, batch_size, hidden_size) and (num_layers, batch_size, cell_size)
        # respectively.
        final_hidden_states, final_memory_states = zip(*final_states)
        final_state_tuple: Tuple[torch.FloatTensor,
                                 torch.FloatTensor] = (torch.cat(final_hidden_states, 0),
                                                       torch.cat(final_memory_states, 0))
        return stacked_sequence_outputs, final_state_tuple
 class LstmTokenEmbedder(nn.Module):
    def __init__(self, config, word_emb_layer, char_emb_layer):
        super(LstmTokenEmbedder, self).__init__()
        self.config = config
        self.word_emb_layer = word_emb_layer
        self.char_emb_layer = char_emb_layer
        self.output_dim = config['encoder']['projection_dim']
        emb_dim = 0
        if word_emb_layer is not None:
            emb_dim += word_emb_layer.n_d
        if char_emb_layer is not None:
            emb_dim += char_emb_layer.n_d * 2
            self.char_lstm = nn.LSTM(char_emb_layer.n_d, char_emb_layer.n_d, num_layers=1, bidirectional=True,
                                     batch_first=True, dropout=config['dropout'])
        self.projection = nn.Linear(emb_dim, self.output_dim, bias=True)
    def forward(self, words, chars):
        embs = []
        if self.word_emb_layer is not None:
            word_emb = self.word_emb_layer(words)
            embs.append(word_emb)
        if self.char_emb_layer is not None:
            batch_size, seq_len, _ = chars.shape
            chars = chars.view(batch_size * seq_len, -1)
            chars_emb = self.char_emb_layer(chars)
            # TODO 这里应该要考虑seq_len的问题
            _, (chars_outputs, __) = self.char_lstm(chars_emb)
            chars_outputs = chars_outputs.contiguous().view(-1, self.config['token_embedder']['char_dim'] * 2)
            embs.append(chars_outputs)
        token_embedding = torch.cat(embs, dim=2)
        return self.projection(token_embedding)
 class ConvTokenEmbedder(nn.Module):
    def __init__(self, config, word_emb_layer, char_emb_layer):
        super(ConvTokenEmbedder, self).__init__()
        self.config = config
        self.word_emb_layer = word_emb_layer
        self.char_emb_layer = char_emb_layer
        self.output_dim = config['encoder']['projection_dim']
        self.emb_dim = 0
        if word_emb_layer is not None:
            self.emb_dim += word_emb_layer.weight.size(1)
        if char_emb_layer is not None:
            self.convolutions = []
            cnn_config = config['token_embedder']
            filters = cnn_config['filters']
            char_embed_dim = cnn_config['char_dim']
            for i, (width, num) in enumerate(filters):
                conv = torch.nn.Conv1d(
                    in_channels=char_embed_dim,
                    out_channels=num,
                    kernel_size=width,
                    bias=True
                )
                self.convolutions.append(conv)
            self.convolutions = nn.ModuleList(self.convolutions)
            self.n_filters = sum(f[1] for f in filters)
            self.n_highway = cnn_config['n_highway']
            self.highways = Highway(self.n_filters, self.n_highway, activation=torch.nn.functional.relu)
            self.emb_dim += self.n_filters
        self.projection = nn.Linear(self.emb_dim, self.output_dim, bias=True)
    def forward(self, words, chars):
        embs = []
        if self.word_emb_layer is not None:
            word_emb = self.word_emb_layer(words)
            embs.append(word_emb)
        if self.char_emb_layer is not None:
            batch_size, seq_len, _ = chars.size()
            chars = chars.view(batch_size * seq_len, -1)
            character_embedding = self.char_emb_layer(chars)
            character_embedding = torch.transpose(character_embedding, 1, 2)
            cnn_config = self.config['token_embedder']
            if cnn_config['activation'] == 'tanh':
                activation = torch.nn.functional.tanh
            elif cnn_config['activation'] == 'relu':
                activation = torch.nn.functional.relu
            else:
                raise Exception("Unknown activation")
            convs = []
            for i in range(len(self.convolutions)):
                convolved = self.convolutions[i](character_embedding)
                # (batch_size * sequence_length, n_filters for this width)
                convolved, _ = torch.max(convolved, dim=-1)
                convolved = activation(convolved)
                convs.append(convolved)
            char_emb = torch.cat(convs, dim=-1)
            char_emb = self.highways(char_emb)
            embs.append(char_emb.view(batch_size, -1, self.n_filters))
        token_embedding = torch.cat(embs, dim=2)
        return self.projection(token_embedding)
 class Highway(torch.nn.Module):
    """
    A `Highway layer <https://arxiv.org/abs/1505.00387>`_ does a gated combination of a linear
    transformation and a non-linear transformation of its input.  :math:`y = g * x + (1 - g) *
    f(A(x))`, where :math:`A` is a linear transformation, :math:`f` is an element-wise
    non-linearity, and :math:`g` is an element-wise gate, computed as :math:`sigmoid(B(x))`.
    This module will apply a fixed number of highway layers to its input, returning the final
    result.
    Parameters
    ----------
    input_dim : ``int``
        The dimensionality of :math:`x`.  We assume the input has shape ``(batch_size,
        input_dim)``.
    num_layers : ``int``, optional (default=``1``)
        The number of highway layers to apply to the input.
    activation : ``Callable[[torch.Tensor], torch.Tensor]``, optional (default=``torch.nn.functional.relu``)
        The non-linearity to use in the highway layers.
    """
    def __init__(self,
                 input_dim: int,
                 num_layers: int = 1,
                 activation: Callable[[torch.Tensor], torch.Tensor] = torch.nn.functional.relu) -> None:
        super(Highway, self).__init__()
        self._input_dim = input_dim
        self._layers = torch.nn.ModuleList([torch.nn.Linear(input_dim, input_dim * 2)
                                            for _ in range(num_layers)])
        self._activation = activation
        for layer in self._layers:
            # We should bias the highway layer to just carry its input forward.  We do that by
            # setting the bias on `B(x)` to be positive, because that means `g` will be biased to
            # be high, to we will carry the input forward.  The bias on `B(x)` is the second half
            # of the bias vector in each Linear layer.
            layer.bias[input_dim:].data.fill_(1)
    def forward(self, inputs: torch.Tensor) -> torch.Tensor:  # pylint: disable=arguments-differ
        current_input = inputs
        for layer in self._layers:
            projected_input = layer(current_input)
            linear_part = current_input
            # NOTE: if you modify this, think about whether you should modify the initialization
            # above, too.
            nonlinear_part = projected_input[:, (0 * self._input_dim):(1 * self._input_dim)]
            gate = projected_input[:, (1 * self._input_dim):(2 * self._input_dim)]
            nonlinear_part = self._activation(nonlinear_part)
            gate = torch.sigmoid(gate)
            current_input = gate * linear_part + (1 - gate) * nonlinear_part
        return current_input
 class _ElmoModel(nn.Module):
    """
    该Module是ElmoEmbedding中进行所有的heavy lifting的地方。做的工作，包括
        (1) 根据配置，加载模型;
        (2) 根据vocab，对模型中的embedding进行调整. 并将其正确初始化
        (3) 保存一个words与chars的对应转换，获取时自动进行相应的转换
        (4) 设计一个保存token的embedding，允许缓存word的表示。
    """
    def __init__(self, model_dir:str, vocab:Vocabulary=None, cache_word_reprs:bool=False):
        super(_ElmoModel, self).__init__()
        config = json.load(open(os.path.join(model_dir, 'structure_config.json'), 'r'))
        self.config = config
        OOV_TAG = '<oov>'
        PAD_TAG = '<pad>'
        BOS_TAG = '<bos>'
        EOS_TAG = '<eos>'
        BOW_TAG = '<bow>'
        EOW_TAG = '<eow>'
        # 将加载embedding放到这里
        token_embedder_states = torch.load(os.path.join(model_dir, 'token_embedder.pkl'), map_location='cpu')
        # For the model trained with word form word encoder.
        if config['token_embedder']['word_dim'] > 0:
            word_lexicon = {}
            with codecs.open(os.path.join(model_dir, 'word.dic'), 'r', encoding='utf-8') as fpi:
                for line in fpi:
                    tokens = line.strip().split('\t')
                    if len(tokens) == 1:
                        tokens.insert(0, '\u3000')
                    token, i = tokens
                    word_lexicon[token] = int(i)
            # 做一些sanity check
            for special_word in [PAD_TAG, OOV_TAG, BOS_TAG, EOS_TAG]:
                assert special_word in word_lexicon, f"{special_word} not found in word.dic."
            # 根据vocab调整word_embedding
            pre_word_embedding = token_embedder_states.pop('word_emb_layer.embedding.weight')
            word_emb_layer = nn.Embedding(len(vocab)+2, config['token_embedder']['word_dim'])  #多增加两个是为了<bos>与<eos>
            found_word_count = 0
            for word, index in vocab:
                if index == vocab.unknown_idx:  # 因为fastNLP的unknow是<unk> 而在这里是<oov>所以ugly强制适配一下
                    index_in_pre = word_lexicon[OOV_TAG]
                    found_word_count += 1
                elif index == vocab.padding_idx:  # 需要pad对齐
                    index_in_pre = word_lexicon[PAD_TAG]
                    found_word_count += 1
                elif word in word_lexicon:
                    index_in_pre = word_lexicon[word]
                    found_word_count += 1
                else:
                    index_in_pre = word_lexicon[OOV_TAG]
                word_emb_layer.weight.data[index] = pre_word_embedding[index_in_pre]
            print(f"{found_word_count} out of {len(vocab)} words were found in pretrained elmo embedding.")
            word_emb_layer.weight.data[-1] = pre_word_embedding[word_lexicon[EOS_TAG]]
            word_emb_layer.weight.data[-2] = pre_word_embedding[word_lexicon[BOS_TAG]]
            self.word_vocab = vocab
        else:
            word_emb_layer = None
        # For the model trained with character-based word encoder.
        if config['token_embedder']['char_dim'] > 0:
            char_lexicon = {}
            with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi:
                for line in fpi:
                    tokens = line.strip().split('\t')
                    if len(tokens) == 1:
                        tokens.insert(0, '\u3000')
                    token, i = tokens
                    char_lexicon[token] = int(i)
            # 做一些sanity check
            for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]:
                assert special_word in char_lexicon, f"{special_word} not found in char.dic."
            # 从vocab中构建char_vocab
            char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG)
            # 需要保证<bow>与<eow>在里面
            char_vocab.add_word(BOW_TAG)
            char_vocab.add_word(EOW_TAG)
            for word, index in vocab:
                char_vocab.add_word_lst(list(word))
            # 保证<eos>, <bos>也在
            char_vocab.add_word_lst(list(BOS_TAG))
            char_vocab.add_word_lst(list(EOS_TAG))
            # 根据char_lexicon调整
            char_emb_layer = nn.Embedding(len(char_vocab), int(config['token_embedder']['char_dim']))
            pre_char_embedding = token_embedder_states.pop('char_emb_layer.embedding.weight')
            found_char_count = 0
            for char, index in char_vocab:  # 调整character embedding
                if char in char_lexicon:
                    index_in_pre = char_lexicon.get(char)
                    found_char_count += 1
                else:
                    index_in_pre = char_lexicon[OOV_TAG]
                char_emb_layer.weight.data[index] = pre_char_embedding[index_in_pre]
            print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.")
            # 生成words到chars的映射
            if config['token_embedder']['name'].lower() == 'cnn':
                max_chars = config['token_embedder']['max_characters_per_token']
            elif config['token_embedder']['name'].lower() == 'lstm':
                max_chars = max(map(lambda x: len(x[0]), vocab)) + 2 # 需要补充两个<bow>与<eow>
            else:
                raise ValueError('Unknown token_embedder: {0}'.format(config['token_embedder']['name']))
            # 增加<bos>, <eos>所以加2.
            self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab)+2, max_chars),
                                                                    fill_value=char_vocab.to_index(PAD_TAG), dtype=torch.long),
                                                         requires_grad=False)
            for word, index in vocab:
                if len(word)+2>max_chars:
                    word = word[:max_chars-2]
                if index==vocab.padding_idx:  # 如果是pad的话，需要和给定的对齐
                    word = PAD_TAG
                elif index==vocab.unknown_idx:
                    word = OOV_TAG
                char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(c) for c in word] + [char_vocab.to_index(EOW_TAG)]
                char_ids += [char_vocab.to_index(PAD_TAG)]*(max_chars-len(char_ids))
                self.words_to_chars_embedding[index] = torch.LongTensor(char_ids)
            for index, word in enumerate([BOS_TAG, EOS_TAG]):  # 加上<eos>, <bos>
                if len(word)+2>max_chars:
                    word = word[:max_chars-2]
                char_ids = [char_vocab.to_index(BOW_TAG)] + [char_vocab.to_index(c) for c in word] + [char_vocab.to_index(EOW_TAG)]
                char_ids += [char_vocab.to_index(PAD_TAG)]*(max_chars-len(char_ids))
                self.words_to_chars_embedding[index+len(vocab)] = torch.LongTensor(char_ids)
            self.char_vocab = char_vocab
        else:
            char_emb_layer = None
        if config['token_embedder']['name'].lower() == 'cnn':
            self.token_embedder = ConvTokenEmbedder(
                config, word_emb_layer, char_emb_layer)
        elif config['token_embedder']['name'].lower() == 'lstm':
            self.token_embedder = LstmTokenEmbedder(
                config, word_emb_layer, char_emb_layer)
        self.token_embedder.load_state_dict(token_embedder_states, strict=False)
        self.output_dim = config['encoder']['projection_dim']
        if config['encoder']['name'].lower() == 'elmo':
            self.encoder = ElmobiLm(config)
        elif config['encoder']['name'].lower() == 'lstm':
            self.encoder = LstmbiLm(config)
        self.encoder.load_state_dict(torch.load(os.path.join(model_dir, 'encoder.pkl'),
                                                map_location='cpu'))
        self.bos_index = len(vocab)
        self.eos_index = len(vocab) + 1
        self._pad_index = vocab.padding_idx
        if cache_word_reprs:
            if config['token_embedder']['char_dim']>0:  # 只有在使用了chars的情况下有用
                print("Start to generate cache word representations.")
                batch_size = 320
                num_batches = self.words_to_chars_embedding.size(0)//batch_size + \
                              int(self.words_to_chars_embedding.size(0)%batch_size!=0)
                self.cached_word_embedding = nn.Embedding(self.words_to_chars_embedding.size(0),
                                                          config['encoder']['projection_dim'])
                with torch.no_grad():
                    for i in range(num_batches):
                        words = torch.arange(i*batch_size, min((i+1)*batch_size, self.words_to_chars_embedding.size(0))).long()
                        chars = self.words_to_chars_embedding[words].unsqueeze(1)  # batch_size x 1 x max_chars
                        word_reprs = self.token_embedder(words.unsqueeze(1), chars).detach()  # batch_size x 1 x config['encoder']['projection_dim']
                        self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1)
                    print("Finish generating cached word representations. Going to delete the character encoder.")
                del self.token_embedder, self.words_to_chars_embedding
            else:
                print("There is no need to cache word representations, since no character information is used.")
    def forward(self, words):
        """
        :param words: batch_size x max_len
        :return: num_layers x batch_size x max_len x hidden_size
        """
        # 扩展<bos>, <eos>
        batch_size, max_len = words.size()
        expanded_words = words.new_zeros(batch_size, max_len + 2)  # 因为pad一定为0，
        seq_len = words.ne(self._pad_index).sum(dim=-1)
        expanded_words[:, 1:-1] = words
        expanded_words[:, 0].fill_(self.bos_index)
        expanded_words[torch.arange(batch_size).to(words), seq_len+1] = self.eos_index
        seq_len = seq_len + 2
        if hasattr(self, 'cached_word_embedding'):
            token_embedding = self.cached_word_embedding(expanded_words)
        else:
            if hasattr(self, 'words_to_chars_embedding'):
                chars = self.words_to_chars_embedding[expanded_words]
            else:
                chars = None
            token_embedding = self.token_embedder(expanded_words, chars)
        if self.config['encoder']['name'] == 'elmo':
            encoder_output = self.encoder(token_embedding, seq_len)
            sz = encoder_output.size()
            token_embedding = torch.cat([token_embedding, token_embedding], dim=2).view(1, sz[1], sz[2], sz[3])
            encoder_output = torch.cat([token_embedding, encoder_output], dim=0)
        elif self.config['encoder']['name'] == 'lstm':
            encoder_output = self.encoder(token_embedding, seq_len)
        else:
            raise ValueError('Unknown encoder: {0}'.format(self.config['encoder']['name']))
        # 删除<eos>, <bos>. 这里没有精确地删除，但应该也不会影响最后的结果了。
        encoder_output = encoder_output[:, :, 1:-1]
        return encoder_output
--- a/fastNLP/modules/encoder/bert.py
+++ b/fastNLP/modules/encoder/bert.py
@@ -269,8 +269,9 @@ class BertModel(nn.Module):
                 attention_probs_dropout_prob=0.1,
                 max_position_embeddings=512,
                 type_vocab_size=2,
                 initializer_range=0.02, **kwargs):
                 initializer_range=0.02):
        super(BertModel, self).__init__()
        self.hidden_size = hidden_size
        self.embeddings = BertEmbeddings(vocab_size, hidden_size, max_position_embeddings,
                                         type_vocab_size, hidden_dropout_prob)
        self.encoder = BertEncoder(num_hidden_layers, hidden_size, num_attention_heads,
--- a/fastNLP/modules/encoder/char_encoder.py
+++ b/fastNLP/modules/encoder/char_encoder.py
@@ -22,10 +22,10 @@ class ConvolutionCharEncoder(nn.Module):
    :param initial_method: 初始化参数的方式, 默认为`xavier normal`
    """
    def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(3, 4, 5), initial_method=None):
    def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(1, 3, 5), initial_method=None):
        super(ConvolutionCharEncoder, self).__init__()
        self.convs = nn.ModuleList([
            nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, 4))
            nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, kernels[i]//2))
            for i in range(len(kernels))])
        initial_parameter(self, initial_method)
--- a/fastNLP/modules/encoder/conv_maxpool.py
+++ b/fastNLP/modules/encoder/conv_maxpool.py
@@ -5,9 +5,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from ..utils import initial_parameter
 class ConvMaxpool(nn.Module):
    """
    别名：:class:`fastNLP.modules.ConvMaxpool`   :class:`fastNLP.modules.encoder.conv_maxpool.ConvMaxpool`
@@ -19,20 +16,15 @@ class ConvMaxpool(nn.Module):
    :param int in_channels: 输入channel的大小，一般是embedding的维度; 或encoder的output维度
    :param int,tuple(int) out_channels: 输出channel的数量。如果为list，则需要与kernel_sizes的数量保持一致
    :param int,tuple(int) kernel_sizes: 输出channel的kernel大小。
    :param int stride: 见pytorch Conv1D文档。所有kernel共享一个stride。
    :param int padding: 见pytorch Conv1D文档。所有kernel共享一个padding。
    :param int dilation: 见pytorch Conv1D文档。所有kernel共享一个dilation。
    :param int groups: 见pytorch Conv1D文档。所有kernel共享一个groups。
    :param bool bias: 见pytorch Conv1D文档。所有kernel共享一个bias。
    :param str activation: Convolution后的结果将通过该activation后再经过max-pooling。支持relu, sigmoid, tanh
    :param str initial_method: str。
    """
    def __init__(self, in_channels, out_channels, kernel_sizes,
                 stride=1, padding=0, dilation=1,
                 groups=1, bias=True, activation="relu", initial_method=None):
    def __init__(self, in_channels, out_channels, kernel_sizes, activation="relu"):
        super(ConvMaxpool, self).__init__()
        for kernel_size in kernel_sizes:
            assert kernel_size%2==1, "kernel size has to be odd numbers."
        # convolution
        if isinstance(kernel_sizes, (list, tuple, int)):
            if isinstance(kernel_sizes, int) and isinstance(out_channels, int):
@@ -49,11 +41,11 @@ class ConvMaxpool(nn.Module):
                in_channels=in_channels,
                out_channels=oc,
                kernel_size=ks,
                stride=stride,
                padding=padding,
                dilation=dilation,
                groups=groups,
                bias=bias)
                stride=1,
                padding=ks//2,
                dilation=1,
                groups=1,
                bias=None)
                for oc, ks in zip(out_channels, kernel_sizes)])
        else:
@@ -70,9 +62,7 @@ class ConvMaxpool(nn.Module):
        else:
            raise Exception(
                "Undefined activation function: choose from: relu, tanh, sigmoid")
        initial_parameter(self, initial_method)
    def forward(self, x, mask=None):
        """
@@ -86,7 +76,7 @@ class ConvMaxpool(nn.Module):
        xs = [self.activation(conv(x)) for conv in self.convs]  # [[N,C,L], ...]
        if mask is not None:
            mask = mask.unsqueeze(1)  # B x 1 x L
            xs = [x.masked_fill_(mask, float('-inf')) for x in xs]
            xs = [x.masked_fill_(mask.eq(0), float('-inf')) for x in xs]
        # max-pooling
        xs = [F.max_pool1d(input=i, kernel_size=i.size(2)).squeeze(2)
              for i in xs]  # [[N, C], ...]
--- a/fastNLP/modules/encoder/embedding.py
+++ b/fastNLP/modules/encoder/embedding.py
@@ -3,48 +3,821 @@ __all__ = [
 ]
 import torch.nn as nn
 from ..utils import get_embeddings
 from .lstm import LSTM
 from ... import Vocabulary
 from abc import abstractmethod
 import torch
 from ...io import EmbedLoader
 import torch.nn.functional as F
 import os
 from ._elmo import _ElmoModel
 from ...io.file_utils import cached_path, _get_base_url
 from ._bert import _WordBertModel
 from typing import List
 from ... import DataSet, Batch, SequentialSampler
 from ...core.utils import _move_model_to_device, _get_model_device
 class Embedding(nn.Embedding):
 class Embedding(nn.Module):
    """
    别名：:class:`fastNLP.modules.Embedding`   :class:`fastNLP.modules.encoder.embedding.Embedding`
    Embedding组件. 可以通过self.num_embeddings获取词表大小; self.embedding_dim获取embedding的维度"""
    def __init__(self, init_embed, padding_idx=None, dropout=0.0, sparse=False, max_norm=None, norm_type=2,
                 scale_grad_by_freq=False):
    def __init__(self, init_embed, dropout=0.0):
        """
        :param tuple(int,int),torch.FloatTensor,nn.Embedding,numpy.ndarray init_embed: Embedding的大小(传入tuple(int, int),
            第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding
        :param None,int padding_idx: 该index的Embedding将一直为0.
            第一个int为vocab_zie, 第二个int为embed_dim); 如果为Tensor, Embedding, ndarray等则直接使用该值初始化Embedding;
            也可以传入TokenEmbedding对象
        :param float dropout: 对Embedding的输出的dropout。
        :param bool sparse: 如果为True，则对Embedding的梯度将是sparse的，参考Pytorch Embedding获取更多信息。
        :param None,float max_norm: 每个vector最大的norm能为多大
        :param int norm_type: norm的类型
        :param bool scale_grad_by_freq: 如果为True，将会把梯度除以这个词出现的次数.
        """
        embed = get_embeddings(init_embed)
        num_embeddings, embedding_dim = embed.weight.size()
        super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx,
                         max_norm=max_norm, norm_type=norm_type, scale_grad_by_freq=scale_grad_by_freq,
                         sparse=sparse, _weight=embed.weight.data)
        del embed
        super(Embedding, self).__init__()
        self.embed = get_embeddings(init_embed)
        self.dropout = nn.Dropout(dropout)
        if not isinstance(self.embed, TokenEmbedding):
            self._embed_size = self.embed.weight.size(1)
        else:
            self._embed_size = self.embed.embed_size
    def forward(self, x):
        """
        :param torch.LongTensor x: [batch, seq_len]
        :return: torch.Tensor : [batch, seq_len, embed_dim]
        """
        x = super().forward(x)
        x = self.embed(x)
        return self.dropout(x)
    @property
    def embed_size(self) -> int:
        return self._embed_size
    @property
    def embedding_dim(self) -> int:
        return self._embed_size
    @property
    def requires_grad(self):
        """
        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
        :return:
        """
        if not isinstance(self.embed, TokenEmbedding):
            return self.embed.weight.requires_grad
        else:
            return self.embed.requires_grad
    @requires_grad.setter
    def requires_grad(self, value):
        if not isinstance(self.embed, TokenEmbedding):
            self.embed.weight.requires_grad = value
        else:
            self.embed.requires_grad = value
    @property
    def size(self):
        if isinstance(self.embed, TokenEmbedding):
            return torch.Size(self.embed._word_vocab, self.embed.embed_size)
        else:
            return self.embed.weight.size()
 class TokenEmbedding(nn.Module):
    def __init__(self, vocab):
        super(TokenEmbedding, self).__init__()
        assert vocab.padding_idx is not None, "You vocabulary must have padding."
        self._word_vocab = vocab
        self._word_pad_index = vocab.padding_idx
    @property
    def requires_grad(self):
        """
        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
        :return:
        """
        requires_grads = set([param.requires_grad for param in self.parameters()])
        if len(requires_grads) == 1:
            return requires_grads.pop()
        else:
            return None
    @requires_grad.setter
    def requires_grad(self, value):
        for param in self.parameters():
            param.requires_grad = value
    @abstractmethod
    def get_original_vocab(self):
        pass
    @property
    def embed_size(self) -> int:
        return self._embed_size
    def get_word_vocab(self):
        """
        返回embedding的词典。
        :return: Vocabulary
        """
        return self._word_vocab
    @property
    def size(self):
        return torch.Size(self.embed._word_vocab, self._embed_size)
 class StaticEmbedding(TokenEmbedding):
    """
    别名：:class:`fastNLP.modules.StaticEmbedding`   :class:`fastNLP.modules.encoder.embedding.StaticEmbedding`
    StaticEmbedding组件. 给定embedding的名称，根据vocab从embedding中抽取相应的数据。该Embedding可以就按照正常的embedding使用了
    Example::
    :param vocab: Vocabulary. 若该项为None则会读取所有的embedding。
    :param model_dir_or_name: 可以有两种方式调用预训练好的static embedding：第一种是传入embedding的文件名，第二种是传入embedding
        的名称。目前支持的embedding包括{`en` 或者 `en-glove-840b-300` : glove.840B.300d, `en-glove-6b-50` : glove.6B.50d,
        `en-word2vec-300` : GoogleNews-vectors-negative300}。第二种情况将自动查看缓存中是否存在该模型，没有的话将自动下载。
    :param requires_grad: 是否需要gradient
    """
    def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', requires_grad: bool=False):
        super(StaticEmbedding, self).__init__(vocab)
        # 优先定义需要下载的static embedding有哪些。这里估计需要自己搞一个server，
        PRETRAIN_URL = _get_base_url('static')
        PRETRAIN_STATIC_FILES = {
            'en': 'glove.840B.300d-cc1ad5e1.tar.gz',
            'en-glove-840b-300': 'glove.840B.300d-cc1ad5e1.tar.gz',
            'en-glove-6b-50': "glove.6B.50d-a6028c70.tar.gz",
            'en-word2vec-300': "GoogleNews-vectors-negative300-be166d9d.tar.gz",
            'en-fasttext': "cc.en.300.vec-d53187b2.gz",
            'cn': "tencent_cn-dab24577.tar.gz",
            'cn-fasttext': "cc.zh.300.vec-d68a9bcf.gz",
        }
        # 得到cache_path
        if model_dir_or_name.lower() in PRETRAIN_STATIC_FILES:
            model_name = PRETRAIN_STATIC_FILES[model_dir_or_name]
            model_url = PRETRAIN_URL + model_name
            model_path = cached_path(model_url)
            # 检查是否存在
        elif os.path.isfile(model_dir_or_name):
            model_path = model_dir_or_name
        else:
            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
        # 读取embedding
        embedding = EmbedLoader.load_with_vocab(model_path, vocab=vocab)
        embedding = torch.tensor(embedding)
        self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1],
                                      padding_idx=vocab.padding_idx,
                                      max_norm=None, norm_type=2, scale_grad_by_freq=False,
                                      sparse=False, _weight=embedding)
        self._embed_size = self.embedding.weight.size(1)
        self.requires_grad = requires_grad
    def forward(self, words):
        """
        传入words的index
        :param words: torch.LongTensor, [batch_size, max_len]
        :return: torch.FloatTensor, [batch_size, max_len, embed_size]
        """
        return self.embedding(words)
 class ContextualEmbedding(TokenEmbedding):
    def __init__(self, vocab: Vocabulary):
        super(ContextualEmbedding, self).__init__(vocab)
    def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool=True):
        """
        由于动态embedding生成比较耗时，所以可以把每句话embedding缓存下来，这样就不需要每次都运行生成过程。
        Example::
            >>>
        :param datasets: DataSet对象
        :param batch_size: int, 生成cache的sentence表示时使用的batch的大小
        :param device: 参考 :class::fastNLP.Trainer 的device
        :param delete_weights: 似乎在生成了cache之后删除权重，在不需要finetune动态模型的情况下，删除权重会大量减少内存占用。
        :return:
        """
        for index, dataset in enumerate(datasets):
            try:
                assert isinstance(dataset, DataSet), "Only fastNLP.DataSet object is allowed."
                assert 'words' in dataset.get_input_name(), "`words` field has to be set as input."
            except Exception as e:
                print(f"Exception happens at {index} dataset.")
                raise e
        sent_embeds = {}
        _move_model_to_device(self, device=device)
        device = _get_model_device(self)
        pad_index = self._word_vocab.padding_idx
        print("Start to calculate sentence representations.")
        with torch.no_grad():
            for index, dataset in enumerate(datasets):
                try:
                    batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), prefetch=False)
                    for batch_x, batch_y in batch:
                        words = batch_x['words'].to(device)
                        words_list = words.tolist()
                        seq_len = words.ne(pad_index).sum(dim=-1)
                        max_len = words.size(1)
                        # 因为有些情况可能包含CLS, SEP, 从后面往前计算比较安全。
                        seq_len_from_behind =(max_len - seq_len).tolist()
                        word_embeds = self(words).detach().cpu().numpy()
                        for b in range(words.size(0)):
                            length = seq_len_from_behind[b]
                            if length==0:
                                sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b]
                            else:
                                sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length]
                except Exception as e:
                    print(f"Exception happens at {index} dataset.")
                    raise e
        print("Finish calculating sentence representations.")
        self.sent_embeds = sent_embeds
        if delete_weights:
            self._delete_model_weights()
    def _get_sent_reprs(self, words):
        """
        获取sentence的表示，如果有缓存，则返回缓存的值; 没有缓存则返回None
        :param words: torch.LongTensor
        :return:
        """
        if hasattr(self, 'sent_embeds'):
            words_list = words.tolist()
            seq_len = words.ne(self._word_pad_index).sum(dim=-1)
            _embeds = []
            for b in range(len(words)):
                words_i = tuple(words_list[b][:seq_len[b]])
                embed = self.sent_embeds[words_i]
                _embeds.append(embed)
            max_sent_len = max(map(len, _embeds))
            embeds = words.new_zeros(len(_embeds), max_sent_len, self.embed_size, dtype=torch.float,
                                     device=words.device)
            for i, embed in enumerate(_embeds):
                embeds[i, :len(embed)] = torch.FloatTensor(embed).to(words.device)
            return embeds
        return None
    @abstractmethod
    def _delete_model_weights(self):
        """删除计算表示的模型以节省资源"""
        raise NotImplementedError
    def remove_sentence_cache(self):
        """
        删除缓存的句子表示. 删除之后如果模型权重没有被删除，将开始使用动态计算权重。
        :return:
        """
        del self.sent_embeds
 class ElmoEmbedding(ContextualEmbedding):
    """
    别名：:class:`fastNLP.modules.ElmoEmbedding`   :class:`fastNLP.modules.encoder.embedding.ElmoEmbedding`
    使用ELMo的embedding。初始化之后，只需要传入words就可以得到对应的embedding。
    我们提供的ELMo预训练模型来自 https://github.com/HIT-SCIR/ELMoForManyLangs
    Example::
        >>>
        >>>
    :param vocab: 词表
    :param model_dir_or_name: 可以有两种方式调用预训练好的ELMo embedding：第一种是传入ELMo权重的文件名，第二种是传入ELMo版本的名称，
        目前支持的ELMo包括{`en` : 英文版本的ELMo, `cn` : 中文版本的ELMo,}。第二种情况将自动查看缓存中是否存在该模型，没有的话将自动下载
    :param layers: str, 指定返回的层数, 以,隔开不同的层。如果要返回第二层的结果'2', 返回后两层的结果'1,2'。不同的层的结果
        按照这个顺序concat起来。默认为'2'。
    :param requires_grad: bool, 该层是否需要gradient. 默认为False
    :param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话，将在初始化的时候为每个word生成对应的embedding，
        并删除character encoder，之后将直接使用cache的embedding。默认为False。
    """
    def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en',
                 layers: str='2', requires_grad: bool=False, cache_word_reprs: bool=False):
        super(ElmoEmbedding, self).__init__(vocab)
        layers = list(map(int, layers.split(',')))
        assert len(layers) > 0, "Must choose one output"
        for layer in layers:
            assert 0 <= layer <= 2, "Layer index should be in range [0, 2]."
        self.layers = layers
        # 根据model_dir_or_name检查是否存在并下载
        PRETRAIN_URL = _get_base_url('elmo')
        # TODO 把baidu云上的加上去
        PRETRAINED_ELMO_MODEL_DIR = {'en': 'elmo_en-d39843fe.tar.gz',
                                     'cn': 'elmo_cn-5e9b34e2.tar.gz'}
        if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR:
            model_name = PRETRAINED_ELMO_MODEL_DIR[model_dir_or_name]
            model_url = PRETRAIN_URL + model_name
            model_dir = cached_path(model_url)
            # 检查是否存在
        elif os.path.isdir(model_dir_or_name):
            model_dir = model_dir_or_name
        else:
            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
        self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs)
        self.requires_grad = requires_grad
        self._embed_size = len(self.layers) * self.model.config['encoder']['projection_dim'] * 2
    def forward(self, words: torch.LongTensor):
        """
        计算words的elmo embedding表示。根据elmo文章中介绍的ELMO实际上是有2L+1层结果，但是为了让结果比较容易拆分，token的
            被重复了一次，使得实际上layer=0的结果是[token_embedding;token_embedding], 而layer=1的结果是[forward_hiddens;
            backward_hiddens].
        :param words: batch_size x max_len
        :return: torch.FloatTensor. batch_size x max_len x (512*len(self.layers))
        """
        outputs = self._get_sent_reprs(words)
        if outputs is not None:
            return outputs
        outputs = self.model(words)
        if len(self.layers) == 1:
            outputs = outputs[self.layers[0]]
        else:
            outputs = torch.cat([*outputs[self.layers]], dim=-1)
        return outputs
    def _delete_model_weights(self):
        del self.layers, self.model
    @property
    def requires_grad(self):
        """
        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
        :return:
        """
        requires_grads = set([param.requires_grad for name, param in self.named_parameters()
                             if 'words_to_chars_embedding' not in name])
        if len(requires_grads) == 1:
            return requires_grads.pop()
        else:
            return None
    @requires_grad.setter
    def requires_grad(self, value):
        for name, param in self.named_parameters():
            if 'words_to_chars_embedding' in name: # 这个不能加入到requires_grad中
                pass
            param.requires_grad = value
 class BertEmbedding(ContextualEmbedding):
    """
    别名：:class:`fastNLP.modules.BertEmbedding`   :class:`fastNLP.modules.encoder.embedding.BertEmbedding`
    使用BERT对words进行encode的Embedding。
    Example::
        >>>
    :param fastNLP.Vocabulary vocab: 词表
    :param str model_dir_or_name: 模型所在目录或者模型的名称。默认值为``en-base-uncased``
    :param str layers:最终结果中的表示。以','隔开层数，可以以负数去索引倒数几层
    :param str pool_method: 因为在bert中，每个word会被表示为多个word pieces, 当获取一个word的表示的时候，怎样从它的word pieces
        中计算得到他对应的表示。支持``last``, ``first``, ``avg``, ``max``.
    :param bool include_cls_sep: bool，在bert计算句子的表示的时候，需要在前面加上[CLS]和[SEP], 是否在结果中保留这两个内容。 这样
        会使得word embedding的结果比输入的结果长两个token。在使用 :class::StackEmbedding 可能会遇到问题。
    :param bool requires_grad: 是否需要gradient。
    """
    def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1',
                 pool_method: str='first', include_cls_sep: bool=False, requires_grad: bool=False):
        super(BertEmbedding, self).__init__(vocab)
        # 根据model_dir_or_name检查是否存在并下载
        PRETRAIN_URL = _get_base_url('bert')
        PRETRAINED_BERT_MODEL_DIR = {'en': 'bert-base-cased-f89bfe08.zip',
                                     'en-base-uncased': 'bert-base-uncased-3413b23c.zip',
                                     'en-base-cased': 'bert-base-cased-f89bfe08.zip',
                                     'en-large-uncased': 'bert-large-uncased-20939f45.zip',
                                     'en-large-cased': 'bert-large-cased-e0cf90fc.zip',
                                     'cn': 'bert-base-chinese-29d0a84a.zip',
                                     'cn-base': 'bert-base-chinese-29d0a84a.zip',
                                     'multilingual': 'bert-base-multilingual-cased-1bd364ee.zip',
                                     'multilingual-base-uncased': 'bert-base-multilingual-uncased-f8730fe4.zip',
                                     'multilingual-base-cased': 'bert-base-multilingual-cased-1bd364ee.zip',
                                     }
        if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR:
            model_name = PRETRAINED_BERT_MODEL_DIR[model_dir_or_name]
            model_url = PRETRAIN_URL + model_name
            model_dir = cached_path(model_url)
            # 检查是否存在
        elif os.path.isdir(model_dir_or_name):
            model_dir = model_dir_or_name
        else:
            raise ValueError(f"Cannot recognize {model_dir_or_name}.")
        self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers,
                                    pool_method=pool_method, include_cls_sep=include_cls_sep)
        self.requires_grad = requires_grad
        self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size
    def _delete_model_weights(self):
        del self.model
    def forward(self, words):
        """
        计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要
            删除这两个token的表示。
        :param words: batch_size x max_len
        :return: torch.FloatTensor. batch_size x max_len x (768*len(self.layers))
        """
        outputs = self._get_sent_reprs(words)
        if outputs is not None:
            return outputs
        outputs = self.model(words)
        outputs = torch.cat([*outputs], dim=-1)
        return outputs
    @property
    def requires_grad(self):
        """
        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
        :return:
        """
        requires_grads = set([param.requires_grad for name, param in self.named_parameters()
                             if 'word_pieces_lengths' not in name])
        if len(requires_grads) == 1:
            return requires_grads.pop()
        else:
            return None
    @requires_grad.setter
    def requires_grad(self, value):
        for name, param in self.named_parameters():
            if 'word_pieces_lengths' in name:  # 这个不能加入到requires_grad中
                pass
            param.requires_grad = value
 def _construct_char_vocab_from_vocab(vocab:Vocabulary, min_freq:int=1):
    """
    给定一个word的vocabulary生成character的vocabulary.
    :param vocab: 从vocab
    :param min_freq:
    :return:
    """
    char_vocab = Vocabulary(min_freq=min_freq)
    for word, index in vocab:
        char_vocab.add_word_lst(list(word))
    return char_vocab
 class CNNCharEmbedding(TokenEmbedding):
    """
    别名：:class:`fastNLP.modules.CNNCharEmbedding`   :class:`fastNLP.modules.encoder.embedding.CNNCharEmbedding`
    使用CNN生成character embedding。CNN的结果为, CNN(x) -> activation(x) -> pool -> fc. 不同的kernel大小的fitler结果是
        concat起来的。
    Example::
        >>>
    :param vocab: 词表
    :param embed_size: 该word embedding的大小，默认值为50.
    :param char_emb_size: character的embed的大小。character是从vocab中生成的。默认值为50.
    :param filter_nums: filter的数量. 长度需要和kernels一致。默认值为[40, 30, 20].
    :param kernels: kernel的大小. 默认值为[5, 3, 1].
    :param pool_method: character的表示在合成一个表示时所使用的pool方法，支持'avg', 'max'.
    :param activation: CNN之后使用的激活方法，支持'relu', 'sigmoid', 'tanh' 或者自定义函数.
    :param min_char_freq: character的最少出现次数。默认值为2.
    """
    def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50,
                 filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1), pool_method: str='max',
                 activation='relu', min_char_freq: int=2):
        super(CNNCharEmbedding, self).__init__(vocab)
        for kernel in kernel_sizes:
            assert kernel % 2 == 1, "Only odd kernel is allowed."
        assert pool_method in ('max', 'avg')
        self.pool_method = pool_method
        # activation function
        if isinstance(activation, str):
            if activation.lower() == 'relu':
                self.activation = F.relu
            elif activation.lower() == 'sigmoid':
                self.activation = F.sigmoid
            elif activation.lower() == 'tanh':
                self.activation = F.tanh
        elif activation is None:
            self.activation = lambda x: x
        elif callable(activation):
            self.activation = activation
        else:
            raise Exception(
                "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")
        print("Start constructing character vocabulary.")
        # 建立char的词表
        self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
        self.char_pad_index = self.char_vocab.padding_idx
        print(f"In total, there are {len(self.char_vocab)} distinct characters.")
        # 对vocab进行index
        self.max_word_len = max(map(lambda x: len(x[0]), vocab))
        self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), self.max_word_len),
                                                                fill_value=self.char_pad_index, dtype=torch.long),
                                                     requires_grad=False)
        self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False)
        for word, index in vocab:
            # if index!=vocab.padding_idx:  # 如果是pad的话，直接就为pad_value了。 修改为不区分pad, 这样所有的<pad>也是同一个embed
            self.words_to_chars_embedding[index, :len(word)] = \
                torch.LongTensor([self.char_vocab.to_index(c) for c in word])
            self.word_lengths[index] = len(word)
        self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size)
        self.convs = nn.ModuleList([nn.Conv1d(
            char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2)
            for i in range(len(kernel_sizes))])
        self._embed_size = embed_size
        self.fc = nn.Linear(sum(filter_nums), embed_size)
    def forward(self, words):
        """
        Embedding的大小
        :return: torch.Size()
        输入words的index后，生成对应的words的表示。
        :param words: [batch_size, max_len]
        :return: [batch_size, max_len, embed_size]
        """
        return self.weight.size()
        batch_size, max_len = words.size()
        chars = self.words_to_chars_embedding[words]  # batch_size x max_len x max_word_len
        word_lengths = self.word_lengths[words] # batch_size x max_len
        max_word_len = word_lengths.max()
        chars = chars[:, :, :max_word_len]
        # 为1的地方为mask
        chars_masks = chars.eq(self.char_pad_index)  # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
        chars = self.char_embedding(chars)  # batch_size x max_len x max_word_len x embed_size
        reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1)
        reshaped_chars = reshaped_chars.transpose(1, 2)  # B' x E x M
        conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1)
                      for conv in self.convs]
        conv_chars = torch.cat(conv_chars, dim=-1).contiguous()  # B x max_len x max_word_len x sum(filters)
        conv_chars = self.activation(conv_chars)
        if self.pool_method == 'max':
            conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf'))
            chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters)
        else:
            conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), 0)
            chars = torch.sum(conv_chars, dim=-2)/chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
        chars = self.fc(chars)
        return chars
    @property
    def requires_grad(self):
        """
        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
        :return:
        """
        params = []
        for name, param in self.named_parameters():
            if 'words_to_chars_embedding' not in name and 'word_lengths' not in name:
                params.append(param.requires_grad)
        requires_grads = set(params)
        if len(requires_grads) == 1:
            return requires_grads.pop()
        else:
            return None
    @requires_grad.setter
    def requires_grad(self, value):
        for name, param in self.named_parameters():
            if 'words_to_chars_embedding' in name or 'word_lengths' in name:  # 这个不能加入到requires_grad中
                pass
            param.requires_grad = value
 class LSTMCharEmbedding(TokenEmbedding):
    """
    别名：:class:`fastNLP.modules.LSTMCharEmbedding`   :class:`fastNLP.modules.encoder.embedding.LSTMCharEmbedding`
    使用LSTM的方式对character进行encode.
    Example::
        >>>
    :param vocab: 词表
    :param embed_size: embedding的大小。默认值为50.
    :param char_emb_size: character的embedding的大小。默认值为50.
    :param hidden_size: LSTM的中间hidden的大小，如果为bidirectional的，hidden会除二，默认为50.
    :param pool_method: 支持'max', 'avg'
    :param activation: 激活函数，支持'relu', 'sigmoid', 'tanh', 或者自定义函数.
    :param min_char_freq: character的最小出现次数。默认值为2.
    :param bidirectional: 是否使用双向的LSTM进行encode。默认值为True。
    """
    def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, hidden_size=50,
                 pool_method: str='max', activation='relu', min_char_freq: int=2, bidirectional=True):
        super(LSTMCharEmbedding, self).__init__(vocab)
        assert hidden_size % 2 == 0, "Only even kernel is allowed."
        assert pool_method in ('max', 'avg')
        self.pool_method = pool_method
        # activation function
        if isinstance(activation, str):
            if activation.lower() == 'relu':
                self.activation = F.relu
            elif activation.lower() == 'sigmoid':
                self.activation = F.sigmoid
            elif activation.lower() == 'tanh':
                self.activation = F.tanh
        elif activation is None:
            self.activation = lambda x: x
        elif callable(activation):
            self.activation = activation
        else:
            raise Exception(
                "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")
        print("Start constructing character vocabulary.")
        # 建立char的词表
        self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
        self.char_pad_index = self.char_vocab.padding_idx
        print(f"In total, there are {len(self.char_vocab)} distinct characters.")
        # 对vocab进行index
        self.max_word_len = max(map(lambda x: len(x[0]), vocab))
        self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), self.max_word_len),
                                                                fill_value=self.char_pad_index, dtype=torch.long),
                                                     requires_grad=False)
        self.word_lengths = nn.Parameter(torch.zeros(len(vocab)).long(), requires_grad=False)
        for word, index in vocab:
            # if index!=vocab.padding_idx:  # 如果是pad的话，直接就为pad_value了. 修改为不区分pad与否
            self.words_to_chars_embedding[index, :len(word)] = \
                torch.LongTensor([self.char_vocab.to_index(c) for c in word])
            self.word_lengths[index] = len(word)
        self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size)
        self.fc = nn.Linear(hidden_size, embed_size)
        hidden_size = hidden_size // 2 if bidirectional else hidden_size
        self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True)
        self._embed_size = embed_size
        self.bidirectional = bidirectional
    def forward(self, words):
        """
        输入words的index后，生成对应的words的表示。
        :param words: [batch_size, max_len]
        :return: [batch_size, max_len, embed_size]
        """
        batch_size, max_len = words.size()
        chars = self.words_to_chars_embedding[words]  # batch_size x max_len x max_word_len
        word_lengths = self.word_lengths[words]  # batch_size x max_len
        max_word_len = word_lengths.max()
        chars = chars[:, :, :max_word_len]
        # 为mask的地方为1
        chars_masks = chars.eq(self.char_pad_index)  # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
        chars = self.char_embedding(chars)  # batch_size x max_len x max_word_len x embed_size
        reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1)
        char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len)
        lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1)
        # B x M x M x H
        lstm_chars = self.activation(lstm_chars)
        if self.pool_method == 'max':
            lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf'))
            chars, _ = torch.max(lstm_chars, dim=-2)  # batch_size x max_len x H
        else:
            lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), 0)
            chars = torch.sum(lstm_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
        chars = self.fc(chars)
        return chars
    @property
    def requires_grad(self):
        """
        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
        :return:
        """
        params = []
        for name, param in self.named_parameters():
            if 'words_to_chars_embedding' not in name and 'word_lengths' not in name:
                params.append(param)
        requires_grads = set(params)
        if len(requires_grads) == 1:
            return requires_grads.pop()
        else:
            return None
    @requires_grad.setter
    def requires_grad(self, value):
        for name, param in self.named_parameters():
            if 'words_to_chars_embedding' in name or 'word_lengths' in name:  # 这个不能加入到requires_grad中
                pass
            param.requires_grad = value
 class StackEmbedding(TokenEmbedding):
    """
    别名：:class:`fastNLP.modules.StackEmbedding`   :class:`fastNLP.modules.encoder.embedding.StackEmbedding`
    支持将多个embedding集合成一个embedding。
    Example::
        >>>
    :param embeds: 一个由若干个TokenEmbedding组成的list，要求每一个TokenEmbedding的词表都保持一致
    """
    def __init__(self, embeds: List[TokenEmbedding]):
        vocabs = []
        for embed in embeds:
            vocabs.append(embed.get_word_vocab())
        _vocab = vocabs[0]
        for vocab in vocabs[1:]:
            assert vocab == _vocab, "All embeddings should use the same word vocabulary."
        super(StackEmbedding, self).__init__(_vocab)
        assert isinstance(embeds, list)
        for embed in embeds:
            assert isinstance(embed, TokenEmbedding), "Only TokenEmbedding type is supported."
        self.embeds = nn.ModuleList(embeds)
        self._embed_size = sum([embed.embed_size for embed in self.embeds])
    def append(self, embed: TokenEmbedding):
        """
        添加一个embedding到结尾。
        :param embed:
        :return:
        """
        assert isinstance(embed, TokenEmbedding)
        self.embeds.append(embed)
    def pop(self):
        """
        弹出最后一个embed
        :return:
        """
        return self.embeds.pop()
    @property
    def embed_size(self):
        return self._embed_size
    @property
    def requires_grad(self):
        """
        Embedding的参数是否允许优化。True: 所有参数运行优化; False: 所有参数不允许优化; None: 部分允许优化、部分不允许
        :return:
        """
        requires_grads = set([embed.requires_grad for embed in self.embeds()])
        if len(requires_grads)==1:
            return requires_grads.pop()
        else:
            return None
    @requires_grad.setter
    def requires_grad(self, value):
        for embed in self.embeds():
            embed.requires_grad = value
    def forward(self, words):
        """
        得到多个embedding的结果，并把结果按照顺序concat起来。
        :param words: batch_size x max_len
        :return: 返回的shape和当前这个stack embedding中embedding的组成有关
        """
        outputs = []
        for embed in self.embeds:
            outputs.append(embed(words))
        return torch.cat(outputs, dim=-1)
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -20,7 +20,7 @@ class LSTM(nn.Module):
    LSTM 模块, 轻量封装的Pytorch LSTM
    :param input_size:  输入 `x` 的特征维度
    :param hidden_size: 隐状态 `h` 的特征维度
    :param hidden_size: 隐状态 `h` 的特征维度.
    :param num_layers: rnn的层数. Default: 1
    :param dropout: 层间dropout概率. Default: 0
    :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
--- a/fastNLP/modules/utils.py
+++ b/fastNLP/modules/utils.py
@@ -82,7 +82,7 @@ def get_embeddings(init_embed):
    if isinstance(init_embed, tuple):
        res = nn.Embedding(
            num_embeddings=init_embed[0], embedding_dim=init_embed[1])
    elif isinstance(init_embed, nn.Embedding):
    elif isinstance(init_embed, nn.Module):
        res = init_embed
    elif isinstance(init_embed, torch.Tensor):
        res = nn.Embedding.from_pretrained(init_embed, freeze=False)
@@ -130,3 +130,17 @@ def summary(model: nn.Module):
    strings = [bar] + strings + [bar]
    print('\n'.join(strings))
    return total, total_train, total_nontrain
 def get_dropout_mask(drop_p: float, tensor: torch.Tensor):
    """
    根据tensor的形状，生成一个mask
    :param drop_p: float, 以多大的概率置为0。
    :param tensor:torch.Tensor
    :return: torch.FloatTensor. 与tensor一样的shape
    """
    mask_x = torch.ones_like(tensor)
    nn.functional.dropout(mask_x, p=drop_p,
                          training=False, inplace=True)
    return mask_x
--- a/reproduction/seqence_labelling/cws/model/module.py
+++ b/reproduction/seqence_labelling/cws/model/module.py
@@ -1,11 +1,10 @@
 from torch import nn
 import torch
 from fastNLP.modules import Embedding
 import numpy as np
 class SemiCRFShiftRelay(nn.Module):
    """
    该模块是一个decoder，但
    该模块是一个decoder，但当前不支持含有tag的decode。
    """
    def __init__(self, L):
--- a/reproduction/seqence_labelling/cws/train_shift_relay.py
+++ b/reproduction/seqence_labelling/cws/train_shift_relay.py
@@ -32,11 +32,11 @@ lr = 0.02
 #########hyper
 device = 0
 # !!!!这里前往不要放完全路径，因为这样会暴露你们在服务器上的用户名，比较危险。所以一定要使用相对路径，最好把数据放到
 # !!!!这里千万不要放完全路径，因为这样会暴露你们在服务器上的用户名，比较危险。所以一定要使用相对路径，最好把数据放到
 #   你们的reproduction路径下，然后设置.gitignore
 file_dir = '/path/to/pku'
 char_embed_path = '/path/to/1grams_t3_m50_corpus.txt'
 bigram_embed_path = 'path/to/2grams_t3_m50_corpus.txt'
 file_dir = '/path/to/'
 char_embed_path = '/pretrain/vectors/1grams_t3_m50_corpus.txt'
 bigram_embed_path = '/pretrain/vectors/2grams_t3_m50_corpus.txt'
 bigram_vocab_opt = VocabularyOption(min_freq=3)
 char_embed_opt = EmbeddingOption(embed_filepath=char_embed_path)
 bigram_embed_opt = EmbeddingOption(embed_filepath=bigram_embed_path)
@@ -44,7 +44,7 @@ bigram_embed_opt = EmbeddingOption(embed_filepath=bigram_embed_path)
 data_name = os.path.basename(file_dir)
 cache_fp = 'caches/{}.pkl'.format(data_name)
 data = prepare_data(_cache_fp=cache_fp, _refresh=False)
 data = prepare_data(_cache_fp=cache_fp, _refresh=True)
 model = ShiftRelayCWSModel(char_embed=data.embeddings['chars'], bigram_embed=data.embeddings['bigrams'],
                           hidden_size=hidden_size, num_layers=num_layers,
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 numpy
 torch>=0.4.0
 tqdm
 nltk
 nltk
 requests
--- a/test/core/test_vocabulary.py
+++ b/test/core/test_vocabulary.py
@@ -100,13 +100,14 @@ class TestIndexing(unittest.TestCase):
        self.assertEqual(text, [vocab.to_word(idx) for idx in [vocab[w] for w in text]])
    def test_iteration(self):
        vocab = Vocabulary()
        vocab = Vocabulary(padding=None, unknown=None)
        text = ["FastNLP", "works", "well", "in", "most", "cases", "and", "scales", "well", "in",
                "works", "well", "in", "most", "cases", "scales", "well"]
        vocab.update(text)
        text = set(text)
        for word in vocab:
        for word, idx in vocab:
            self.assertTrue(word in text)
            self.assertTrue(idx < len(vocab))
 class TestOther(unittest.TestCase):
--- a/test/models/test_cnn_text_classification.py
+++ b/test/models/test_cnn_text_classification.py
@@ -12,7 +12,6 @@ class TestCNNText(unittest.TestCase):
        model = CNNText(init_emb,
                        NUM_CLS,
                        kernel_nums=(1, 3, 5),
                        kernel_sizes=(2, 2, 2),
                        padding=0,
                        kernel_sizes=(1, 3, 5),
                        dropout=0.5)
        RUNNER.run_model_with_task(TEXT_CLS, model)
--- a/test/test_tutorials.py
+++ b/test/test_tutorials.py
@@ -70,7 +70,7 @@ class TestTutorial(unittest.TestCase):
            break
        from fastNLP.models import CNNText
        model = CNNText((len(vocab), 50), num_classes=5, padding=2, dropout=0.1)
        model = CNNText((len(vocab), 50), num_classes=5, dropout=0.1)
        from fastNLP import Trainer
        from copy import deepcopy
@@ -143,7 +143,7 @@ class TestTutorial(unittest.TestCase):
                       is_input=True)
        from fastNLP.models import CNNText
        model = CNNText((len(vocab), 50), num_classes=5, padding=2, dropout=0.1)
        model = CNNText((len(vocab), 50), num_classes=5, dropout=0.1)
        from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric, Adam