本地暂存

6 years ago · 28ece53df0
--- a/docs/source/fastNLP.io.rst
+++ b/docs/source/fastNLP.io.rst
@@ -4,48 +4,48 @@ fastNLP.io package
 Submodules
 ----------
 fastNLP.io.base\_loader module
 ------------------------------
 fastNLP.io.base_loader module
 -----------------------------
 .. automodule:: fastNLP.io.base_loader
    :members:
    :undoc-members:
    :show-inheritance:
 fastNLP.io.config\_io module
 ----------------------------
 fastNLP.io.config_io module
 ---------------------------
 .. automodule:: fastNLP.io.config_io
    :members:
    :undoc-members:
    :show-inheritance:
 fastNLP.io.dataset\_loader module
 ---------------------------------
 fastNLP.io.dataset_loader module
 --------------------------------
 .. automodule:: fastNLP.io.dataset_loader
    :members:
    :undoc-members:
    :show-inheritance:
 fastNLP.io.embed\_loader module
 -------------------------------
 fastNLP.io.embed_loader module
 ------------------------------
 .. automodule:: fastNLP.io.embed_loader
    :members:
    :undoc-members:
    :show-inheritance:
 fastNLP.io.file\_reader module
 ------------------------------
 fastNLP.io.file_reader module
 -----------------------------
 .. automodule:: fastNLP.io.file_reader
    :members:
    :undoc-members:
    :show-inheritance:
 fastNLP.io.model\_io module
 ---------------------------
 fastNLP.io.model_io module
 --------------------------
 .. automodule:: fastNLP.io.model_io
    :members:
--- a/docs/source/fastNLP.models.rst
+++ b/docs/source/fastNLP.models.rst
@@ -4,8 +4,8 @@ fastNLP.models package
 Submodules
 ----------
 fastNLP.models.base\_model module
 ---------------------------------
 fastNLP.models.base_model module
 --------------------------------
 .. automodule:: fastNLP.models.base_model
    :members:
@@ -20,64 +20,64 @@ fastNLP.models.bert module
    :undoc-members:
    :show-inheritance:
 fastNLP.models.biaffine\_parser module
 --------------------------------------
 fastNLP.models.biaffine_parser module
 -------------------------------------
 .. automodule:: fastNLP.models.biaffine_parser
    :members:
    :undoc-members:
    :show-inheritance:
 fastNLP.models.char\_language\_model module
 -------------------------------------------
 fastNLP.models.char_language_model module
 -----------------------------------------
 .. automodule:: fastNLP.models.char_language_model
    :members:
    :undoc-members:
    :show-inheritance:
 fastNLP.models.cnn\_text\_classification module
 -----------------------------------------------
 fastNLP.models.cnn_text_classification module
 ---------------------------------------------
 .. automodule:: fastNLP.models.cnn_text_classification
    :members:
    :undoc-members:
    :show-inheritance:
 fastNLP.models.enas\_controller module
 --------------------------------------
 fastNLP.models.enas_controller module
 -------------------------------------
 .. automodule:: fastNLP.models.enas_controller
    :members:
    :undoc-members:
    :show-inheritance:
 fastNLP.models.enas\_model module
 ---------------------------------
 fastNLP.models.enas_model module
 --------------------------------
 .. automodule:: fastNLP.models.enas_model
    :members:
    :undoc-members:
    :show-inheritance:
 fastNLP.models.enas\_trainer module
 -----------------------------------
 fastNLP.models.enas_trainer module
 ----------------------------------
 .. automodule:: fastNLP.models.enas_trainer
    :members:
    :undoc-members:
    :show-inheritance:
 fastNLP.models.enas\_utils module
 ---------------------------------
 fastNLP.models.enas_utils module
 --------------------------------
 .. automodule:: fastNLP.models.enas_utils
    :members:
    :undoc-members:
    :show-inheritance:
 fastNLP.models.sequence\_modeling module
 ----------------------------------------
 fastNLP.models.sequence_modeling module
 ---------------------------------------
 .. automodule:: fastNLP.models.sequence_modeling
    :members:
@@ -92,8 +92,8 @@ fastNLP.models.snli module
    :undoc-members:
    :show-inheritance:
 fastNLP.models.star\_transformer module
 ---------------------------------------
 fastNLP.models.star_transformer module
 --------------------------------------
 .. automodule:: fastNLP.models.star_transformer
    :members:
--- a/docs/source/fastNLP.modules.aggregator.rst
+++ b/docs/source/fastNLP.modules.aggregator.rst
@@ -12,32 +12,32 @@ fastNLP.modules.aggregator.attention module
    :undoc-members:
    :show-inheritance:
 fastNLP.modules.aggregator.avg\_pool module
 -------------------------------------------
 fastNLP.modules.aggregator.avg_pool module
 ------------------------------------------
 .. automodule:: fastNLP.modules.aggregator.avg_pool
    :members:
    :undoc-members:
    :show-inheritance:
 fastNLP.modules.aggregator.kmax\_pool module
 --------------------------------------------
 fastNLP.modules.aggregator.kmax_pool module
 -------------------------------------------
 .. automodule:: fastNLP.modules.aggregator.kmax_pool
    :members:
    :undoc-members:
    :show-inheritance:
 fastNLP.modules.aggregator.max\_pool module
 -------------------------------------------
 fastNLP.modules.aggregator.max_pool module
 ------------------------------------------
 .. automodule:: fastNLP.modules.aggregator.max_pool
    :members:
    :undoc-members:
    :show-inheritance:
 fastNLP.modules.aggregator.self\_attention module
 -------------------------------------------------
 fastNLP.modules.aggregator.self_attention module
 ------------------------------------------------
 .. automodule:: fastNLP.modules.aggregator.self_attention
    :members:
--- a/docs/source/fastNLP.modules.encoder.rst
+++ b/docs/source/fastNLP.modules.encoder.rst
@@ -4,8 +4,8 @@ fastNLP.modules.encoder package
 Submodules
 ----------
 fastNLP.modules.encoder.char\_embedding module
 ----------------------------------------------
 fastNLP.modules.encoder.char_embedding module
 ---------------------------------------------
 .. automodule:: fastNLP.modules.encoder.char_embedding
    :members:
@@ -20,8 +20,8 @@ fastNLP.modules.encoder.conv module
    :undoc-members:
    :show-inheritance:
 fastNLP.modules.encoder.conv\_maxpool module
 --------------------------------------------
 fastNLP.modules.encoder.conv_maxpool module
 -------------------------------------------
 .. automodule:: fastNLP.modules.encoder.conv_maxpool
    :members:
@@ -52,16 +52,16 @@ fastNLP.modules.encoder.lstm module
    :undoc-members:
    :show-inheritance:
 fastNLP.modules.encoder.masked\_rnn module
 ------------------------------------------
 fastNLP.modules.encoder.masked_rnn module
 -----------------------------------------
 .. automodule:: fastNLP.modules.encoder.masked_rnn
    :members:
    :undoc-members:
    :show-inheritance:
 fastNLP.modules.encoder.star\_transformer module
 ------------------------------------------------
 fastNLP.modules.encoder.star_transformer module
 -----------------------------------------------
 .. automodule:: fastNLP.modules.encoder.star_transformer
    :members:
@@ -76,8 +76,8 @@ fastNLP.modules.encoder.transformer module
    :undoc-members:
    :show-inheritance:
 fastNLP.modules.encoder.variational\_rnn module
 -----------------------------------------------
 fastNLP.modules.encoder.variational_rnn module
 ----------------------------------------------
 .. automodule:: fastNLP.modules.encoder.variational_rnn
    :members:
--- a/docs/source/fastNLP.modules.rst
+++ b/docs/source/fastNLP.modules.rst
@@ -21,8 +21,8 @@ fastNLP.modules.dropout module
    :undoc-members:
    :show-inheritance:
 fastNLP.modules.other\_modules module
 -------------------------------------
 fastNLP.modules.other_modules module
 ------------------------------------
 .. automodule:: fastNLP.modules.other_modules
    :members:
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -1,14 +1,114 @@
 """
 fastNLP.core.DataSet的介绍文档
 DataSet是fastNLP中用于承载数据的容器。可以将DataSet看做是一个表格，每一行是一个sample(在fastNLP中被称为Instance)，每一列是一个feature(在fastNLP中称为field)。
 DataSet是fastNLP中用于承载数据的容器。可以将DataSet看做是一个表格，每一行是一个instance(或sample)，每一列是一个feature。
 .. _DataSet:
 csv-table::
 :header: "Field1", "Field2", "Field3"
 :widths:20, 10, 10
 .. csv-table:: Following is a demo layout of DataSet
   :header: "sentence", "words", "seq_len"
   "This is the first instance .", "[This, is, the, first, instance, .]", 6
   "Second instance .", "[Second, instance, .]", 3
   "Third instance .", "[Third, instance, .]", 3
   "...", "[...]", "..."
 在fastNLP内部每一行是一个 Instance_ 对象； 每一列是一个 FieldArray_ 对象。
 1. DataSet的创建
    创建DataSet主要有以下的3种方式
    1. 传入dict
      Example::
        from fastNLP import DataSet
        data = {'sentence':["This is the first instance .", "Second instance .", "Third instance ."],
                'words': [['this', 'is', 'the', 'first', 'instance', '.'], ['Second', 'instance', '.'], ['Third', 'instance', '.'],
                'seq_len': [6, 3, 3]}
        dataset = DataSet(data)
        # 传入的dict的每个key的value应该为具有相同长度的list
    2. 通过构建Instance
      Example::
        from fastNLP import DataSet
        from fastNLP import Instance
        dataset = DataSet()
        instance = Instance(sentence="This is the first instance",
                            words=['this', 'is', 'the', 'first', 'instance', '.'],
                            seq_len=6)
        dataset.append(instance)
        # 可以继续append更多内容，但是append的instance应该和第一个instance拥有完全相同的field
    3. 通过list(Instance)
       Example::
        from fastNLP import DataSet
        from fastNLP import Instance
        instances = []
        instances.append(Instance(sentence="This is the first instance",
                            words=['this', 'is', 'the', 'first', 'instance', '.'],
                            seq_len=6))
        instances.append(Instance(sentence="Second instance .",
                            words=['Second', 'instance', '.'],
                            seq_len=3))
        dataset = DataSet(instances)
 2. DataSet的基本使用
    1. 从某个文本文件读取内容 # TODO 引用DataLoader
        Example::
            from fastNLP import DataSet
            from fastNLP import Instance
            dataset = DataSet()
            filepath='some/text/file'
            # 假设文件中每行内容如下(sentence  label):
            #    This is a fantastic day    positive
            #    The bad weather    negative
            #    .....
            with open(filepath, 'r') as f:
                for line in f:
                    sent, label = line.strip().split('\t')
                    dataset.append(Instance(sentence=sent, label=label))
    2. index, 返回结果为对DataSet对象的浅拷贝
        Example::
            import numpy as np
            from fastNLP import DataSet
            dataset = DataSet({'a': np.arange(10), 'b': [[_] for _ in range(10)]})
            d[0]  # 使用一个下标获取一个instance
            >>{'a': 0 type=int,'b': [2] type=list} # 得到一个instance
            d[1:3]  # 使用slice获取一个新的DataSet
            >>DataSet({'a': 1 type=int, 'b': [2] type=list}, {'a': 2 type=int, 'b': [2] type=list})
    3. 对DataSet中的内容处理
        Example::
            from fastNLP import DataSet
            data = {'sentence':["This is the first instance .", "Second instance .", "Third instance ."]}
            dataset = DataSet(data)
            # 将句子分成单词形式, 详见DataSet.apply()方法
            dataset.apply(lambda ins: ins['sentence'].split(), new_field_name='words')
            # 或使用DataSet.apply_field()
            dataset.apply(lambda sent:sent.split(), field_name='sentence', new_field_name='words')
    4. 删除DataSet的内容
        Example::
            from fastNLP import DataSet
            dataset = DataSet({'a': list(range(-5, 5))})
            # 返回满足条件的instance,并放入DataSet中
            dropped_dataset = dataset.drop(lambda ins:ins['a']<0, inplace=False)
            # 在dataset中删除满足条件的instance
            dataset.drop(lambda ins:ins['a']<0)  # dataset的instance数量减少
 "This is the first instance", ['This', 'is', 'the', 'first', 'instance'], 5
 "Second instance", ['Second', 'instance'], 2
 """
@@ -22,7 +122,6 @@ from fastNLP.core.fieldarray import FieldArray
 from fastNLP.core.instance import Instance
 from fastNLP.core.utils import get_func_signature
 class DataSet(object):
    """DataSet is the collection of examples.
    DataSet provides instance-level interface. You can append and access an instance of the DataSet.
@@ -87,10 +186,7 @@ class DataSet(object):
        return inner_iter_func()
    def __getitem__(self, idx):
        """Fetch Instance(s) at the `idx` position(s) in the dataset.
        Notice: This method returns a copy of the actual instance(s). Any change to the returned value would not modify
        the origin instance(s) of the DataSet.
        If you want to make in-place changes to all Instances, use `apply` method.
        """给定int的index，返回一个Instance; 给定slice，返回包含这个slice内容的新的DataSet。
        :param idx: can be int or slice.
        :return: If `idx` is int, return an Instance object.
@@ -145,33 +241,48 @@ class DataSet(object):
    def __repr__(self):
        return "DataSet(" + self.__inner_repr__() + ")"
    def append(self, ins):
    def append(self, instance):
        """将一个instance对象append到DataSet后面。
        If the DataSet is not empty, the instance must have the same field names as the rest instances in the DataSet.
        :param ins: an Instance object
        :param instance: an Instance object
        """
        if len(self.field_arrays) == 0:
            # DataSet has no field yet
            for name, field in ins.fields.items():
            for name, field in instance.fields.items():
                field = field.tolist() if isinstance(field, np.ndarray) else field
                self.field_arrays[name] = FieldArray(name, [field])  # 第一个样本，必须用list包装起来
        else:
            if len(self.field_arrays) != len(ins.fields):
            if len(self.field_arrays) != len(instance.fields):
                raise ValueError(
                    "DataSet object has {} fields, but attempt to append an Instance object with {} fields."
                        .format(len(self.field_arrays), len(ins.fields)))
            for name, field in ins.fields.items():
                        .format(len(self.field_arrays), len(instance.fields)))
            for name, field in instance.fields.items():
                assert name in self.field_arrays
                self.field_arrays[name].append(field)
    def add_fieldarray(self, field_name, fieldarray):
        """将fieldarray添加到DataSet中.
        :param str field_name: 新加入的field的名称
        :param FieldArray fieldarray: 需要加入DataSet的field的内容
        :return:
        """
        if not isinstance(fieldarray, FieldArray):
            raise TypeError("Only fastNLP.FieldArray supported.")
        if len(self) != len(fieldarray):
            raise RuntimeError(f"The field to add must have the same size as dataset. "
                               f"Dataset size {len(self)} != field size {len(fieldarray)}")
        self.field_arrays[field_name] = fieldarray
    def add_field(self, field_name, fields, padder=AutoPadder(), is_input=False, is_target=False, ignore_type=False):
        """新增一个field
        :param str field_name: 新增的field的名称
        :param list fields: 需要新增的field的内容
        :param None, Padder padder: 如果为None,则不进行pad。
        :param None,Padder padder: 如果为None,则不进行pad。
        :param bool is_input: 新加入的field是否是input
        :param bool is_target: 新加入的field是否是target
        :param bool ignore_type: 是否忽略对新加入的field的类型检查
@@ -179,18 +290,28 @@ class DataSet(object):
        if len(self.field_arrays) != 0:
            if len(self) != len(fields):
                raise RuntimeError(f"The field to append must have the same size as dataset. "
                raise RuntimeError(f"The field to add must have the same size as dataset. "
                                   f"Dataset size {len(self)} != field size {len(fields)}")
        self.field_arrays[field_name] = FieldArray(field_name, fields, is_target=is_target, is_input=is_input,
                                                   padder=padder, ignore_type=ignore_type)
    def delete_field(self, field_name):
        """删除field
        """删除名为field_name的field
        :param str field_name: 需要删除的field的名称.
        """
        self.field_arrays.pop(field_name)
    def has_field(self, field_name):
        """判断DataSet中是否有field_name这个field
        :param str field_name: field的名称
        :return: bool
        """
        if isinstance(field_name, str):
            return field_name in self.field_arrays
        return False
    def get_field(self, field_name):
        """获取field_name这个field
@@ -318,25 +439,21 @@ class DataSet(object):
    def apply_field(self, func, field_name, new_field_name=None, **kwargs):
        """将DataSet中的每个instance中的`field_name`这个field传给func，并获取它的返回值.
        :param callable func: input是instance的`field_name`这个field.
        :param str field_name: 传入func的是哪个field.
        :param str, None new_field_name: 将func返回的内容放入到什么field中
                                       1. str, 将func的返回值放入这个名为`new_field_name`的新field中，如果名称与已有的field相
                                        同，则覆盖之前的field
                                       2. None, 不创建新的field
        :param kwargs: 合法的参数有以下三个
        :param callable func: input是instance的`field_name`这个field的内容。
        :param str field_name: 传入func的是哪个field。
        :param None,str new_field_name: 将func返回的内容放入到new_field_name这个field中，如果名称与已有的field相同，则覆
         :盖之前的field。如果为None则不创建新的field。
        :param optional kwargs: 支持输入is_input,is_target,ignore_type
                     1. is_input: bool, 如果为True则将`new_field_name`的field设置为input
            1. is_input: bool, 如果为True则将`new_field_name`的field设置为input
                     2. is_target: bool, 如果为True则将`new_field_name`的field设置为target
            2. is_target: bool, 如果为True则将`new_field_name`的field设置为target
                     3. ignore_type: bool, 如果为True则将`new_field_name`的field的ignore_type设置为true, 忽略其类型
            3. ignore_type: bool, 如果为True则将`new_field_name`的field的ignore_type设置为true, 忽略其类型
        :return: list(Any), 里面的元素为func的返回值，所以list长度为DataSet的长度
        """
        assert len(self)!=0, "Null DataSet cannot use apply()."
        assert len(self)!=0, "Null DataSet cannot use apply_field()."
        if field_name not in self:
            raise KeyError("DataSet has no field named `{}`.".format(field_name))
        results = []
@@ -388,23 +505,19 @@ class DataSet(object):
                           ignore_type=extra_param.get("ignore_type", False))
    def apply(self, func, new_field_name=None, **kwargs):
        """将DataSet中每个instance传入到func中，并获取它的返回值.
        :param callable func: 参数是DataSet中的instance
        :param str, None new_field_name: 将func返回的内容放入到什么field中
                                       1. str, 将func的返回值放入这个名为`new_field_name`的新field中，如果名称与已有的field相
                                        同，则覆盖之前的field
        """ 将DataSet中每个instance传入到func中，并获取它的返回值.
                                       2. None, 不创建新的field
        :param kwargs: 合法的参数有以下三个
        :param callable func: 参数是DataSet中的Instance
        :param None,str new_field_name: 将func返回的内容放入到new_field_name这个field中，如果名称与已有的field相同，则覆
            :盖之前的field。如果为None则不创建新的field。
        :param optional kwargs: 支持输入is_input,is_target,ignore_type
                     1. is_input: bool, 如果为True则将`new_field_name`的field设置为input
            1. is_input: bool, 如果为True则将`new_field_name`的field设置为input
                     2. is_target: bool, 如果为True则将`new_field_name`的field设置为target
            2. is_target: bool, 如果为True则将`new_field_name`的field设置为target
                     3. ignore_type: bool, 如果为True则将`new_field_name`的field的ignore_type设置为true, 忽略其类型
        :return: List[], 里面的元素为func的返回值，所以list长度为DataSet的长度
            3. ignore_type: bool, 如果为True则将`new_field_name`的field的ignore_type设置为true, 忽略其类型
        :return: list(Any), 里面的元素为func的返回值，所以list长度为DataSet的长度
        """
        assert len(self)!=0, "Null DataSet cannot use apply()."
        idx = -1
@@ -426,10 +539,10 @@ class DataSet(object):
        return results
    def drop(self, func, inplace=True):
        """func接受一个instance，返回bool值，返回值为True时，该instance会被删除。
        """func接受一个instance，返回bool值，返回值为True时，该instance会被移除或者加入到返回的DataSet中。
        :param callable func: 接受一个instance作为参数，返回bool值。为True时删除该instance
        :param bool inplace: 是否在当前DataSet中直接删除instance。如果为False，返回值为一个删除了相应instance的新的DataSet
        :param bool inplace: 是否在当前DataSet中直接删除instance。如果为False，返回值被删除的instance的组成的新DataSet
        :return: DataSet
        """
@@ -440,10 +553,13 @@ class DataSet(object):
            return self
        else:
            results = [ins for ins in self if not func(ins)]
            dataset = DataSet(results)
            for field_name, field in self.field_arrays.items():
                dataset.field_arrays[field_name].to(field)
            return dataset
            if len(results)!=0:
                dataset = DataSet(results)
                for field_name, field in self.field_arrays.items():
                    dataset.field_arrays[field_name].to(field)
                return dataset
            else:
                return DataSet()
    def split(self, ratio):
        """将DataSet按照ratio的比例拆分，返回两个DataSet
--- a/fastNLP/core/fieldarray.py
+++ b/fastNLP/core/fieldarray.py
@@ -1,4 +1,9 @@
 """
 FieldArray是  DataSet_ 中一列的存储方式
 .. _FieldArray:
 """
 import numpy as np
--- a/fastNLP/core/instance.py
+++ b/fastNLP/core/instance.py
@@ -1,3 +1,14 @@
 """
 Instance文档
 .. _Instance:
 测试
 """
 class Instance(object):
    """An Instance is an example of data.
    Example::
--- a/fastNLP/core/utils.py
+++ b/fastNLP/core/utils.py
@@ -24,47 +24,50 @@ def _prepare_cache_filepath(filepath):
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
 #  TODO 可以保存下缓存时的参数，如果load的时候发现参数不一致，发出警告。
 def cache_results(cache_filepath, refresh=False, verbose=1):
 def cache_results(_cache_fp, _refresh=False, _verbose=1):
    def wrapper_(func):
        signature = inspect.signature(func)
        for key, _ in signature.parameters.items():
            if key in ('cache_filepath', 'refresh', 'verbose'):
                raise RuntimeError("The function decorated by cache_results cannot have keyword `{}`.".format(key))
        def wrapper(*args, **kwargs):
            if 'cache_filepath' in kwargs:
                _cache_filepath = kwargs.pop('cache_filepath')
                assert isinstance(_cache_filepath, str), "cache_filepath can only be str."
            if '_cache_fp' in kwargs:
                cache_filepath = kwargs.pop('_cache_fp')
                assert isinstance(cache_filepath, str), "_cache_fp can only be str."
            else:
                cache_filepath = _cache_fp
            if '_refresh' in kwargs:
                refresh  = kwargs.pop('_refresh')
                assert isinstance(refresh, bool), "_refresh can only be bool."
            else:
                _cache_filepath = cache_filepath
            if 'refresh' in kwargs:
                _refresh  = kwargs.pop('refresh')
                assert isinstance(_refresh, bool), "refresh can only be bool."
                refresh = _refresh
            if '_verbose' in kwargs:
                verbose = kwargs.pop('_verbose')
                assert isinstance(verbose, int), "_verbose can only be integer."
            else:
                _refresh = refresh
            if 'verbose' in kwargs:
                _verbose = kwargs.pop('verbose')
                assert isinstance(_verbose, int), "verbose can only be integer."
                verbose = _verbose
            refresh_flag = True
            if _cache_filepath is not None and _refresh is False:
            if cache_filepath is not None and refresh is False:
                # load data
                if os.path.exists(_cache_filepath):
                    with open(_cache_filepath, 'rb') as f:
                if os.path.exists(cache_filepath):
                    with open(cache_filepath, 'rb') as f:
                        results = _pickle.load(f)
                    if verbose==1:
                        print("Read cache from {}.".format(_cache_filepath))
                        print("Read cache from {}.".format(cache_filepath))
                    refresh_flag = False
            if refresh_flag:
                results = func(*args, **kwargs)
                if _cache_filepath is not None:
                if cache_filepath is not None:
                    if results is None:
                        raise RuntimeError("The return value is None. Delete the decorator.")
                    _prepare_cache_filepath(_cache_filepath)
                    with open(_cache_filepath, 'wb') as f:
                    _prepare_cache_filepath(cache_filepath)
                    with open(cache_filepath, 'wb') as f:
                        _pickle.dump(results, f)
                    print("Save cache to {}.".format(_cache_filepath))
                    print("Save cache to {}.".format(cache_filepath))
            return results
        return wrapper
--- a/fastNLP/io/embed_loader.py
+++ b/fastNLP/io/embed_loader.py
@@ -1,7 +1,6 @@
 import os
 import numpy as np
 import torch
 from fastNLP.core.vocabulary import Vocabulary
 from fastNLP.io.base_loader import BaseLoader
@@ -14,120 +13,6 @@ class EmbedLoader(BaseLoader):
    def __init__(self):
        super(EmbedLoader, self).__init__()
    @staticmethod
    def _load_glove(emb_file):
        """Read file as a glove embedding
        file format:
            embeddings are split by line,
            for one embedding, word and numbers split by space
        Example::
        word_1 float_1 float_2 ... float_emb_dim
        word_2 float_1 float_2 ... float_emb_dim
        ...
        """
        emb = {}
        with open(emb_file, 'r', encoding='utf-8') as f:
            for line in f:
                line = list(filter(lambda w: len(w) > 0, line.strip().split(' ')))
                if len(line) > 2:
                    emb[line[0]] = torch.Tensor(list(map(float, line[1:])))
        return emb
    @staticmethod
    def _load_pretrain(emb_file, emb_type):
        """Read txt data from embedding file and convert to np.array as pre-trained embedding
        :param str emb_file: the pre-trained embedding file path
        :param str emb_type: the pre-trained embedding data format
        :return: a dict of ``{str: np.array}``
        """
        if emb_type == 'glove':
            return EmbedLoader._load_glove(emb_file)
        else:
            raise Exception("embedding type {} not support yet".format(emb_type))
    @staticmethod
    def load_embedding(emb_dim, emb_file, emb_type, vocab):
        """Load the pre-trained embedding and combine with the given dictionary.
        :param int emb_dim: the dimension of the embedding. Should be the same as pre-trained embedding.
        :param str emb_file: the pre-trained embedding file path.
        :param str emb_type: the pre-trained embedding format, support glove now
        :param Vocabulary vocab: a mapping from word to index, can be provided by user or built from pre-trained embedding
        :return (embedding_tensor, vocab):
                embedding_tensor - Tensor of shape (len(word_dict), emb_dim);
                vocab - input vocab or vocab built by pre-train
        """
        pretrain = EmbedLoader._load_pretrain(emb_file, emb_type)
        if vocab is None:
            # build vocabulary from pre-trained embedding
            vocab = Vocabulary()
            for w in pretrain.keys():
                vocab.add(w)
        embedding_tensor = torch.randn(len(vocab), emb_dim)
        for w, v in pretrain.items():
            if len(v.shape) > 1 or emb_dim != v.shape[0]:
                raise ValueError(
                    "Pretrained embedding dim is {}. Dimension dismatched. Required {}".format(v.shape, (emb_dim,)))
            if vocab.has_word(w):
                embedding_tensor[vocab[w]] = v
        return embedding_tensor, vocab
    @staticmethod
    def parse_glove_line(line):
        line = line.split()
        if len(line) <= 2:
            raise RuntimeError("something goes wrong in parsing glove embedding")
        return line[0], line[1:]
    @staticmethod
    def str_list_2_vec(line):
        try:
            return torch.Tensor(list(map(float, line)))
        except Exception:
            raise RuntimeError("something goes wrong in parsing glove embedding")
    @staticmethod
    def fast_load_embedding(emb_dim, emb_file, vocab):
        """Fast load the pre-trained embedding and combine with the given dictionary.
        This loading method uses line-by-line operation.
        :param int emb_dim: the dimension of the embedding. Should be the same as pre-trained embedding.
        :param str emb_file: the pre-trained embedding file path.
        :param Vocabulary vocab: a mapping from word to index, can be provided by user or built from pre-trained embedding
        :return embedding_matrix: numpy.ndarray
        """
        if vocab is None:
            raise RuntimeError("You must provide a vocabulary.")
        embedding_matrix = np.zeros(shape=(len(vocab), emb_dim), dtype=np.float32)
        hit_flags = np.zeros(shape=(len(vocab),), dtype=int)
        with open(emb_file, "r", encoding="utf-8") as f:
            startline = f.readline()
            if len(startline.split()) > 2:
                f.seek(0)
            for line in f:
                word, vector = EmbedLoader.parse_glove_line(line)
                if word in vocab:
                    vector = EmbedLoader.str_list_2_vec(vector)
                    if len(vector.shape) > 1 or emb_dim != vector.shape[0]:
                        raise ValueError("Pre-trained embedding dim is {}. Expect {}.".format(vector.shape, (emb_dim,)))
                    embedding_matrix[vocab[word]] = vector
                    hit_flags[vocab[word]] = 1
        if np.sum(hit_flags) < len(vocab):
            # some words from vocab are missing in pre-trained embedding
            # we normally sample each dimension
            vocab_embed = embedding_matrix[np.where(hit_flags)]
            sampled_vectors = np.random.normal(vocab_embed.mean(axis=0), vocab_embed.std(axis=0),
                                               size=(len(vocab) - np.sum(hit_flags), emb_dim))
            embedding_matrix[np.where(1 - hit_flags)] = sampled_vectors
        return embedding_matrix
    @staticmethod
    def load_with_vocab(embed_filepath, vocab, dtype=np.float32, normalize=True, error='ignore'):
        """
--- a/fastNLP/modules/decoder/utils.py
+++ b/fastNLP/modules/decoder/utils.py
@@ -36,8 +36,7 @@ def viterbi_decode(feats, transitions, mask=None, unpad=False):
    vpath = feats.new_zeros((seq_len, batch_size, n_tags), dtype=torch.long)
    vscore = feats[0]
    vscore += transitions[n_tags, :n_tags]
    trans_score = transitions[:n_tags, :n_tags].view(1, n_tags, n_tags).data
    trans_score = transitions.view(1, n_tags, n_tags).data
    for i in range(1, seq_len):
        prev_score = vscore.view(batch_size, n_tags, 1)
        cur_score = feats[i].view(batch_size, 1, n_tags)
--- a/reproduction/Biaffine_parser/run.py
+++ b/reproduction/Biaffine_parser/run.py
@@ -155,7 +155,7 @@ print('test len {}'.format(len(test_data)))
 def train(path):
    # test saving pipeline
    save_pipe(path)
    embed = EmbedLoader.fast_load_embedding(model_args['word_emb_dim'], emb_file_name, word_v)
    embed = EmbedLoader.load_with_vocab(emb_file_name, word_v)
    embed = torch.tensor(embed, dtype=torch.float32)
    # embed = EmbedLoader.fast_load_embedding(emb_dim=model_args['word_emb_dim'], emb_file=emb_file_name, vocab=word_v)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 numpy>=1.14.2
 torch>=0.4.0
 tensorboardX
 tqdm>=4.28.1
 tqdm>=4.28.1
 nltk>=3.4.1
--- a/test/core/test_utils.py
+++ b/test/core/test_utils.py
@@ -89,17 +89,17 @@ class TestCache(unittest.TestCase):
    def test_duplicate_keyword(self):
        with self.assertRaises(RuntimeError):
            @cache_results(None)
            def func_verbose(a, verbose):
            def func_verbose(a, _verbose):
                pass
            func_verbose(0, 1)
        with self.assertRaises(RuntimeError):
            @cache_results(None)
            def func_cache(a, cache_filepath):
            def func_cache(a, _cache_fp):
                pass
            func_cache(1, 2)
        with self.assertRaises(RuntimeError):
            @cache_results(None)
            def func_refresh(a, refresh):
            def func_refresh(a, _refresh):
                pass
            func_refresh(1, 2)
--- a/test/io/test_embed_loader.py
+++ b/test/io/test_embed_loader.py
@@ -6,12 +6,6 @@ from fastNLP.io.embed_loader import EmbedLoader
 class TestEmbedLoader(unittest.TestCase):
    def test_case(self):
        vocab = Vocabulary()
        vocab.update(["the", "in", "I", "to", "of", "hahaha"])
        embedding = EmbedLoader().fast_load_embedding(50, "test/data_for_tests/glove.6B.50d_test.txt", vocab)
        self.assertEqual(tuple(embedding.shape), (len(vocab), 50))
    def test_load_with_vocab(self):
        vocab = Vocabulary()
        glove = "test/data_for_tests/glove.6B.50d_test.txt"