修改了 models 部分 import 的顺序，__all__ 暴露的内容

6 years ago · 9c07850019
--- a/docs/source/fastNLP.models.base_model.rst
+++ b/docs/source/fastNLP.models.base_model.rst
@@ -1,7 +0,0 @@
 fastNLP.models.base\_model
 ==========================
 .. automodule:: fastNLP.models.base_model
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.models.bert.rst
+++ b/docs/source/fastNLP.models.bert.rst
@@ -1,7 +0,0 @@
 fastNLP.models.bert
 ===================
 .. automodule:: fastNLP.models.bert
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.models.enas_controller.rst
+++ b/docs/source/fastNLP.models.enas_controller.rst
@@ -1,7 +0,0 @@
 fastNLP.models.enas\_controller
 ===============================
 .. automodule:: fastNLP.models.enas_controller
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.models.enas_model.rst
+++ b/docs/source/fastNLP.models.enas_model.rst
@@ -1,7 +0,0 @@
 fastNLP.models.enas\_model
 ==========================
 .. automodule:: fastNLP.models.enas_model
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.models.enas_trainer.rst
+++ b/docs/source/fastNLP.models.enas_trainer.rst
@@ -1,7 +0,0 @@
 fastNLP.models.enas\_trainer
 ============================
 .. automodule:: fastNLP.models.enas_trainer
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.models.enas_utils.rst
+++ b/docs/source/fastNLP.models.enas_utils.rst
@@ -1,7 +0,0 @@
 fastNLP.models.enas\_utils
 ==========================
 .. automodule:: fastNLP.models.enas_utils
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.models.rst
+++ b/docs/source/fastNLP.models.rst
@@ -12,14 +12,8 @@ fastNLP.models
 .. toctree::
   :titlesonly:
   fastNLP.models.base_model
   fastNLP.models.bert
   fastNLP.models.biaffine_parser
   fastNLP.models.cnn_text_classification
   fastNLP.models.enas_controller
   fastNLP.models.enas_model
   fastNLP.models.enas_trainer
   fastNLP.models.enas_utils
   fastNLP.models.sequence_labeling
   fastNLP.models.snli
   fastNLP.models.star_transformer
--- a/fastNLP/core/batch.py
+++ b/fastNLP/core/batch.py
@@ -3,12 +3,12 @@ batch 模块实现了 fastNLP 所需的 Batch 类。
 """
 import atexit
 from queue import Empty, Full
 import numpy as np
 import torch
 import torch.multiprocessing as mp
 from queue import Empty, Full
 from .sampler import RandomSampler
 __all__ = [
--- a/fastNLP/core/callback.py
+++ b/fastNLP/core/callback.py
@@ -50,6 +50,7 @@ callback模块实现了 fastNLP 中的许多 callback 类，用于增强 :class:
 """
 import os
 import torch
 try:
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -273,9 +273,10 @@
 """
 import _pickle as pickle
 import numpy as np
 import warnings
 import numpy as np
 from .field import AutoPadder
 from .field import FieldArray
 from .instance import Instance
--- a/fastNLP/core/field.py
+++ b/fastNLP/core/field.py
@@ -3,10 +3,10 @@ field模块实现了 FieldArray 和若干 Padder。 FieldArray 是  :class:`~fas
 原理部分请参考 :doc:`fastNLP.core.dataset`
 """
 import numpy as np
 from copy import deepcopy
 import numpy as np
 __all__ = [
    "FieldArray",
    "Padder",
--- a/fastNLP/core/losses.py
+++ b/fastNLP/core/losses.py
@@ -3,11 +3,11 @@ losses 模块定义了 fastNLP 中所需的各种损失函数，一般做为 :cl
 """
 import inspect
 from collections import defaultdict
 import torch
 import torch.nn.functional as F
 from collections import defaultdict
 from .utils import _CheckError
 from .utils import _CheckRes
 from .utils import _build_args
--- a/fastNLP/core/metrics.py
+++ b/fastNLP/core/metrics.py
@@ -3,11 +3,11 @@ metrics 模块实现了 fastNLP 所需的各种常用衡量指标，一般做为
 """
 import inspect
 from collections import defaultdict
 import numpy as np
 import torch
 from collections import defaultdict
 from .utils import _CheckError
 from .utils import _CheckRes
 from .utils import _build_args
--- a/fastNLP/core/predictor.py
+++ b/fastNLP/core/predictor.py
@@ -2,10 +2,10 @@
    ..todo::
        检查这个类是否需要
 """
 import torch
 from collections import defaultdict
 import torch
 from . import Batch
 from . import DataSet
 from . import SequentialSampler
--- a/fastNLP/core/sampler.py
+++ b/fastNLP/core/sampler.py
@@ -1,10 +1,10 @@
 """
 sampler 子类实现了 fastNLP 所需的各种采样器。
 """
 import numpy as np
 from itertools import chain
 import numpy as np
 __all__ = [
    "Sampler",
    "BucketSampler",
--- a/fastNLP/core/tester.py
+++ b/fastNLP/core/tester.py
@@ -33,6 +33,7 @@ Tester在验证进行之前会调用model.eval()提示当前进入了evaluation
 """
 import warnings
 import torch
 import torch.nn as nn
--- a/fastNLP/core/trainer.py
+++ b/fastNLP/core/trainer.py
@@ -297,13 +297,13 @@ Example2.3
 """
 import os
 import numpy as np
 import time
 from datetime import datetime, timedelta
 import numpy as np
 import torch
 import torch.nn as nn
 from datetime import datetime, timedelta
 try:
    from tqdm.auto import tqdm
 except:
--- a/fastNLP/core/utils.py
+++ b/fastNLP/core/utils.py
@@ -3,14 +3,13 @@ utils模块实现了 fastNLP 内部和外部所需的很多工具。其中用户
 """
 import _pickle
 import inspect
 import numpy as np
 import os
 import torch
 import torch.nn as nn
 import warnings
 from collections import Counter, namedtuple
 from collections import Counter
 from collections import namedtuple
 import numpy as np
 import torch
 import torch.nn as nn
 __all__ = [
    "cache_results",
--- a/fastNLP/io/init.py
+++ b/fastNLP/io/init.py
@@ -9,6 +9,11 @@
 这些类的使用方法如下:
 """
 from .embed_loader import EmbedLoader
 from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \
    PeopleDailyCorpusLoader, Conll2003Loader
 from .model_io import ModelLoader, ModelSaver
 __all__ = [
    'EmbedLoader',
@@ -24,7 +29,3 @@ __all__ = [
    'ModelLoader',
    'ModelSaver',
 ]
 from .embed_loader import EmbedLoader
 from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, SNLILoader, SSTLoader, \
    PeopleDailyCorpusLoader, Conll2003Loader
 from .model_io import ModelLoader as ModelLoader, ModelSaver as ModelSaver
--- a/fastNLP/io/base_loader.py
+++ b/fastNLP/io/base_loader.py
@@ -1,15 +1,20 @@
 import _pickle as pickle
 import os
 __all__ = [
    "BaseLoader"
 ]
 class BaseLoader(object):
    """
    各个 Loader 的基类，提供了 API 的参考。
    """
    def __init__(self):
        super(BaseLoader, self).__init__()
    @staticmethod
    def load_lines(data_path):
        """
@@ -20,7 +25,7 @@ class BaseLoader(object):
        with open(data_path, "r", encoding="utf=8") as f:
            text = f.readlines()
        return [line.strip() for line in text]
    @classmethod
    def load(cls, data_path):
        """
@@ -31,7 +36,7 @@ class BaseLoader(object):
        with open(data_path, "r", encoding="utf-8") as f:
            text = f.readlines()
        return [[word for word in sent.strip()] for sent in text]
    @classmethod
    def load_with_cache(cls, data_path, cache_path):
        """缓存版的load
@@ -48,16 +53,18 @@ class BaseLoader(object):
 class DataLoaderRegister:
    _readers = {}
    @classmethod
    def set_reader(cls, reader_cls, read_fn_name):
        # def wrapper(reader_cls):
        if read_fn_name in cls._readers:
            raise KeyError('duplicate reader: {} and {} for read_func: {}'.format(cls._readers[read_fn_name], reader_cls, read_fn_name))
            raise KeyError(
                'duplicate reader: {} and {} for read_func: {}'.format(cls._readers[read_fn_name], reader_cls,
                                                                       read_fn_name))
        if hasattr(reader_cls, 'load'):
            cls._readers[read_fn_name] = reader_cls().load
        return reader_cls
    @classmethod
    def get_reader(cls, read_fn_name):
        if read_fn_name in cls._readers:
--- a/fastNLP/io/config_io.py
+++ b/fastNLP/io/config_io.py
@@ -1,14 +1,20 @@
 """
 用于读入和处理和保存 config 文件
 .. todo::
    这个模块中的类可能被抛弃？
 """
 __all__ = ["ConfigLoader","ConfigSection","ConfigSaver"]
 import configparser
 import json
 import os
 from .base_loader import BaseLoader
 __all__ = [
    "ConfigLoader",
    "ConfigSection",
    "ConfigSaver"
 ]
 class ConfigLoader(BaseLoader):
    """
@@ -19,15 +25,16 @@ class ConfigLoader(BaseLoader):
    :param str data_path: 配置文件的路径
    """
    def __init__(self, data_path=None):
        super(ConfigLoader, self).__init__()
        if data_path is not None:
            self.config = self.parse(super(ConfigLoader, self).load(data_path))
    @staticmethod
    def parse(string):
        raise NotImplementedError
    @staticmethod
    def load_config(file_path, sections):
        """
@@ -81,10 +88,10 @@ class ConfigSection(object):
    ConfigSection是一个存储了一个section中所有键值对的数据结构，推荐使用此类的实例来配合 :meth:`ConfigLoader.load_config` 使用
    """
    def __init__(self):
        super(ConfigSection, self).__init__()
    def __getitem__(self, key):
        """
        :param key: str, the name of the attribute
@@ -97,7 +104,7 @@ class ConfigSection(object):
        if key in self.__dict__.keys():
            return getattr(self, key)
        raise AttributeError("do NOT have attribute %s" % key)
    def __setitem__(self, key, value):
        """
        :param key: str, the name of the attribute
@@ -112,14 +119,14 @@ class ConfigSection(object):
                raise AttributeError("attr %s except %s but got %s" %
                                     (key, str(type(getattr(self, key))), str(type(value))))
        setattr(self, key, value)
    def __contains__(self, item):
        """
        :param item: The key of item.
        :return: True if the key in self.__dict__.keys() else False.
        """
        return item in self.__dict__.keys()
    def __eq__(self, other):
        """Overwrite the == operator
@@ -131,15 +138,15 @@ class ConfigSection(object):
                return False
            if getattr(self, k) != getattr(self, k):
                return False
        for k in other.__dict__.keys():
            if k not in self.__dict__.keys():
                return False
            if getattr(self, k) != getattr(self, k):
                return False
        return True
    def __ne__(self, other):
        """Overwrite the != operator
@@ -147,7 +154,7 @@ class ConfigSection(object):
        :return:
        """
        return not self.__eq__(other)
    @property
    def data(self):
        return self.__dict__
@@ -162,11 +169,12 @@ class ConfigSaver(object):
    :param str file_path: 配置文件的路径
    """
    def __init__(self, file_path):
        self.file_path = file_path
        if not os.path.exists(self.file_path):
            raise FileNotFoundError("file {} NOT found!".__format__(self.file_path))
    def _get_section(self, sect_name):
        """
        This is the function to get the section with the section name.
@@ -177,7 +185,7 @@ class ConfigSaver(object):
        sect = ConfigSection()
        ConfigLoader().load_config(self.file_path, {sect_name: sect})
        return sect
    def _read_section(self):
        """
        This is the function to read sections from the config file.
@@ -187,16 +195,16 @@ class ConfigSaver(object):
            sect_key_list: A list of names in sect_list.
        """
        sect_name = None
        sect_list = {}
        sect_key_list = []
        single_section = {}
        single_section_key = []
        with open(self.file_path, 'r') as f:
            lines = f.readlines()
        for line in lines:
            if line.startswith('[') and line.endswith(']\n'):
                if sect_name is None:
@@ -208,29 +216,29 @@ class ConfigSaver(object):
                    sect_key_list.append(sect_name)
                sect_name = line[1: -2]
                continue
            if line.startswith('#'):
                single_section[line] = '#'
                single_section_key.append(line)
                continue
            if line.startswith('\n'):
                single_section_key.append('\n')
                continue
            if '=' not in line:
                raise RuntimeError("can NOT load config file {}".__format__(self.file_path))
            key = line.split('=', maxsplit=1)[0].strip()
            value = line.split('=', maxsplit=1)[1].strip() + '\n'
            single_section[key] = value
            single_section_key.append(key)
        if sect_name is not None:
            sect_list[sect_name] = single_section, single_section_key
            sect_key_list.append(sect_name)
        return sect_list, sect_key_list
    def _write_section(self, sect_list, sect_key_list):
        """
        This is the function to write config file with section list and name list.
@@ -252,7 +260,7 @@ class ConfigSaver(object):
                        continue
                    f.write(key + ' = ' + single_section[key])
                f.write('\n')
    def save_config_file(self, section_name, section):
        """
        这个方法可以用来修改并保存配置文件中单独的一个 section
@@ -284,11 +292,11 @@ class ConfigSaver(object):
                    break
            if not change_file:
                return
            sect_list, sect_key_list = self._read_section()
            if section_name not in sect_key_list:
                raise AttributeError()
            sect, sect_key = sect_list[section_name]
            for k in section.__dict__.keys():
                if k not in sect_key:
--- a/fastNLP/io/dataset_loader.py
+++ b/fastNLP/io/dataset_loader.py
@@ -10,6 +10,12 @@ dataset_loader模块实现了许多 DataSetLoader, 用于读取不同格式的
    # ... do stuff
 """
 from nltk.tree import Tree
 from ..core.dataset import DataSet
 from ..core.instance import Instance
 from .file_reader import _read_csv, _read_json, _read_conll
 __all__ = [
    'DataSetLoader',
    'CSVLoader',
@@ -20,11 +26,6 @@ __all__ = [
    'PeopleDailyCorpusLoader',
    'Conll2003Loader',
 ]
 from nltk.tree import Tree
 from ..core.dataset import DataSet
 from ..core.instance import Instance
 from .file_reader import _read_csv, _read_json, _read_conll
 def _download_from_url(url, path):
--- a/fastNLP/io/embed_loader.py
+++ b/fastNLP/io/embed_loader.py
@@ -1,11 +1,15 @@
 import os
 import warnings
 import numpy as np
 from ..core.vocabulary import Vocabulary
 from .base_loader import BaseLoader
 import warnings
 __all__ = [
    "EmbedLoader"
 ]
 class EmbedLoader(BaseLoader):
    """
@@ -13,10 +17,10 @@ class EmbedLoader(BaseLoader):
    用于读取预训练的embedding, 读取结果可直接载入为模型参数。
    """
    def __init__(self):
        super(EmbedLoader, self).__init__()
    @staticmethod
    def load_with_vocab(embed_filepath, vocab, dtype=np.float32, normalize=True, error='ignore'):
        """
@@ -40,11 +44,11 @@ class EmbedLoader(BaseLoader):
            line = f.readline().strip()
            parts = line.split()
            start_idx = 0
            if len(parts)==2:
            if len(parts) == 2:
                dim = int(parts[1])
                start_idx += 1
            else:
                dim = len(parts)-1
                dim = len(parts) - 1
                f.seek(0)
            matrix = np.random.randn(len(vocab), dim).astype(dtype)
            for idx, line in enumerate(f, start_idx):
@@ -63,21 +67,21 @@ class EmbedLoader(BaseLoader):
            total_hits = sum(hit_flags)
            print("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab)))
            found_vectors = matrix[hit_flags]
            if len(found_vectors)!=0:
            if len(found_vectors) != 0:
                mean = np.mean(found_vectors, axis=0, keepdims=True)
                std = np.std(found_vectors, axis=0, keepdims=True)
                unfound_vec_num = len(vocab) - total_hits
                r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype)*std + mean
                matrix[hit_flags==False] = r_vecs
                r_vecs = np.random.randn(unfound_vec_num, dim).astype(dtype) * std + mean
                matrix[hit_flags == False] = r_vecs
            if normalize:
                matrix /= np.linalg.norm(matrix, axis=1, keepdims=True)
            return matrix
    @staticmethod
    def load_without_vocab(embed_filepath, dtype=np.float32, padding='<pad>', unknown='<unk>', normalize=True,
                            error='ignore'):
                           error='ignore'):
        """
        从embed_filepath中读取预训练的word vector。根据预训练的词表读取embedding并生成一个对应的Vocabulary。
@@ -96,35 +100,35 @@ class EmbedLoader(BaseLoader):
        vec_dict = {}
        found_unknown = False
        found_pad = False
        with open(embed_filepath, 'r', encoding='utf-8') as f:
            line = f.readline()
            start = 1
            dim = -1
            if len(line.strip().split())!=2:
            if len(line.strip().split()) != 2:
                f.seek(0)
                start = 0
            for idx, line in enumerate(f, start=start):
                try:
                    parts = line.strip().split()
                    word = parts[0]
                    if dim==-1:
                        dim = len(parts)-1
                    if dim == -1:
                        dim = len(parts) - 1
                    vec = np.fromstring(' '.join(parts[1:]), sep=' ', dtype=dtype, count=dim)
                    vec_dict[word] = vec
                    vocab.add_word(word)
                    if unknown is not None and unknown==word:
                    if unknown is not None and unknown == word:
                        found_unknown = True
                    if found_pad is not None and padding==word:
                    if found_pad is not None and padding == word:
                        found_pad = True
                except Exception as e:
                    if error=='ignore':
                    if error == 'ignore':
                        warnings.warn("Error occurred at the {} line.".format(idx))
                        pass
                    else:
                        print("Error occurred at the {} line.".format(idx))
                        raise e
            if dim==-1:
            if dim == -1:
                raise RuntimeError("{} is an empty file.".format(embed_filepath))
            matrix = np.random.randn(len(vocab), dim).astype(dtype)
            if (unknown is not None and not found_unknown) or (padding is not None and not found_pad):
@@ -133,19 +137,19 @@ class EmbedLoader(BaseLoader):
                    start_idx += 1
                if unknown is not None:
                    start_idx += 1
                mean = np.mean(matrix[start_idx:], axis=0, keepdims=True)
                std = np.std(matrix[start_idx:], axis=0, keepdims=True)
                if (unknown is not None and not found_unknown):
                    matrix[start_idx-1] = np.random.randn(1, dim).astype(dtype)*std + mean
                    matrix[start_idx - 1] = np.random.randn(1, dim).astype(dtype) * std + mean
                if (padding is not None and not found_pad):
                    matrix[0] = np.random.randn(1, dim).astype(dtype)*std + mean
                    matrix[0] = np.random.randn(1, dim).astype(dtype) * std + mean
            for key, vec in vec_dict.items():
                index = vocab.to_index(key)
                matrix[index] = vec
            if normalize:
                matrix /= np.linalg.norm(matrix, axis=1, keepdims=True)
            return matrix, vocab
--- a/fastNLP/io/model_io.py
+++ b/fastNLP/io/model_io.py
@@ -5,6 +5,11 @@ import torch
 from .base_loader import BaseLoader
 __all__ = [
    "ModelLoader",
    "ModelSaver"
 ]
 class ModelLoader(BaseLoader):
    """
@@ -12,10 +17,10 @@ class ModelLoader(BaseLoader):
    用于读取模型
    """
    def __init__(self):
        super(ModelLoader, self).__init__()
    @staticmethod
    def load_pytorch(empty_model, model_path):
        """
@@ -25,7 +30,7 @@ class ModelLoader(BaseLoader):
        :param str model_path: 模型保存的路径
        """
        empty_model.load_state_dict(torch.load(model_path))
    @staticmethod
    def load_pytorch_model(model_path):
        """
@@ -48,14 +53,14 @@ class ModelSaver(object):
        saver.save_pytorch(model)
    """
    def __init__(self, save_path):
        """
        :param save_path: 模型保存的路径
        """
        self.save_path = save_path
    def save_pytorch(self, model, param_only=True):
        """
        把 PyTorch 模型存入 ".pkl" 文件
--- a/fastNLP/models/init.py
+++ b/fastNLP/models/init.py
@@ -7,7 +7,6 @@ fastNLP 在 :mod:`~fastNLP.models` 模块中内置了如 :class:`~fastNLP.models
 """
 __all__ = ["CNNText", "SeqLabeling", "ESIM", "STSeqLabel", "AdvSeqLabel", "STNLICls", "STSeqCls"]
 from .base_model import BaseModel
 from .bert import BertForMultipleChoice, BertForQuestionAnswering, BertForSequenceClassification, \
    BertForTokenClassification
@@ -15,4 +14,21 @@ from .biaffine_parser import BiaffineParser, GraphParser
 from .cnn_text_classification import CNNText
 from .sequence_labeling import SeqLabeling, AdvSeqLabel
 from .snli import ESIM
 from .star_transformer import STSeqCls, STNLICls, STSeqLabel
 from .star_transformer import StarTransEnc, STSeqCls, STNLICls, STSeqLabel
 __all__ = [
    "CNNText",
    "SeqLabeling",
    "AdvSeqLabel",
    "ESIM",
    "StarTransEnc",
    "STSeqLabel",
    "STNLICls",
    "STSeqCls",
    "BiaffineParser",
    "GraphParser"
 ]
--- a/fastNLP/models/base_model.py
+++ b/fastNLP/models/base_model.py
@@ -6,13 +6,13 @@ from ..modules.decoder.MLP import MLP
 class BaseModel(torch.nn.Module):
    """Base PyTorch model for all models.
    """
    def __init__(self):
        super(BaseModel, self).__init__()
    def fit(self, train_data, dev_data=None, **train_args):
        pass
    def predict(self, *args, **kwargs):
        raise NotImplementedError
@@ -21,9 +21,9 @@ class NaiveClassifier(BaseModel):
    def __init__(self, in_feature_dim, out_feature_dim):
        super(NaiveClassifier, self).__init__()
        self.mlp = MLP([in_feature_dim, in_feature_dim, out_feature_dim])
    def forward(self, x):
        return {"predict": torch.sigmoid(self.mlp(x))}
    def predict(self, x):
        return {"predict": torch.sigmoid(self.mlp(x)) > 0.5}
--- a/fastNLP/models/biaffine_parser.py
+++ b/fastNLP/models/biaffine_parser.py
@@ -1,11 +1,12 @@
 """Biaffine Dependency Parser 的 Pytorch 实现.
 """
 from collections import defaultdict
 Biaffine Dependency Parser 的 Pytorch 实现.
 """
 import numpy as np
 import torch
 from torch import nn
 from torch.nn import functional as F
 import torch.nn as nn
 import torch.nn.functional as F
 from collections import defaultdict
 from ..core.const import Const as C
 from ..core.losses import LossFunc
@@ -18,6 +19,12 @@ from ..modules.utils import get_embeddings
 from .base_model import BaseModel
 from ..core.utils import seq_len_to_mask
 __all__ = [
    "BiaffineParser",
    "GraphParser"
 ]
 def _mst(scores):
    """
    with some modification to support parser output for MST decoding
@@ -44,7 +51,7 @@ def _mst(scores):
            scores[roots, new_heads] / root_scores)]
        heads[roots] = new_heads
        heads[new_root] = 0
    edges = defaultdict(set)
    vertices = set((0,))
    for dep, head in enumerate(heads[tokens]):
@@ -73,7 +80,7 @@ def _mst(scores):
        heads[changed_cycle] = new_head
        edges[new_head].add(changed_cycle)
        edges[old_head].remove(changed_cycle)
    return heads
@@ -88,7 +95,7 @@ def _find_cycle(vertices, edges):
    _lowlinks = {}
    _onstack = defaultdict(lambda: False)
    _SCCs = []
    def _strongconnect(v):
        nonlocal _index
        _indices[v] = _index
@@ -96,28 +103,28 @@ def _find_cycle(vertices, edges):
        _index += 1
        _stack.append(v)
        _onstack[v] = True
        for w in edges[v]:
            if w not in _indices:
                _strongconnect(w)
                _lowlinks[v] = min(_lowlinks[v], _lowlinks[w])
            elif _onstack[w]:
                _lowlinks[v] = min(_lowlinks[v], _indices[w])
        if _lowlinks[v] == _indices[v]:
            SCC = set()
            while True:
                w = _stack.pop()
                _onstack[w] = False
                SCC.add(w)
                if not(w != v):
                if not (w != v):
                    break
            _SCCs.append(SCC)
    for v in vertices:
        if v not in _indices:
            _strongconnect(v)
    return [SCC for SCC in _SCCs if len(SCC) > 1]
@@ -125,9 +132,10 @@ class GraphParser(BaseModel):
    """
    基于图的parser base class, 支持贪婪解码和最大生成树解码
    """
    def __init__(self):
        super(GraphParser, self).__init__()
    @staticmethod
    def greedy_decoder(arc_matrix, mask=None):
        """
@@ -146,7 +154,7 @@ class GraphParser(BaseModel):
        if mask is not None:
            heads *= mask.long()
        return heads
    @staticmethod
    def mst_decoder(arc_matrix, mask=None):
        """
@@ -176,6 +184,7 @@ class ArcBiaffine(nn.Module):
    :param hidden_size: 输入的特征维度
    :param bias: 是否使用bias. Default: ``True``
    """
    def __init__(self, hidden_size, bias=True):
        super(ArcBiaffine, self).__init__()
        self.U = nn.Parameter(torch.Tensor(hidden_size, hidden_size), requires_grad=True)
@@ -185,7 +194,7 @@ class ArcBiaffine(nn.Module):
        else:
            self.register_parameter("bias", None)
        initial_parameter(self)
    def forward(self, head, dep):
        """
@@ -209,11 +218,12 @@ class LabelBilinear(nn.Module):
    :param num_label: 边类别的个数
    :param bias: 是否使用bias. Default: ``True``
    """
    def __init__(self, in1_features, in2_features, num_label, bias=True):
        super(LabelBilinear, self).__init__()
        self.bilinear = nn.Bilinear(in1_features, in2_features, num_label, bias=bias)
        self.lin = nn.Linear(in1_features + in2_features, num_label, bias=False)
    def forward(self, x1, x2):
        """
@@ -225,13 +235,13 @@ class LabelBilinear(nn.Module):
        output += self.lin(torch.cat([x1, x2], dim=2))
        return output
 class BiaffineParser(GraphParser):
    """
    别名：:class:`fastNLP.models.BiaffineParser`  :class:`fastNLP.models.baffine_parser.BiaffineParser`
    Biaffine Dependency Parser 实现.
    论文参考 ` Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016)
    <https://arxiv.org/abs/1611.01734>`_ .
    论文参考 `Deep Biaffine Attention for Neural Dependency Parsing (Dozat and Manning, 2016) <https://arxiv.org/abs/1611.01734>`_ .
    :param init_embed: 单词词典, 可以是 tuple, 包括(num_embedings, embedding_dim), 即
        embedding的大小和每个词的维度. 也可以传入 nn.Embedding 对象,
@@ -248,18 +258,19 @@ class BiaffineParser(GraphParser):
    :param use_greedy_infer: 是否在inference时使用贪心算法.
        若 ``False`` , 使用更加精确但相对缓慢的MST算法. Default: ``False``
    """
    def __init__(self,
                init_embed,
                pos_vocab_size,
                pos_emb_dim,
                num_label,
                rnn_layers=1,
                rnn_hidden_size=200,
                arc_mlp_size=100,
                label_mlp_size=100,
                dropout=0.3,
                encoder='lstm',
                use_greedy_infer=False):
                 init_embed,
                 pos_vocab_size,
                 pos_emb_dim,
                 num_label,
                 rnn_layers=1,
                 rnn_hidden_size=200,
                 arc_mlp_size=100,
                 label_mlp_size=100,
                 dropout=0.3,
                 encoder='lstm',
                 use_greedy_infer=False):
        super(BiaffineParser, self).__init__()
        rnn_out_size = 2 * rnn_hidden_size
        word_hid_dim = pos_hid_dim = rnn_hidden_size
@@ -295,20 +306,20 @@ class BiaffineParser(GraphParser):
            if (d_k * n_head) != rnn_out_size:
                raise ValueError('unsupported rnn_out_size: {} for transformer'.format(rnn_out_size))
            self.position_emb = nn.Embedding(num_embeddings=self.max_len,
                                             embedding_dim=rnn_out_size,)
                                             embedding_dim=rnn_out_size, )
            self.encoder = TransformerEncoder(num_layers=rnn_layers,
                                              model_size=rnn_out_size,
                                              inner_size=1024,
                                              key_size=d_k,
                                              value_size=d_v,
                                              num_head=n_head,
                                              dropout=dropout,)
                                              dropout=dropout, )
        else:
            raise ValueError('unsupported encoder type: {}'.format(encoder))
        self.mlp = nn.Sequential(nn.Linear(rnn_out_size, arc_mlp_size * 2 + label_mlp_size * 2),
                                          nn.ELU(),
                                          TimestepDropout(p=dropout),)
                                 nn.ELU(),
                                 TimestepDropout(p=dropout), )
        self.arc_mlp_size = arc_mlp_size
        self.label_mlp_size = label_mlp_size
        self.arc_predictor = ArcBiaffine(arc_mlp_size, bias=True)
@@ -316,7 +327,7 @@ class BiaffineParser(GraphParser):
        self.use_greedy_infer = use_greedy_infer
        self.reset_parameters()
        self.dropout = dropout
    def reset_parameters(self):
        for m in self.modules():
            if isinstance(m, nn.Embedding):
@@ -327,7 +338,7 @@ class BiaffineParser(GraphParser):
            else:
                for p in m.parameters():
                    nn.init.normal_(p, 0, 0.1)
    def forward(self, words1, words2, seq_len, target1=None):
        """模型forward阶段
@@ -337,50 +348,52 @@ class BiaffineParser(GraphParser):
        :param target1: [batch_size, seq_len] 输入真实标注的heads, 仅在训练阶段有效,
            用于训练label分类器. 若为 ``None`` , 使用预测的heads输入到label分类器
            Default: ``None``
        :return dict: parsing结果::
        :return dict: parsing
                结果::
                    pred1: [batch_size, seq_len, seq_len] 边预测logits
                    pred2: [batch_size, seq_len, num_label] label预测logits
                    pred3: [batch_size, seq_len] heads的预测结果, 在 ``target1=None`` 时预测
            pred1: [batch_size, seq_len, seq_len] 边预测logits
            pred2: [batch_size, seq_len, num_label] label预测logits
            pred3: [batch_size, seq_len] heads的预测结果, 在 ``target1=None`` 时预测
        """
        # prepare embeddings
        batch_size, length = words1.shape
        # print('forward {} {}'.format(batch_size, seq_len))
        # get sequence mask
        mask = seq_len_to_mask(seq_len).long()
        word = self.word_embedding(words1) # [N,L] -> [N,L,C_0]
        pos = self.pos_embedding(words2) # [N,L] -> [N,L,C_1]
        word = self.word_embedding(words1)  # [N,L] -> [N,L,C_0]
        pos = self.pos_embedding(words2)  # [N,L] -> [N,L,C_1]
        word, pos = self.word_fc(word), self.pos_fc(pos)
        word, pos = self.word_norm(word), self.pos_norm(pos)
        x = torch.cat([word, pos], dim=2) # -> [N,L,C]
        x = torch.cat([word, pos], dim=2)  # -> [N,L,C]
        # encoder, extract features
        if self.encoder_name.endswith('lstm'):
            sort_lens, sort_idx = torch.sort(seq_len, dim=0, descending=True)
            x = x[sort_idx]
            x = nn.utils.rnn.pack_padded_sequence(x, sort_lens, batch_first=True)
            feat, _ = self.encoder(x) # -> [N,L,C]
            feat, _ = self.encoder(x)  # -> [N,L,C]
            feat, _ = nn.utils.rnn.pad_packed_sequence(feat, batch_first=True)
            _, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
            feat = feat[unsort_idx]
        else:
            seq_range = torch.arange(length, dtype=torch.long, device=x.device)[None,:]
            seq_range = torch.arange(length, dtype=torch.long, device=x.device)[None, :]
            x = x + self.position_emb(seq_range)
            feat = self.encoder(x, mask.float())
        # for arc biaffine
        # mlp, reduce dim
        feat = self.mlp(feat)
        arc_sz, label_sz = self.arc_mlp_size, self.label_mlp_size
        arc_dep, arc_head = feat[:,:,:arc_sz], feat[:,:,arc_sz:2*arc_sz]
        label_dep, label_head = feat[:,:,2*arc_sz:2*arc_sz+label_sz], feat[:,:,2*arc_sz+label_sz:]
        arc_dep, arc_head = feat[:, :, :arc_sz], feat[:, :, arc_sz:2 * arc_sz]
        label_dep, label_head = feat[:, :, 2 * arc_sz:2 * arc_sz + label_sz], feat[:, :, 2 * arc_sz + label_sz:]
        # biaffine arc classifier
        arc_pred = self.arc_predictor(arc_head, arc_dep) # [N, L, L]
        arc_pred = self.arc_predictor(arc_head, arc_dep)  # [N, L, L]
        # use gold or predicted arc to predict label
        if target1 is None or not self.training:
            # use greedy decoding in training
@@ -390,22 +403,22 @@ class BiaffineParser(GraphParser):
                heads = self.mst_decoder(arc_pred, mask)
            head_pred = heads
        else:
            assert self.training # must be training mode
            assert self.training  # must be training mode
            if target1 is None:
                heads = self.greedy_decoder(arc_pred, mask)
                head_pred = heads
            else:
                head_pred = None
                heads = target1
        batch_range = torch.arange(start=0, end=batch_size, dtype=torch.long, device=words1.device).unsqueeze(1)
        label_head = label_head[batch_range, heads].contiguous()
        label_pred = self.label_predictor(label_head, label_dep) # [N, L, num_label]
        label_pred = self.label_predictor(label_head, label_dep)  # [N, L, num_label]
        res_dict = {C.OUTPUTS(0): arc_pred, C.OUTPUTS(1): label_pred}
        if head_pred is not None:
            res_dict[C.OUTPUTS(2)] = head_pred
        return res_dict
    @staticmethod
    def loss(pred1, pred2, target1, target2, seq_len):
        """
@@ -418,7 +431,7 @@ class BiaffineParser(GraphParser):
        :param seq_len: [batch_size, seq_len] 真实目标的长度
        :return loss: scalar
        """
        batch_size, length, _ = pred1.shape
        mask = seq_len_to_mask(seq_len)
        flip_mask = (mask == 0)
@@ -430,24 +443,26 @@ class BiaffineParser(GraphParser):
        child_index = torch.arange(length, device=arc_logits.device, dtype=torch.long).unsqueeze(0)
        arc_loss = arc_logits[batch_index, child_index, target1]
        label_loss = label_logits[batch_index, child_index, target2]
        byte_mask = flip_mask.byte()
        arc_loss.masked_fill_(byte_mask, 0)
        label_loss.masked_fill_(byte_mask, 0)
        arc_nll = -arc_loss.mean()
        label_nll = -label_loss.mean()
        return arc_nll + label_nll
    def predict(self, words1, words2, seq_len):
        """模型预测API
        :param words1: [batch_size, seq_len] 输入word序列
        :param words2: [batch_size, seq_len] 输入pos序列
        :param seq_len: [batch_size, seq_len] 输入序列长度
        :return dict: parsing结果::
        :return dict: parsing
                结果::
                    pred1: [batch_size, seq_len] heads的预测结果
                    pred2: [batch_size, seq_len, num_label] label预测logits
            pred1: [batch_size, seq_len] heads的预测结果
            pred2: [batch_size, seq_len, num_label] label预测logits
        """
        res = self(words1, words2, seq_len)
        output = {}
@@ -470,6 +485,7 @@ class ParserLoss(LossFunc):
    :param seq_len: [batch_size, seq_len] 真实目标的长度
    :return loss: scalar
    """
    def __init__(self, pred1=None, pred2=None,
                 target1=None, target2=None,
                 seq_len=None):
@@ -497,9 +513,10 @@ class ParserMetric(MetricBase):
        UAS: 不带label时, 边预测的准确率
        LAS: 同时预测边和label的准确率
    """
    def __init__(self, pred1=None, pred2=None,
                 target1=None, target2=None, seq_len=None):
        super().__init__()
        self._init_param_map(pred1=pred1, pred2=pred2,
                             target1=target1, target2=target2,
@@ -507,13 +524,13 @@ class ParserMetric(MetricBase):
        self.num_arc = 0
        self.num_label = 0
        self.num_sample = 0
    def get_metric(self, reset=True):
        res = {'UAS': self.num_arc*1.0 / self.num_sample, 'LAS': self.num_label*1.0 / self.num_sample}
        res = {'UAS': self.num_arc * 1.0 / self.num_sample, 'LAS': self.num_label * 1.0 / self.num_sample}
        if reset:
            self.num_sample = self.num_label = self.num_arc = 0
        return res
    def evaluate(self, pred1, pred2, target1, target2, seq_len=None):
        """Evaluate the performance of prediction.
        """
@@ -522,7 +539,7 @@ class ParserMetric(MetricBase):
        else:
            seq_mask = seq_len_to_mask(seq_len.long()).long()
        # mask out <root> tag
        seq_mask[:,0] = 0
        seq_mask[:, 0] = 0
        head_pred_correct = (pred1 == target1).long() * seq_mask
        label_pred_correct = (pred2 == target2).long() * head_pred_correct
        self.num_arc += head_pred_correct.sum().item()
--- a/fastNLP/models/cnn_text_classification.py
+++ b/fastNLP/models/cnn_text_classification.py
@@ -1,12 +1,13 @@
 # python: 3.6
 # encoding: utf-8
 import torch
 import torch.nn as nn
 from ..core.const import Const as C
 from ..core.const import Const as C
 from ..modules import encoder
 __all__ = [
    "CNNText"
 ]
 class CNNText(torch.nn.Module):
    """
@@ -23,7 +24,7 @@ class CNNText(torch.nn.Module):
    :param int padding: 对句子前后的pad的大小, 用0填充。
    :param float dropout: Dropout的大小
    """
    def __init__(self, init_embed,
                 num_classes,
                 kernel_nums=(3, 4, 5),
@@ -31,7 +32,7 @@ class CNNText(torch.nn.Module):
                 padding=0,
                 dropout=0.5):
        super(CNNText, self).__init__()
        # no support for pre-trained embedding currently
        self.embed = encoder.Embedding(init_embed)
        self.conv_pool = encoder.ConvMaxpool(
@@ -41,7 +42,7 @@ class CNNText(torch.nn.Module):
            padding=padding)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(sum(kernel_nums), num_classes)
    def forward(self, words, seq_len=None):
        """
@@ -54,7 +55,7 @@ class CNNText(torch.nn.Module):
        x = self.dropout(x)
        x = self.fc(x)  # [N,C] -> [N, N_class]
        return {C.OUTPUT: x}
    def predict(self, words, seq_len=None):
        """
        :param torch.LongTensor words: [batch_size, seq_len]，句子中word的index
--- a/fastNLP/models/enas_controller.py
+++ b/fastNLP/models/enas_controller.py
@@ -5,6 +5,7 @@ import os
 import torch
 import torch.nn.functional as F
 from . import enas_utils as utils
 from .enas_utils import Node
--- a/fastNLP/models/enas_model.py
+++ b/fastNLP/models/enas_model.py
@@ -1,17 +1,19 @@
 # Code Modified from https://github.com/carpedm20/ENAS-pytorch
 """Module containing the shared RNN model."""
 import numpy as np
 """
 Module containing the shared RNN model.
 Code Modified from https://github.com/carpedm20/ENAS-pytorch
 """
 import collections
 import numpy as np
 import torch
 from torch import nn
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.autograd import Variable
 from . import enas_utils as utils
 from .base_model import BaseModel
 def _get_dropped_weights(w_raw, dropout_p, is_training):
    """Drops out weights to implement DropConnect.
@@ -35,12 +37,13 @@ def _get_dropped_weights(w_raw, dropout_p, is_training):
    The above TODO is the reason for the hacky check for `torch.nn.Parameter`.
    """
    dropped_w = F.dropout(w_raw, p=dropout_p, training=is_training)
    if isinstance(dropped_w, torch.nn.Parameter):
        dropped_w = dropped_w.clone()
    return dropped_w
 class EmbeddingDropout(torch.nn.Embedding):
    """Class for dropping out embeddings by zero'ing out parameters in the
    embedding matrix.
@@ -53,6 +56,7 @@ class EmbeddingDropout(torch.nn.Embedding):
    See 'A Theoretically Grounded Application of Dropout in Recurrent Neural
    Networks', (Gal and Ghahramani, 2016).
    """
    def __init__(self,
                 num_embeddings,
                 embedding_dim,
@@ -83,14 +87,14 @@ class EmbeddingDropout(torch.nn.Embedding):
        assert (dropout >= 0.0) and (dropout < 1.0), ('Dropout must be >= 0.0 '
                                                      'and < 1.0')
        self.scale = scale
    def forward(self, inputs):  # pylint:disable=arguments-differ
        """Embeds `inputs` with the dropped out embedding weight matrix."""
        if self.training:
            dropout = self.dropout
        else:
            dropout = 0
        if dropout:
            mask = self.weight.data.new(self.weight.size(0), 1)
            mask.bernoulli_(1 - dropout)
@@ -101,7 +105,7 @@ class EmbeddingDropout(torch.nn.Embedding):
            masked_weight = self.weight
        if self.scale and self.scale != 1:
            masked_weight = masked_weight * self.scale
        return F.embedding(inputs,
                           masked_weight,
                           max_norm=self.max_norm,
@@ -114,7 +118,7 @@ class LockedDropout(nn.Module):
    # code from https://github.com/salesforce/awd-lstm-lm/blob/master/locked_dropout.py
    def __init__(self):
        super().__init__()
    def forward(self, x, dropout=0.5):
        if not self.training or not dropout:
            return x
@@ -126,11 +130,12 @@ class LockedDropout(nn.Module):
 class ENASModel(BaseModel):
    """Shared RNN model."""
    def __init__(self, embed_num, num_classes, num_blocks=4, cuda=False, shared_hid=1000, shared_embed=1000):
        super(ENASModel, self).__init__()
        self.use_cuda = cuda
        self.shared_hid = shared_hid
        self.num_blocks = num_blocks
        self.decoder = nn.Linear(self.shared_hid, num_classes)
@@ -139,16 +144,16 @@ class ENASModel(BaseModel):
                                        dropout=0.1)
        self.lockdrop = LockedDropout()
        self.dag = None
        # Tie weights
        # self.decoder.weight = self.encoder.weight
        # Since W^{x, c} and W^{h, c} are always summed, there
        # is no point duplicating their bias offset parameter. Likewise for
        # W^{x, h} and W^{h, h}.
        self.w_xc = nn.Linear(shared_embed, self.shared_hid)
        self.w_xh = nn.Linear(shared_embed, self.shared_hid)
        # The raw weights are stored here because the hidden-to-hidden weights
        # are weight dropped on the forward pass.
        self.w_hc_raw = torch.nn.Parameter(
@@ -157,10 +162,10 @@ class ENASModel(BaseModel):
            torch.Tensor(self.shared_hid, self.shared_hid))
        self.w_hc = None
        self.w_hh = None
        self.w_h = collections.defaultdict(dict)
        self.w_c = collections.defaultdict(dict)
        for idx in range(self.num_blocks):
            for jdx in range(idx + 1, self.num_blocks):
                self.w_h[idx][jdx] = nn.Linear(self.shared_hid,
@@ -169,48 +174,47 @@ class ENASModel(BaseModel):
                self.w_c[idx][jdx] = nn.Linear(self.shared_hid,
                                               self.shared_hid,
                                               bias=False)
        self._w_h = nn.ModuleList([self.w_h[idx][jdx]
                                   for idx in self.w_h
                                   for jdx in self.w_h[idx]])
        self._w_c = nn.ModuleList([self.w_c[idx][jdx]
                                   for idx in self.w_c
                                   for jdx in self.w_c[idx]])
        self.batch_norm = None
        # if args.mode == 'train':
        #     self.batch_norm = nn.BatchNorm1d(self.shared_hid)
        # else:
        #     self.batch_norm = None
        self.reset_parameters()
        self.static_init_hidden = utils.keydefaultdict(self.init_hidden)
    def setDAG(self, dag):
        if self.dag is None:
            self.dag = dag
    def forward(self, word_seq, hidden=None):
        inputs = torch.transpose(word_seq, 0, 1)
        time_steps = inputs.size(0)
        batch_size = inputs.size(1)
        self.w_hh = _get_dropped_weights(self.w_hh_raw,
                                         0.5,
                                         self.training)
        self.w_hc = _get_dropped_weights(self.w_hc_raw,
                                         0.5,
                                         self.training)
        # hidden = self.static_init_hidden[batch_size] if hidden is None else hidden
        hidden = self.static_init_hidden[batch_size]
        embed = self.encoder(inputs)
        embed = self.lockdrop(embed, 0.65 if self.training else 0)
        # The norm of hidden states are clipped here because
        # otherwise ENAS is especially prone to exploding activations on the
        # forward pass. This could probably be fixed in a more elegant way, but
@@ -226,7 +230,7 @@ class ENASModel(BaseModel):
        for step in range(time_steps):
            x_t = embed[step]
            logit, hidden = self.cell(x_t, hidden, self.dag)
            hidden_norms = hidden.norm(dim=-1)
            max_norm = 25.0
            if hidden_norms.data.max() > max_norm:
@@ -237,60 +241,60 @@ class ENASModel(BaseModel):
                # because the PyTorch slicing and slice assignment is too
                # flaky.
                hidden_norms = hidden_norms.data.cpu().numpy()
                clipped_num += 1
                if hidden_norms.max() > max_clipped_norm:
                    max_clipped_norm = hidden_norms.max()
                clip_select = hidden_norms > max_norm
                clip_norms = hidden_norms[clip_select]
                mask = np.ones(hidden.size())
                normalizer = max_norm/clip_norms
                normalizer = max_norm / clip_norms
                normalizer = normalizer[:, np.newaxis]
                mask[clip_select] = normalizer
                if self.use_cuda:
                    hidden *= torch.autograd.Variable(
                        torch.FloatTensor(mask).cuda(), requires_grad=False)
                else:
                    hidden *= torch.autograd.Variable(
                        torch.FloatTensor(mask), requires_grad=False)                    
                        torch.FloatTensor(mask), requires_grad=False)
            logits.append(logit)
            h1tohT.append(hidden)
        h1tohT = torch.stack(h1tohT)
        output = torch.stack(logits)
        raw_output = output
        output = self.lockdrop(output, 0.4 if self.training else 0)
        #Pooling 
        # Pooling
        output = torch.mean(output, 0)
        decoded = self.decoder(output)
        extra_out = {'dropped': decoded,
                     'hiddens': h1tohT,
                     'raw': raw_output}
        return {'pred': decoded, 'hidden': hidden, 'extra_out': extra_out}
    def cell(self, x, h_prev, dag):
        """Computes a single pass through the discovered RNN cell."""
        c = {}
        h = {}
        f = {}
        f[0] = self.get_f(dag[-1][0].name)
        c[0] = torch.sigmoid(self.w_xc(x) + F.linear(h_prev, self.w_hc, None))
        h[0] = (c[0]*f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) +
                (1 - c[0])*h_prev)
        h[0] = (c[0] * f[0](self.w_xh(x) + F.linear(h_prev, self.w_hh, None)) +
                (1 - c[0]) * h_prev)
        leaf_node_ids = []
        q = collections.deque()
        q.append(0)
        # Computes connections from the parent nodes `node_id`
        # to their child nodes `next_id` recursively, skipping leaf nodes. A
        # leaf node is a node whose id == `self.num_blocks`.
@@ -306,10 +310,10 @@ class ENASModel(BaseModel):
        while True:
            if len(q) == 0:
                break
            node_id = q.popleft()
            nodes = dag[node_id]
            for next_node in nodes:
                next_id = next_node.id
                if next_id == self.num_blocks:
@@ -317,38 +321,38 @@ class ENASModel(BaseModel):
                    assert len(nodes) == 1, ('parent of leaf node should have '
                                             'only one child')
                    continue
                w_h = self.w_h[node_id][next_id]
                w_c = self.w_c[node_id][next_id]
                f[next_id] = self.get_f(next_node.name)
                c[next_id] = torch.sigmoid(w_c(h[node_id]))
                h[next_id] = (c[next_id]*f[next_id](w_h(h[node_id])) +
                              (1 - c[next_id])*h[node_id])
                h[next_id] = (c[next_id] * f[next_id](w_h(h[node_id])) +
                              (1 - c[next_id]) * h[node_id])
                q.append(next_id)
        # Instead of averaging loose ends, perhaps there should
        # be a set of separate unshared weights for each "loose" connection
        # between each node in a cell and the output.
        #
        # As it stands, all weights W^h_{ij} are doing double duty by
        # connecting both from i to j, as well as from i to the output.
        # average all the loose ends
        leaf_nodes = [h[node_id] for node_id in leaf_node_ids]
        output = torch.mean(torch.stack(leaf_nodes, 2), -1)
        # stabilizing the Updates of omega
        if self.batch_norm is not None:
            output = self.batch_norm(output)
        return output, h[self.num_blocks - 1]
    def init_hidden(self, batch_size):
        zeros = torch.zeros(batch_size, self.shared_hid)
        return utils.get_variable(zeros, self.use_cuda, requires_grad=False)
    def get_f(self, name):
        name = name.lower()
        if name == 'relu':
@@ -360,22 +364,21 @@ class ENASModel(BaseModel):
        elif name == 'sigmoid':
            f = torch.sigmoid
        return f
    @property
    def num_parameters(self):
        def size(p):
            return np.prod(p.size())
        return sum([size(param) for param in self.parameters()])
    def reset_parameters(self):
        init_range = 0.025
        # init_range = 0.025 if self.args.mode == 'train' else 0.04
        for param in self.parameters():
            param.data.uniform_(-init_range, init_range)
        self.decoder.bias.data.fill_(0)
    def predict(self, word_seq):
        """
--- a/fastNLP/models/enas_trainer.py
+++ b/fastNLP/models/enas_trainer.py
@@ -1,12 +1,12 @@
 # Code Modified from https://github.com/carpedm20/ENAS-pytorch
 import time
 from datetime import datetime
 from datetime import timedelta
 import math
 import numpy as np
 import time
 import torch
 import math
 from datetime import datetime, timedelta
 from torch.optim import Adam
 try:
    from tqdm.auto import tqdm
@@ -21,8 +21,6 @@ from ..core.utils import _move_dict_value_to_device
 from . import enas_utils as utils
 from ..core.utils import _build_args
 from torch.optim import Adam
 def _get_no_grad_ctx_mgr():
    """Returns a the `torch.no_grad` context manager for PyTorch version >=
@@ -33,6 +31,7 @@ def _get_no_grad_ctx_mgr():
 class ENASTrainer(Trainer):
    """A class to wrap training code."""
    def __init__(self, train_data, model, controller, **kwargs):
        """Constructor for training algorithm.
        :param DataSet train_data: the training data
@@ -45,19 +44,19 @@ class ENASTrainer(Trainer):
        self.controller_step = 0
        self.shared_step = 0
        self.max_length = 35
        self.shared = model
        self.controller = controller
        self.shared_optim = Adam(
            self.shared.parameters(),
            lr=20.0,
            weight_decay=1e-7)
        self.controller_optim = Adam(
            self.controller.parameters(),
            lr=3.5e-4)
    def train(self, load_best_model=True):
        """
        :param bool load_best_model: 该参数只有在初始化提供了dev_data的情况下有效，如果True, trainer将在返回之前重新加载dev表现
@@ -82,21 +81,22 @@ class ENASTrainer(Trainer):
                self.model = self.model.cuda()
            self._model_device = self.model.parameters().__next__().device
            self._mode(self.model, is_test=False)
            self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
            start_time = time.time()
            print("training epochs started " + self.start_time, flush=True)
            try:
                self.callback_manager.on_train_begin()
                self._train()
                self.callback_manager.on_train_end()
            except (CallbackException, KeyboardInterrupt) as e:
                self.callback_manager.on_exception(e)
            if self.dev_data is not None:
                print("\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) +
                      self.tester._format_eval_results(self.best_dev_perf),)
                print(
                    "\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) +
                    self.tester._format_eval_results(self.best_dev_perf), )
                results['best_eval'] = self.best_dev_perf
                results['best_epoch'] = self.best_dev_epoch
                results['best_step'] = self.best_dev_step
@@ -110,9 +110,9 @@ class ENASTrainer(Trainer):
        finally:
            pass
        results['seconds'] = round(time.time() - start_time, 2)
        return results
    def _train(self):
        if not self.use_tqdm:
            from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm
@@ -126,21 +126,21 @@ class ENASTrainer(Trainer):
            avg_loss = 0
            data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
                                  prefetch=self.prefetch)
            for epoch in range(1, self.n_epochs+1):
            for epoch in range(1, self.n_epochs + 1):
                pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs))
                last_stage = (epoch > self.n_epochs + 1 - self.final_epochs)
                if epoch == self.n_epochs + 1 - self.final_epochs:
                    print('Entering the final stage. (Only train the selected structure)')
                # early stopping
                self.callback_manager.on_epoch_begin()
                # 1. Training the shared parameters omega of the child models
                self.train_shared(pbar)
                # 2. Training the controller parameters theta
                if not last_stage:
                    self.train_controller()
                if ((self.validate_every > 0 and self.step % self.validate_every == 0) or
                    (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \
                        and self.dev_data is not None:
@@ -149,16 +149,15 @@ class ENASTrainer(Trainer):
                    eval_res = self._do_validation(epoch=epoch, step=self.step)
                    eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step,
                                                                                total_steps) + \
                                self.tester._format_eval_results(eval_res)
                               self.tester._format_eval_results(eval_res)
                    pbar.write(eval_str)
                # lr decay; early stopping
                self.callback_manager.on_epoch_end()
            # =============== epochs end =================== #
            pbar.close()
        # ============ tqdm end ============== #
    def get_loss(self, inputs, targets, hidden, dags):
        """Computes the loss for the same batch for M models.
@@ -167,7 +166,7 @@ class ENASTrainer(Trainer):
        """
        if not isinstance(dags, list):
            dags = [dags]
        loss = 0
        for dag in dags:
            self.shared.setDAG(dag)
@@ -175,14 +174,14 @@ class ENASTrainer(Trainer):
            inputs['hidden'] = hidden
            result = self.shared(**inputs)
            output, hidden, extra_out = result['pred'], result['hidden'], result['extra_out']
            self.callback_manager.on_loss_begin(targets, result)
            sample_loss = self._compute_loss(result, targets)
            loss += sample_loss
        assert len(dags) == 1, 'there are multiple `hidden` for multple `dags`'
        return loss, hidden, extra_out
    def train_shared(self, pbar=None, max_step=None, dag=None):
        """Train the language model for 400 steps of minibatches of 64
        examples.
@@ -200,9 +199,9 @@ class ENASTrainer(Trainer):
        model = self.shared
        model.train()
        self.controller.eval()
        hidden = self.shared.init_hidden(self.batch_size)
        abs_max_grad = 0
        abs_max_hidden_norm = 0
        step = 0
@@ -211,15 +210,15 @@ class ENASTrainer(Trainer):
        train_idx = 0
        avg_loss = 0
        data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
                prefetch=self.prefetch)
                              prefetch=self.prefetch)
        for batch_x, batch_y in data_iterator:
            _move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
            indices = data_iterator.get_batch_indices()
            # negative sampling; replace unknown; re-weight batch_y
            self.callback_manager.on_batch_begin(batch_x, batch_y, indices)
            # prediction = self._data_forward(self.model, batch_x)
            dags = self.controller.sample(1)
            inputs, targets = batch_x, batch_y
            # self.callback_manager.on_loss_begin(batch_y, prediction)
@@ -228,18 +227,18 @@ class ENASTrainer(Trainer):
                                                    hidden,
                                                    dags)
            hidden.detach_()
            avg_loss += loss.item()
            # Is loss NaN or inf? requires_grad = False
            self.callback_manager.on_backward_begin(loss)
            self._grad_backward(loss)
            self.callback_manager.on_backward_end()
            self._update()
            self.callback_manager.on_step_end()
            if (self.step+1) % self.print_every == 0:
            if (self.step + 1) % self.print_every == 0:
                if self.use_tqdm:
                    print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every)
                    pbar.update(self.print_every)
@@ -255,30 +254,29 @@ class ENASTrainer(Trainer):
            self.shared_step += 1
            self.callback_manager.on_batch_end()
        # ================= mini-batch end ==================== #
    def get_reward(self, dag, entropies, hidden, valid_idx=0):
        """Computes the perplexity of a single sampled model on a minibatch of
        validation data.
        """
        if not isinstance(entropies, np.ndarray):
            entropies = entropies.data.cpu().numpy()
        data_iterator = Batch(self.dev_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
                prefetch=self.prefetch)
                              prefetch=self.prefetch)
        for inputs, targets in data_iterator:
            valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag)
            valid_loss = utils.to_item(valid_loss.data)
            valid_ppl = math.exp(valid_loss)
            R = 80 / valid_ppl
            rewards = R + 1e-4 * entropies
            return rewards, hidden
    def train_controller(self):
        """Fixes the shared parameters and updates the controller parameters.
@@ -296,13 +294,13 @@ class ENASTrainer(Trainer):
        # Why can't we call shared.eval() here? Leads to loss
        # being uniformly zero for the controller.
        # self.shared.eval()
        avg_reward_base = None
        baseline = None
        adv_history = []
        entropy_history = []
        reward_history = []
        hidden = self.shared.init_hidden(self.batch_size)
        total_loss = 0
        valid_idx = 0
@@ -310,7 +308,7 @@ class ENASTrainer(Trainer):
            # sample models
            dags, log_probs, entropies = self.controller.sample(
                with_details=True)
            # calculate reward
            np_entropies = entropies.data.cpu().numpy()
            # No gradients should be backpropagated to the
@@ -320,40 +318,39 @@ class ENASTrainer(Trainer):
                                                  np_entropies,
                                                  hidden,
                                                  valid_idx)
            reward_history.extend(rewards)
            entropy_history.extend(np_entropies)
            # moving average baseline
            if baseline is None:
                baseline = rewards
            else:
                decay = 0.95
                baseline = decay * baseline + (1 - decay) * rewards
            adv = rewards - baseline
            adv_history.extend(adv)
            # policy loss
            loss = -log_probs*utils.get_variable(adv,
                                                 'cuda' in self.device,
                                                 requires_grad=False)
            loss = -log_probs * utils.get_variable(adv,
                                                   'cuda' in self.device,
                                                   requires_grad=False)
            loss = loss.sum()  # or loss.mean()
            # update
            self.controller_optim.zero_grad()
            loss.backward()
            self.controller_optim.step()
            total_loss += utils.to_item(loss.data)
            if ((step % 50) == 0) and (step > 0):
                reward_history, adv_history, entropy_history = [], [], []
                total_loss = 0
            self.controller_step += 1
            # prev_valid_idx = valid_idx
            # valid_idx = ((valid_idx + self.max_length) %
@@ -362,16 +359,16 @@ class ENASTrainer(Trainer):
            # # validation data, we reset the hidden states.
            # if prev_valid_idx > valid_idx:
            #     hidden = self.shared.init_hidden(self.batch_size)
    def derive(self, sample_num=10, valid_idx=0):
        """We are always deriving based on the very first batch
        of validation data? This seems wrong...
        """
        hidden = self.shared.init_hidden(self.batch_size)
        dags, _, entropies = self.controller.sample(sample_num,
                                                    with_details=True)
        max_R = 0
        best_dag = None
        for dag in dags:
@@ -379,5 +376,5 @@ class ENASTrainer(Trainer):
            if R.max() > max_R:
                max_R = R.max()
                best_dag = dag
        self.model.setDAG(best_dag)
--- a/fastNLP/models/enas_utils.py
+++ b/fastNLP/models/enas_utils.py
@@ -1,12 +1,10 @@
 # Code Modified from https://github.com/carpedm20/ENAS-pytorch
 from __future__ import print_function
 from collections import defaultdict
 import collections
 import numpy as np
 import torch
 from torch.autograd import Variable
--- a/fastNLP/models/sequence_labeling.py
+++ b/fastNLP/models/sequence_labeling.py
@@ -1,11 +1,19 @@
 """
    本模块实现了两种序列标注模型
 """
 import torch
 import torch.nn as nn
 from .base_model import BaseModel
 from ..modules import decoder, encoder
 from ..modules.decoder.CRF import allowed_transitions
 from ..core.utils import seq_len_to_mask
 from ..core.const import Const as C
 from torch import nn
 __all__ = [
    "SeqLabeling",
    "AdvSeqLabel"
 ]
 class SeqLabeling(BaseModel):
--- a/fastNLP/models/snli.py
+++ b/fastNLP/models/snli.py
@@ -8,6 +8,9 @@ from ..modules import encoder as Encoder
 from ..modules import aggregator as Aggregator
 from ..core.utils import seq_len_to_mask
 __all__ = [
    "ESIM"
 ]
 my_inf = 10e12
@@ -26,7 +29,7 @@ class ESIM(BaseModel):
    :param int num_classes: 标签数目，默认为3
    :param numpy.array init_embedding: 初始词嵌入矩阵，形状为(vocab_size, embed_dim)，默认为None，即随机初始化词嵌入矩阵
    """
    def __init__(self, vocab_size, embed_dim, hidden_size, dropout=0.0, num_classes=3, init_embedding=None):
        super(ESIM, self).__init__()
@@ -35,35 +38,36 @@ class ESIM(BaseModel):
        self.hidden_size = hidden_size
        self.dropout = dropout
        self.n_labels = num_classes
        self.drop = nn.Dropout(self.dropout)
        self.embedding = Encoder.Embedding(
            (self.vocab_size, self.embed_dim), dropout=self.dropout,
        )
        self.embedding_layer = nn.Linear(self.embed_dim, self.hidden_size)
        self.encoder = Encoder.LSTM(
            input_size=self.embed_dim, hidden_size=self.hidden_size, num_layers=1, bias=True,
            batch_first=True, bidirectional=True
        )
        self.bi_attention = Aggregator.BiAttention()
        self.mean_pooling = Aggregator.AvgPoolWithMask()
        self.max_pooling = Aggregator.MaxPoolWithMask()
        self.inference_layer = nn.Linear(self.hidden_size * 4, self.hidden_size)
        self.decoder = Encoder.LSTM(
            input_size=self.hidden_size, hidden_size=self.hidden_size, num_layers=1, bias=True,
            batch_first=True, bidirectional=True
        )
        self.output = Decoder.MLP([4 * self.hidden_size, self.hidden_size, self.n_labels], 'tanh', dropout=self.dropout)
    def forward(self, words1, words2, seq_len1=None, seq_len2=None, target=None):
        """ Forward function
        :param torch.Tensor words1: [batch size(B), premise seq len(PL)] premise的token表示
        :param torch.Tensor words2: [B, hypothesis seq len(HL)] hypothesis的token表示
        :param torch.LongTensor seq_len1: [B] premise的长度
@@ -71,10 +75,10 @@ class ESIM(BaseModel):
        :param torch.LongTensor target: [B] 真实目标值
        :return: dict prediction: [B, n_labels(N)] 预测结果
        """
        premise0 = self.embedding_layer(self.embedding(words1))
        hypothesis0 = self.embedding_layer(self.embedding(words2))
        if seq_len1 is not None:
            seq_len1 = seq_len_to_mask(seq_len1)
        else:
@@ -85,55 +89,55 @@ class ESIM(BaseModel):
        else:
            seq_len2 = torch.ones(hypothesis0.size(0), hypothesis0.size(1))
            seq_len2 = (seq_len2.long()).to(device=hypothesis0.device)
        _BP, _PSL, _HP = premise0.size()
        _BH, _HSL, _HH = hypothesis0.size()
        _BPL, _PLL = seq_len1.size()
        _HPL, _HLL = seq_len2.size()
        assert _BP == _BH and _BPL == _HPL and _BP == _BPL
        assert _HP == _HH
        assert _PSL == _PLL and _HSL == _HLL
        B, PL, H = premise0.size()
        B, HL, H = hypothesis0.size()
        a0 = self.encoder(self.drop(premise0))  # a0: [B, PL, H * 2]
        b0 = self.encoder(self.drop(hypothesis0))  # b0: [B, HL, H * 2]
        a = torch.mean(a0.view(B, PL, -1, H), dim=2)  # a: [B, PL, H]
        b = torch.mean(b0.view(B, HL, -1, H), dim=2)  # b: [B, HL, H]
        ai, bi = self.bi_attention(a, b, seq_len1, seq_len2)
        ma = torch.cat((a, ai, a - ai, a * ai), dim=2)  # ma: [B, PL, 4 * H]
        mb = torch.cat((b, bi, b - bi, b * bi), dim=2)  # mb: [B, HL, 4 * H]
        f_ma = self.inference_layer(ma)
        f_mb = self.inference_layer(mb)
        vat = self.decoder(self.drop(f_ma))
        vbt = self.decoder(self.drop(f_mb))
        va = torch.mean(vat.view(B, PL, -1, H), dim=2)  # va: [B, PL, H]
        vb = torch.mean(vbt.view(B, HL, -1, H), dim=2)  # vb: [B, HL, H]
        va_ave = self.mean_pooling(va, seq_len1, dim=1)  # va_ave: [B, H]
        va_max, va_arg_max = self.max_pooling(va, seq_len1, dim=1)  # va_max: [B, H]
        vb_ave = self.mean_pooling(vb, seq_len2, dim=1)  # vb_ave: [B, H]
        vb_max, vb_arg_max = self.max_pooling(vb, seq_len2, dim=1)  # vb_max: [B, H]
        v = torch.cat((va_ave, va_max, vb_ave, vb_max), dim=1)  # v: [B, 4 * H]
        prediction = torch.tanh(self.output(v))  # prediction: [B, N]
        if target is not None:
            func = nn.CrossEntropyLoss()
            loss = func(prediction, target)
            return {Const.OUTPUT: prediction, Const.LOSS: loss}
        return {Const.OUTPUT: prediction}
    def predict(self, words1, words2, seq_len1=None, seq_len2=None, target=None):
        """ Predict function
@@ -146,4 +150,3 @@ class ESIM(BaseModel):
        """
        prediction = self.forward(words1, words2, seq_len1, seq_len2)[Const.OUTPUT]
        return {Const.OUTPUT: torch.argmax(prediction, dim=-1)}
--- a/fastNLP/models/star_transformer.py
+++ b/fastNLP/models/star_transformer.py
@@ -1,17 +1,25 @@
 """Star-Transformer 的 一个 Pytorch 实现.
 """
 Star-Transformer 的 Pytorch 实现。
 """
 import torch
 from torch import nn
 from ..modules.encoder.star_transformer import StarTransformer
 from ..core.utils import seq_len_to_mask
 from ..modules.utils import get_embeddings
 from ..core.const import Const
 import torch
 from torch import nn
 __all__ = [
    "StarTransEnc",
    "STNLICls",
    "STSeqCls",
    "STSeqLabel",
 ]
 class StarTransEnc(nn.Module):
    """
    别名：:class:`fastNLP.models.StarTransEnc`  :class:`fastNLP.models.start_transformer.StarTransEnc`
    别名：:class:`fastNLP.models.StarTransEnc`  :class:`fastNLP.models.star_transformer.StarTransEnc`
    带word embedding的Star-Transformer Encoder
@@ -28,6 +36,7 @@ class StarTransEnc(nn.Module):
    :param emb_dropout: 词嵌入的dropout概率.
    :param dropout: 模型除词嵌入外的dropout概率.
    """
    def __init__(self, init_embed,
                 hidden_size,
                 num_layers,
@@ -47,7 +56,7 @@ class StarTransEnc(nn.Module):
                                       head_dim=head_dim,
                                       dropout=dropout,
                                       max_len=max_len)
    def forward(self, x, mask):
        """
        :param FloatTensor data: [batch, length, hidden] 输入的序列
@@ -72,7 +81,7 @@ class _Cls(nn.Module):
            nn.Dropout(dropout),
            nn.Linear(hid_dim, num_cls),
        )
    def forward(self, x):
        h = self.fc(x)
        return h
@@ -83,20 +92,21 @@ class _NLICls(nn.Module):
        super(_NLICls, self).__init__()
        self.fc = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(in_dim*4, hid_dim),  #4
            nn.Linear(in_dim * 4, hid_dim),  # 4
            nn.LeakyReLU(),
            nn.Dropout(dropout),
            nn.Linear(hid_dim, num_cls),
        )
    def forward(self, x1, x2):
        x = torch.cat([x1, x2, torch.abs(x1-x2), x1*x2], 1)
        x = torch.cat([x1, x2, torch.abs(x1 - x2), x1 * x2], 1)
        h = self.fc(x)
        return h
 class STSeqLabel(nn.Module):
    """
    别名：:class:`fastNLP.models.STSeqLabel`  :class:`fastNLP.models.start_transformer.STSeqLabel`
    别名：:class:`fastNLP.models.STSeqLabel`  :class:`fastNLP.models.star_transformer.STSeqLabel`
    用于序列标注的Star-Transformer模型
@@ -112,6 +122,7 @@ class STSeqLabel(nn.Module):
    :param emb_dropout: 词嵌入的dropout概率. Default: 0.1
    :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
    """
    def __init__(self, init_embed, num_cls,
                 hidden_size=300,
                 num_layers=4,
@@ -120,7 +131,7 @@ class STSeqLabel(nn.Module):
                 max_len=512,
                 cls_hidden_size=600,
                 emb_dropout=0.1,
                 dropout=0.1,):
                 dropout=0.1, ):
        super(STSeqLabel, self).__init__()
        self.enc = StarTransEnc(init_embed=init_embed,
                                hidden_size=hidden_size,
@@ -131,7 +142,7 @@ class STSeqLabel(nn.Module):
                                emb_dropout=emb_dropout,
                                dropout=dropout)
        self.cls = _Cls(hidden_size, num_cls, cls_hidden_size)
    def forward(self, words, seq_len):
        """
@@ -142,9 +153,9 @@ class STSeqLabel(nn.Module):
        mask = seq_len_to_mask(seq_len)
        nodes, _ = self.enc(words, mask)
        output = self.cls(nodes)
        output = output.transpose(1,2) # make hidden to be dim 1
        return {Const.OUTPUT: output} # [bsz, n_cls, seq_len]
        output = output.transpose(1, 2)  # make hidden to be dim 1
        return {Const.OUTPUT: output}  # [bsz, n_cls, seq_len]
    def predict(self, words, seq_len):
        """
@@ -159,7 +170,7 @@ class STSeqLabel(nn.Module):
 class STSeqCls(nn.Module):
    """
    别名：:class:`fastNLP.models.STSeqCls`  :class:`fastNLP.models.start_transformer.STSeqCls`
    别名：:class:`fastNLP.models.STSeqCls`  :class:`fastNLP.models.star_transformer.STSeqCls`
    用于分类任务的Star-Transformer
@@ -175,7 +186,7 @@ class STSeqCls(nn.Module):
    :param emb_dropout: 词嵌入的dropout概率. Default: 0.1
    :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
    """
    def __init__(self, init_embed, num_cls,
                 hidden_size=300,
                 num_layers=4,
@@ -184,7 +195,7 @@ class STSeqCls(nn.Module):
                 max_len=512,
                 cls_hidden_size=600,
                 emb_dropout=0.1,
                 dropout=0.1,):
                 dropout=0.1, ):
        super(STSeqCls, self).__init__()
        self.enc = StarTransEnc(init_embed=init_embed,
                                hidden_size=hidden_size,
@@ -195,7 +206,7 @@ class STSeqCls(nn.Module):
                                emb_dropout=emb_dropout,
                                dropout=dropout)
        self.cls = _Cls(hidden_size, num_cls, cls_hidden_size)
    def forward(self, words, seq_len):
        """
@@ -206,9 +217,9 @@ class STSeqCls(nn.Module):
        mask = seq_len_to_mask(seq_len)
        nodes, relay = self.enc(words, mask)
        y = 0.5 * (relay + nodes.max(1)[0])
        output = self.cls(y) # [bsz, n_cls]
        output = self.cls(y)  # [bsz, n_cls]
        return {Const.OUTPUT: output}
    def predict(self, words, seq_len):
        """
@@ -223,7 +234,7 @@ class STSeqCls(nn.Module):
 class STNLICls(nn.Module):
    """
    别名：:class:`fastNLP.models.STNLICls`  :class:`fastNLP.models.start_transformer.STNLICls`
    别名：:class:`fastNLP.models.STNLICls`  :class:`fastNLP.models.star_transformer.STNLICls`
    用于自然语言推断(NLI)的Star-Transformer
@@ -239,7 +250,7 @@ class STNLICls(nn.Module):
    :param emb_dropout: 词嵌入的dropout概率. Default: 0.1
    :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
    """
    def __init__(self, init_embed, num_cls,
                 hidden_size=300,
                 num_layers=4,
@@ -248,7 +259,7 @@ class STNLICls(nn.Module):
                 max_len=512,
                 cls_hidden_size=600,
                 emb_dropout=0.1,
                 dropout=0.1,):
                 dropout=0.1, ):
        super(STNLICls, self).__init__()
        self.enc = StarTransEnc(init_embed=init_embed,
                                hidden_size=hidden_size,
@@ -259,7 +270,7 @@ class STNLICls(nn.Module):
                                emb_dropout=emb_dropout,
                                dropout=dropout)
        self.cls = _NLICls(hidden_size, num_cls, cls_hidden_size)
    def forward(self, words1, words2, seq_len1, seq_len2):
        """
@@ -271,14 +282,16 @@ class STNLICls(nn.Module):
        """
        mask1 = seq_len_to_mask(seq_len1)
        mask2 = seq_len_to_mask(seq_len2)
        def enc(seq, mask):
            nodes, relay = self.enc(seq, mask)
            return 0.5 * (relay + nodes.max(1)[0])
        y1 = enc(words1, mask1)
        y2 = enc(words2, mask2)
        output = self.cls(y1, y2) # [bsz, n_cls]
        output = self.cls(y1, y2)  # [bsz, n_cls]
        return {Const.OUTPUT: output}
    def predict(self, words1, words2, seq_len1, seq_len2):
        """