erge conflict in bert

6 years ago · f3e19ddd9a
--- a/docs/source/fastNLP.core.batch.rst
+++ b/docs/source/fastNLP.core.batch.rst
@@ -2,6 +2,6 @@ fastNLP.core.batch
 ==================
 .. automodule:: fastNLP.core.batch
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.core.callback.rst
+++ b/docs/source/fastNLP.core.callback.rst
@@ -2,6 +2,6 @@ fastNLP.core.callback
 =====================
 .. automodule:: fastNLP.core.callback
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.core.const.rst
+++ b/docs/source/fastNLP.core.const.rst
@@ -2,6 +2,6 @@ fastNLP.core.const
 ==================
 .. automodule:: fastNLP.core.const
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.core.dataset.rst
+++ b/docs/source/fastNLP.core.dataset.rst
@@ -2,6 +2,6 @@ fastNLP.core.dataset
 ====================
 .. automodule:: fastNLP.core.dataset
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.core.field.rst
+++ b/docs/source/fastNLP.core.field.rst
@@ -2,6 +2,6 @@ fastNLP.core.field
 ==================
 .. automodule:: fastNLP.core.field
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.core.instance.rst
+++ b/docs/source/fastNLP.core.instance.rst
@@ -2,6 +2,6 @@ fastNLP.core.instance
 =====================
 .. automodule:: fastNLP.core.instance
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.core.losses.rst
+++ b/docs/source/fastNLP.core.losses.rst
@@ -2,6 +2,6 @@ fastNLP.core.losses
 ===================
 .. automodule:: fastNLP.core.losses
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.core.metrics.rst
+++ b/docs/source/fastNLP.core.metrics.rst
@@ -2,6 +2,6 @@ fastNLP.core.metrics
 ====================
 .. automodule:: fastNLP.core.metrics
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.core.optimizer.rst
+++ b/docs/source/fastNLP.core.optimizer.rst
@@ -2,6 +2,6 @@ fastNLP.core.optimizer
 ======================
 .. automodule:: fastNLP.core.optimizer
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.core.rst
+++ b/docs/source/fastNLP.core.rst
@@ -2,15 +2,15 @@ fastNLP.core
 ============
 .. automodule:: fastNLP.core
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
 子模块
 ----------
 .. toctree::
   :titlesonly:
   :maxdepth: 1
   fastNLP.core.batch
   fastNLP.core.callback
@@ -26,4 +26,3 @@ fastNLP.core
   fastNLP.core.trainer
   fastNLP.core.utils
   fastNLP.core.vocabulary
--- a/docs/source/fastNLP.core.sampler.rst
+++ b/docs/source/fastNLP.core.sampler.rst
@@ -2,6 +2,6 @@ fastNLP.core.sampler
 ====================
 .. automodule:: fastNLP.core.sampler
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.core.tester.rst
+++ b/docs/source/fastNLP.core.tester.rst
@@ -2,6 +2,6 @@ fastNLP.core.tester
 ===================
 .. automodule:: fastNLP.core.tester
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.core.trainer.rst
+++ b/docs/source/fastNLP.core.trainer.rst
@@ -2,6 +2,6 @@ fastNLP.core.trainer
 ====================
 .. automodule:: fastNLP.core.trainer
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.core.utils.rst
+++ b/docs/source/fastNLP.core.utils.rst
@@ -2,6 +2,6 @@ fastNLP.core.utils
 ==================
 .. automodule:: fastNLP.core.utils
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.core.vocabulary.rst
+++ b/docs/source/fastNLP.core.vocabulary.rst
@@ -2,6 +2,6 @@ fastNLP.core.vocabulary
 =======================
 .. automodule:: fastNLP.core.vocabulary
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.embeddings.bert_embedding.rst
+++ b/docs/source/fastNLP.embeddings.bert_embedding.rst
@@ -0,0 +1,7 @@
 fastNLP.embeddings.bert\_embedding
 ==================================
 .. automodule:: fastNLP.embeddings.bert_embedding
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.embeddings.char_embedding.rst
+++ b/docs/source/fastNLP.embeddings.char_embedding.rst
@@ -0,0 +1,7 @@
 fastNLP.embeddings.char\_embedding
 ==================================
 .. automodule:: fastNLP.embeddings.char_embedding
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.embeddings.contextual_embedding.rst
+++ b/docs/source/fastNLP.embeddings.contextual_embedding.rst
@@ -0,0 +1,7 @@
 fastNLP.embeddings.contextual\_embedding
 ========================================
 .. automodule:: fastNLP.embeddings.contextual_embedding
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.embeddings.elmo_embedding.rst
+++ b/docs/source/fastNLP.embeddings.elmo_embedding.rst
@@ -0,0 +1,7 @@
 fastNLP.embeddings.elmo\_embedding
 ==================================
 .. automodule:: fastNLP.embeddings.elmo_embedding
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.embeddings.embedding.rst
+++ b/docs/source/fastNLP.embeddings.embedding.rst
@@ -0,0 +1,7 @@
 fastNLP.embeddings.embedding
 ============================
 .. automodule:: fastNLP.embeddings.embedding
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.embeddings.rst
+++ b/docs/source/fastNLP.embeddings.rst
@@ -0,0 +1,22 @@
 fastNLP.embeddings
 ==================
 .. automodule:: fastNLP.embeddings
   :members:
   :undoc-members:
   :show-inheritance:
 子模块
 ----------
 .. toctree::
   :maxdepth: 1
   fastNLP.embeddings.bert_embedding
   fastNLP.embeddings.char_embedding
   fastNLP.embeddings.contextual_embedding
   fastNLP.embeddings.elmo_embedding
   fastNLP.embeddings.embedding
   fastNLP.embeddings.stack_embedding
   fastNLP.embeddings.static_embedding
   fastNLP.embeddings.utils
--- a/docs/source/fastNLP.embeddings.stack_embedding.rst
+++ b/docs/source/fastNLP.embeddings.stack_embedding.rst
@@ -0,0 +1,7 @@
 fastNLP.embeddings.stack\_embedding
 ===================================
 .. automodule:: fastNLP.embeddings.stack_embedding
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.embeddings.static_embedding.rst
+++ b/docs/source/fastNLP.embeddings.static_embedding.rst
@@ -0,0 +1,7 @@
 fastNLP.embeddings.static\_embedding
 ====================================
 .. automodule:: fastNLP.embeddings.static_embedding
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.embeddings.utils.rst
+++ b/docs/source/fastNLP.embeddings.utils.rst
@@ -0,0 +1,7 @@
 fastNLP.embeddings.utils
 ========================
 .. automodule:: fastNLP.embeddings.utils
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.io.base_loader.rst
+++ b/docs/source/fastNLP.io.base_loader.rst
@@ -2,6 +2,6 @@ fastNLP.io.base\_loader
 =======================
 .. automodule:: fastNLP.io.base_loader
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.io.data_loader.rst
+++ b/docs/source/fastNLP.io.data_loader.rst
@@ -2,6 +2,6 @@ fastNLP.io.data\_loader
 ==========================
 .. automodule:: fastNLP.io.data_loader
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.io.dataset_loader.rst
+++ b/docs/source/fastNLP.io.dataset_loader.rst
@@ -2,6 +2,6 @@ fastNLP.io.dataset\_loader
 ==========================
 .. automodule:: fastNLP.io.dataset_loader
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.io.embed_loader.rst
+++ b/docs/source/fastNLP.io.embed_loader.rst
@@ -2,6 +2,6 @@ fastNLP.io.embed\_loader
 ========================
 .. automodule:: fastNLP.io.embed_loader
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.io.model_io.rst
+++ b/docs/source/fastNLP.io.model_io.rst
@@ -2,6 +2,6 @@ fastNLP.io.model\_io
 ====================
 .. automodule:: fastNLP.io.model_io
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.io.rst
+++ b/docs/source/fastNLP.io.rst
@@ -2,19 +2,18 @@ fastNLP.io
 ==========
 .. automodule:: fastNLP.io
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
 子模块
 ----------
 .. toctree::
   :titlesonly:
   :maxdepth: 1
   fastNLP.io.data_loader
   fastNLP.io.base_loader
   fastNLP.io.dataset_loader
   fastNLP.io.embed_loader
   fastNLP.io.dataset_loader
   fastNLP.io.data_loader
   fastNLP.io.model_io
--- a/docs/source/fastNLP.models.biaffine_parser.rst
+++ b/docs/source/fastNLP.models.biaffine_parser.rst
@@ -2,6 +2,6 @@ fastNLP.models.biaffine\_parser
 ===============================
 .. automodule:: fastNLP.models.biaffine_parser
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.models.cnn_text_classification.rst
+++ b/docs/source/fastNLP.models.cnn_text_classification.rst
@@ -2,6 +2,6 @@ fastNLP.models.cnn\_text\_classification
 ========================================
 .. automodule:: fastNLP.models.cnn_text_classification
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.models.rst
+++ b/docs/source/fastNLP.models.rst
@@ -2,19 +2,18 @@ fastNLP.models
 ==============
 .. automodule:: fastNLP.models
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
 子模块
 ----------
 .. toctree::
   :titlesonly:
   :maxdepth: 1
   fastNLP.models.biaffine_parser
   fastNLP.models.cnn_text_classification
   fastNLP.models.sequence_labeling
   fastNLP.models.snli
   fastNLP.models.star_transformer
--- a/docs/source/fastNLP.models.sequence_labeling.rst
+++ b/docs/source/fastNLP.models.sequence_labeling.rst
@@ -2,6 +2,6 @@ fastNLP.models.sequence\_labeling
 =================================
 .. automodule:: fastNLP.models.sequence_labeling
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.models.snli.rst
+++ b/docs/source/fastNLP.models.snli.rst
@@ -2,6 +2,6 @@ fastNLP.models.snli
 ===================
 .. automodule:: fastNLP.models.snli
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.models.star_transformer.rst
+++ b/docs/source/fastNLP.models.star_transformer.rst
@@ -2,6 +2,6 @@ fastNLP.models.star\_transformer
 ================================
 .. automodule:: fastNLP.models.star_transformer
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.modules.decoder.crf.rst
+++ b/docs/source/fastNLP.modules.decoder.crf.rst
@@ -1,7 +0,0 @@
 fastNLP.modules.decoder.CRF
 ===========================
 .. automodule:: fastNLP.modules.decoder.crf
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.modules.decoder.mlp.rst
+++ b/docs/source/fastNLP.modules.decoder.mlp.rst
@@ -1,7 +0,0 @@
 fastNLP.modules.decoder.MLP
 ===========================
 .. automodule:: fastNLP.modules.decoder.mlp
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.modules.decoder.rst
+++ b/docs/source/fastNLP.modules.decoder.rst
@@ -2,17 +2,7 @@ fastNLP.modules.decoder
 =======================
 .. automodule:: fastNLP.modules.decoder
    :members:
    :undoc-members:
    :show-inheritance:
 子模块
 ----------
 .. toctree::
   :titlesonly:
   fastNLP.modules.decoder.crf
   fastNLP.modules.decoder.mlp
   fastNLP.modules.decoder.utils
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.modules.decoder.utils.rst
+++ b/docs/source/fastNLP.modules.decoder.utils.rst
@@ -1,7 +0,0 @@
 fastNLP.modules.decoder.utils
 =============================
 .. automodule:: fastNLP.modules.decoder.utils
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.modules.encoder.bert.rst
+++ b/docs/source/fastNLP.modules.encoder.bert.rst
@@ -1,7 +0,0 @@
 fastNLP.modules.encoder.bert
 ============================
 .. automodule:: fastNLP.modules.encoder.bert
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.modules.encoder.char_encoder.rst
+++ b/docs/source/fastNLP.modules.encoder.char_encoder.rst
@@ -1,7 +0,0 @@
 fastNLP.modules.encoder.char\_encoder
 =====================================
 .. automodule:: fastNLP.modules.encoder.char_encoder
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.modules.encoder.conv_maxpool.rst
+++ b/docs/source/fastNLP.modules.encoder.conv_maxpool.rst
@@ -1,7 +0,0 @@
 fastNLP.modules.encoder.conv\_maxpool
 =====================================
 .. automodule:: fastNLP.modules.encoder.conv_maxpool
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.modules.encoder.embedding.rst
+++ b/docs/source/fastNLP.modules.encoder.embedding.rst
@@ -1,7 +0,0 @@
 fastNLP.modules.encoder.embedding
 =================================
 .. automodule:: fastNLP.modules.encoder.embedding
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.modules.encoder.lstm.rst
+++ b/docs/source/fastNLP.modules.encoder.lstm.rst
@@ -1,7 +0,0 @@
 fastNLP.modules.encoder.lstm
 ============================
 .. automodule:: fastNLP.modules.encoder.lstm
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.modules.encoder.rst
+++ b/docs/source/fastNLP.modules.encoder.rst
@@ -2,22 +2,6 @@ fastNLP.modules.encoder
 =======================
 .. automodule:: fastNLP.modules.encoder
    :members:
    :undoc-members:
    :show-inheritance:
 子模块
 ----------
 .. toctree::
   :titlesonly:
   fastNLP.modules.encoder.bert
   fastNLP.modules.encoder.char_encoder
   fastNLP.modules.encoder.conv_maxpool
   fastNLP.modules.encoder.embedding
   fastNLP.modules.encoder.lstm
   fastNLP.modules.encoder.star_transformer
   fastNLP.modules.encoder.transformer
   fastNLP.modules.encoder.variational_rnn
   :members:
   :undoc-members:
   :show-inheritance:
--- a/docs/source/fastNLP.modules.encoder.star_transformer.rst
+++ b/docs/source/fastNLP.modules.encoder.star_transformer.rst
@@ -1,7 +0,0 @@
 fastNLP.modules.encoder.star\_transformer
 =========================================
 .. automodule:: fastNLP.modules.encoder.star_transformer
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.modules.encoder.transformer.rst
+++ b/docs/source/fastNLP.modules.encoder.transformer.rst
@@ -1,7 +0,0 @@
 fastNLP.modules.encoder.transformer
 ===================================
 .. automodule:: fastNLP.modules.encoder.transformer
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.modules.encoder.variational_rnn.rst
+++ b/docs/source/fastNLP.modules.encoder.variational_rnn.rst
@@ -1,7 +0,0 @@
 fastNLP.modules.encoder.variational\_rnn
 ========================================
 .. automodule:: fastNLP.modules.encoder.variational_rnn
    :members:
    :undoc-members:
    :show-inheritance:
--- a/docs/source/fastNLP.modules.rst
+++ b/docs/source/fastNLP.modules.rst
@@ -2,15 +2,16 @@ fastNLP.modules
 ===============
 .. automodule:: fastNLP.modules
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
 子模块
 -----------
 .. toctree::
    :titlesonly:
   :titlesonly:
   :maxdepth: 1
    fastNLP.modules.decoder
    fastNLP.modules.encoder
   fastNLP.modules.decoder
   fastNLP.modules.encoder
--- a/docs/source/fastNLP.rst
+++ b/docs/source/fastNLP.rst
@@ -2,19 +2,18 @@ API 文档
 ===============
 .. automodule:: fastNLP
    :members:
    :undoc-members:
    :show-inheritance:
   :members:
   :undoc-members:
   :show-inheritance:
 内部模块
 -----------
 .. toctree::
    :titlesonly:
    :maxdepth: 3
    fastNLP.core
    fastNLP.io
    fastNLP.modules
    fastNLP.models
   :maxdepth: 1
   fastNLP.core
   fastNLP.embeddings
   fastNLP.io
   fastNLP.models
   fastNLP.modules
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,60 +1,28 @@
 fastNLP 中文文档
 =====================
 fastNLP 是一款轻量级的 NLP 处理套件。你既可以使用它快速地完成一个命名实体识别（NER）、中文分词或文本分类任务；
 也可以使用他构建许多复杂的网络模型，进行科研。它具有如下的特性:
 `fastNLP <https://github.com/fastnlp/fastNLP/>`_ 是一款轻量级的 NLP 处理套件。你既可以使用它快速地完成一个序列标注
 （NER、POS-Tagging等）、中文分词、文本分类、Matching、指代消解、摘要等任务
 （详见 `reproduction <https://github.com/fastnlp/fastNLP/tree/master/reproduction>`_ ）；
 也可以使用它构建许多复杂的网络模型，进行科研。它具有如下的特性：
 - 统一的Tabular式数据容器，让数据预处理过程简洁明了。内置多种数据集的DataSet Loader，省去预处理代码。
 - 各种方便的NLP工具，例如预处理embedding加载; 中间数据cache等;
 - 详尽的中文文档以供查阅；
 - 提供诸多高级模块，例如Variational LSTM, Transformer, CRF等;
 - 封装CNNText，Biaffine等模型可供直接使用;
 - 便捷且具有扩展性的训练器; 提供多种内置callback函数，方便实验记录、异常捕获等。
 - 统一的Tabular式数据容器，让数据预处理过程简洁明了。内置多种数据集的 :mod:`~fastNLP.io.data_loader` ，省去预处理代码;
 - 多种训练、测试组件，例如训练器 :class:`~fastNLP.Trainer` ；测试器 :class:`~fastNLP.Tester` ；以及各种评测 :mod:`~fastNLP.core.metrics` 等等;
 - 各种方便的NLP工具，例如预处理 :mod:`embedding<fastNLP.embeddings>` 加载（包括ELMo和BERT）; 中间数据存储 :func:`cache <fastNLP.cache_results>` 等;
 - 提供诸多高级模块 :mod:`~fastNLP.modules`，例如 :class:`~fastNLP.modules.VarLSTM` , :class:`Transformer<fastNLP.modules.TransformerEncoder>` , :class:`CRF<fastNLP.modules.ConditionalRandomField>` 等;
 - 在序列标注、中文分词、文本分类、Matching、指代消解、摘要等任务上封装了各种 :mod:`~fastNLP.models` 可供直接使用;
 - 训练器便捷且具有扩展性，提供多种内置 :mod:`~fastNLP.core.callback` 函数，方便实验记录、异常捕获等。
 内置组件
 ------------
 大部分用于的 NLP 任务神经网络都可以看做由编码（encoder）、聚合（aggregator）、解码（decoder）三种模块组成。
 .. image:: figures/text_classification.png
 fastNLP 在 :mod:`~fastNLP.modules` 模块中内置了三种模块的诸多组件，可以帮助用户快速搭建自己所需的网络。
 三种模块的功能和常见组件如下:
 +-----------------------+-----------------------+-----------------------+
 | module type           | functionality         | example               |
 +=======================+=======================+=======================+
 | encoder               | 将输入编码为具有具    | embedding, RNN, CNN,  |
 |                       | 有表示能力的向量      | transformer           |
 +-----------------------+-----------------------+-----------------------+
 | aggregator            | 从多个向量中聚合信息  | self-attention,       |
 |                       |                       | max-pooling           |
 +-----------------------+-----------------------+-----------------------+
 | decoder               | 将具有某种表示意义的  | MLP, CRF              |
 |                       | 向量解码为需要的输出  |                       |
 |                       | 形式                  |                       |
 +-----------------------+-----------------------+-----------------------+
 内置模型
 ----------------
 fastNLP 在 :mod:`~fastNLP.models` 模块中内置了如 :class:`~fastNLP.models.CNNText` 、
 :class:`~fastNLP.models.SeqLabeling` 等完整的模型，以供用户直接使用。
 .. todo::
    这些模型的介绍如下表所示：（模型名称 + 介绍 + 任务上的结果）
 用户手册
 ----------------
 .. toctree::
   :maxdepth: 1
   :maxdepth: 2
    安装指南 </user/installation>
    快速入门 </user/quickstart>
    详细指南 </user/tutorials>
    详细教程 </user/tutorials>
 API 文档
 -------------
@@ -67,11 +35,11 @@ API 文档
   fastNLP
 fitlog
 ------
 fitlog文档
 ----------
 用户可以 `点此 <https://fitlog.readthedocs.io/zh/latest/>`_  查看fitlog的文档。
 fitlog 是由我们团队开发，用于帮助用户记录日志并管理代码的工具
 您可以 `点此 <https://fitlog.readthedocs.io/zh/latest/>`_  查看fitlog的文档。
 fitlog 是由我们团队开发的日志记录+代码管理的工具。
 索引与搜索
 ==================
--- a/docs/source/tutorials/tutorial_1_data_preprocess.rst
+++ b/docs/source/tutorials/tutorial_1_data_preprocess.rst
@@ -60,7 +60,7 @@
            seq_len=3)
        ])
 在初步构建完数据集之后，我们可可以通过 `for` 循环遍历 :class:`~fastNLP.DataSet` 中的内容。
 在初步构建完数据集之后，我们可以通过 `for` 循环遍历 :class:`~fastNLP.DataSet` 中的内容。
 .. code-block:: python
--- a/docs/source/tutorials/tutorial_2_load_dataset.rst
+++ b/docs/source/tutorials/tutorial_2_load_dataset.rst
@@ -35,12 +35,12 @@ Part II: 数据集的使用方式
    - _load 函数：从一个数据文件中读取数据到一个 :class:`~fastNLP.DataSet`
    - load 函数（可以使用基类的方法）：从一个或多个数据文件中读取数据到一个或多个 :class:`~fastNLP.DataSet`
    - process 函数：一个或多个从数据文件中读取数据，并处理成可以训练的 :class:`~fastNLP.io.DataInfo`
    - process 函数：一个或多个从数据文件中读取数据，并处理成可以训练的 :class:`~fastNLP.io.DataBundle`
    **\*process函数中可以调用load函数或_load函数**
 DataSetLoader的_load或者load函数返回的 :class:`~fastNLP.DataSet` 当中，内容为数据集的文本信息，process函数返回的
 :class:`~fastNLP.io.DataInfo` 当中， `datasets` 的内容为已经index好的、可以直接被 :class:`~fastNLP.Trainer`
 :class:`~fastNLP.io.DataBundle` 当中， `datasets` 的内容为已经index好的、可以直接被 :class:`~fastNLP.Trainer`
 接受的内容。
 --------------------------------------------------------
--- a/docs/source/tutorials/tutorial_6_seq_labeling.rst
+++ b/docs/source/tutorials/tutorial_6_seq_labeling.rst
@@ -45,7 +45,7 @@ fastNLP可以方便地载入各种类型的数据。同时，针对常见的数
 数据处理
 ----------------------------
 我们进一步处理数据。将数据和词表封装在 :class:`~fastNLP.DataInfo` 类中。data是DataInfo的实例。
 我们进一步处理数据。将数据和词表封装在 :class:`~fastNLP.DataBundle` 类中。data是DataBundle的实例。
 我们输入模型的数据包括char embedding，以及word embedding。在数据处理部分，我们尝试完成词表的构建。
 使用fastNLP中的Vocabulary类来构建词表。
--- a/docs/source/tutorials/tutorial_7_modules_models.rst
+++ b/docs/source/tutorials/tutorial_7_modules_models.rst
@@ -181,7 +181,7 @@ FastNLP 完全支持使用 pyTorch 编写的模型，但与 pyTorch 中编写模
      )
    )
 FastNLP 中包含的各种模块如下表，您可以点击具体的名称查看详细的 API:
 FastNLP 中包含的各种模块如下表，您可以点击具体的名称查看详细的 API，也可以通过 :doc:`/fastNLP.modules` 进行了解。
 .. csv-table::
   :header: 名称, 介绍
@@ -189,7 +189,6 @@ FastNLP 中包含的各种模块如下表，您可以点击具体的名称查看
   :class:`~fastNLP.modules.ConvolutionCharEncoder` , char级别的卷积 encoder
   :class:`~fastNLP.modules.LSTMCharEncoder` , char级别基于LSTM的 encoder
   :class:`~fastNLP.modules.ConvMaxpool` , 结合了Convolution和Max-Pooling于一体的模块
   :class:`~fastNLP.modules.Embedding` , 基础的Embedding模块
   :class:`~fastNLP.modules.LSTM` , LSTM模块, 轻量封装了PyTorch的LSTM
   :class:`~fastNLP.modules.StarTransformer` , Star-Transformer 的encoder部分
   :class:`~fastNLP.modules.TransformerEncoder` , Transformer的encoder模块，不包含embedding层
@@ -198,8 +197,11 @@ FastNLP 中包含的各种模块如下表，您可以点击具体的名称查看
   :class:`~fastNLP.modules.VarGRU` , Variational Dropout GRU 模块
   :class:`~fastNLP.modules.MaxPool` , Max-pooling模块
   :class:`~fastNLP.modules.MaxPoolWithMask` , 带mask矩阵的max pooling。在做 max-pooling的时候不会考虑mask值为0的位置。
   :class:`~fastNLP.modules.AvgPool` , Average-pooling模块
   :class:`~fastNLP.modules.AvgPoolWithMask` , 带mask矩阵的average pooling。在做 average-pooling的时候不会考虑mask值为0的位置。
   :class:`~fastNLP.modules.MultiHeadAttention` , MultiHead Attention 模块
   :class:`~fastNLP.modules.MLP` , 简单的多层感知器模块
   :class:`~fastNLP.modules.ConditionalRandomField` , 条件随机场模块
   :class:`~fastNLP.modules.viterbi_decode` , 给定一个特征矩阵以及转移分数矩阵，计算出最佳的路径以及对应的分数 （与 :class:`~fastNLP.modules.ConditionalRandomField` 配合使用）
   :class:`~fastNLP.modules.allowed_transitions` , 给定一个id到label的映射表，返回所有可以跳转的列表（与 :class:`~fastNLP.modules.ConditionalRandomField` 配合使用）
   :class:`~fastNLP.modules.TimestepDropout` , 简单包装过的Dropout 组件
--- a/docs/source/tutorials/tutorial_9_callback.rst
+++ b/docs/source/tutorials/tutorial_9_callback.rst
@@ -44,10 +44,10 @@ Callback的构建和使用
    这里，:class:`~fastNLP.Callback` 中所有以 ``on_`` 开头的类方法会在 :class:`~fastNLP.Trainer` 的训练中在特定时间调用。
    如 on_train_begin() 会在训练开始时被调用，on_epoch_end() 会在每个 epoch 结束时调用。
    具体有哪些类方法，参见文档。
    具体有哪些类方法，参见文档 :class:`~fastNLP.Callback` 。
    另外，为了使用方便，可以在 :class:`~fastNLP.Callback` 内部访问 :class:`~fastNLP.Trainer` 中的属性，如 optimizer, epoch, step，分别对应训练时的优化器，当前epoch数，和当前的总step数。
    具体可访问的属性，参见文档。
    具体可访问的属性，参见文档 :class:`~fastNLP.Callback` 。
 使用Callback
    在定义好 :class:`~fastNLP.Callback` 之后，就能将它传入Trainer的 ``callbacks`` 参数，在实际训练时使用。
--- a/docs/source/user/tutorials.rst
+++ b/docs/source/user/tutorials.rst
@@ -1,18 +1,20 @@
 ===================
 fastNLP详细使用教程
 ===================
 ========================
 fastNLP 详细使用教程
 ========================
 这里是更详细的使用教程。对于大部分的用户，我们建议你从第一篇开始顺序阅读；如果你只想了解其中的一部分，也可以进行选读。
 .. toctree::
   :maxdepth: 1
   1. 使用DataSet预处理文本 </tutorials/tutorial_1_data_preprocess>
   2. 使用DataSetLoader加载数据集 </tutorials/tutorial_2_load_dataset>
   3. 使用Embedding模块将文本转成向量 </tutorials/tutorial_3_embedding>
   4. 动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试 </tutorials/tutorial_4_loss_optimizer>
   5. 动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程 </tutorials/tutorial_5_datasetiter>
   6. 快速实现序列标注模型 </tutorials/tutorial_6_seq_labeling>
   7. 使用Modules和Models快速搭建自定义模型 </tutorials/tutorial_7_modules_models>
   8. 使用Metric快速评测你的模型 </tutorials/tutorial_8_metrics>
   9. 使用Callback自定义你的训练过程 </tutorials/tutorial_9_callback>
   10. 使用fitlog 辅助 fastNLP 进行科研 </tutorials/tutorial_10_fitlog>
   使用DataSet预处理文本 </tutorials/tutorial_1_data_preprocess>
   使用DataSetLoader加载数据集 </tutorials/tutorial_2_load_dataset>
   使用Embedding模块将文本转成向量 </tutorials/tutorial_3_embedding>
   动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试 </tutorials/tutorial_4_loss_optimizer>
   动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程 </tutorials/tutorial_5_datasetiter>
   快速实现序列标注模型 </tutorials/tutorial_6_seq_labeling>
   使用Modules和Models快速搭建自定义模型 </tutorials/tutorial_7_modules_models>
   使用Metric快速评测你的模型 </tutorials/tutorial_8_metrics>
   使用Callback自定义你的训练过程 </tutorials/tutorial_9_callback>
   使用fitlog 辅助 fastNLP 进行科研 </tutorials/tutorial_10_fitlog>
--- a/fastNLP/init.py
+++ b/fastNLP/init.py
@@ -1,11 +1,12 @@
 """
 fastNLP 由 :mod:`~fastNLP.core` 、 :mod:`~fastNLP.io` 、:mod:`~fastNLP.modules`、:mod:`~fastNLP.models`
 等子模块组成，你可以点进去查看每个模块的文档。
 fastNLP 由 :mod:`~fastNLP.core` 、 :mod:`~fastNLP.io` 、:mod:`~fastNLP.embeddings` 、 :mod:`~fastNLP.modules`、
 :mod:`~fastNLP.models` 等子模块组成，你可以查看每个模块的文档。
 - :mod:`~fastNLP.core` 是fastNLP 的核心模块，包括 DataSet、 Trainer、 Tester 等组件。详见文档 :doc:`/fastNLP.core`
 - :mod:`~fastNLP.io` 是实现输入输出的模块，包括了数据集的读取，模型的存取等功能。详见文档 :doc:`/fastNLP.io`
 - :mod:`~fastNLP.embeddings` 提供用于构建复杂网络模型所需的各种embedding。详见文档 :doc:`/fastNLP.embeddings`
 - :mod:`~fastNLP.modules`  包含了用于搭建神经网络模型的诸多组件，可以帮助用户快速搭建自己所需的网络。详见文档 :doc:`/fastNLP.modules`
 - :mod:`~fastNLP.models` 包含了一些使用 fastNLP 实现的完整网络模型，包括 :class:`~fastNLP.models.CNNText` 、 :class:`~fastNLP.models.SeqLabeling` 等常见模型。详见文档 :doc:`/fastNLP.models`
 - :mod:`~fastNLP.models` 包含了一些使用 fastNLP 实现的完整网络模型，包括 :class:`~fastNLP.models.CNNText` 、 :class:`~fastNLP.models.SeqLabeling` 等常见模型。详见文档 :doc:`fastNLP.models`
 fastNLP 中最常用的组件可以直接从 fastNLP 包中 import ，他们的文档如下：
 """
@@ -61,4 +62,5 @@ __version__ = '0.4.5'
 from .core import *
 from . import models
 from . import modules
 from . import embeddings
 from .io import data_loader
--- a/fastNLP/io/init.py
+++ b/fastNLP/io/init.py
@@ -3,36 +3,40 @@
 1. 用于读入 embedding 的 :doc:`EmbedLoader <fastNLP.io.embed_loader>` 类,
 2. 用于读入数据的 :doc:`DataSetLoader <fastNLP.io.dataset_loader>` 类
 2. 用于读入不同格式数据的 :doc:`DataSetLoader <fastNLP.io.dataset_loader>` 类
 3. 用于保存和载入模型的类, 参考 :doc:`/fastNLP.io.model_io`
 3. 用于读入不同数据集并进行预处理的 :doc:`DataLoader <fastNLP.io.data_loader>` 类
 4. 用于保存和载入模型的类, 参考 :doc:`model_io文档</fastNLP.io.model_io>`
 这些类的使用方法如下:
 """
 __all__ = [
    'EmbedLoader',
    'DataBundle',
    'DataSetLoader',
    'CSVLoader',
    'JsonLoader',
    'ModelLoader',
    'ModelSaver',
    'DataBundle',
    'DataSetLoader',
    'ConllLoader',
    'Conll2003Loader',
    'IMDBLoader',
    'MatchingLoader',
    'PeopleDailyCorpusLoader',
    'SNLILoader',
    'SSTLoader',
    'SST2Loader',
    'MNLILoader',
    'MTL16Loader',
    'PeopleDailyCorpusLoader',
    'QNLILoader',
    'QuoraLoader',
    'RTELoader',
    'SSTLoader',
    'SST2Loader',
    'YelpLoader',
    'ModelLoader',
    'ModelSaver',
 ]
 from .embed_loader import EmbedLoader
--- a/fastNLP/io/data_loader/init.py
+++ b/fastNLP/io/data_loader/init.py
@@ -1,13 +1,14 @@
 """
 用于读数据集的模块, 可以读取文本分类、序列标注、Matching任务的数据集
 这些模块的使用方法如下:
 这些模块的具体介绍如下，您可以通过阅读 :doc:`教程</tutorials/tutorial_2_load_dataset>` 来进行了解。
 """
 __all__ = [
    'ConllLoader',
    'Conll2003Loader',
    'IMDBLoader',
    'MatchingLoader',
    'SNLILoader',
    'MNLILoader',
    'MTL16Loader',
    'PeopleDailyCorpusLoader',
@@ -16,7 +17,6 @@ __all__ = [
    'RTELoader',
    'SSTLoader',
    'SST2Loader',
    'SNLILoader',
    'YelpLoader',
 ]
--- a/fastNLP/io/data_loader/conll.py
+++ b/fastNLP/io/data_loader/conll.py
@@ -58,7 +58,7 @@ class ConllLoader(DataSetLoader):
 class Conll2003Loader(ConllLoader):
    """
    别名：:class:`fastNLP.io.Conll2003Loader` :class:`fastNLP.io.dataset_loader.Conll2003Loader`
    别名：:class:`fastNLP.io.Conll2003Loader` :class:`fastNLP.io.data_loader.Conll2003Loader`
    读取Conll2003数据
--- a/fastNLP/io/data_loader/people_daily.py
+++ b/fastNLP/io/data_loader/people_daily.py
@@ -7,7 +7,7 @@ from ...core.const import Const
 class PeopleDailyCorpusLoader(DataSetLoader):
    """
    别名：:class:`fastNLP.io.PeopleDailyCorpusLoader` :class:`fastNLP.io.dataset_loader.PeopleDailyCorpusLoader`
    别名：:class:`fastNLP.io.PeopleDailyCorpusLoader` :class:`fastNLP.io.data_loader.PeopleDailyCorpusLoader`
    读取人民日报数据集
    """
--- a/fastNLP/models/biaffine_parser.py
+++ b/fastNLP/models/biaffine_parser.py
@@ -130,6 +130,8 @@ def _find_cycle(vertices, edges):
 class GraphParser(BaseModel):
    """
    别名：:class:`fastNLP.models.GraphParser`  :class:`fastNLP.models.baffine_parser.GraphParser`
    基于图的parser base class, 支持贪婪解码和最大生成树解码
    """
--- a/fastNLP/models/cnn_text_classification.py
+++ b/fastNLP/models/cnn_text_classification.py
@@ -25,14 +25,14 @@ class CNNText(torch.nn.Module):
    :param int,tuple(int) kernel_sizes: 输出channel的kernel大小。
    :param float dropout: Dropout的大小
    """
    def __init__(self, init_embed,
                 num_classes,
                 kernel_nums=(30, 40, 50),
                 kernel_sizes=(1, 3, 5),
                 dropout=0.5):
        super(CNNText, self).__init__()
        # no support for pre-trained embedding currently
        self.embed = embedding.Embedding(init_embed)
        self.conv_pool = encoder.ConvMaxpool(
@@ -41,7 +41,7 @@ class CNNText(torch.nn.Module):
            kernel_sizes=kernel_sizes)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(sum(kernel_nums), num_classes)
    def forward(self, words, seq_len=None):
        """
@@ -58,7 +58,7 @@ class CNNText(torch.nn.Module):
        x = self.dropout(x)
        x = self.fc(x)  # [N,C] -> [N, N_class]
        return {C.OUTPUT: x}
    def predict(self, words, seq_len=None):
        """
        :param torch.LongTensor words: [batch_size, seq_len]，句子中word的index
--- a/fastNLP/models/sequence_labeling.py
+++ b/fastNLP/models/sequence_labeling.py
@@ -1,10 +1,10 @@
 """
    本模块实现了两种序列标注模型
    本模块实现了几种序列标注模型
 """
 __all__ = [
    "SeqLabeling",
    "AdvSeqLabel",
    "BiLSTMCRF"
    # "BiLSTMCRF"
 ]
 import torch
@@ -25,7 +25,10 @@ from ..modules import ConditionalRandomField
 class BiLSTMCRF(BaseModel):
    """
    结构为BiLSTM + FC + Dropout + CRF.
    TODO 补充文档
    .. todo::
        继续补充文档
    :param embed: tuple:
    :param num_classes:
    :param num_layers:
--- a/fastNLP/models/snli.py
+++ b/fastNLP/models/snli.py
@@ -15,7 +15,10 @@ from ..core.utils import seq_len_to_mask
 class ESIM(BaseModel):
    """ESIM model的一个PyTorch实现
    """
    别名：:class:`fastNLP.models.ESIM`  :class:`fastNLP.models.snli.ESIM`
    ESIM model的一个PyTorch实现
    论文参见： https://arxiv.org/pdf/1609.06038.pdf
    :param fastNLP.TokenEmbedding init_embedding: 初始化的TokenEmbedding
--- a/fastNLP/models/star_transformer.py
+++ b/fastNLP/models/star_transformer.py
@@ -34,7 +34,7 @@ class StarTransEnc(nn.Module):
    :param emb_dropout: 词嵌入的dropout概率.
    :param dropout: 模型除词嵌入外的dropout概率.
    """
    def __init__(self, init_embed,
                 hidden_size,
                 num_layers,
@@ -54,7 +54,7 @@ class StarTransEnc(nn.Module):
                                       head_dim=head_dim,
                                       dropout=dropout,
                                       max_len=max_len)
    def forward(self, x, mask):
        """
        :param FloatTensor x: [batch, length, hidden] 输入的序列
@@ -79,7 +79,7 @@ class _Cls(nn.Module):
            nn.Dropout(dropout),
            nn.Linear(hid_dim, num_cls),
        )
    def forward(self, x):
        h = self.fc(x)
        return h
@@ -95,7 +95,7 @@ class _NLICls(nn.Module):
            nn.Dropout(dropout),
            nn.Linear(hid_dim, num_cls),
        )
    def forward(self, x1, x2):
        x = torch.cat([x1, x2, torch.abs(x1 - x2), x1 * x2], 1)
        h = self.fc(x)
@@ -121,7 +121,7 @@ class STSeqLabel(nn.Module):
    :param emb_dropout: 词嵌入的dropout概率. Default: 0.1
    :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
    """
    def __init__(self, init_embed, num_cls,
                 hidden_size=300,
                 num_layers=4,
@@ -141,7 +141,7 @@ class STSeqLabel(nn.Module):
                                emb_dropout=emb_dropout,
                                dropout=dropout)
        self.cls = _Cls(hidden_size, num_cls, cls_hidden_size)
    def forward(self, words, seq_len):
        """
@@ -154,7 +154,7 @@ class STSeqLabel(nn.Module):
        output = self.cls(nodes)
        output = output.transpose(1, 2)  # make hidden to be dim 1
        return {Const.OUTPUT: output}  # [bsz, n_cls, seq_len]
    def predict(self, words, seq_len):
        """
@@ -186,7 +186,7 @@ class STSeqCls(nn.Module):
    :param emb_dropout: 词嵌入的dropout概率. Default: 0.1
    :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
    """
    def __init__(self, init_embed, num_cls,
                 hidden_size=300,
                 num_layers=4,
@@ -206,7 +206,7 @@ class STSeqCls(nn.Module):
                                emb_dropout=emb_dropout,
                                dropout=dropout)
        self.cls = _Cls(hidden_size, num_cls, cls_hidden_size, dropout=dropout)
    def forward(self, words, seq_len):
        """
@@ -219,7 +219,7 @@ class STSeqCls(nn.Module):
        y = 0.5 * (relay + nodes.max(1)[0])
        output = self.cls(y)  # [bsz, n_cls]
        return {Const.OUTPUT: output}
    def predict(self, words, seq_len):
        """
@@ -251,7 +251,7 @@ class STNLICls(nn.Module):
    :param emb_dropout: 词嵌入的dropout概率. Default: 0.1
    :param dropout: 模型除词嵌入外的dropout概率. Default: 0.1
    """
    def __init__(self, init_embed, num_cls,
                 hidden_size=300,
                 num_layers=4,
@@ -271,7 +271,7 @@ class STNLICls(nn.Module):
                                emb_dropout=emb_dropout,
                                dropout=dropout)
        self.cls = _NLICls(hidden_size, num_cls, cls_hidden_size)
    def forward(self, words1, words2, seq_len1, seq_len2):
        """
@@ -283,16 +283,16 @@ class STNLICls(nn.Module):
        """
        mask1 = seq_len_to_mask(seq_len1)
        mask2 = seq_len_to_mask(seq_len2)
        def enc(seq, mask):
            nodes, relay = self.enc(seq, mask)
            return 0.5 * (relay + nodes.max(1)[0])
        y1 = enc(words1, mask1)
        y2 = enc(words2, mask2)
        output = self.cls(y1, y2)  # [bsz, n_cls]
        return {Const.OUTPUT: output}
    def predict(self, words1, words2, seq_len1, seq_len2):
        """
--- a/fastNLP/modules/init.py
+++ b/fastNLP/modules/init.py
@@ -1,45 +1,52 @@
 """
 大部分用于的 NLP 任务神经网络都可以看做由编码 :mod:`~fastNLP.modules.encoder` 、
 解码 :mod:`~fastNLP.modules.decoder` 两种模块组成。
 .. image:: figures/text_classification.png
 :mod:`~fastNLP.modules` 中实现了 fastNLP 提供的诸多模块组件，可以帮助用户快速搭建自己所需的网络。
 两种模块的功能和常见组件如下:
 大部分用于的 NLP 任务神经网络都可以看做由 :mod:`embedding<fastNLP.embeddings>` 、 :mod:`~fastNLP.modules.encoder` 、
 :mod:`~fastNLP.modules.decoder` 三种模块组成。 本模块中实现了 fastNLP 提供的诸多模块组件，
 可以帮助用户快速搭建自己所需的网络。几种模块的功能和常见组件如下:
 .. csv-table::
   :header: "类型", "功能", "常见组件"
   "embedding", 参见 :doc:`/fastNLP.embeddings` ,  "Elmo, Bert"
   "encoder", "将输入编码为具有表示能力的向量", "CNN, LSTM, Transformer"
   "decoder", "将具有某种表示意义的向量解码为需要的输出形式 ", "MLP, CRF"
   "其它", "配合其它组件使用的组件", "Dropout"
 +-----------------------+-----------------------+-----------------------+
 | module type           | functionality         | example               |
 +=======================+=======================+=======================+
 | encoder               | 将输入编码为具有具    | embedding, RNN, CNN,  |
 |                       | 有表示能力的向量      | transformer           |
 +-----------------------+-----------------------+-----------------------+
 | decoder               | 将具有某种表示意义的  | MLP, CRF              |
 |                       | 向量解码为需要的输出  |                       |
 |                       | 形式                  |                       |
 +-----------------------+-----------------------+-----------------------+
 """
 __all__ = [
    # "BertModel",
    "ConvolutionCharEncoder",
    "LSTMCharEncoder",
    "ConvMaxpool",
    "LSTM",
    "StarTransformer",
    "TransformerEncoder",
    "VarRNN",
    "VarLSTM",
    "VarGRU",
    "MaxPool",
    "MaxPoolWithMask",
    "AvgPool",
    "AvgPoolWithMask",
    "MultiHeadAttention",
    "MLP",
    "ConditionalRandomField",
    "viterbi_decode",
    "allowed_transitions",
    "TimestepDropout",
 ]
 from . import decoder
--- a/fastNLP/modules/decoder/crf.py
+++ b/fastNLP/modules/decoder/crf.py
@@ -11,7 +11,7 @@ from ..utils import initial_parameter
 def allowed_transitions(id2target, encoding_type='bio', include_start_end=False):
    """
    别名：:class:`fastNLP.modules.allowed_transitions`  :class:`fastNLP.modules.decoder.crf.allowed_transitions`
    别名：:class:`fastNLP.modules.allowed_transitions`  :class:`fastNLP.modules.decoder.allowed_transitions`
    给定一个id到label的映射表，返回所有可以跳转的(from_tag_id, to_tag_id)列表。
@@ -31,7 +31,7 @@ def allowed_transitions(id2target, encoding_type='bio', include_start_end=False)
    id_label_lst = list(id2target.items())
    if include_start_end:
        id_label_lst += [(start_idx, 'start'), (end_idx, 'end')]
    def split_tag_label(from_label):
        from_label = from_label.lower()
        if from_label in ['start', 'end']:
@@ -41,7 +41,7 @@ def allowed_transitions(id2target, encoding_type='bio', include_start_end=False)
            from_tag = from_label[:1]
            from_label = from_label[2:]
        return from_tag, from_label
    for from_id, from_label in id_label_lst:
        if from_label in ['<pad>', '<unk>']:
            continue
@@ -93,7 +93,7 @@ def _is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label
            return to_tag in ['end', 'b', 'o']
        else:
            raise ValueError("Unexpect tag {}. Expect only 'B', 'I', 'O'.".format(from_tag))
    elif encoding_type == 'bmes':
        """
        第一行是to_tag, 第一列是from_tag，y任意条件下可转，-只有在label相同时可转，n不可转
@@ -151,7 +151,7 @@ def _is_transition_allowed(encoding_type, from_tag, from_label, to_tag, to_label
 class ConditionalRandomField(nn.Module):
    """
    别名：:class:`fastNLP.modules.ConditionalRandomField`  :class:`fastNLP.modules.decoder.crf.ConditionalRandomField`
    别名：:class:`fastNLP.modules.ConditionalRandomField`  :class:`fastNLP.modules.decoder.ConditionalRandomField`
    条件随机场。
    提供forward()以及viterbi_decode()两个方法，分别用于训练与inference。
@@ -163,21 +163,21 @@ class ConditionalRandomField(nn.Module):
                               allowed_transitions()函数得到；如果为None，则所有跃迁均为合法
    :param str initial_method: 初始化方法。见initial_parameter
    """
    def __init__(self, num_tags, include_start_end_trans=False, allowed_transitions=None,
                 initial_method=None):
        super(ConditionalRandomField, self).__init__()
        self.include_start_end_trans = include_start_end_trans
        self.num_tags = num_tags
        # the meaning of entry in this matrix is (from_tag_id, to_tag_id) score
        self.trans_m = nn.Parameter(torch.randn(num_tags, num_tags))
        if self.include_start_end_trans:
            self.start_scores = nn.Parameter(torch.randn(num_tags))
            self.end_scores = nn.Parameter(torch.randn(num_tags))
        if allowed_transitions is None:
            constrain = torch.zeros(num_tags + 2, num_tags + 2)
        else:
@@ -185,9 +185,9 @@ class ConditionalRandomField(nn.Module):
            for from_tag_id, to_tag_id in allowed_transitions:
                constrain[from_tag_id, to_tag_id] = 0
        self._constrain = nn.Parameter(constrain, requires_grad=False)
        initial_parameter(self, initial_method)
    def _normalizer_likelihood(self, logits, mask):
        """Computes the (batch_size,) denominator term for the log-likelihood, which is the
        sum of the likelihoods across all possible state sequences.
@@ -200,21 +200,21 @@ class ConditionalRandomField(nn.Module):
        alpha = logits[0]
        if self.include_start_end_trans:
            alpha = alpha + self.start_scores.view(1, -1)
        flip_mask = mask.eq(0)
        for i in range(1, seq_len):
            emit_score = logits[i].view(batch_size, 1, n_tags)
            trans_score = self.trans_m.view(1, n_tags, n_tags)
            tmp = alpha.view(batch_size, n_tags, 1) + emit_score + trans_score
            alpha = torch.logsumexp(tmp, 1).masked_fill(flip_mask[i].view(batch_size, 1), 0) + \
                    alpha.masked_fill(mask[i].byte().view(batch_size, 1), 0)
        if self.include_start_end_trans:
            alpha = alpha + self.end_scores.view(1, -1)
        return torch.logsumexp(alpha, 1)
    def _gold_score(self, logits, tags, mask):
        """
        Compute the score for the gold path.
@@ -226,7 +226,7 @@ class ConditionalRandomField(nn.Module):
        seq_len, batch_size, _ = logits.size()
        batch_idx = torch.arange(batch_size, dtype=torch.long, device=logits.device)
        seq_idx = torch.arange(seq_len, dtype=torch.long, device=logits.device)
        # trans_socre [L-1, B]
        mask = mask.byte()
        flip_mask = mask.eq(0)
@@ -243,7 +243,7 @@ class ConditionalRandomField(nn.Module):
            score = score + st_scores + ed_scores
        # return [B,]
        return score
    def forward(self, feats, tags, mask):
        """
        用于计算CRF的前向loss，返回值为一个batch_size的FloatTensor，可能需要mean()求得loss。
@@ -258,9 +258,9 @@ class ConditionalRandomField(nn.Module):
        mask = mask.transpose(0, 1).float()
        all_path_score = self._normalizer_likelihood(feats, mask)
        gold_path_score = self._gold_score(feats, tags, mask)
        return all_path_score - gold_path_score
    def viterbi_decode(self, logits, mask, unpad=False):
        """给定一个特征矩阵以及转移分数矩阵，计算出最佳的路径以及对应的分数
@@ -277,7 +277,7 @@ class ConditionalRandomField(nn.Module):
        batch_size, seq_len, n_tags = logits.size()
        logits = logits.transpose(0, 1).data  # L, B, H
        mask = mask.transpose(0, 1).data.byte()  # L, B
        # dp
        vpath = logits.new_zeros((seq_len, batch_size, n_tags), dtype=torch.long)
        vscore = logits[0]
@@ -286,7 +286,7 @@ class ConditionalRandomField(nn.Module):
        if self.include_start_end_trans:
            transitions[n_tags, :n_tags] += self.start_scores.data
            transitions[:n_tags, n_tags + 1] += self.end_scores.data
        vscore += transitions[n_tags, :n_tags]
        trans_score = transitions[:n_tags, :n_tags].view(1, n_tags, n_tags).data
        for i in range(1, seq_len):
@@ -297,17 +297,17 @@ class ConditionalRandomField(nn.Module):
            vpath[i] = best_dst
            vscore = best_score.masked_fill(mask[i].eq(0).view(batch_size, 1), 0) + \
                     vscore.masked_fill(mask[i].view(batch_size, 1), 0)
        if self.include_start_end_trans:
            vscore += transitions[:n_tags, n_tags + 1].view(1, -1)
        # backtrace
        batch_idx = torch.arange(batch_size, dtype=torch.long, device=logits.device)
        seq_idx = torch.arange(seq_len, dtype=torch.long, device=logits.device)
        lens = (mask.long().sum(0) - 1)
        # idxes [L, B], batched idx from seq_len-1 to 0
        idxes = (lens.view(1, -1) - seq_idx.view(-1, 1)) % seq_len
        ans = logits.new_empty((seq_len, batch_size), dtype=torch.long)
        ans_score, last_tags = vscore.max(1)
        ans[idxes[0], batch_idx] = last_tags
--- a/fastNLP/modules/decoder/mlp.py
+++ b/fastNLP/modules/decoder/mlp.py
@@ -10,7 +10,7 @@ from ..utils import initial_parameter
 class MLP(nn.Module):
    """
    别名：:class:`fastNLP.modules.MLP`  :class:`fastNLP.modules.decoder.mlp.MLP`
    别名：:class:`fastNLP.modules.MLP`  :class:`fastNLP.modules.decoder.MLP`
    多层感知器
@@ -40,7 +40,7 @@ class MLP(nn.Module):
        >>>     print(x)
        >>>     print(y)
    """
    def __init__(self, size_layer, activation='relu', output_activation=None, initial_method=None, dropout=0.0):
        super(MLP, self).__init__()
        self.hiddens = nn.ModuleList()
@@ -51,9 +51,9 @@ class MLP(nn.Module):
                self.output = nn.Linear(size_layer[i - 1], size_layer[i])
            else:
                self.hiddens.append(nn.Linear(size_layer[i - 1], size_layer[i]))
        self.dropout = nn.Dropout(p=dropout)
        actives = {
            'relu': nn.ReLU(),
            'tanh': nn.Tanh(),
@@ -82,7 +82,7 @@ class MLP(nn.Module):
            else:
                raise ValueError("should set activation correctly: {}".format(activation))
        initial_parameter(self, initial_method)
    def forward(self, x):
        """
        :param torch.Tensor x: MLP接受的输入
--- a/fastNLP/modules/decoder/utils.py
+++ b/fastNLP/modules/decoder/utils.py
@@ -6,7 +6,7 @@ import torch
 def viterbi_decode(logits, transitions, mask=None, unpad=False):
    r"""
    别名：:class:`fastNLP.modules.viterbi_decode`  :class:`fastNLP.modules.decoder.utils.viterbi_decode`
    别名：:class:`fastNLP.modules.viterbi_decode`  :class:`fastNLP.modules.decoder.viterbi_decode`
    给定一个特征矩阵以及转移分数矩阵，计算出最佳的路径以及对应的分数
@@ -30,11 +30,11 @@ def viterbi_decode(logits, transitions, mask=None, unpad=False):
        mask = mask.transpose(0, 1).data.byte()  # L, B
    else:
        mask = logits.new_ones((seq_len, batch_size), dtype=torch.uint8)
    # dp
    vpath = logits.new_zeros((seq_len, batch_size, n_tags), dtype=torch.long)
    vscore = logits[0]
    trans_score = transitions.view(1, n_tags, n_tags).data
    for i in range(1, seq_len):
        prev_score = vscore.view(batch_size, n_tags, 1)
@@ -44,14 +44,14 @@ def viterbi_decode(logits, transitions, mask=None, unpad=False):
        vpath[i] = best_dst
        vscore = best_score.masked_fill(mask[i].eq(0).view(batch_size, 1), 0) + \
                 vscore.masked_fill(mask[i].view(batch_size, 1), 0)
    # backtrace
    batch_idx = torch.arange(batch_size, dtype=torch.long, device=logits.device)
    seq_idx = torch.arange(seq_len, dtype=torch.long, device=logits.device)
    lens = (mask.long().sum(0) - 1)
    # idxes [L, B], batched idx from seq_len-1 to 0
    idxes = (lens.view(1, -1) - seq_idx.view(-1, 1)) % seq_len
    ans = logits.new_empty((seq_len, batch_size), dtype=torch.long)
    ans_score, last_tags = vscore.max(1)
    ans[idxes[0], batch_idx] = last_tags
--- a/fastNLP/modules/dropout.py
+++ b/fastNLP/modules/dropout.py
@@ -5,10 +5,8 @@ import torch
 class TimestepDropout(torch.nn.Dropout):
    """
    别名：:class:`fastNLP.modules.TimestepDropout`
    接受的参数shape为``[batch_size, num_timesteps, embedding_dim)]`` 使用同一个mask(shape为``(batch_size, embedding_dim)``)
     在每个timestamp上做dropout。
    传入参数的shape为 ``(batch_size, num_timesteps, embedding_dim)``
    使用同一个shape为 ``(batch_size, embedding_dim)`` 的mask在每个timestamp上做dropout。
    """
    def forward(self, x):
--- a/fastNLP/modules/encoder/init.py
+++ b/fastNLP/modules/encoder/init.py
@@ -1,17 +1,17 @@
 __all__ = [
    "BertModel",
    # "BertModel",
    "ConvolutionCharEncoder",
    "LSTMCharEncoder",
    "ConvMaxpool",
    "LSTM",
    "StarTransformer",
    "TransformerEncoder",
    "VarRNN",
    "VarLSTM",
    "VarGRU",
--- a/fastNLP/modules/encoder/attention.py
+++ b/fastNLP/modules/encoder/attention.py
@@ -8,8 +8,6 @@ import torch
 import torch.nn.functional as F
 from torch import nn
 from fastNLP.modules.dropout import TimestepDropout
 from fastNLP.modules.utils import initial_parameter
@@ -18,7 +16,7 @@ class DotAttention(nn.Module):
    .. todo::
        补上文档
    """
    def __init__(self, key_size, value_size, dropout=0.0):
        super(DotAttention, self).__init__()
        self.key_size = key_size
@@ -26,7 +24,7 @@ class DotAttention(nn.Module):
        self.scale = math.sqrt(key_size)
        self.drop = nn.Dropout(dropout)
        self.softmax = nn.Softmax(dim=2)
    def forward(self, Q, K, V, mask_out=None):
        """
@@ -45,7 +43,7 @@ class DotAttention(nn.Module):
 class MultiHeadAttention(nn.Module):
    """
    别名：:class:`fastNLP.modules.MultiHeadAttention`   :class:`fastNLP.modules.encoder.attention.MultiHeadAttention`
    别名：:class:`fastNLP.modules.MultiHeadAttention`   :class:`fastNLP.modules.encoder.MultiHeadAttention`
    :param input_size: int, 输入维度的大小。同时也是输出维度的大小。
    :param key_size: int, 每个head的维度大小。
@@ -53,14 +51,14 @@ class MultiHeadAttention(nn.Module):
    :param num_head: int，head的数量。
    :param dropout: float。
    """
    def __init__(self, input_size, key_size, value_size, num_head, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        self.input_size = input_size
        self.key_size = key_size
        self.value_size = value_size
        self.num_head = num_head
        in_size = key_size * num_head
        self.q_in = nn.Linear(input_size, in_size)
        self.k_in = nn.Linear(input_size, in_size)
@@ -69,14 +67,14 @@ class MultiHeadAttention(nn.Module):
        self.attention = DotAttention(key_size=key_size, value_size=value_size, dropout=dropout)
        self.out = nn.Linear(value_size * num_head, input_size)
        self.reset_parameters()
    def reset_parameters(self):
        sqrt = math.sqrt
        nn.init.normal_(self.q_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.key_size)))
        nn.init.normal_(self.k_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.key_size)))
        nn.init.normal_(self.v_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.value_size)))
        nn.init.xavier_normal_(self.out.weight)
    def forward(self, Q, K, V, atte_mask_out=None):
        """
@@ -92,7 +90,7 @@ class MultiHeadAttention(nn.Module):
        q = self.q_in(Q).view(batch, sq, n_head, d_k)
        k = self.k_in(K).view(batch, sk, n_head, d_k)
        v = self.v_in(V).view(batch, sk, n_head, d_v)
        # transpose q, k and v to do batch attention
        q = q.permute(2, 0, 1, 3).contiguous().view(-1, sq, d_k)
        k = k.permute(2, 0, 1, 3).contiguous().view(-1, sk, d_k)
@@ -100,7 +98,7 @@ class MultiHeadAttention(nn.Module):
        if atte_mask_out is not None:
            atte_mask_out = atte_mask_out.repeat(n_head, 1, 1)
        atte = self.attention(q, k, v, atte_mask_out).view(n_head, batch, sq, d_v)
        # concat all heads, do output linear
        atte = atte.permute(1, 2, 0, 3).contiguous().view(batch, sq, -1)
        output = self.out(atte)
@@ -124,11 +122,11 @@ class BiAttention(nn.Module):
        \end{array}
    """
    def __init__(self):
        super(BiAttention, self).__init__()
        self.inf = 10e12
    def forward(self, in_x1, in_x2, x1_len, x2_len):
        """
        :param torch.Tensor in_x1: [batch_size, x1_seq_len, hidden_size] 第一句的特征表示
@@ -139,36 +137,36 @@ class BiAttention(nn.Module):
            torch.Tensor out_x2: [batch_size, x2_seq_len, hidden_size] 第一句attend到的特征表示
        """
        assert in_x1.size()[0] == in_x2.size()[0]
        assert in_x1.size()[2] == in_x2.size()[2]
        # The batch size and hidden size must be equal.
        assert in_x1.size()[1] == x1_len.size()[1] and in_x2.size()[1] == x2_len.size()[1]
        # The seq len in in_x and x_len must be equal.
        assert in_x1.size()[0] == x1_len.size()[0] and x1_len.size()[0] == x2_len.size()[0]
        batch_size = in_x1.size()[0]
        x1_max_len = in_x1.size()[1]
        x2_max_len = in_x2.size()[1]
        in_x2_t = torch.transpose(in_x2, 1, 2)  # [batch_size, hidden_size, x2_seq_len]
        attention_matrix = torch.bmm(in_x1, in_x2_t)  # [batch_size, x1_seq_len, x2_seq_len]
        a_mask = x1_len.le(0.5).float() * -self.inf  # [batch_size, x1_seq_len]
        a_mask = a_mask.view(batch_size, x1_max_len, -1)
        a_mask = a_mask.expand(-1, -1, x2_max_len)  # [batch_size, x1_seq_len, x2_seq_len]
        b_mask = x2_len.le(0.5).float() * -self.inf
        b_mask = b_mask.view(batch_size, -1, x2_max_len)
        b_mask = b_mask.expand(-1, x1_max_len, -1)  # [batch_size, x1_seq_len, x2_seq_len]
        attention_a = F.softmax(attention_matrix + a_mask, dim=2)  # [batch_size, x1_seq_len, x2_seq_len]
        attention_b = F.softmax(attention_matrix + b_mask, dim=1)  # [batch_size, x1_seq_len, x2_seq_len]
        out_x1 = torch.bmm(attention_a, in_x2)  # [batch_size, x1_seq_len, hidden_size]
        attention_b_t = torch.transpose(attention_b, 1, 2)
        out_x2 = torch.bmm(attention_b_t, in_x1)  # [batch_size, x2_seq_len, hidden_size]
        return out_x1, out_x2
@@ -182,10 +180,10 @@ class SelfAttention(nn.Module):
    :param float drop: dropout概率，默认值为0.5
    :param str initial_method: 初始化参数方法
    """
    def __init__(self, input_size, attention_unit=300, attention_hops=10, drop=0.5, initial_method=None, ):
        super(SelfAttention, self).__init__()
        self.attention_hops = attention_hops
        self.ws1 = nn.Linear(input_size, attention_unit, bias=False)
        self.ws2 = nn.Linear(attention_unit, attention_hops, bias=False)
@@ -194,7 +192,7 @@ class SelfAttention(nn.Module):
        self.drop = nn.Dropout(drop)
        self.tanh = nn.Tanh()
        initial_parameter(self, initial_method)
    def _penalization(self, attention):
        """
        compute the penalization term for attention module
@@ -208,7 +206,7 @@ class SelfAttention(nn.Module):
        mat = torch.bmm(attention, attention_t) - self.I[:attention.size(0)]
        ret = (torch.sum(torch.sum((mat ** 2), 2), 1).squeeze() + 1e-10) ** 0.5
        return torch.sum(ret) / size[0]
    def forward(self, input, input_origin):
        """
        :param torch.Tensor input: [baz, senLen, h_dim] 要做attention的矩阵
@@ -218,14 +216,14 @@ class SelfAttention(nn.Module):
        """
        input = input.contiguous()
        size = input.size()  # [bsz, len, nhid]
        input_origin = input_origin.expand(self.attention_hops, -1, -1)  # [hops,baz, len]
        input_origin = input_origin.transpose(0, 1).contiguous()  # [baz, hops,len]
        y1 = self.tanh(self.ws1(self.drop(input)))  # [baz,len,dim] -->[bsz,len, attention-unit]
        attention = self.ws2(y1).transpose(1, 2).contiguous()
        # [bsz,len, attention-unit]--> [bsz, len, hop]--> [baz,hop,len]
        attention = attention + (-999999 * (input_origin == 0).float())  # remove the weight on padding token.
        attention = F.softmax(attention, 2)  # [baz ,hop, len]
        return torch.bmm(attention, input), self._penalization(attention)  # output1 --> [baz ,hop ,nhid]
--- a/fastNLP/modules/encoder/bert.py
+++ b/fastNLP/modules/encoder/bert.py
@@ -1,11 +1,11 @@
 """
 这个页面的代码很大程度上参考(复制粘贴)了https://github.com/huggingface/pytorch-pretrained-BERT的代码， 如果你发现该代码对你
    有用，也请引用一下他们。
 """
 __all__ = [
    "BertModel"
 ]
 import collections
@@ -29,6 +29,7 @@ VOCAB_NAME = 'vocab.txt'
 class BertConfig(object):
    """Configuration class to store the configuration of a `BertModel`.
    """
    def __init__(self,
                 vocab_size_or_config_json_file,
                 hidden_size=768,
@@ -67,8 +68,7 @@ class BertConfig(object):
                initializing all weight matrices.
            layer_norm_eps: The epsilon used by LayerNorm.
        """
        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                        and isinstance(vocab_size_or_config_json_file, unicode)):
        if isinstance(vocab_size_or_config_json_file, str):
            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
                json_config = json.loads(reader.read())
            for key, value in json_config.items():
@@ -153,6 +153,7 @@ class BertLayerNorm(nn.Module):
 class BertEmbeddings(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings.
    """
    def __init__(self, config):
        super(BertEmbeddings, self).__init__()
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
@@ -262,7 +263,7 @@ class BertIntermediate(nn.Module):
    def __init__(self, config):
        super(BertIntermediate, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act
@@ -334,7 +335,10 @@ class BertPooler(nn.Module):
 class BertModel(nn.Module):
    """BERT(Bidirectional Embedding Representations from Transformers).
    """
    别名：:class:`fastNLP.modules.BertModel`   :class:`fastNLP.modules.encoder.BertModel`
    BERT(Bidirectional Embedding Representations from Transformers).
    如果你想使用预训练好的权重矩阵，请在以下网址下载.
    sources::
@@ -576,6 +580,7 @@ def load_vocab(vocab_file):
            index += 1
    return vocab
 class BasicTokenizer(object):
    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
@@ -761,8 +766,8 @@ class BertTokenizer(object):
            [(ids, tok) for tok, ids in self.vocab.items()])
        self.do_basic_tokenize = do_basic_tokenize
        if do_basic_tokenize:
          self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
                                                never_split=never_split)
            self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
                                                  never_split=never_split)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
        self.max_len = max_len if max_len is not None else int(1e12)
@@ -817,7 +822,7 @@ class BertTokenizer(object):
            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                if index != token_index:
                    print("Saving vocabulary to {}: vocabulary indices are not consecutive."
                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
                          " Please check that the vocabulary is not corrupted!".format(vocab_file))
                    index = token_index
                writer.write(token + u'\n')
                index += 1
@@ -837,13 +842,13 @@ class BertTokenizer(object):
        tokenizer = cls(pretrained_model_name_or_path, *inputs, **kwargs)
        return tokenizer
 class _WordPieceBertModel(nn.Module):
    """
    这个模块用于直接计算word_piece的结果.
    """
    def __init__(self, model_dir:str, layers:str='-1'):
    def __init__(self, model_dir: str, layers: str = '-1'):
        super().__init__()
        self.tokenzier = BertTokenizer.from_pretrained(model_dir)
@@ -852,11 +857,11 @@ class _WordPieceBertModel(nn.Module):
        encoder_layer_number = len(self.encoder.encoder.layer)
        self.layers = list(map(int, layers.split(',')))
        for layer in self.layers:
            if layer<0:
                assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \
            if layer < 0:
                assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                    f"a bert model with {encoder_layer_number} layers."
            else:
                assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                assert layer < encoder_layer_number, f"The layer index:{layer} is out of scope for " \
                    f"a bert model with {encoder_layer_number} layers."
        self._cls_index = self.tokenzier.vocab['[CLS]']
@@ -872,15 +877,16 @@ class _WordPieceBertModel(nn.Module):
        :param field_name: 基于哪一列index
        :return:
        """
        def convert_words_to_word_pieces(words):
            word_pieces = []
            for word in words:
                tokens = self.tokenzier.wordpiece_tokenizer.tokenize(word)
                word_piece_ids = self.tokenzier.convert_tokens_to_ids(tokens)
                word_pieces.extend(word_piece_ids)
            if word_pieces[0]!=self._cls_index:
            if word_pieces[0] != self._cls_index:
                word_pieces.insert(0, self._cls_index)
            if word_pieces[-1]!=self._sep_index:
            if word_pieces[-1] != self._sep_index:
                word_pieces.insert(-1, self._sep_index)
            return word_pieces
@@ -904,10 +910,9 @@ class _WordPieceBertModel(nn.Module):
        attn_masks = word_pieces.ne(self._wordpiece_pad_index)
        bert_outputs, _ = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks,
                                           output_all_encoded_layers=True)
                                       output_all_encoded_layers=True)
        # output_layers = [self.layers]  # len(self.layers) x batch_size x max_word_piece_length x hidden_size
        outputs = bert_outputs[0].new_zeros((len(self.layers), batch_size, max_len, bert_outputs[0].size(-1)))
        for l_index, l in enumerate(self.layers):
            outputs[l_index] = bert_outputs[l]
        return outputs
--- a/fastNLP/modules/encoder/char_encoder.py
+++ b/fastNLP/modules/encoder/char_encoder.py
@@ -11,7 +11,7 @@ from ..utils import initial_parameter
 # from torch.nn.init import xavier_uniform
 class ConvolutionCharEncoder(nn.Module):
    """
    别名：:class:`fastNLP.modules.ConvolutionCharEncoder`   :class:`fastNLP.modules.encoder.char_encoder.ConvolutionCharEncoder`
    别名：:class:`fastNLP.modules.ConvolutionCharEncoder`   :class:`fastNLP.modules.encoder.ConvolutionCharEncoder`
    char级别的卷积编码器.
@@ -21,15 +21,16 @@ class ConvolutionCharEncoder(nn.Module):
    :param tuple kernels: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的卷积核.
    :param initial_method: 初始化参数的方式, 默认为`xavier normal`
    """
    def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(1, 3, 5), initial_method=None):
        super(ConvolutionCharEncoder, self).__init__()
        self.convs = nn.ModuleList([
            nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, kernels[i]//2))
            nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True,
                      padding=(0, kernels[i] // 2))
            for i in range(len(kernels))])
        initial_parameter(self, initial_method)
    def forward(self, x):
        """
        :param torch.Tensor x: ``[batch_size * sent_length, word_length, char_emb_size]`` 输入字符的embedding
@@ -40,7 +41,7 @@ class ConvolutionCharEncoder(nn.Module):
        x = x.transpose(2, 3)
        # [batch_size*sent_length, channel, height, width]
        return self._convolute(x).unsqueeze(2)
    def _convolute(self, x):
        feats = []
        for conv in self.convs:
@@ -57,13 +58,13 @@ class ConvolutionCharEncoder(nn.Module):
 class LSTMCharEncoder(nn.Module):
    """
    别名：:class:`fastNLP.modules.LSTMCharEncoder`   :class:`fastNLP.modules.encoder.char_encoder.LSTMCharEncoder`
    别名：:class:`fastNLP.modules.LSTMCharEncoder`   :class:`fastNLP.modules.encoder.LSTMCharEncoder`
    char级别基于LSTM的encoder.
    """
    def __init__(self, char_emb_size=50, hidden_size=None, initial_method=None):
        """
        :param int char_emb_size: char级别embedding的维度. Default: 50
@@ -73,14 +74,14 @@ class LSTMCharEncoder(nn.Module):
        """
        super(LSTMCharEncoder, self).__init__()
        self.hidden_size = char_emb_size if hidden_size is None else hidden_size
        self.lstm = nn.LSTM(input_size=char_emb_size,
                            hidden_size=self.hidden_size,
                            num_layers=1,
                            bias=True,
                            batch_first=True)
        initial_parameter(self, initial_method)
    def forward(self, x):
        """
        :param torch.Tensor x: ``[ n_batch*n_word, word_length, char_emb_size]`` 输入字符的embedding
@@ -91,6 +92,6 @@ class LSTMCharEncoder(nn.Module):
        h0 = nn.init.orthogonal_(h0)
        c0 = torch.empty(1, batch_size, self.hidden_size)
        c0 = nn.init.orthogonal_(c0)
        _, hidden = self.lstm(x, (h0, c0))
        return hidden[0].squeeze().unsqueeze(2)
--- a/fastNLP/modules/encoder/conv_maxpool.py
+++ b/fastNLP/modules/encoder/conv_maxpool.py
@@ -5,9 +5,10 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 class ConvMaxpool(nn.Module):
    """
    别名：:class:`fastNLP.modules.ConvMaxpool`   :class:`fastNLP.modules.encoder.conv_maxpool.ConvMaxpool`
    别名：:class:`fastNLP.modules.ConvMaxpool`   :class:`fastNLP.modules.encoder.ConvMaxpool`
    集合了Convolution和Max-Pooling于一体的层。给定一个batch_size x max_len x input_size的输入，返回batch_size x
    sum(output_channels) 大小的matrix。在内部，是先使用CNN给输入做卷积，然后经过activation激活层，在通过在长度(max_len)
@@ -18,12 +19,12 @@ class ConvMaxpool(nn.Module):
    :param int,tuple(int) kernel_sizes: 输出channel的kernel大小。
    :param str activation: Convolution后的结果将通过该activation后再经过max-pooling。支持relu, sigmoid, tanh
    """
    def __init__(self, in_channels, out_channels, kernel_sizes, activation="relu"):
        super(ConvMaxpool, self).__init__()
        for kernel_size in kernel_sizes:
            assert kernel_size%2==1, "kernel size has to be odd numbers."
            assert kernel_size % 2 == 1, "kernel size has to be odd numbers."
        # convolution
        if isinstance(kernel_sizes, (list, tuple, int)):
@@ -36,22 +37,22 @@ class ConvMaxpool(nn.Module):
                                   " of kernel_sizes."
            else:
                raise ValueError("The type of out_channels and kernel_sizes should be the same.")
            self.convs = nn.ModuleList([nn.Conv1d(
                in_channels=in_channels,
                out_channels=oc,
                kernel_size=ks,
                stride=1,
                padding=ks//2,
                padding=ks // 2,
                dilation=1,
                groups=1,
                bias=None)
                for oc, ks in zip(out_channels, kernel_sizes)])
        else:
            raise Exception(
                'Incorrect kernel sizes: should be list, tuple or int')
        # activation function
        if activation == 'relu':
            self.activation = F.relu
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -10,9 +10,10 @@ import torch
 import torch.nn as nn
 import torch.nn.utils.rnn as rnn
 class LSTM(nn.Module):
    """
    别名：:class:`fastNLP.modules.LSTM`  :class:`fastNLP.modules.encoder.lstm.LSTM`
    别名：:class:`fastNLP.modules.LSTM`  :class:`fastNLP.modules.encoder.LSTM`
    LSTM 模块, 轻量封装的Pytorch LSTM. 在提供seq_len的情况下，将自动使用pack_padded_sequence; 同时默认将forget gate的bias初始化
        为1; 且可以应对DataParallel中LSTM的使用问题。
@@ -26,7 +27,7 @@ class LSTM(nn.Module):
        :(batch, seq, feature). Default: ``False``
    :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
    """
    def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True,
                 bidirectional=False, bias=True):
        super(LSTM, self).__init__()
--- a/fastNLP/modules/encoder/pooling.py
+++ b/fastNLP/modules/encoder/pooling.py
@@ -10,7 +10,7 @@ import torch.nn as nn
 class MaxPool(nn.Module):
    """
    别名：:class:`fastNLP.modules.MaxPool`  :class:`fastNLP.modules.encoder.pooling.MaxPool`
    别名：:class:`fastNLP.modules.MaxPool`  :class:`fastNLP.modules.encoder.MaxPool`
    Max-pooling模块。
@@ -21,9 +21,9 @@ class MaxPool(nn.Module):
    :param kernel_size: max pooling的窗口大小，默认为tensor最后k维，其中k为dimension
    :param ceil_mode:
    """
    def __init__(self, stride=None, padding=0, dilation=1, dimension=1, kernel_size=None, ceil_mode=False):
        super(MaxPool, self).__init__()
        assert (1 <= dimension) and (dimension <= 3)
        self.dimension = dimension
@@ -32,7 +32,7 @@ class MaxPool(nn.Module):
        self.dilation = dilation
        self.kernel_size = kernel_size
        self.ceil_mode = ceil_mode
    def forward(self, x):
        if self.dimension == 1:
            pooling = nn.MaxPool1d(
@@ -59,15 +59,15 @@ class MaxPool(nn.Module):
 class MaxPoolWithMask(nn.Module):
    """
    别名：:class:`fastNLP.modules.MaxPoolWithMask`  :class:`fastNLP.modules.encoder.pooling.MaxPoolWithMask`
    别名：:class:`fastNLP.modules.MaxPoolWithMask`  :class:`fastNLP.modules.encoder.MaxPoolWithMask`
    带mask矩阵的max pooling。在做max-pooling的时候不会考虑mask值为0的位置。
    """
    def __init__(self):
        super(MaxPoolWithMask, self).__init__()
        self.inf = 10e12
    def forward(self, tensor, mask, dim=1):
        """
        :param torch.FloatTensor tensor: [batch_size, seq_len, channels] 初始tensor
@@ -82,11 +82,11 @@ class MaxPoolWithMask(nn.Module):
 class KMaxPool(nn.Module):
    """K max-pooling module."""
    def __init__(self, k=1):
        super(KMaxPool, self).__init__()
        self.k = k
    def forward(self, x):
        """
        :param torch.Tensor x: [N, C, L] 初始tensor
@@ -99,16 +99,16 @@ class KMaxPool(nn.Module):
 class AvgPool(nn.Module):
    """
    别名：:class:`fastNLP.modules.AvgPool`  :class:`fastNLP.modules.encoder.pooling.AvgPool`
    别名：:class:`fastNLP.modules.AvgPool`  :class:`fastNLP.modules.encoder.AvgPool`
    给定形如[batch_size, max_len, hidden_size]的输入，在最后一维进行avg pooling. 输出为[batch_size, hidden_size]
    """
    def __init__(self, stride=None, padding=0):
        super(AvgPool, self).__init__()
        self.stride = stride
        self.padding = padding
    def forward(self, x):
        """
        :param torch.Tensor x: [N, C, L] 初始tensor
@@ -126,16 +126,16 @@ class AvgPool(nn.Module):
 class AvgPoolWithMask(nn.Module):
    """
    别名：:class:`fastNLP.modules.AvgPoolWithMask`  :class:`fastNLP.modules.encoder.pooling.AvgPoolWithMask`
    别名：:class:`fastNLP.modules.AvgPoolWithMask`  :class:`fastNLP.modules.encoder.AvgPoolWithMask`
    给定形如[batch_size, max_len, hidden_size]的输入，在最后一维进行avg pooling. 输出为[batch_size, hidden_size], pooling
    的时候只会考虑mask为1的位置
    """
    def __init__(self):
        super(AvgPoolWithMask, self).__init__()
        self.inf = 10e12
    def forward(self, tensor, mask, dim=1):
        """
        :param torch.FloatTensor tensor: [batch_size, seq_len, channels] 初始tensor
--- a/fastNLP/modules/encoder/star_transformer.py
+++ b/fastNLP/modules/encoder/star_transformer.py
@@ -13,7 +13,7 @@ from torch.nn import functional as F
 class StarTransformer(nn.Module):
    """
    别名：:class:`fastNLP.modules.StarTransformer`  :class:`fastNLP.modules.encoder.star_transformer.StarTransformer`
    别名：:class:`fastNLP.modules.StarTransformer`  :class:`fastNLP.modules.encoder.StarTransformer`
    Star-Transformer 的encoder部分。 输入3d的文本输入, 返回相同长度的文本编码
@@ -29,11 +29,11 @@ class StarTransformer(nn.Module):
        模型会为输入序列加上position embedding。
        若为`None`，忽略加上position embedding的步骤. Default: `None`
    """
    def __init__(self, hidden_size, num_layers, num_head, head_dim, dropout=0.1, max_len=None):
        super(StarTransformer, self).__init__()
        self.iters = num_layers
        self.norm = nn.ModuleList([nn.LayerNorm(hidden_size, eps=1e-6) for _ in range(self.iters)])
        # self.emb_fc = nn.Conv2d(hidden_size, hidden_size, 1)
        self.emb_drop = nn.Dropout(dropout)
@@ -43,12 +43,12 @@ class StarTransformer(nn.Module):
        self.star_att = nn.ModuleList(
            [_MSA2(hidden_size, nhead=num_head, head_dim=head_dim, dropout=0.0)
             for _ in range(self.iters)])
        if max_len is not None:
            self.pos_emb = nn.Embedding(max_len, hidden_size)
        else:
            self.pos_emb = None
    def forward(self, data, mask):
        """
        :param FloatTensor data: [batch, length, hidden] 输入的序列
@@ -58,15 +58,15 @@ class StarTransformer(nn.Module):
                [batch, hidden] 全局 relay 节点, 详见论文
        """
        def norm_func(f, x):
            # B, H, L, 1
            return f(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
        B, L, H = data.size()
        mask = (mask == 0)  # flip the mask for masked_fill_
        smask = torch.cat([torch.zeros(B, 1, ).byte().to(mask), mask], 1)
        embs = data.permute(0, 2, 1)[:, :, :, None]  # B H L 1
        if self.pos_emb and False:
            P = self.pos_emb(torch.arange(L, dtype=torch.long, device=embs.device) \
@@ -80,13 +80,13 @@ class StarTransformer(nn.Module):
        for i in range(self.iters):
            ax = torch.cat([r_embs, relay.expand(B, H, 1, L)], 2)
            nodes = F.leaky_relu(self.ring_att[i](norm_func(self.norm[i], nodes), ax=ax))
            #nodes = F.leaky_relu(self.ring_att[i](nodes, ax=ax))
            # nodes = F.leaky_relu(self.ring_att[i](nodes, ax=ax))
            relay = F.leaky_relu(self.star_att[i](relay, torch.cat([relay, nodes], 2), smask))
            nodes = nodes.masked_fill_(ex_mask, 0)
        nodes = nodes.view(B, H, L).permute(0, 2, 1)
        return nodes, relay.view(B, H)
@@ -99,19 +99,19 @@ class _MSA1(nn.Module):
        self.WK = nn.Conv2d(nhid, nhead * head_dim, 1)
        self.WV = nn.Conv2d(nhid, nhead * head_dim, 1)
        self.WO = nn.Conv2d(nhead * head_dim, nhid, 1)
        self.drop = nn.Dropout(dropout)
        # print('NUM_HEAD', nhead, 'DIM_HEAD', head_dim)
        self.nhid, self.nhead, self.head_dim, self.unfold_size = nhid, nhead, head_dim, 3
    def forward(self, x, ax=None):
        # x: B, H, L, 1, ax : B, H, X, L append features
        nhid, nhead, head_dim, unfold_size = self.nhid, self.nhead, self.head_dim, self.unfold_size
        B, H, L, _ = x.shape
        q, k, v = self.WQ(x), self.WK(x), self.WV(x)  # x: (B,H,L,1)
        if ax is not None:
            aL = ax.shape[2]
            ak = self.WK(ax).view(B, nhead, head_dim, aL, L)
@@ -124,12 +124,12 @@ class _MSA1(nn.Module):
        if ax is not None:
            k = torch.cat([k, ak], 3)
            v = torch.cat([v, av], 3)
        alphas = self.drop(F.softmax((q * k).sum(2, keepdim=True) / NP.sqrt(head_dim), 3))  # B N L 1 U
        att = (alphas * v).sum(3).view(B, nhead * head_dim, L, 1)
        ret = self.WO(att)
        return ret
@@ -141,19 +141,19 @@ class _MSA2(nn.Module):
        self.WK = nn.Conv2d(nhid, nhead * head_dim, 1)
        self.WV = nn.Conv2d(nhid, nhead * head_dim, 1)
        self.WO = nn.Conv2d(nhead * head_dim, nhid, 1)
        self.drop = nn.Dropout(dropout)
        # print('NUM_HEAD', nhead, 'DIM_HEAD', head_dim)
        self.nhid, self.nhead, self.head_dim, self.unfold_size = nhid, nhead, head_dim, 3
    def forward(self, x, y, mask=None):
        # x: B, H, 1, 1, 1 y: B H L 1
        nhid, nhead, head_dim, unfold_size = self.nhid, self.nhead, self.head_dim, self.unfold_size
        B, H, L, _ = y.shape
        q, k, v = self.WQ(x), self.WK(y), self.WV(y)
        q = q.view(B, nhead, 1, head_dim)  # B, H, 1, 1 -> B, N, 1, h
        k = k.view(B, nhead, head_dim, L)  # B, H, L, 1 -> B, N, h, L
        v = v.view(B, nhead, head_dim, L).permute(0, 1, 3, 2)  # B, H, L, 1 -> B, N, L, h
--- a/fastNLP/modules/encoder/transformer.py
+++ b/fastNLP/modules/encoder/transformer.py
@@ -9,7 +9,7 @@ from ..dropout import TimestepDropout
 class TransformerEncoder(nn.Module):
    """
    别名：:class:`fastNLP.modules.TransformerEncoder`  :class:`fastNLP.modules.encoder.transformer.TransformerEncoder`
    别名：:class:`fastNLP.modules.TransformerEncoder`  :class:`fastNLP.modules.encoder.TransformerEncoder`
    transformer的encoder模块，不包含embedding层
@@ -22,7 +22,7 @@ class TransformerEncoder(nn.Module):
    :param int num_head: head的数量。
    :param float dropout: dropout概率. Default: 0.1
    """
    class SubLayer(nn.Module):
        def __init__(self, model_size, inner_size, key_size, value_size, num_head, dropout=0.1):
            super(TransformerEncoder.SubLayer, self).__init__()
@@ -33,7 +33,7 @@ class TransformerEncoder(nn.Module):
                                     nn.Linear(inner_size, model_size),
                                     TimestepDropout(dropout), )
            self.norm2 = nn.LayerNorm(model_size)
        def forward(self, input, seq_mask=None, atte_mask_out=None):
            """
@@ -48,11 +48,11 @@ class TransformerEncoder(nn.Module):
            output = self.norm2(output + norm_atte)
            output *= seq_mask
            return output
    def __init__(self, num_layers, **kargs):
        super(TransformerEncoder, self).__init__()
        self.layers = nn.ModuleList([self.SubLayer(**kargs) for _ in range(num_layers)])
    def forward(self, x, seq_mask=None):
        """
        :param x: [batch, seq_len, model_size] 输入序列
--- a/fastNLP/modules/encoder/variational_rnn.py
+++ b/fastNLP/modules/encoder/variational_rnn.py
@@ -28,14 +28,14 @@ class VarRnnCellWrapper(nn.Module):
    """
    Wrapper for normal RNN Cells, make it support variational dropout
    """
    def __init__(self, cell, hidden_size, input_p, hidden_p):
        super(VarRnnCellWrapper, self).__init__()
        self.cell = cell
        self.hidden_size = hidden_size
        self.input_p = input_p
        self.hidden_p = hidden_p
    def forward(self, input_x, hidden, mask_x, mask_h, is_reversed=False):
        """
        :param PackedSequence input_x: [seq_len, batch_size, input_size]
@@ -47,13 +47,13 @@ class VarRnnCellWrapper(nn.Module):
                hidden: for LSTM, tuple of (h_n, c_n), [batch_size, hidden_size]
                        for other RNN, h_n, [batch_size, hidden_size]
        """
        def get_hi(hi, h0, size):
            h0_size = size - hi.size(0)
            if h0_size > 0:
                return torch.cat([hi, h0[:h0_size]], dim=0)
            return hi[:size]
        is_lstm = isinstance(hidden, tuple)
        input, batch_sizes = input_x.data, input_x.batch_sizes
        output = []
@@ -64,7 +64,7 @@ class VarRnnCellWrapper(nn.Module):
        else:
            batch_iter = batch_sizes
            idx = 0
        if is_lstm:
            hn = (hidden[0].clone(), hidden[1].clone())
        else:
@@ -91,7 +91,7 @@ class VarRnnCellWrapper(nn.Module):
                hi = cell(input_i, hi)
                hn[:size] = hi
                output.append(hi)
        if is_reversed:
            output = list(reversed(output))
        output = torch.cat(output, dim=0)
@@ -117,7 +117,7 @@ class VarRNNBase(nn.Module):
    :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
    :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
    """
    def __init__(self, mode, Cell, input_size, hidden_size, num_layers=1,
                 bias=True, batch_first=False,
                 input_dropout=0, hidden_dropout=0, bidirectional=False):
@@ -141,7 +141,7 @@ class VarRNNBase(nn.Module):
                    cell, self.hidden_size, input_dropout, hidden_dropout))
        initial_parameter(self)
        self.is_lstm = (self.mode == "LSTM")
    def _forward_one(self, n_layer, n_direction, input, hx, mask_x, mask_h):
        is_lstm = self.is_lstm
        idx = self.num_directions * n_layer + n_direction
@@ -150,7 +150,7 @@ class VarRNNBase(nn.Module):
        output_x, hidden_x = cell(
            input, hi, mask_x, mask_h, is_reversed=(n_direction == 1))
        return output_x, hidden_x
    def forward(self, x, hx=None):
        """
@@ -170,13 +170,13 @@ class VarRNNBase(nn.Module):
        else:
            max_batch_size = int(x.batch_sizes[0])
        x, batch_sizes = x.data, x.batch_sizes
        if hx is None:
            hx = x.new_zeros(self.num_layers * self.num_directions,
                             max_batch_size, self.hidden_size, requires_grad=True)
            if is_lstm:
                hx = (hx, hx.new_zeros(hx.size(), requires_grad=True))
        mask_x = x.new_ones((max_batch_size, self.input_size))
        mask_out = x.new_ones(
            (max_batch_size, self.hidden_size * self.num_directions))
@@ -185,7 +185,7 @@ class VarRNNBase(nn.Module):
                              training=self.training, inplace=True)
        nn.functional.dropout(mask_out, p=self.hidden_dropout,
                              training=self.training, inplace=True)
        hidden = x.new_zeros(
            (self.num_layers * self.num_directions, max_batch_size, self.hidden_size))
        if is_lstm:
@@ -207,22 +207,22 @@ class VarRNNBase(nn.Module):
                else:
                    hidden[idx] = hidden_x
            x = torch.cat(output_list, dim=-1)
        if is_lstm:
            hidden = (hidden, cellstate)
        if is_packed:
            output = PackedSequence(x, batch_sizes)
        else:
            x = PackedSequence(x, batch_sizes)
            output, _ = pad_packed_sequence(x, batch_first=self.batch_first)
        return output, hidden
 class VarLSTM(VarRNNBase):
    """
    别名：:class:`fastNLP.modules.VarLSTM`  :class:`fastNLP.modules.encoder.variational_rnn.VarLSTM`
    别名：:class:`fastNLP.modules.VarLSTM`  :class:`fastNLP.modules.encoder.VarLSTM`
    Variational Dropout LSTM.
@@ -236,18 +236,18 @@ class VarLSTM(VarRNNBase):
    :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
    :param bidirectional: 若为 ``True``, 使用双向的LSTM. Default: ``False``
    """
    def __init__(self, *args, **kwargs):
        super(VarLSTM, self).__init__(
            mode="LSTM", Cell=nn.LSTMCell, *args, **kwargs)
    def forward(self, x, hx=None):
        return super(VarLSTM, self).forward(x, hx)
 class VarRNN(VarRNNBase):
    """
    别名：:class:`fastNLP.modules.VarRNN`  :class:`fastNLP.modules.encoder.variational_rnn.VarRNN`
    别名：:class:`fastNLP.modules.VarRNN`  :class:`fastNLP.modules.encoder.VarRNN`
    Variational Dropout RNN.
@@ -261,18 +261,18 @@ class VarRNN(VarRNNBase):
    :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
    :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
    """
    def __init__(self, *args, **kwargs):
        super(VarRNN, self).__init__(
            mode="RNN", Cell=nn.RNNCell, *args, **kwargs)
    def forward(self, x, hx=None):
        return super(VarRNN, self).forward(x, hx)
 class VarGRU(VarRNNBase):
    """
    别名：:class:`fastNLP.modules.VarGRU`  :class:`fastNLP.modules.encoder.variational_rnn.VarGRU`
    别名：:class:`fastNLP.modules.VarGRU`  :class:`fastNLP.modules.encoder.VarGRU`
    Variational Dropout GRU.
@@ -286,10 +286,10 @@ class VarGRU(VarRNNBase):
    :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
    :param bidirectional: 若为 ``True``, 使用双向的GRU. Default: ``False``
    """
    def __init__(self, *args, **kwargs):
        super(VarGRU, self).__init__(
            mode="GRU", Cell=nn.GRUCell, *args, **kwargs)
    def forward(self, x, hx=None):
        return super(VarGRU, self).forward(x, hx)