From 202bde4bfd3ac26e1155f7f6e2c6e021b5cfa3d7 Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Tue, 10 Sep 2019 22:56:50 +0800
Subject: [PATCH] split the docs over~

---
 fastNLP/modules/decoder/crf.py              | 22 +++---
 fastNLP/modules/decoder/mlp.py              | 15 ++--
 fastNLP/modules/encoder/attention.py        | 27 ++++---
 fastNLP/modules/encoder/char_encoder.py     | 13 +--
 fastNLP/modules/encoder/conv_maxpool.py     | 11 ++-
 fastNLP/modules/encoder/lstm.py             | 21 ++---
 fastNLP/modules/encoder/pooling.py          | 16 ++--
 fastNLP/modules/encoder/star_transformer.py | 19 +++--
 fastNLP/modules/encoder/transformer.py      | 17 ++--
 fastNLP/modules/encoder/variational_rnn.py  | 88 ++++++++++++---------
 10 files changed, 145 insertions(+), 104 deletions(-)

diff --git a/fastNLP/modules/decoder/crf.py b/fastNLP/modules/decoder/crf.py
index aeb73d76..669501e9 100644
--- a/fastNLP/modules/decoder/crf.py
+++ b/fastNLP/modules/decoder/crf.py
@@ -5,13 +5,15 @@ __all__ = [
     "allowed_transitions"
 ]
 
+from typing import Union
+
 import torch
 from torch import nn
 
 from ..utils import initial_parameter
-from ...core.vocabulary import Vocabulary
 from ...core.metrics import _get_encoding_type_from_tag_vocab, _check_tag_vocab_and_encoding_type
-from typing import Union
+from ...core.vocabulary import Vocabulary
+
 
 def allowed_transitions(tag_vocab:Union[Vocabulary, dict], encoding_type=None, include_start_end=False):
     """
@@ -168,17 +170,19 @@ class ConditionalRandomField(nn.Module):
     """
     条件随机场。提供forward()以及viterbi_decode()两个方法，分别用于训练与inference。
 
-    :param int num_tags: 标签的数量
-    :param bool include_start_end_trans: 是否考虑各个tag作为开始以及结尾的分数。
-    :param List[Tuple[from_tag_id(int), to_tag_id(int)]] allowed_transitions: 内部的Tuple[from_tag_id(int),
-                               to_tag_id(int)]视为允许发生的跃迁，其他没有包含的跃迁认为是禁止跃迁，可以通过
-                               allowed_transitions()函数得到；如果为None，则所有跃迁均为合法
-    :param str initial_method: 初始化方法。见initial_parameter
     """
 
     def __init__(self, num_tags, include_start_end_trans=False, allowed_transitions=None,
                  initial_method=None):
-
+        """
+        
+        :param int num_tags: 标签的数量
+        :param bool include_start_end_trans: 是否考虑各个tag作为开始以及结尾的分数。
+        :param List[Tuple[from_tag_id(int), to_tag_id(int)]] allowed_transitions: 内部的Tuple[from_tag_id(int),
+                                   to_tag_id(int)]视为允许发生的跃迁，其他没有包含的跃迁认为是禁止跃迁，可以通过
+                                   allowed_transitions()函数得到；如果为None，则所有跃迁均为合法
+        :param str initial_method: 初始化方法。见initial_parameter
+        """
         super(ConditionalRandomField, self).__init__()
 
         self.include_start_end_trans = include_start_end_trans
diff --git a/fastNLP/modules/decoder/mlp.py b/fastNLP/modules/decoder/mlp.py
index 3e594de1..0f23f481 100644
--- a/fastNLP/modules/decoder/mlp.py
+++ b/fastNLP/modules/decoder/mlp.py
@@ -14,12 +14,6 @@ class MLP(nn.Module):
     """
     多层感知器
 
-    :param List[int] size_layer: 一个int的列表，用来定义MLP的层数，列表中的数字为每一层是hidden数目。MLP的层数为 len(size_layer) - 1
-    :param Union[str,func,List[str]] activation: 一个字符串或者函数的列表，用来定义每一个隐层的激活函数，字符串包括relu，tanh和
-        sigmoid，默认值为relu
-    :param Union[str,func] output_activation:  字符串或者函数，用来定义输出层的激活函数，默认值为None，表示输出层没有激活函数
-    :param str initial_method: 参数初始化方式
-    :param float dropout: dropout概率，默认值为0
     
     .. note::
         隐藏层的激活函数通过activation定义。一个str/function或者一个str/function的list可以被传入activation。
@@ -42,6 +36,15 @@ class MLP(nn.Module):
     """
 
     def __init__(self, size_layer, activation='relu', output_activation=None, initial_method=None, dropout=0.0):
+        """
+        
+        :param List[int] size_layer: 一个int的列表，用来定义MLP的层数，列表中的数字为每一层是hidden数目。MLP的层数为 len(size_layer) - 1
+        :param Union[str,func,List[str]] activation: 一个字符串或者函数的列表，用来定义每一个隐层的激活函数，字符串包括relu，tanh和
+            sigmoid，默认值为relu
+        :param Union[str,func] output_activation:  字符串或者函数，用来定义输出层的激活函数，默认值为None，表示输出层没有激活函数
+        :param str initial_method: 参数初始化方式
+        :param float dropout: dropout概率，默认值为0
+        """
         super(MLP, self).__init__()
         self.hiddens = nn.ModuleList()
         self.output = None
diff --git a/fastNLP/modules/encoder/attention.py b/fastNLP/modules/encoder/attention.py
index 0d832653..32f59c22 100644
--- a/fastNLP/modules/encoder/attention.py
+++ b/fastNLP/modules/encoder/attention.py
@@ -46,14 +46,17 @@ class DotAttention(nn.Module):
 class MultiHeadAttention(nn.Module):
     """
 
-    :param input_size: int, 输入维度的大小。同时也是输出维度的大小。
-    :param key_size: int, 每个head的维度大小。
-    :param value_size: int，每个head中value的维度。
-    :param num_head: int，head的数量。
-    :param dropout: float。
     """
 
     def __init__(self, input_size, key_size, value_size, num_head, dropout=0.1):
+        """
+        
+        :param input_size: int, 输入维度的大小。同时也是输出维度的大小。
+        :param key_size: int, 每个head的维度大小。
+        :param value_size: int，每个head中value的维度。
+        :param num_head: int，head的数量。
+        :param dropout: float。
+        """
         super(MultiHeadAttention, self).__init__()
         self.input_size = input_size
         self.key_size = key_size
@@ -169,15 +172,17 @@ class BiAttention(nn.Module):
 class SelfAttention(nn.Module):
     """
     Self Attention Module.
-    
-    :param int input_size: 输入tensor的hidden维度
-    :param int attention_unit: 输出tensor的hidden维度
-    :param int attention_hops:
-    :param float drop: dropout概率，默认值为0.5
-    :param str initial_method: 初始化参数方法
     """
 
     def __init__(self, input_size, attention_unit=300, attention_hops=10, drop=0.5, initial_method=None, ):
+        """
+        
+        :param int input_size: 输入tensor的hidden维度
+        :param int attention_unit: 输出tensor的hidden维度
+        :param int attention_hops:
+        :param float drop: dropout概率，默认值为0.5
+        :param str initial_method: 初始化参数方法
+        """
         super(SelfAttention, self).__init__()
 
         self.attention_hops = attention_hops
diff --git a/fastNLP/modules/encoder/char_encoder.py b/fastNLP/modules/encoder/char_encoder.py
index dc73f447..786a2467 100644
--- a/fastNLP/modules/encoder/char_encoder.py
+++ b/fastNLP/modules/encoder/char_encoder.py
@@ -15,14 +15,17 @@ class ConvolutionCharEncoder(nn.Module):
     """
     char级别的卷积编码器.
     
-    :param int char_emb_size: char级别embedding的维度. Default: 50
-        :例: 有26个字符, 每一个的embedding是一个50维的向量, 所以输入的向量维度为50.
-    :param tuple feature_maps: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的filter.
-    :param tuple kernels: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的卷积核.
-    :param initial_method: 初始化参数的方式, 默认为`xavier normal`
     """
 
     def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(1, 3, 5), initial_method=None):
+        """
+        
+        :param int char_emb_size: char级别embedding的维度. Default: 50
+            :例: 有26个字符, 每一个的embedding是一个50维的向量, 所以输入的向量维度为50.
+        :param tuple feature_maps: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的filter.
+        :param tuple kernels: 一个由int组成的tuple. tuple的长度是char级别卷积操作的数目, 第`i`个int表示第`i`个卷积操作的卷积核.
+        :param initial_method: 初始化参数的方式, 默认为`xavier normal`
+        """
         super(ConvolutionCharEncoder, self).__init__()
         self.convs = nn.ModuleList([
             nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True,
diff --git a/fastNLP/modules/encoder/conv_maxpool.py b/fastNLP/modules/encoder/conv_maxpool.py
index bf629eba..f19a92f3 100644
--- a/fastNLP/modules/encoder/conv_maxpool.py
+++ b/fastNLP/modules/encoder/conv_maxpool.py
@@ -14,13 +14,16 @@ class ConvMaxpool(nn.Module):
     sum(output_channels) 大小的matrix。在内部，是先使用CNN给输入做卷积，然后经过activation激活层，在通过在长度(max_len)
     这一维进行max_pooling。最后得到每个sample的一个向量表示。
 
-    :param int in_channels: 输入channel的大小，一般是embedding的维度; 或encoder的output维度
-    :param int,tuple(int) out_channels: 输出channel的数量。如果为list，则需要与kernel_sizes的数量保持一致
-    :param int,tuple(int) kernel_sizes: 输出channel的kernel大小。
-    :param str activation: Convolution后的结果将通过该activation后再经过max-pooling。支持relu, sigmoid, tanh
     """
 
     def __init__(self, in_channels, out_channels, kernel_sizes, activation="relu"):
+        """
+        
+        :param int in_channels: 输入channel的大小，一般是embedding的维度; 或encoder的output维度
+        :param int,tuple(int) out_channels: 输出channel的数量。如果为list，则需要与kernel_sizes的数量保持一致
+        :param int,tuple(int) kernel_sizes: 输出channel的kernel大小。
+        :param str activation: Convolution后的结果将通过该activation后再经过max-pooling。支持relu, sigmoid, tanh
+        """
         super(ConvMaxpool, self).__init__()
 
         for kernel_size in kernel_sizes:
diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py
index 1dd1f0df..06b437ef 100644
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -15,20 +15,23 @@ import torch.nn.utils.rnn as rnn
 class LSTM(nn.Module):
     """
     LSTM 模块, 轻量封装的Pytorch LSTM. 在提供seq_len的情况下，将自动使用pack_padded_sequence; 同时默认将forget gate的bias初始化
-        为1; 且可以应对DataParallel中LSTM的使用问题。
+    为1; 且可以应对DataParallel中LSTM的使用问题。
 
-    :param input_size:  输入 `x` 的特征维度
-    :param hidden_size: 隐状态 `h` 的特征维度.
-    :param num_layers: rnn的层数. Default: 1
-    :param dropout: 层间dropout概率. Default: 0
-    :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
-    :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
-        :(batch, seq, feature). Default: ``False``
-    :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
     """
 
     def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True,
                  bidirectional=False, bias=True):
+        """
+        
+        :param input_size:  输入 `x` 的特征维度
+        :param hidden_size: 隐状态 `h` 的特征维度.
+        :param num_layers: rnn的层数. Default: 1
+        :param dropout: 层间dropout概率. Default: 0
+        :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
+        :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
+            :(batch, seq, feature). Default: ``False``
+        :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
+        """
         super(LSTM, self).__init__()
         self.batch_first = batch_first
         self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias=bias, batch_first=batch_first,
diff --git a/fastNLP/modules/encoder/pooling.py b/fastNLP/modules/encoder/pooling.py
index c248601d..789b6d26 100644
--- a/fastNLP/modules/encoder/pooling.py
+++ b/fastNLP/modules/encoder/pooling.py
@@ -14,16 +14,18 @@ class MaxPool(nn.Module):
     """
     Max-pooling模块。
     
-    :param stride: 窗口移动大小，默认为kernel_size
-    :param padding: padding的内容，默认为0
-    :param dilation: 控制窗口内元素移动距离的大小
-    :param dimension: MaxPool的维度，支持1，2，3维。
-    :param kernel_size: max pooling的窗口大小，默认为tensor最后k维，其中k为dimension
-    :param ceil_mode:
     """
 
     def __init__(self, stride=None, padding=0, dilation=1, dimension=1, kernel_size=None, ceil_mode=False):
-
+        """
+        
+        :param stride: 窗口移动大小，默认为kernel_size
+        :param padding: padding的内容，默认为0
+        :param dilation: 控制窗口内元素移动距离的大小
+        :param dimension: MaxPool的维度，支持1，2，3维。
+        :param kernel_size: max pooling的窗口大小，默认为tensor最后k维，其中k为dimension
+        :param ceil_mode:
+        """
         super(MaxPool, self).__init__()
         assert (1 <= dimension) and (dimension <= 3)
         self.dimension = dimension
diff --git a/fastNLP/modules/encoder/star_transformer.py b/fastNLP/modules/encoder/star_transformer.py
index bb47d9b5..d4cc66f7 100644
--- a/fastNLP/modules/encoder/star_transformer.py
+++ b/fastNLP/modules/encoder/star_transformer.py
@@ -18,17 +18,20 @@ class StarTransformer(nn.Module):
 
     paper: https://arxiv.org/abs/1902.09113
 
-    :param int hidden_size: 输入维度的大小。同时也是输出维度的大小。
-    :param int num_layers: star-transformer的层数
-    :param int num_head: head的数量。
-    :param int head_dim: 每个head的维度大小。
-    :param float dropout: dropout 概率. Default: 0.1
-    :param int max_len: int or None, 如果为int，输入序列的最大长度，
-        模型会为输入序列加上position embedding。
-        若为`None`，忽略加上position embedding的步骤. Default: `None`
     """
 
     def __init__(self, hidden_size, num_layers, num_head, head_dim, dropout=0.1, max_len=None):
+        """
+        
+        :param int hidden_size: 输入维度的大小。同时也是输出维度的大小。
+        :param int num_layers: star-transformer的层数
+        :param int num_head: head的数量。
+        :param int head_dim: 每个head的维度大小。
+        :param float dropout: dropout 概率. Default: 0.1
+        :param int max_len: int or None, 如果为int，输入序列的最大长度，
+            模型会为输入序列加上position embedding。
+            若为`None`，忽略加上position embedding的步骤. Default: `None`
+        """
         super(StarTransformer, self).__init__()
         self.iters = num_layers
 
diff --git a/fastNLP/modules/encoder/transformer.py b/fastNLP/modules/encoder/transformer.py
index 3d97c306..323091b0 100644
--- a/fastNLP/modules/encoder/transformer.py
+++ b/fastNLP/modules/encoder/transformer.py
@@ -12,13 +12,6 @@ class TransformerEncoder(nn.Module):
     """
     transformer的encoder模块，不包含embedding层
 
-    :param int num_layers: transformer的层数
-    :param int model_size: 输入维度的大小。同时也是输出维度的大小。
-    :param int inner_size: FFN层的hidden大小
-    :param int key_size: 每个head的维度大小。
-    :param int value_size: 每个head中value的维度。
-    :param int num_head: head的数量。
-    :param float dropout: dropout概率. Default: 0.1
     """
 
     class SubLayer(nn.Module):
@@ -53,6 +46,16 @@ class TransformerEncoder(nn.Module):
             return input
 
     def __init__(self, num_layers, **kargs):
+        """
+        
+        :param int num_layers: transformer的层数
+        :param int model_size: 输入维度的大小。同时也是输出维度的大小。
+        :param int inner_size: FFN层的hidden大小
+        :param int key_size: 每个head的维度大小。
+        :param int value_size: 每个head中value的维度。
+        :param int num_head: head的数量。
+        :param float dropout: dropout概率. Default: 0.1
+        """
         super(TransformerEncoder, self).__init__()
         self.layers = nn.ModuleList([self.SubLayer(**kargs) for _ in range(num_layers)])
         self.norm = nn.LayerNorm(kargs['model_size'], eps=1e-6)
diff --git a/fastNLP/modules/encoder/variational_rnn.py b/fastNLP/modules/encoder/variational_rnn.py
index 17e2ad23..5f4a5534 100644
--- a/fastNLP/modules/encoder/variational_rnn.py
+++ b/fastNLP/modules/encoder/variational_rnn.py
@@ -106,22 +106,25 @@ class VarRNNBase(nn.Module):
     论文参考: `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016)
     https://arxiv.org/abs/1512.05287`.
 
-    :param mode: rnn 模式, (lstm or not)
-    :param Cell: rnn cell 类型, (lstm, gru, etc)
-    :param input_size:  输入 `x` 的特征维度
-    :param hidden_size: 隐状态 `h` 的特征维度
-    :param num_layers: rnn的层数. Default: 1
-    :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
-    :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
-        (batch, seq, feature). Default: ``False``
-    :param input_dropout: 对输入的dropout概率. Default: 0
-    :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
-    :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
     """
 
     def __init__(self, mode, Cell, input_size, hidden_size, num_layers=1,
                  bias=True, batch_first=False,
                  input_dropout=0, hidden_dropout=0, bidirectional=False):
+        """
+        
+        :param mode: rnn 模式, (lstm or not)
+        :param Cell: rnn cell 类型, (lstm, gru, etc)
+        :param input_size:  输入 `x` 的特征维度
+        :param hidden_size: 隐状态 `h` 的特征维度
+        :param num_layers: rnn的层数. Default: 1
+        :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
+        :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
+            (batch, seq, feature). Default: ``False``
+        :param input_dropout: 对输入的dropout概率. Default: 0
+        :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
+        :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
+        """
         super(VarRNNBase, self).__init__()
         self.mode = mode
         self.input_size = input_size
@@ -225,18 +228,21 @@ class VarLSTM(VarRNNBase):
     """
     Variational Dropout LSTM.
 
-    :param input_size:  输入 `x` 的特征维度
-    :param hidden_size: 隐状态  `h`  的特征维度
-    :param num_layers: rnn的层数. Default: 1
-    :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
-    :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
-        (batch, seq, feature). Default: ``False``
-    :param input_dropout: 对输入的dropout概率. Default: 0
-    :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
-    :param bidirectional: 若为 ``True``, 使用双向的LSTM. Default: ``False``
     """
 
     def __init__(self, *args, **kwargs):
+        """
+        
+        :param input_size:  输入 `x` 的特征维度
+        :param hidden_size: 隐状态  `h`  的特征维度
+        :param num_layers: rnn的层数. Default: 1
+        :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
+        :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
+            (batch, seq, feature). Default: ``False``
+        :param input_dropout: 对输入的dropout概率. Default: 0
+        :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
+        :param bidirectional: 若为 ``True``, 使用双向的LSTM. Default: ``False``
+        """
         super(VarLSTM, self).__init__(
             mode="LSTM", Cell=nn.LSTMCell, *args, **kwargs)
 
@@ -248,18 +254,21 @@ class VarRNN(VarRNNBase):
     """
     Variational Dropout RNN.
 
-    :param input_size:  输入 `x` 的特征维度
-    :param hidden_size: 隐状态 `h` 的特征维度
-    :param num_layers: rnn的层数. Default: 1
-    :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
-    :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
-        (batch, seq, feature). Default: ``False``
-    :param input_dropout: 对输入的dropout概率. Default: 0
-    :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
-    :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
     """
 
     def __init__(self, *args, **kwargs):
+        """
+        
+        :param input_size:  输入 `x` 的特征维度
+        :param hidden_size: 隐状态 `h` 的特征维度
+        :param num_layers: rnn的层数. Default: 1
+        :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
+        :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
+            (batch, seq, feature). Default: ``False``
+        :param input_dropout: 对输入的dropout概率. Default: 0
+        :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
+        :param bidirectional: 若为 ``True``, 使用双向的RNN. Default: ``False``
+        """
         super(VarRNN, self).__init__(
             mode="RNN", Cell=nn.RNNCell, *args, **kwargs)
 
@@ -271,18 +280,21 @@ class VarGRU(VarRNNBase):
     """
     Variational Dropout GRU.
 
-    :param input_size:  输入 `x` 的特征维度
-    :param hidden_size: 隐状态 `h` 的特征维度
-    :param num_layers: rnn的层数. Default: 1
-    :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
-    :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
-        (batch, seq, feature). Default: ``False``
-    :param input_dropout: 对输入的dropout概率. Default: 0
-    :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
-    :param bidirectional: 若为 ``True``, 使用双向的GRU. Default: ``False``
     """
 
     def __init__(self, *args, **kwargs):
+        """
+        
+        :param input_size:  输入 `x` 的特征维度
+        :param hidden_size: 隐状态 `h` 的特征维度
+        :param num_layers: rnn的层数. Default: 1
+        :param bias: 如果为 ``False``, 模型将不会使用bias. Default: ``True``
+        :param batch_first: 若为 ``True``, 输入和输出 ``Tensor`` 形状为
+            (batch, seq, feature). Default: ``False``
+        :param input_dropout: 对输入的dropout概率. Default: 0
+        :param hidden_dropout: 对每个隐状态的dropout概率. Default: 0
+        :param bidirectional: 若为 ``True``, 使用双向的GRU. Default: ``False``
+        """
         super(VarGRU, self).__init__(
             mode="GRU", Cell=nn.GRUCell, *args, **kwargs)