diff --git a/.travis.yml b/.travis.yml
index 11239eb4..559fc86e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,7 +4,8 @@ python:
 # command to install dependencies
 install:
   - pip install --quiet -r requirements.txt
-  - pip install pytest pytest-cov
+  - pip install pytest>=3.6
+  - pip install pytest-cov
 # command to run tests
 script:
   - pytest --cov=./
diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py
index 442d1154..7b3b6d11 100644
--- a/fastNLP/core/batch.py
+++ b/fastNLP/core/batch.py
@@ -10,10 +10,10 @@ class Batch(object):
             for batch_x, batch_y in Batch(data_set, batch_size=16, sampler=SequentialSampler()):
                 # ...
 
-    :param dataset: a DataSet object
-    :param batch_size: int, the size of the batch
-    :param sampler: a Sampler object
-    :param as_numpy: bool. If True, return Numpy array. Otherwise, return torch tensors.
+    :param DataSet dataset: a DataSet object
+    :param int batch_size: the size of the batch
+    :param Sampler sampler: a Sampler object
+    :param bool as_numpy: If True, return Numpy array. Otherwise, return torch tensors.
 
     """
 
diff --git a/fastNLP/io/base_loader.py b/fastNLP/io/base_loader.py
index b01c233a..ccfa1169 100644
--- a/fastNLP/io/base_loader.py
+++ b/fastNLP/io/base_loader.py
@@ -3,7 +3,9 @@ import os
 
 
 class BaseLoader(object):
+    """Base loader for all loaders.
 
+    """
     def __init__(self):
         super(BaseLoader, self).__init__()
 
@@ -32,7 +34,9 @@ class BaseLoader(object):
 
 
 class DataLoaderRegister:
-    """"register for data sets"""
+    """Register for all data sets.
+
+    """
     _readers = {}
 
     @classmethod
diff --git a/fastNLP/io/config_io.py b/fastNLP/io/config_io.py
index 52c5e789..8be59a35 100644
--- a/fastNLP/io/config_io.py
+++ b/fastNLP/io/config_io.py
@@ -6,7 +6,11 @@ from fastNLP.io.base_loader import BaseLoader
 
 
 class ConfigLoader(BaseLoader):
-    """loader for configuration files"""
+    """Loader for configuration.
+
+    :param str data_path: path to the config
+
+    """
 
     def __init__(self, data_path=None):
         super(ConfigLoader, self).__init__()
@@ -19,13 +23,15 @@ class ConfigLoader(BaseLoader):
 
     @staticmethod
     def load_config(file_path, sections):
-        """
-        :param file_path: the path of config file
-        :param sections: the dict of {section_name(string): Section instance}
-        Example:
+        """Load section(s) of configuration into the ``sections`` provided. No returns.
+
+        :param str file_path: the path of config file
+        :param dict sections: the dict of ``{section_name(string): ConfigSection object}``
+        Example::
+
             test_args = ConfigSection()
             ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
-        :return: return nothing, but the value of attributes are saved in sessions
+
         """
         assert isinstance(sections, dict)
         cfg = configparser.ConfigParser()
@@ -60,9 +66,12 @@ class ConfigLoader(BaseLoader):
 
 
 class ConfigSection(object):
+    """ConfigSection is the data structure storing all key-value pairs in one section in a config file.
+
+    """
 
     def __init__(self):
-        pass
+        super(ConfigSection, self).__init__()
 
     def __getitem__(self, key):
         """
@@ -132,25 +141,12 @@ class ConfigSection(object):
         return self.__dict__
 
 
-if __name__ == "__main__":
-    config = ConfigLoader('there is no data')
-
-    section = {'General': ConfigSection(), 'My': ConfigSection(), 'A': ConfigSection()}
-    """
-            General and My can be found in config file, so the attr and
-        value will be updated
-            A cannot be found in config file, so nothing will be done
-    """
-
-    config.load_config("../../test/data_for_tests/config", section)
-    for s in section:
-        print(s)
-        for attr in section[s].__dict__.keys():
-            print(s, attr, getattr(section[s], attr), type(getattr(section[s], attr)))
-
-
 class ConfigSaver(object):
+    """ConfigSaver is used to save config file and solve related conflicts.
+
+    :param str file_path: path to the config file
 
+    """
     def __init__(self, file_path):
         self.file_path = file_path
         if not os.path.exists(self.file_path):
@@ -244,9 +240,8 @@ class ConfigSaver(object):
     def save_config_file(self, section_name, section):
         """This is the function to be called to change the config file with a single section and its name.
 
-        :param section_name: The name of section what needs to be changed and saved.
-        :param section: The section with key and value what needs to be changed and saved.
-        :return:
+        :param str section_name: The name of section what needs to be changed and saved.
+        :param ConfigSection section: The section with key and value what needs to be changed and saved.
         """
         section_file = self._get_section(section_name)
         if len(section_file.__dict__.keys()) == 0:  # the section not in the file before
diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py
index fedf8058..87682751 100644
--- a/fastNLP/io/dataset_loader.py
+++ b/fastNLP/io/dataset_loader.py
@@ -9,11 +9,12 @@ def convert_seq_dataset(data):
     """Create an DataSet instance that contains no labels.
 
     :param data: list of list of strings, [num_examples, *].
-            ::
-            [
-                [word_11, word_12, ...],
-                ...
-            ]
+            Example::
+
+                [
+                    [word_11, word_12, ...],
+                    ...
+                ]
 
     :return: a DataSet.
     """
@@ -24,15 +25,16 @@ def convert_seq_dataset(data):
 
 
 def convert_seq2tag_dataset(data):
-    """Convert list of data into DataSet
+    """Convert list of data into DataSet.
 
     :param data: list of list of strings, [num_examples, *].
-            ::
-            [
-                [ [word_11, word_12, ...], label_1 ],
-                [ [word_21, word_22, ...], label_2 ],
-                ...
-            ]
+            Example::
+
+                [
+                    [ [word_11, word_12, ...], label_1 ],
+                    [ [word_21, word_22, ...], label_2 ],
+                    ...
+                ]
 
     :return: a DataSet.
     """
@@ -43,15 +45,16 @@ def convert_seq2tag_dataset(data):
 
 
 def convert_seq2seq_dataset(data):
-    """Convert list of data into DataSet
+    """Convert list of data into DataSet.
 
     :param data: list of list of strings, [num_examples, *].
-            ::
-            [
-                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
-                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
-                ...
-            ]
+            Example::
+
+                [
+                    [ [word_11, word_12, ...], [label_1, label_1, ...] ],
+                    [ [word_21, word_22, ...], [label_2, label_1, ...] ],
+                    ...
+                ]
 
     :return: a DataSet.
     """
@@ -62,20 +65,31 @@ def convert_seq2seq_dataset(data):
 
 
 class DataSetLoader:
-    """"loader for data sets"""
+    """Interface for all DataSetLoaders.
+
+    """
 
     def load(self, path):
-        """ load data in `path` into a dataset
+        """Load data from a given file.
+
+        :param str path: file path
+        :return: a DataSet object
         """
         raise NotImplementedError
 
     def convert(self, data):
-        """convert list of data into dataset
+        """Optional operation to build a DataSet.
+
+        :param data: inner data structure (user-defined) to represent the data.
+        :return: a DataSet object
         """
         raise NotImplementedError
 
 
 class NativeDataSetLoader(DataSetLoader):
+    """A simple example of DataSetLoader
+
+    """
     def __init__(self):
         super(NativeDataSetLoader, self).__init__()
 
@@ -90,6 +104,9 @@ DataLoaderRegister.set_reader(NativeDataSetLoader, 'read_naive')
 
 
 class RawDataSetLoader(DataSetLoader):
+    """A simple example of raw data reader
+
+    """
     def __init__(self):
         super(RawDataSetLoader, self).__init__()
 
@@ -108,37 +125,35 @@ DataLoaderRegister.set_reader(RawDataSetLoader, 'read_rawdata')
 
 
 class POSDataSetLoader(DataSetLoader):
-    """Dataset Loader for POS Tag datasets.
-
-    In these datasets, each line are divided by '\t'
-    while the first Col is the vocabulary and the second
-    Col is the label.
-        Different sentence are divided by an empty line.
-        e.g:
-        Tom label1
-        and label2
-        Jerry   label1
-        .   label3
-        (separated by an empty line)
-        Hello   label4
-        world   label5
-        !   label3
-        In this file, there are two sentence "Tom and Jerry ."
-    and "Hello world !". Each word has its own label from label1
-    to label5.
+    """Dataset Loader for a POS Tag dataset.
+
+    In these datasets, each line are divided by "\t". The first Col is the vocabulary and the second
+    Col is the label. Different sentence are divided by an empty line.
+        E.g::
+
+            Tom label1
+            and label2
+            Jerry   label1
+            .   label3
+            (separated by an empty line)
+            Hello   label4
+            world   label5
+            !   label3
+
+    In this example, there are two sentences "Tom and Jerry ." and "Hello world !". Each word has its own label.
     """
-
     def __init__(self):
         super(POSDataSetLoader, self).__init__()
 
     def load(self, data_path):
         """
         :return data: three-level list
-            [
-                [ [word_11, word_12, ...], [label_1, label_1, ...] ],
-                [ [word_21, word_22, ...], [label_2, label_1, ...] ],
-                ...
-            ]
+            Example::
+                [
+                    [ [word_11, word_12, ...], [label_1, label_1, ...] ],
+                    [ [word_21, word_22, ...], [label_2, label_1, ...] ],
+                    ...
+                ]
         """
         with open(data_path, "r", encoding="utf-8") as f:
             lines = f.readlines()
@@ -188,17 +203,17 @@ class TokenizeDataSetLoader(DataSetLoader):
         super(TokenizeDataSetLoader, self).__init__()
 
     def load(self, data_path, max_seq_len=32):
-        """
-        load pku dataset for Chinese word segmentation
+        """Load pku dataset for Chinese word segmentation.
         CWS (Chinese Word Segmentation) pku training dataset format:
-            1. Each line is a sentence.
-            2. Each word in a sentence is separated by space.
+        1. Each line is a sentence.
+        2. Each word in a sentence is separated by space.
         This function convert the pku dataset into three-level lists with labels <BMES>.
-            B: beginning of a word
-            M: middle of a word
-            E: ending of a word
-            S: single character
+        B: beginning of a word
+        M: middle of a word
+        E: ending of a word
+        S: single character
 
+        :param str data_path: path to the data set.
         :param max_seq_len: int, the maximum length of a sequence. If a sequence is longer than it, split it into
                 several sequences.
         :return: three-level lists
@@ -239,7 +254,7 @@ class TokenizeDataSetLoader(DataSetLoader):
 
 
 class ClassDataSetLoader(DataSetLoader):
-    """Loader for classification data sets"""
+    """Loader for a dummy classification data set"""
 
     def __init__(self):
         super(ClassDataSetLoader, self).__init__()
@@ -254,11 +269,9 @@ class ClassDataSetLoader(DataSetLoader):
     @staticmethod
     def parse(lines):
         """
-        Params
-            lines: lines from dataset
-        Return
-            list(list(list())): the three level of lists are
-                words, sentence, and dataset
+
+        :param list lines: lines from dataset
+        :return: a 3-D list, indicating words, sentence, and dataset respectively.
         """
         dataset = list()
         for line in lines:
@@ -280,15 +293,9 @@ class ConllLoader(DataSetLoader):
     """loader for conll format files"""
 
     def __init__(self):
-        """
-        :param str data_path: the path to the conll data set
-        """
         super(ConllLoader, self).__init__()
 
     def load(self, data_path):
-        """
-        :return: list lines: all lines in a conll file
-        """
         with open(data_path, "r", encoding="utf-8") as f:
             lines = f.readlines()
         data = self.parse(lines)
@@ -297,7 +304,7 @@ class ConllLoader(DataSetLoader):
     @staticmethod
     def parse(lines):
         """
-        :param list lines:a list containing all lines in a conll file.
+        :param list lines: a list containing all lines in a conll file.
         :return: a 3D list
         """
         sentences = list()
@@ -320,8 +327,8 @@ class ConllLoader(DataSetLoader):
 class LMDataSetLoader(DataSetLoader):
     """Language Model Dataset Loader
 
-        This loader produces data for language model training in a supervised way.
-        That means it has X and Y.
+    This loader produces data for language model training in a supervised way.
+    That means it has X and Y.
 
     """
 
@@ -467,6 +474,7 @@ class Conll2003Loader(DataSetLoader):
 
         return dataset
 
+
 class SNLIDataSetLoader(DataSetLoader):
     """A data set loader for SNLI data set.
 
@@ -478,8 +486,8 @@ class SNLIDataSetLoader(DataSetLoader):
     def load(self, path_list):
         """
 
-        :param path_list: A list of file name, in the order of premise file, hypothesis file, and label file.
-        :return: data_set: A DataSet object.
+        :param list path_list: A list of file name, in the order of premise file, hypothesis file, and label file.
+        :return: A DataSet object.
         """
         assert len(path_list) == 3
         line_set = []
@@ -507,12 +515,14 @@ class SNLIDataSetLoader(DataSetLoader):
         """Convert a 3D list to a DataSet object.
 
         :param data: A 3D tensor.
-            [
-                [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ],
-                [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ],
-                ...
-            ]
-        :return: data_set: A DataSet object.
+            Example::
+                [
+                    [ [premise_word_11, premise_word_12, ...], [hypothesis_word_11, hypothesis_word_12, ...], [label_1] ],
+                    [ [premise_word_21, premise_word_22, ...], [hypothesis_word_21, hypothesis_word_22, ...], [label_2] ],
+                    ...
+                ]
+
+        :return: A DataSet object.
         """
 
         data_set = DataSet()
diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py
index 2eb48f93..aae29ab5 100644
--- a/fastNLP/io/embed_loader.py
+++ b/fastNLP/io/embed_loader.py
@@ -38,7 +38,7 @@ class EmbedLoader(BaseLoader):
 
         :param str emb_file: the pre-trained embedding file path
         :param str emb_type: the pre-trained embedding data format
-        :return dict embedding: `{str: np.array}`
+        :return: a dict of ``{str: np.array}``
         """
         if emb_type == 'glove':
             return EmbedLoader._load_glove(emb_file)
@@ -53,8 +53,9 @@ class EmbedLoader(BaseLoader):
         :param str emb_file: the pre-trained embedding file path.
         :param str emb_type: the pre-trained embedding format, support glove now
         :param Vocabulary vocab: a mapping from word to index, can be provided by user or built from pre-trained embedding
-        :return embedding_tensor: Tensor of shape (len(word_dict), emb_dim)
-                vocab: input vocab or vocab built by pre-train
+        :return (embedding_tensor, vocab):
+                embedding_tensor - Tensor of shape (len(word_dict), emb_dim);
+                vocab - input vocab or vocab built by pre-train
 
         """
         pretrain = EmbedLoader._load_pretrain(emb_file, emb_type)
@@ -95,7 +96,7 @@ class EmbedLoader(BaseLoader):
         :param int emb_dim: the dimension of the embedding. Should be the same as pre-trained embedding.
         :param str emb_file: the pre-trained embedding file path.
         :param Vocabulary vocab: a mapping from word to index, can be provided by user or built from pre-trained embedding
-        :return numpy.ndarray embedding_matrix:
+        :return: the embedding matrix, numpy.ndarray
 
         """
         if vocab is None:
diff --git a/fastNLP/io/logger.py b/fastNLP/io/logger.py
index 19b2b2c3..9e9730db 100644
--- a/fastNLP/io/logger.py
+++ b/fastNLP/io/logger.py
@@ -3,15 +3,16 @@ import os
 
 
 def create_logger(logger_name, log_path, log_format=None, log_level=logging.INFO):
-    """Return a logger.
+    """Create a logger.
 
-    :param logger_name: str
-    :param log_path: str
+    :param str logger_name:
+    :param str log_path:
     :param log_format:
     :param log_level:
     :return: logger
 
-    to use a logger:
+    To use a logger::
+
         logger.debug("this is a debug message")
         logger.info("this is a info message")
         logger.warning("this is a warning message")
diff --git a/fastNLP/io/model_io.py b/fastNLP/io/model_io.py
index e1264b47..422eb919 100644
--- a/fastNLP/io/model_io.py
+++ b/fastNLP/io/model_io.py
@@ -13,10 +13,10 @@ class ModelLoader(BaseLoader):
 
     @staticmethod
     def load_pytorch(empty_model, model_path):
-        """
-        Load model parameters from .pkl files into the empty PyTorch model.
+        """Load model parameters from ".pkl" files into the empty PyTorch model.
+
         :param empty_model: a PyTorch model with initialized parameters.
-        :param model_path: str, the path to the saved model.
+        :param str model_path: the path to the saved model.
         """
         empty_model.load_state_dict(torch.load(model_path))
 
@@ -24,30 +24,30 @@ class ModelLoader(BaseLoader):
     def load_pytorch_model(model_path):
         """Load the entire model.
 
+        :param str model_path: the path to the saved model.
         """
         return torch.load(model_path)
 
 
 class ModelSaver(object):
     """Save a model
+
+        :param str save_path: the path to the saving directory.
         Example::
+
             saver = ModelSaver("./save/model_ckpt_100.pkl")
             saver.save_pytorch(model)
 
     """
 
     def __init__(self, save_path):
-        """
-
-        :param save_path: str, the path to the saving directory.
-        """
         self.save_path = save_path
 
     def save_pytorch(self, model, param_only=True):
-        """Save a pytorch model into .pkl file.
+        """Save a pytorch model into ".pkl" file.
 
         :param model: a PyTorch model
-        :param param_only: bool, whether only to save the model parameters or the entire model.
+        :param bool param_only: whether only to save the model parameters or the entire model.
 
         """
         if param_only is True:
diff --git a/fastNLP/modules/aggregator/attention.py b/fastNLP/modules/aggregator/attention.py
index 882807f8..edd82f31 100644
--- a/fastNLP/modules/aggregator/attention.py
+++ b/fastNLP/modules/aggregator/attention.py
@@ -1,11 +1,12 @@
+import math
+
 import torch
 from torch import nn
-import math
+
 from fastNLP.modules.utils import mask_softmax
 
 
 class Attention(torch.nn.Module):
-
     def __init__(self, normalize=False):
         super(Attention, self).__init__()
         self.normalize = normalize
@@ -19,9 +20,9 @@ class Attention(torch.nn.Module):
     def _atten_forward(self, query, memory):
         raise NotImplementedError
 
+
 class DotAtte(nn.Module):
     def __init__(self, key_size, value_size):
-        # TODO never test
         super(DotAtte, self).__init__()
         self.key_size = key_size
         self.value_size = value_size
@@ -41,10 +42,9 @@ class DotAtte(nn.Module):
         output = nn.functional.softmax(output, dim=2)
         return torch.matmul(output, V)
 
+
 class MultiHeadAtte(nn.Module):
     def __init__(self, input_size, output_size, key_size, value_size, num_atte):
-        raise NotImplementedError
-        # TODO never test
         super(MultiHeadAtte, self).__init__()
         self.in_linear = nn.ModuleList()
         for i in range(num_atte * 3):
diff --git a/fastNLP/modules/aggregator/self_attention.py b/fastNLP/modules/aggregator/self_attention.py
index 981f34c6..b0f03791 100644
--- a/fastNLP/modules/aggregator/self_attention.py
+++ b/fastNLP/modules/aggregator/self_attention.py
@@ -7,13 +7,14 @@ from fastNLP.modules.utils import initial_parameter
 
 
 class SelfAttention(nn.Module):
-    """
-    Self Attention Module.
+    """Self Attention Module.
 
-    Args:
-    input_size: int, the size for the input vector
-    dim: int, the width of weight matrix.
-    num_vec: int, the number of encoded vectors
+    :param int input_size:
+    :param int attention_unit:
+    :param int attention_hops:
+    :param float drop:
+    :param str initial_method:
+    :param bool use_cuda:
     """
 
     def __init__(self, input_size, attention_unit=350, attention_hops=10, drop=0.5, initial_method=None,
@@ -48,7 +49,7 @@ class SelfAttention(nn.Module):
     def forward(self, input, input_origin):
         """
         :param input:  the matrix to do attention.              [baz, senLen, h_dim]
-        :param inp:  then token index include pad token( 0 )   [baz , senLen]
+        :param inp: then token index include pad token( 0 )   [baz , senLen]
         :return output1: the input matrix after attention operation   [baz, multi-head , h_dim]
         :return output2: the attention penalty term, a scalar  [1]
         """
@@ -59,8 +60,8 @@ class SelfAttention(nn.Module):
         input_origin = input_origin.transpose(0, 1).contiguous()  # [baz, hops,len]
 
         y1 = self.tanh(self.ws1(self.drop(input)))  # [baz,len,dim] -->[bsz,len, attention-unit]
-        attention = self.ws2(y1).transpose(1,
-                                           2).contiguous()  # [bsz,len, attention-unit]--> [bsz, len, hop]--> [baz,hop,len]
+        attention = self.ws2(y1).transpose(1, 2).contiguous()
+        # [bsz,len, attention-unit]--> [bsz, len, hop]--> [baz,hop,len]
 
         attention = attention + (-999999 * (input_origin == 0).float())  # remove the weight on padding token.
         attention = F.softmax(attention, 2)  # [baz ,hop, len]
diff --git a/fastNLP/modules/decoder/CRF.py b/fastNLP/modules/decoder/CRF.py
index 55d3faa4..9973275b 100644
--- a/fastNLP/modules/decoder/CRF.py
+++ b/fastNLP/modules/decoder/CRF.py
@@ -21,11 +21,13 @@ def seq_len_to_byte_mask(seq_lens):
 
 
 class ConditionalRandomField(nn.Module):
-    def __init__(self, tag_size, include_start_end_trans=False ,initial_method = None):
-        """
-        :param tag_size: int, num of tags
-        :param include_start_end_trans: bool, whether to include start/end tag
-        """
+    """
+        :param int tag_size: num of tags
+        :param bool include_start_end_trans: whether to include start/end tag
+        :param str initial_method: method for initialization
+    """
+
+    def __init__(self, tag_size, include_start_end_trans=False, initial_method=None):
         super(ConditionalRandomField, self).__init__()
 
         self.include_start_end_trans = include_start_end_trans
@@ -39,6 +41,7 @@ class ConditionalRandomField(nn.Module):
 
         # self.reset_parameter()
         initial_parameter(self, initial_method)
+
     def reset_parameter(self):
         nn.init.xavier_normal_(self.trans_m)
         if self.include_start_end_trans:
@@ -46,12 +49,12 @@ class ConditionalRandomField(nn.Module):
             nn.init.normal_(self.end_scores)
 
     def _normalizer_likelihood(self, logits, mask):
-        """
-        Computes the (batch_size,) denominator term for the log-likelihood, which is the
+        """Computes the (batch_size,) denominator term for the log-likelihood, which is the
         sum of the likelihoods across all possible state sequences.
-        :param logits:FloatTensor, max_len x batch_size x tag_size
-        :param mask:ByteTensor, max_len x batch_size
-        :return:FloatTensor, batch_size
+
+        :param FloatTensor logits: [max_len, batch_size, tag_size]
+        :param ByteTensor mask: [max_len, batch_size]
+        :return: FloatTensor, [batch_size,]
         """
         seq_len, batch_size, n_tags = logits.size()
         alpha = logits[0]
@@ -70,8 +73,8 @@ class ConditionalRandomField(nn.Module):
         return log_sum_exp(alpha, 1)
 
     def _glod_score(self, logits, tags, mask):
-        """
-        Compute the score for the gold path.
+        """Compute the score for the gold path.
+
         :param logits: FloatTensor, max_len x batch_size x tag_size
         :param tags: LongTensor, max_len x batch_size
         :param mask: ByteTensor, max_len x batch_size
@@ -97,12 +100,12 @@ class ConditionalRandomField(nn.Module):
         return score
 
     def forward(self, feats, tags, mask):
-        """
-        Calculate the neg log likelihood
-        :param feats:FloatTensor, batch_size x max_len x tag_size
-        :param tags:LongTensor, batch_size x max_len
-        :param mask:ByteTensor batch_size x max_len
-        :return:FloatTensor, batch_size
+        """Calculate the neg log likelihood
+
+        :param FloatTensor feats: [batch_size, max_len, tag_size]
+        :param LongTensor tags: [batch_size, max_len]
+        :param ByteTensor mask: [batch_size, max_len]
+        :return: FloatTensor, [batch_size,]
         """
         feats = feats.transpose(0, 1)
         tags = tags.transpose(0, 1).long()
@@ -113,11 +116,11 @@ class ConditionalRandomField(nn.Module):
         return all_path_score - gold_path_score
 
     def viterbi_decode(self, data, mask, get_score=False):
-        """
-        Given a feats matrix, return best decode path and best score.
-        :param data:FloatTensor, batch_size x max_len x tag_size
-        :param mask:ByteTensor batch_size x max_len
-        :param get_score: bool, whether to output the decode score.
+        """Given a feats matrix, return best decode path and best score.
+
+        :param FloatTensor data: [batch_size, max_len, tag_size]
+        :param ByteTensor mask: [batch_size, max_len]
+        :param bool get_score: whether to output the decode score.
         :return: scores, paths
         """
         batch_size, seq_len, n_tags = data.size()
diff --git a/fastNLP/modules/decoder/MLP.py b/fastNLP/modules/decoder/MLP.py
index 0470e91b..2b518a12 100644
--- a/fastNLP/modules/decoder/MLP.py
+++ b/fastNLP/modules/decoder/MLP.py
@@ -1,21 +1,23 @@
 import torch
 import torch.nn as nn
+
 from fastNLP.modules.utils import initial_parameter
 
 
 class MLP(nn.Module):
-    def __init__(self, size_layer, activation='relu', initial_method=None, dropout=0.0):
-        """Multilayer Perceptrons as a decoder
+    """Multilayer Perceptrons as a decoder
 
-        :param size_layer: list of int, define the size of MLP layers.
-        :param activation: str or function, the activation function for hidden layers.
-        :param initial_method: str, the name of init method.
-        :param dropout: float, the probability of dropout.
+    :param list size_layer: list of int, define the size of MLP layers.
+    :param str activation: str or function, the activation function for hidden layers.
+    :param str initial_method: the name of initialization method.
+    :param float dropout: the probability of dropout.
 
-        .. note::
-            There is no activation function applying on output layer.
+    .. note::
+        There is no activation function applying on output layer.
 
-        """
+    """
+
+    def __init__(self, size_layer, activation='relu', initial_method=None, dropout=0.0):
         super(MLP, self).__init__()
         self.hiddens = nn.ModuleList()
         self.output = None
diff --git a/fastNLP/modules/dropout.py b/fastNLP/modules/dropout.py
index 8cef4d09..34cf9e90 100644
--- a/fastNLP/modules/dropout.py
+++ b/fastNLP/modules/dropout.py
@@ -2,8 +2,8 @@ import torch
 
 
 class TimestepDropout(torch.nn.Dropout):
-    """This module accepts a `[batch_size, num_timesteps, embedding_dim)]` and use a single
-    dropout mask of shape `(batch_size, embedding_dim)` to apply on every time step.
+    """This module accepts a ``[batch_size, num_timesteps, embedding_dim)]`` and use a single
+    dropout mask of shape ``(batch_size, embedding_dim)`` to apply on every time step.
     """
 
     def forward(self, x):
diff --git a/fastNLP/modules/encoder/char_embedding.py b/fastNLP/modules/encoder/char_embedding.py
index 249a73ad..057d080c 100644
--- a/fastNLP/modules/encoder/char_embedding.py
+++ b/fastNLP/modules/encoder/char_embedding.py
@@ -1,5 +1,4 @@
 import torch
-import torch.nn.functional as F
 from torch import nn
 
 from fastNLP.modules.utils import initial_parameter
@@ -7,17 +6,17 @@ from fastNLP.modules.utils import initial_parameter
 
 # from torch.nn.init import xavier_uniform
 class ConvCharEmbedding(nn.Module):
+    """Character-level Embedding with CNN.
+
+    :param int char_emb_size: the size of character level embedding. Default: 50
+        say 26 characters, each embedded to 50 dim vector, then the input_size is 50.
+    :param tuple feature_maps: tuple of int. The length of the tuple is the number of convolution operations
+        over characters. The i-th integer is the number of filters (dim of out channels) for the i-th
+        convolution.
+    :param tuple kernels: tuple of int. The width of each kernel.
+    """
 
     def __init__(self, char_emb_size=50, feature_maps=(40, 30, 30), kernels=(3, 4, 5), initial_method=None):
-        """
-        Character Level Word Embedding
-        :param char_emb_size: the size of character level embedding. Default: 50
-            say 26 characters, each embedded to 50 dim vector, then the input_size is 50.
-        :param feature_maps: tuple of int. The length of the tuple is the number of convolution operations
-            over characters. The i-th integer is the number of filters (dim of out channels) for the i-th
-            convolution.
-        :param kernels: tuple of int. The width of each kernel.
-        """
         super(ConvCharEmbedding, self).__init__()
         self.convs = nn.ModuleList([
             nn.Conv2d(1, feature_maps[i], kernel_size=(char_emb_size, kernels[i]), bias=True, padding=(0, 4))
@@ -27,8 +26,8 @@ class ConvCharEmbedding(nn.Module):
 
     def forward(self, x):
         """
-        :param x: [batch_size * sent_length, word_length, char_emb_size]
-        :return: [batch_size * sent_length, sum(feature_maps), 1]
+        :param x: ``[batch_size * sent_length, word_length, char_emb_size]``
+        :return: feature map of shape [batch_size * sent_length, sum(feature_maps), 1]
         """
         x = x.contiguous().view(x.size(0), 1, x.size(1), x.size(2))
         # [batch_size*sent_length, channel, width, height]
@@ -51,13 +50,12 @@ class ConvCharEmbedding(nn.Module):
 
 
 class LSTMCharEmbedding(nn.Module):
-    """
-    Character Level Word Embedding with LSTM with a single layer.
-    :param char_emb_size: int, the size of character level embedding. Default: 50
+    """Character-level Embedding with LSTM.
+
+    :param int char_emb_size: the size of character level embedding. Default: 50
         say 26 characters, each embedded to 50 dim vector, then the input_size is 50.
-    :param hidden_size: int, the number of hidden units. Default:  equal to char_emb_size.
+    :param int hidden_size: the number of hidden units. Default:  equal to char_emb_size.
     """
-
     def __init__(self, char_emb_size=50, hidden_size=None, initial_method=None):
         super(LSTMCharEmbedding, self).__init__()
         self.hidden_size = char_emb_size if hidden_size is None else hidden_size
@@ -71,7 +69,7 @@ class LSTMCharEmbedding(nn.Module):
 
     def forward(self, x):
         """
-        :param x:[ n_batch*n_word, word_length, char_emb_size]
+        :param x: ``[ n_batch*n_word, word_length, char_emb_size]``
         :return: [ n_batch*n_word, char_emb_size]
         """
         batch_size = x.shape[0]
diff --git a/fastNLP/modules/encoder/conv.py b/fastNLP/modules/encoder/conv.py
index 68536e5d..42254a8b 100644
--- a/fastNLP/modules/encoder/conv.py
+++ b/fastNLP/modules/encoder/conv.py
@@ -3,20 +3,30 @@
 
 import torch
 import torch.nn as nn
-from torch.nn.init import xavier_uniform_
-# import torch.nn.functional as F
 
 from fastNLP.modules.utils import initial_parameter
 
+
+# import torch.nn.functional as F
+
+
 class Conv(nn.Module):
-    """
-    Basic 1-d convolution module.
-    initialize with xavier_uniform
-    """
+    """Basic 1-d convolution module, initialized with xavier_uniform.
 
+    :param int in_channels:
+    :param int out_channels:
+    :param tuple kernel_size:
+    :param int stride:
+    :param int padding:
+    :param int dilation:
+    :param int groups:
+    :param bool bias:
+    :param str activation:
+    :param str initial_method:
+    """
     def __init__(self, in_channels, out_channels, kernel_size,
                  stride=1, padding=0, dilation=1,
-                 groups=1, bias=True, activation='relu',initial_method = None ):
+                 groups=1, bias=True, activation='relu', initial_method=None):
         super(Conv, self).__init__()
         self.conv = nn.Conv1d(
             in_channels=in_channels,
diff --git a/fastNLP/modules/encoder/conv_maxpool.py b/fastNLP/modules/encoder/conv_maxpool.py
index 42994bcd..8b035871 100644
--- a/fastNLP/modules/encoder/conv_maxpool.py
+++ b/fastNLP/modules/encoder/conv_maxpool.py
@@ -4,17 +4,27 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.nn.init import xavier_uniform_
+
 from fastNLP.modules.utils import initial_parameter
 
+
 class ConvMaxpool(nn.Module):
-    """
-    Convolution and max-pooling module with multiple kernel sizes.
-    """
+    """Convolution and max-pooling module with multiple kernel sizes.
 
+    :param int in_channels:
+    :param int out_channels:
+    :param tuple kernel_sizes:
+    :param int stride:
+    :param int padding:
+    :param int dilation:
+    :param int groups:
+    :param bool bias:
+    :param str activation:
+    :param str initial_method:
+    """
     def __init__(self, in_channels, out_channels, kernel_sizes,
                  stride=1, padding=0, dilation=1,
-                 groups=1, bias=True, activation='relu',initial_method = None ):
+                 groups=1, bias=True, activation="relu", initial_method=None):
         super(ConvMaxpool, self).__init__()
 
         # convolution
diff --git a/fastNLP/modules/encoder/embedding.py b/fastNLP/modules/encoder/embedding.py
index 73ddd77a..7bcffb8e 100644
--- a/fastNLP/modules/encoder/embedding.py
+++ b/fastNLP/modules/encoder/embedding.py
@@ -2,16 +2,13 @@ import torch.nn as nn
 
 
 class Embedding(nn.Module):
-    """
-    A simple lookup table
-    Args:
-    nums : the size of the lookup table
-    dims : the size of each vector
-    padding_idx : pads the tensor with zeros whenever it encounters this index
-    sparse : If True, gradient matrix will be a sparse tensor. In this case,
-    only optim.SGD(cuda and cpu) and optim.Adagrad(cpu) can be used
-    """
+    """A simple lookup table.
 
+    :param int nums: the size of the lookup table
+    :param int dims: the size of each vector
+    :param int padding_idx: pads the tensor with zeros whenever it encounters this index
+    :param bool sparse: If True, gradient matrix will be a sparse tensor. In this case, only optim.SGD(cuda and cpu) and optim.Adagrad(cpu) can be used
+    """
     def __init__(self, nums, dims, padding_idx=0, sparse=False, init_emb=None, dropout=0.0):
         super(Embedding, self).__init__()
         self.embed = nn.Embedding(nums, dims, padding_idx, sparse=sparse)
diff --git a/fastNLP/modules/encoder/linear.py b/fastNLP/modules/encoder/linear.py
index 399e15d3..2dc31eea 100644
--- a/fastNLP/modules/encoder/linear.py
+++ b/fastNLP/modules/encoder/linear.py
@@ -5,15 +5,12 @@ from fastNLP.modules.utils import initial_parameter
 
 class Linear(nn.Module):
     """
-    Linear module
-    Args:
-    input_size : input size
-    hidden_size : hidden size
-    num_layers : number of hidden layers
-    dropout : dropout rate
-    bidirectional : If True, becomes a bidirectional RNN
-    """
 
+    :param int input_size: input size
+    :param int output_size: output size
+    :param bool bias:
+    :param str initial_method:
+    """
     def __init__(self, input_size, output_size, bias=True, initial_method=None):
         super(Linear, self).__init__()
         self.linear = nn.Linear(input_size, output_size, bias)
diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py
index a0b42442..48c67a64 100644
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -6,14 +6,16 @@ from fastNLP.modules.utils import initial_parameter
 class LSTM(nn.Module):
     """Long Short Term Memory
 
-    Args:
-    input_size : input size
-    hidden_size : hidden size
-    num_layers : number of hidden layers. Default: 1
-    dropout : dropout rate. Default: 0.5
-    bidirectional : If True, becomes a bidirectional RNN. Default: False.
+    :param int input_size:
+    :param int hidden_size:
+    :param int num_layers:
+    :param float dropout:
+    :param bool batch_first:
+    :param bool bidirectional:
+    :param bool bias:
+    :param str initial_method:
+    :param bool get_hidden:
     """
-
     def __init__(self, input_size, hidden_size=100, num_layers=1, dropout=0.0, batch_first=True,
                  bidirectional=False, bias=True, initial_method=None, get_hidden=False):
         super(LSTM, self).__init__()
diff --git a/fastNLP/modules/encoder/masked_rnn.py b/fastNLP/modules/encoder/masked_rnn.py
index c1ef15d0..321546c4 100644
--- a/fastNLP/modules/encoder/masked_rnn.py
+++ b/fastNLP/modules/encoder/masked_rnn.py
@@ -5,6 +5,8 @@ import torch.nn as nn
 import torch.nn.functional as F
 
 from fastNLP.modules.utils import initial_parameter
+
+
 def MaskedRecurrent(reverse=False):
     def forward(input, hidden, cell, mask, train=True, dropout=0):
         """
@@ -254,16 +256,16 @@ class MaskedRNNBase(nn.Module):
         return output, hidden
 
     def step(self, input, hx=None, mask=None):
-        '''
-        execute one step forward (only for one-directional RNN).
-        Args:
-            input (batch, input_size): input tensor of this step.
-            hx (num_layers, batch, hidden_size): the hidden state of last step.
-            mask (batch): the mask tensor of this step.
-        Returns:
-            output (batch, hidden_size): tensor containing the output of this step from the last layer of RNN.
-            hn (num_layers, batch, hidden_size): tensor containing the hidden state of this step
-        '''
+        """Execute one step forward (only for one-directional RNN).
+
+        :param Tensor input: input tensor of this step. (batch, input_size)
+        :param Tensor hx: the hidden state of last step. (num_layers, batch, hidden_size)
+        :param Tensor mask: the mask tensor of this step. (batch, )
+        :returns:
+            **output** (batch, hidden_size), tensor containing the output of this step from the last layer of RNN.
+            **hn** (num_layers, batch, hidden_size), tensor containing the hidden state of this step
+
+        """
         assert not self.bidirectional, "step only cannot be applied to bidirectional RNN."  # aha, typo!
         batch_size = input.size(0)
         lstm = self.Cell is nn.LSTMCell
@@ -285,25 +287,23 @@ class MaskedRNN(MaskedRNNBase):
     r"""Applies a multi-layer Elman RNN with costomized non-linearity to an
     input sequence.
     For each element in the input sequence, each layer computes the following
-    function:
-    .. math::
-        h_t = \tanh(w_{ih} * x_t + b_{ih}  +  w_{hh} * h_{(t-1)} + b_{hh})
+    function. :math:`h_t = \tanh(w_{ih} * x_t + b_{ih}  +  w_{hh} * h_{(t-1)} + b_{hh})`
+
     where :math:`h_t` is the hidden state at time `t`, and :math:`x_t` is
     the hidden state of the previous layer at time `t` or :math:`input_t`
     for the first layer. If nonlinearity='relu', then `ReLU` is used instead
     of `tanh`.
-    Args:
-        input_size: The number of expected features in the input x
-        hidden_size: The number of features in the hidden state h
-        num_layers: Number of recurrent layers.
-        nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh'
-        bias: If False, then the layer does not use bias weights b_ih and b_hh.
-            Default: True
-        batch_first: If True, then the input and output tensors are provided
-            as (batch, seq, feature)
-        dropout: If non-zero, introduces a dropout layer on the outputs of each
-            RNN layer except the last layer
-        bidirectional: If True, becomes a bidirectional RNN. Default: False
+
+
+    :param int input_size: The number of expected features in the input x
+    :param int hidden_size: The number of features in the hidden state h
+    :param int num_layers: Number of recurrent layers.
+    :param str nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh'
+    :param bool bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True
+    :param bool batch_first: If True, then the input and output tensors are provided as (batch, seq, feature)
+    :param float dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer
+    :param bool bidirectional: If True, becomes a bidirectional RNN. Default: False
+
     Inputs: input, mask, h_0
         - **input** (seq_len, batch, input_size): tensor containing the features
           of the input sequence.
@@ -327,32 +327,33 @@ class MaskedLSTM(MaskedRNNBase):
     r"""Applies a multi-layer long short-term memory (LSTM) RNN to an input
     sequence.
     For each element in the input sequence, each layer computes the following
-    function:
+    function.
+
     .. math::
-            \begin{array}{ll}
-            i_t = \mathrm{sigmoid}(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
-            f_t = \mathrm{sigmoid}(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
-            g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\
-            o_t = \mathrm{sigmoid}(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
-            c_t = f_t * c_{(t-1)} + i_t * g_t \\
-            h_t = o_t * \tanh(c_t)
-            \end{array}
+
+        \begin{array}{ll}
+        i_t = \mathrm{sigmoid}(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
+        f_t = \mathrm{sigmoid}(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
+        g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hc} h_{(t-1)} + b_{hg}) \\
+        o_t = \mathrm{sigmoid}(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
+        c_t = f_t * c_{(t-1)} + i_t * g_t \\
+        h_t = o_t * \tanh(c_t)
+        \end{array}
+
     where :math:`h_t` is the hidden state at time `t`, :math:`c_t` is the cell
     state at time `t`, :math:`x_t` is the hidden state of the previous layer at
     time `t` or :math:`input_t` for the first layer, and :math:`i_t`,
     :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell,
     and out gates, respectively.
-    Args:
-        input_size: The number of expected features in the input x
-        hidden_size: The number of features in the hidden state h
-        num_layers: Number of recurrent layers.
-        bias: If False, then the layer does not use bias weights b_ih and b_hh.
-            Default: True
-        batch_first: If True, then the input and output tensors are provided
-            as (batch, seq, feature)
-        dropout: If non-zero, introduces a dropout layer on the outputs of each
-            RNN layer except the last layer
-        bidirectional: If True, becomes a bidirectional RNN. Default: False
+
+    :param int input_size: The number of expected features in the input x
+    :param int hidden_size: The number of features in the hidden state h
+    :param int num_layers: Number of recurrent layers.
+    :param bool bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True
+    :param bool batch_first: If True, then the input and output tensors are provided as (batch, seq, feature)
+    :param bool dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer
+    :param bool bidirectional: If True, becomes a bidirectional RNN. Default: False
+
     Inputs: input, mask, (h_0, c_0)
         - **input** (seq_len, batch, input_size): tensor containing the features
           of the input sequence.
@@ -380,29 +381,30 @@ class MaskedGRU(MaskedRNNBase):
     r"""Applies a multi-layer gated recurrent unit (GRU) RNN to an input sequence.
     For each element in the input sequence, each layer computes the following
     function:
+
     .. math::
+
             \begin{array}{ll}
             r_t = \mathrm{sigmoid}(W_{ir} x_t + b_{ir} + W_{hr} h_{(t-1)} + b_{hr}) \\
             z_t = \mathrm{sigmoid}(W_{iz} x_t + b_{iz} + W_{hz} h_{(t-1)} + b_{hz}) \\
             n_t = \tanh(W_{in} x_t + b_{in} + r_t * (W_{hn} h_{(t-1)}+ b_{hn})) \\
             h_t = (1 - z_t) * n_t + z_t * h_{(t-1)} \\
             \end{array}
+
     where :math:`h_t` is the hidden state at time `t`, :math:`x_t` is the hidden
     state of the previous layer at time `t` or :math:`input_t` for the first
     layer, and :math:`r_t`, :math:`z_t`, :math:`n_t` are the reset, input,
     and new gates, respectively.
-    Args:
-        input_size: The number of expected features in the input x
-        hidden_size: The number of features in the hidden state h
-        num_layers: Number of recurrent layers.
-        nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh'
-        bias: If False, then the layer does not use bias weights b_ih and b_hh.
-            Default: True
-        batch_first: If True, then the input and output tensors are provided
-            as (batch, seq, feature)
-        dropout: If non-zero, introduces a dropout layer on the outputs of each
-            RNN layer except the last layer
-        bidirectional: If True, becomes a bidirectional RNN. Default: False
+
+    :param int input_size: The number of expected features in the input x
+    :param int hidden_size: The number of features in the hidden state h
+    :param int num_layers: Number of recurrent layers.
+    :param str nonlinearity: The non-linearity to use ['tanh'|'relu']. Default: 'tanh'
+    :param bool bias: If False, then the layer does not use bias weights b_ih and b_hh. Default: True
+    :param bool batch_first: If True, then the input and output tensors are provided as (batch, seq, feature)
+    :param bool dropout: If non-zero, introduces a dropout layer on the outputs of each RNN layer except the last layer
+    :param bool bidirectional: If True, becomes a bidirectional RNN. Default: False
+
     Inputs: input, mask, h_0
         - **input** (seq_len, batch, input_size): tensor containing the features
           of the input sequence.
diff --git a/fastNLP/modules/encoder/transformer.py b/fastNLP/modules/encoder/transformer.py
index 46badcfe..615a6f34 100644
--- a/fastNLP/modules/encoder/transformer.py
+++ b/fastNLP/modules/encoder/transformer.py
@@ -1,10 +1,9 @@
-import torch
 from torch import nn
-import torch.nn.functional as F
 
 from ..aggregator.attention import MultiHeadAtte
 from ..other_modules import LayerNormalization
 
+
 class TransformerEncoder(nn.Module):
     class SubLayer(nn.Module):
         def __init__(self, input_size, output_size, key_size, value_size, num_atte):
@@ -12,8 +11,8 @@ class TransformerEncoder(nn.Module):
             self.atte = MultiHeadAtte(input_size, output_size, key_size, value_size, num_atte)
             self.norm1 = LayerNormalization(output_size)
             self.ffn = nn.Sequential(nn.Linear(output_size, output_size),
-                                    nn.ReLU(),
-                                    nn.Linear(output_size, output_size))
+                                     nn.ReLU(),
+                                     nn.Linear(output_size, output_size))
             self.norm2 = LayerNormalization(output_size)
 
         def forward(self, input, seq_mask):
@@ -28,5 +27,3 @@ class TransformerEncoder(nn.Module):
 
     def forward(self, x, seq_mask=None):
         return self.layers(x, seq_mask)
-
-
diff --git a/fastNLP/modules/encoder/variational_rnn.py b/fastNLP/modules/encoder/variational_rnn.py
index f4a37cf4..6c1b0d46 100644
--- a/fastNLP/modules/encoder/variational_rnn.py
+++ b/fastNLP/modules/encoder/variational_rnn.py
@@ -1,5 +1,3 @@
-import math
-
 import torch
 import torch.nn as nn
 from torch.nn.utils.rnn import PackedSequence
@@ -9,15 +7,17 @@ from fastNLP.modules.utils import initial_parameter
 try:
     from torch import flip
 except ImportError:
-   def flip(x, dims):
+    def flip(x, dims):
         indices = [slice(None)] * x.dim()
         for dim in dims:
             indices[dim] = torch.arange(x.size(dim) - 1, -1, -1, dtype=torch.long, device=x.device)
         return x[tuple(indices)]
 
+
 class VarRnnCellWrapper(nn.Module):
     """Wrapper for normal RNN Cells, make it support variational dropout
     """
+
     def __init__(self, cell, hidden_size, input_p, hidden_p):
         super(VarRnnCellWrapper, self).__init__()
         self.cell = cell
@@ -32,9 +32,9 @@ class VarRnnCellWrapper(nn.Module):
                        for other RNN, h_0, [batch_size, hidden_size]
         :param mask_x: [batch_size, input_size] dropout mask for input
         :param mask_h: [batch_size, hidden_size] dropout mask for hidden
-        :return output: [seq_len, bacth_size, hidden_size]
-                hidden: for LSTM, tuple of (h_n, c_n), [batch_size, hidden_size]
-                        for other RNN, h_n, [batch_size, hidden_size]
+        :return: (output, hidden)
+            **output**: [seq_len, bacth_size, hidden_size].
+            **hidden**: for LSTM, tuple of (h_n, c_n), [batch_size, hidden_size]; For other RNN, h_n, [batch_size, hidden_size].
         """
         is_lstm = isinstance(hidden, tuple)
         input = input * mask_x.unsqueeze(0) if mask_x is not None else input
@@ -56,6 +56,7 @@ class VarRNNBase(nn.Module):
     refer to `A Theoretically Grounded Application of Dropout in Recurrent Neural Networks (Yarin Gal and Zoubin Ghahramani, 2016)
     https://arxiv.org/abs/1512.05287`.
     """
+
     def __init__(self, mode, Cell, input_size, hidden_size, num_layers=1,
                  bias=True, batch_first=False,
                  input_dropout=0, hidden_dropout=0, bidirectional=False):
@@ -138,17 +139,22 @@ class VarRNNBase(nn.Module):
 class VarLSTM(VarRNNBase):
     """Variational Dropout LSTM.
     """
+
     def __init__(self, *args, **kwargs):
         super(VarLSTM, self).__init__(mode="LSTM", Cell=nn.LSTMCell, *args, **kwargs)
 
+
 class VarRNN(VarRNNBase):
     """Variational Dropout RNN.
     """
+
     def __init__(self, *args, **kwargs):
         super(VarRNN, self).__init__(mode="RNN", Cell=nn.RNNCell, *args, **kwargs)
 
+
 class VarGRU(VarRNNBase):
     """Variational Dropout GRU.
     """
+
     def __init__(self, *args, **kwargs):
         super(VarGRU, self).__init__(mode="GRU", Cell=nn.GRUCell, *args, **kwargs)
diff --git a/fastNLP/modules/other_modules.py b/fastNLP/modules/other_modules.py
index 5cd10e7e..92a08ba1 100644
--- a/fastNLP/modules/other_modules.py
+++ b/fastNLP/modules/other_modules.py
@@ -29,8 +29,11 @@ class GroupNorm(nn.Module):
 
 
 class LayerNormalization(nn.Module):
-    """ Layer normalization module """
+    """
 
+    :param int layer_size:
+    :param float eps: default=1e-3
+    """
     def __init__(self, layer_size, eps=1e-3):
         super(LayerNormalization, self).__init__()
 
@@ -52,12 +55,11 @@ class LayerNormalization(nn.Module):
 class BiLinear(nn.Module):
     def __init__(self, n_left, n_right, n_out, bias=True):
         """
-        Args:
-            n_left: size of left input
-            n_right: size of right input
-            n_out: size of output
-            bias: If set to False, the layer will not learn an additive bias.
-                Default: True
+
+        :param int n_left: size of left input
+        :param int n_right: size of right input
+        :param int n_out: size of output
+        :param bool bias: If set to False, the layer will not learn an additive bias. Default: True
         """
         super(BiLinear, self).__init__()
         self.n_left = n_left
@@ -83,12 +85,9 @@ class BiLinear(nn.Module):
 
     def forward(self, input_left, input_right):
         """
-        Args:
-            input_left: Tensor
-                the left input tensor with shape = [batch1, batch2, ..., left_features]
-            input_right: Tensor
-                the right input tensor with shape = [batch1, batch2, ..., right_features]
-        Returns:
+        :param Tensor input_left: the left input tensor with shape = [batch1, batch2, ..., left_features]
+        :param Tensor input_right: the right input tensor with shape = [batch1, batch2, ..., right_features]
+
         """
         left_size = input_left.size()
         right_size = input_right.size()
@@ -118,16 +117,11 @@ class BiLinear(nn.Module):
 class BiAffine(nn.Module):
     def __init__(self, n_enc, n_dec, n_labels, biaffine=True, **kwargs):
         """
-        Args:
-            n_enc: int
-                the dimension of the encoder input.
-            n_dec: int
-                the dimension of the decoder input.
-            n_labels: int
-                the number of labels of the crf layer
-            biaffine: bool
-                if apply bi-affine parameter.
-            **kwargs:
+
+        :param int n_enc: the dimension of the encoder input.
+        :param int n_dec: the dimension of the decoder input.
+        :param int n_labels: the number of labels of the crf layer
+        :param bool biaffine: if apply bi-affine parameter.
         """
         super(BiAffine, self).__init__()
         self.n_enc = n_enc
@@ -154,17 +148,12 @@ class BiAffine(nn.Module):
 
     def forward(self, input_d, input_e, mask_d=None, mask_e=None):
         """
-        Args:
-            input_d: Tensor
-                the decoder input tensor with shape = [batch, length_decoder, input_size]
-            input_e: Tensor
-                the child input tensor with shape = [batch, length_encoder, input_size]
-            mask_d: Tensor or None
-                the mask tensor for decoder with shape = [batch, length_decoder]
-            mask_e: Tensor or None
-                the mask tensor for encoder with shape = [batch, length_encoder]
-        Returns: Tensor
-            the energy tensor with shape = [batch, num_label, length, length]
+
+        :param Tensor input_d: the decoder input tensor with shape = [batch, length_decoder, input_size]
+        :param Tensor input_e: the child input tensor with shape = [batch, length_encoder, input_size]
+        :param mask_d: Tensor or None, the mask tensor for decoder with shape = [batch, length_decoder]
+        :param mask_e: Tensor or None, the mask tensor for encoder with shape = [batch, length_encoder]
+        :returns: Tensor, the energy tensor with shape = [batch, num_label, length, length]
         """
         assert input_d.size(0) == input_e.size(0), 'batch sizes of encoder and decoder are requires to be equal.'
         batch, length_decoder, _ = input_d.size()
diff --git a/fastNLP/modules/utils.py b/fastNLP/modules/utils.py
index 5056e181..5287bca4 100644
--- a/fastNLP/modules/utils.py
+++ b/fastNLP/modules/utils.py
@@ -15,7 +15,7 @@ def initial_parameter(net, initial_method=None):
     """A method used to initialize the weights of PyTorch models.
 
     :param net: a PyTorch model
-    :param initial_method: str, one of the following initializations
+    :param str initial_method: one of the following initializations.
 
             - xavier_uniform
             - xavier_normal (default)
@@ -79,7 +79,7 @@ def seq_mask(seq_len, max_len):
 
     :param seq_len: list or torch.Tensor, the lengths of sequences in a batch.
     :param max_len: int, the maximum sequence length in a batch.
-    :return mask: torch.LongTensor, [batch_size, max_len]
+    :return: mask, torch.LongTensor, [batch_size, max_len]
 
     """
     if not isinstance(seq_len, torch.Tensor):