From ed6fd60aa9ee4f689d688a5de2efe5a3c2121895 Mon Sep 17 00:00:00 2001
From: wyg <1505116161@qq.com>
Date: Fri, 23 Aug 2019 14:47:46 +0800
Subject: [PATCH 01/19] [verify] char_cnn use pipe

---
 .../text_classification/model/BertTC.py       | 24 ++++++++++
 .../text_classification/train_char_cnn.py     | 45 +++++++++++++++----
 2 files changed, 60 insertions(+), 9 deletions(-)
 create mode 100644 reproduction/text_classification/model/BertTC.py

diff --git a/reproduction/text_classification/model/BertTC.py b/reproduction/text_classification/model/BertTC.py
new file mode 100644
index 00000000..702c0cd1
--- /dev/null
+++ b/reproduction/text_classification/model/BertTC.py
@@ -0,0 +1,24 @@
+from fastNLP.embeddings import BertEmbedding
+import torch
+import torch.nn as nn
+from fastNLP.core.const import Const as C
+
+class BertTC(nn.Module):
+    def __init__(self, vocab,num_class,bert_model_dir_or_name,fine_tune=False):
+        super(BertTC, self).__init__()
+        self.embed=BertEmbedding(vocab, requires_grad=fine_tune,
+                           model_dir_or_name=bert_model_dir_or_name,include_cls_sep=True)
+        self.classifier = nn.Linear(self.embed.embedding_dim, num_class)
+
+    def forward(self, words):
+        embedding_cls=self.embed(words)[:,0]
+        output=self.classifier(embedding_cls)
+        return {C.OUTPUT: output}
+
+    def predict(self,words):
+        return self.forward(words)
+
+if __name__=="__main__":
+    ta=torch.tensor([[1,2,3],[4,5,6],[7,8,9]])
+    tb=ta[:,0]
+    print(tb)
diff --git a/reproduction/text_classification/train_char_cnn.py b/reproduction/text_classification/train_char_cnn.py
index 3482de70..6b56608a 100644
--- a/reproduction/text_classification/train_char_cnn.py
+++ b/reproduction/text_classification/train_char_cnn.py
@@ -8,6 +8,7 @@ sys.path.append('../..')
 from fastNLP.core.const import Const as C
 import torch.nn as nn
 from fastNLP.io.data_loader import YelpLoader
+from fastNLP.io.pipe.classification import YelpFullPipe,YelpPolarityPipe,SST2Pipe,IMDBPipe
 #from data.sstLoader import sst2Loader
 from model.char_cnn import CharacterLevelCNN
 from fastNLP import CrossEntropyLoss, AccuracyMetric
@@ -46,6 +47,8 @@ class Config():
     extra_characters=''
     max_length=1014
     weight_decay = 1e-5
+    to_lower=True
+    tokenizer = 'spacy'  # 使用spacy进行分词
 
     char_cnn_config={
         "alphabet": {
@@ -111,12 +114,35 @@ ops=Config
 ##1.task相关信息：利用dataloader载入dataInfo
 #dataloader=SST2Loader()
 #dataloader=IMDBLoader()
-dataloader=YelpLoader(fine_grained=True)
-datainfo=dataloader.process(ops.datapath,char_level_op=True,split_dev_op=False)
+# dataloader=YelpLoader(fine_grained=True)
+# datainfo=dataloader.process(ops.datapath,char_level_op=True,split_dev_op=False)
 char_vocab=ops.char_cnn_config["alphabet"]["en"]["lower"]["alphabet"]
 ops.number_of_characters=len(char_vocab)
 ops.embedding_dim=ops.number_of_characters
 
+# load data set
+if ops.task == 'yelp_p':
+    data_bundle = YelpPolarityPipe(lower=ops.to_lower, tokenizer=ops.tokenizer).process_from_file()
+elif ops.task == 'yelp_f':
+    data_bundle = YelpFullPipe(lower=ops.to_lower, tokenizer=ops.tokenizer).process_from_file()
+elif ops.task == 'imdb':
+    data_bundle = IMDBPipe(lower=ops.to_lower, tokenizer=ops.tokenizer).process_from_file()
+elif ops.task == 'sst-2':
+    data_bundle = SST2Pipe(lower=ops.to_lower, tokenizer=ops.tokenizer).process_from_file()
+else:
+    raise RuntimeError(f'NOT support {ops.task} task yet!')
+
+
+def wordtochar(words):
+    chars = []
+    for word in words:
+        #word = word.lower()
+        for char in word:
+            chars.append(char)
+        chars.append('')
+    chars.pop()
+    return chars
+
 #chartoindex
 def chartoindex(chars):
     max_seq_len=ops.max_length
@@ -136,13 +162,14 @@ def chartoindex(chars):
         char_index_list=[zero_index]*max_seq_len
     return char_index_list
 
-for dataset in datainfo.datasets.values():
+for dataset in data_bundle.datasets.values():
+    dataset.apply_field(wordtochar, field_name="raw_words", new_field_name='chars')
     dataset.apply_field(chartoindex,field_name='chars',new_field_name='chars')
 
-datainfo.datasets['train'].set_input('chars')
-datainfo.datasets['test'].set_input('chars')
-datainfo.datasets['train'].set_target('target')
-datainfo.datasets['test'].set_target('target')
+data_bundle.datasets['train'].set_input('chars')
+data_bundle.datasets['test'].set_input('chars')
+data_bundle.datasets['train'].set_target('target')
+data_bundle.datasets['test'].set_target('target')
 
 ##2. 定义/组装模型，这里可以随意，就如果是fastNLP封装好的，类似CNNText就直接用初始化调用就好了，这里只是给出一个伪框架表示占位，在这里建立符合fastNLP输入输出规范的model
 class ModelFactory(nn.Module):
@@ -165,7 +192,7 @@ class ModelFactory(nn.Module):
 
 ## 2.或直接复用fastNLP的模型
 #vocab=datainfo.vocabs['words']
-vocab_label=datainfo.vocabs['target']
+vocab_label=data_bundle.vocabs['target']
 '''
 # emded_char=CNNCharEmbedding(vocab)
 # embed_word = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
@@ -212,5 +239,5 @@ if __name__=="__main__":
     #print(vocab_label)
 
     #print(datainfo.datasets["train"])
-    train(model,datainfo,loss,metric,optimizer,num_epochs=ops.train_epoch)
+    train(model,data_bundle,loss,metric,optimizer,num_epochs=ops.train_epoch)
     
\ No newline at end of file

From d6c597d32e66121a4f24c3fdbf6f5f0a9ee6e56e Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Sun, 25 Aug 2019 11:13:25 +0800
Subject: [PATCH 02/19] add __doc__ & __all__ in module 'embeddings'

---
 fastNLP/embeddings/__init__.py             |   1 -
 fastNLP/embeddings/bert_embedding.py       | 158 ++++++++++++---------
 fastNLP/embeddings/char_embedding.py       |  68 +++++----
 fastNLP/embeddings/contextual_embedding.py |  29 ++--
 fastNLP/embeddings/elmo_embedding.py       |  77 +++++-----
 fastNLP/embeddings/embedding.py            |  56 ++++----
 fastNLP/embeddings/stack_embedding.py      |  24 +++-
 fastNLP/embeddings/static_embedding.py     |  61 ++++----
 fastNLP/embeddings/utils.py                |  16 ++-
 9 files changed, 277 insertions(+), 213 deletions(-)

diff --git a/fastNLP/embeddings/__init__.py b/fastNLP/embeddings/__init__.py
index 37881f17..8a970e25 100644
--- a/fastNLP/embeddings/__init__.py
+++ b/fastNLP/embeddings/__init__.py
@@ -18,7 +18,6 @@ __all__ = [
     "get_embeddings",
 ]
 
-
 from .embedding import Embedding, TokenEmbedding
 from .static_embedding import StaticEmbedding
 from .elmo_embedding import ElmoEmbedding
diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py
index 6a10c489..e8844aa1 100644
--- a/fastNLP/embeddings/bert_embedding.py
+++ b/fastNLP/embeddings/bert_embedding.py
@@ -1,3 +1,12 @@
+"""
+.. todo::
+    doc
+"""
+
+__all__ = [
+    "BertEmbedding",
+    "BertWordPieceEncoder"
+]
 
 import os
 import collections
@@ -13,6 +22,7 @@ from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer
 from .contextual_embedding import ContextualEmbedding
 import warnings
 
+
 class BertEmbedding(ContextualEmbedding):
     """
     别名：:class:`fastNLP.embeddings.BertEmbedding`   :class:`fastNLP.embeddings.bert_embedding.BertEmbedding`
@@ -54,11 +64,12 @@ class BertEmbedding(ContextualEmbedding):
         word pieces后的内容，并将第512个word piece置为[SEP]。超过长度的部分的encode结果直接全部置零。一般仅有只使用[CLS]
         来进行分类的任务将auto_truncate置为True。
     """
-    def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en-base-uncased', layers: str='-1',
-                 pool_method: str='first', word_dropout=0, dropout=0, include_cls_sep: bool=False,
-                 pooled_cls=True, requires_grad: bool=False, auto_truncate:bool=False):
+    
+    def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1',
+                 pool_method: str = 'first', word_dropout=0, dropout=0, include_cls_sep: bool = False,
+                 pooled_cls=True, requires_grad: bool = False, auto_truncate: bool = False):
         super(BertEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
-
+        
         # 根据model_dir_or_name检查是否存在并下载
         if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR:
             if 'cn' in model_dir_or_name.lower() and pool_method not in ('first', 'last'):
@@ -71,21 +82,21 @@ class BertEmbedding(ContextualEmbedding):
             model_dir = os.path.abspath(os.path.expanduser(model_dir_or_name))
         else:
             raise ValueError(f"Cannot recognize {model_dir_or_name}.")
-
+        
         self._word_sep_index = None
         if '[SEP]' in vocab:
             self._word_sep_index = vocab['[SEP]']
-
+        
         self.model = _WordBertModel(model_dir=model_dir, vocab=vocab, layers=layers,
                                     pool_method=pool_method, include_cls_sep=include_cls_sep,
                                     pooled_cls=pooled_cls, auto_truncate=auto_truncate, min_freq=2)
-
+        
         self.requires_grad = requires_grad
-        self._embed_size = len(self.model.layers)*self.model.encoder.hidden_size
-
+        self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size
+    
     def _delete_model_weights(self):
         del self.model
-
+    
     def forward(self, words):
         """
         计算words的bert embedding表示。计算之前会在每句话的开始增加[CLS]在结束增加[SEP], 并根据include_cls_sep判断要不要
@@ -100,9 +111,9 @@ class BertEmbedding(ContextualEmbedding):
             return self.dropout(outputs)
         outputs = self.model(words)
         outputs = torch.cat([*outputs], dim=-1)
-
+        
         return self.dropout(outputs)
-
+    
     def drop_word(self, words):
         """
         按照设定随机将words设置为unknown_index。
@@ -120,7 +131,7 @@ class BertEmbedding(ContextualEmbedding):
                 if self._word_sep_index:
                     words.masked_fill_(sep_mask, self._word_sep_index)
         return words
-
+    
     @property
     def requires_grad(self):
         """
@@ -129,12 +140,12 @@ class BertEmbedding(ContextualEmbedding):
         :return:
         """
         requires_grads = set([param.requires_grad for name, param in self.named_parameters()
-                             if 'word_pieces_lengths' not in name])
+                              if 'word_pieces_lengths' not in name])
         if len(requires_grads) == 1:
             return requires_grads.pop()
         else:
             return None
-
+    
     @requires_grad.setter
     def requires_grad(self, value):
         for name, param in self.named_parameters():
@@ -155,10 +166,11 @@ class BertWordPieceEncoder(nn.Module):
     :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。
     :param bool requires_grad: 是否需要gradient。
     """
-    def __init__(self, model_dir_or_name: str='en-base-uncased', layers: str='-1', pooled_cls: bool = False,
-                 word_dropout=0, dropout=0, requires_grad: bool=False):
+    
+    def __init__(self, model_dir_or_name: str = 'en-base-uncased', layers: str = '-1', pooled_cls: bool = False,
+                 word_dropout=0, dropout=0, requires_grad: bool = False):
         super().__init__()
-
+        
         if model_dir_or_name.lower() in PRETRAINED_BERT_MODEL_DIR:
             model_url = _get_embedding_url('bert', model_dir_or_name.lower())
             model_dir = cached_path(model_url, name='embedding')
@@ -167,7 +179,7 @@ class BertWordPieceEncoder(nn.Module):
             model_dir = model_dir_or_name
         else:
             raise ValueError(f"Cannot recognize {model_dir_or_name}.")
-
+        
         self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls)
         self._sep_index = self.model._sep_index
         self._wordpiece_unk_index = self.model._wordpiece_unknown_index
@@ -175,7 +187,7 @@ class BertWordPieceEncoder(nn.Module):
         self.requires_grad = requires_grad
         self.word_dropout = word_dropout
         self.dropout_layer = nn.Dropout(dropout)
-
+    
     @property
     def requires_grad(self):
         """
@@ -187,24 +199,24 @@ class BertWordPieceEncoder(nn.Module):
             return requires_grads.pop()
         else:
             return None
-
+    
     @requires_grad.setter
     def requires_grad(self, value):
         for name, param in self.named_parameters():
             param.requires_grad = value
-
+    
     @property
     def embed_size(self):
         return self._embed_size
-
+    
     @property
     def embedding_dim(self):
         return self._embed_size
-
+    
     @property
     def num_embedding(self):
         return self.model.encoder.config.vocab_size
-
+    
     def index_datasets(self, *datasets, field_name, add_cls_sep=True):
         """
         使用bert的tokenizer新生成word_pieces列加入到datasets中，并将他们设置为input,且将word_pieces这一列的pad value设置为了
@@ -216,7 +228,7 @@ class BertWordPieceEncoder(nn.Module):
         :return:
         """
         self.model.index_dataset(*datasets, field_name=field_name, add_cls_sep=add_cls_sep)
-
+    
     def forward(self, word_pieces, token_type_ids=None):
         """
         计算words的bert embedding表示。传入的words中应该自行包含[CLS]与[SEP]的tag。
@@ -233,13 +245,13 @@ class BertWordPieceEncoder(nn.Module):
                 token_type_ids = sep_mask_cumsum.fmod(2)
                 if token_type_ids[0, 0].item():  # 如果开头是奇数，则需要flip一下结果，因为需要保证开头为0
                     token_type_ids = token_type_ids.eq(0).long()
-
+        
         word_pieces = self.drop_word(word_pieces)
         outputs = self.model(word_pieces, token_type_ids)
         outputs = torch.cat([*outputs], dim=-1)
-
+        
         return self.dropout_layer(outputs)
-
+    
     def drop_word(self, words):
         """
         按照设定随机将words设置为unknown_index。
@@ -260,10 +272,10 @@ class BertWordPieceEncoder(nn.Module):
 
 
 class _WordBertModel(nn.Module):
-    def __init__(self, model_dir:str, vocab:Vocabulary, layers:str='-1', pool_method:str='first',
-                 include_cls_sep:bool=False, pooled_cls:bool=False, auto_truncate:bool=False, min_freq=2):
+    def __init__(self, model_dir: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first',
+                 include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = False, min_freq=2):
         super().__init__()
-
+        
         self.tokenzier = BertTokenizer.from_pretrained(model_dir)
         self.encoder = BertModel.from_pretrained(model_dir)
         self._max_position_embeddings = self.encoder.config.max_position_embeddings
@@ -271,23 +283,23 @@ class _WordBertModel(nn.Module):
         encoder_layer_number = len(self.encoder.encoder.layer)
         self.layers = list(map(int, layers.split(',')))
         for layer in self.layers:
-            if layer<0:
-                assert -layer<=encoder_layer_number, f"The layer index:{layer} is out of scope for " \
-                    f"a bert model with {encoder_layer_number} layers."
+            if layer < 0:
+                assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \
+                                                       f"a bert model with {encoder_layer_number} layers."
             else:
-                assert layer<encoder_layer_number, f"The layer index:{layer} is out of scope for " \
-                    f"a bert model with {encoder_layer_number} layers."
-
+                assert layer < encoder_layer_number, f"The layer index:{layer} is out of scope for " \
+                                                     f"a bert model with {encoder_layer_number} layers."
+        
         assert pool_method in ('avg', 'max', 'first', 'last')
         self.pool_method = pool_method
         self.include_cls_sep = include_cls_sep
         self.pooled_cls = pooled_cls
         self.auto_truncate = auto_truncate
-
+        
         # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
         print("Start to generating word pieces for word.")
         # 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值
-        word_piece_dict = {'[CLS]':1, '[SEP]':1}  # 用到的word_piece以及新增的
+        word_piece_dict = {'[CLS]': 1, '[SEP]': 1}  # 用到的word_piece以及新增的
         found_count = 0
         self._has_sep_in_vocab = '[SEP]' in vocab  # 用来判断传入的数据是否需要生成token_ids
         if '[sep]' in vocab:
@@ -302,10 +314,11 @@ class _WordBertModel(nn.Module):
             elif index == vocab.unknown_idx:
                 word = '[UNK]'
             word_pieces = self.tokenzier.wordpiece_tokenizer.tokenize(word)
-            if len(word_pieces)==1:
+            if len(word_pieces) == 1:
                 if not vocab._is_word_no_create_entry(word):  # 如果是train中的值, 但是却没有找到
-                    if index!=vocab.unknown_idx and word_pieces[0]=='[UNK]': # 说明这个词不在原始的word里面
-                        if vocab.word_count[word]>=min_freq and not vocab._is_word_no_create_entry(word):  #出现次数大于这个次数才新增
+                    if index != vocab.unknown_idx and word_pieces[0] == '[UNK]':  # 说明这个词不在原始的word里面
+                        if vocab.word_count[word] >= min_freq and not vocab._is_word_no_create_entry(
+                                word):  # 出现次数大于这个次数才新增
                             word_piece_dict[word] = 1  # 新增一个值
                         continue
             for word_piece in word_pieces:
@@ -327,7 +340,7 @@ class _WordBertModel(nn.Module):
             new_word_piece_vocab[token] = len(new_word_piece_vocab)
         self.tokenzier._reinit_on_new_vocab(new_word_piece_vocab)
         self.encoder.embeddings.word_embeddings = embed
-
+        
         word_to_wordpieces = []
         word_pieces_lengths = []
         for word, index in vocab:
@@ -347,7 +360,7 @@ class _WordBertModel(nn.Module):
         self.word_to_wordpieces = np.array(word_to_wordpieces)
         self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False)
         print("Successfully generate word pieces.")
-
+    
     def forward(self, words):
         """
 
@@ -358,34 +371,37 @@ class _WordBertModel(nn.Module):
             batch_size, max_word_len = words.size()
             word_mask = words.ne(self._word_pad_index)  # 为1的地方有word
             seq_len = word_mask.sum(dim=-1)
-            batch_word_pieces_length = self.word_pieces_lengths[words].masked_fill(word_mask.eq(0), 0)  # batch_size x max_len
+            batch_word_pieces_length = self.word_pieces_lengths[words].masked_fill(word_mask.eq(0),
+                                                                                   0)  # batch_size x max_len
             word_pieces_lengths = batch_word_pieces_length.sum(dim=-1)  # batch_size
             word_piece_length = batch_word_pieces_length.sum(dim=-1).max().item()  # 表示word piece的长度(包括padding)
-            if word_piece_length+2>self._max_position_embeddings:
+            if word_piece_length + 2 > self._max_position_embeddings:
                 if self.auto_truncate:
-                    word_pieces_lengths = word_pieces_lengths.masked_fill(word_pieces_lengths+2>self._max_position_embeddings,
-                                                                          self._max_position_embeddings-2)
+                    word_pieces_lengths = word_pieces_lengths.masked_fill(
+                        word_pieces_lengths + 2 > self._max_position_embeddings,
+                        self._max_position_embeddings - 2)
                 else:
-                    raise RuntimeError("After split words into word pieces, the lengths of word pieces are longer than the "
-                                       f"maximum allowed sequence length:{self._max_position_embeddings} of bert.")
-
+                    raise RuntimeError(
+                        "After split words into word pieces, the lengths of word pieces are longer than the "
+                        f"maximum allowed sequence length:{self._max_position_embeddings} of bert.")
+            
             # +2是由于需要加入[CLS]与[SEP]
-            word_pieces = words.new_full((batch_size, min(word_piece_length+2, self._max_position_embeddings)),
+            word_pieces = words.new_full((batch_size, min(word_piece_length + 2, self._max_position_embeddings)),
                                          fill_value=self._wordpiece_pad_index)
             attn_masks = torch.zeros_like(word_pieces)
             # 1. 获取words的word_pieces的id，以及对应的span范围
             word_indexes = words.cpu().numpy()
             for i in range(batch_size):
                 word_pieces_i = list(chain(*self.word_to_wordpieces[word_indexes[i, :seq_len[i]]]))
-                if self.auto_truncate and len(word_pieces_i)>self._max_position_embeddings-2:
-                    word_pieces_i = word_pieces_i[:self._max_position_embeddings-2]
-                word_pieces[i, 1:word_pieces_lengths[i]+1] = torch.LongTensor(word_pieces_i)
-                attn_masks[i, :word_pieces_lengths[i]+2].fill_(1)
+                if self.auto_truncate and len(word_pieces_i) > self._max_position_embeddings - 2:
+                    word_pieces_i = word_pieces_i[:self._max_position_embeddings - 2]
+                word_pieces[i, 1:word_pieces_lengths[i] + 1] = torch.LongTensor(word_pieces_i)
+                attn_masks[i, :word_pieces_lengths[i] + 2].fill_(1)
             # 添加[cls]和[sep]
             word_pieces[:, 0].fill_(self._cls_index)
             batch_indexes = torch.arange(batch_size).to(words)
-            word_pieces[batch_indexes, word_pieces_lengths+1] = self._sep_index
-            if self._has_sep_in_vocab:  #但[SEP]在vocab中出现应该才会需要token_ids
+            word_pieces[batch_indexes, word_pieces_lengths + 1] = self._sep_index
+            if self._has_sep_in_vocab:  # 但[SEP]在vocab中出现应该才会需要token_ids
                 sep_mask = word_pieces.eq(self._sep_index)  # batch_size x max_len
                 sep_mask_cumsum = sep_mask.flip(dims=[-1]).cumsum(dim=-1).flip(dims=[-1])
                 token_type_ids = sep_mask_cumsum.fmod(2)
@@ -396,9 +412,9 @@ class _WordBertModel(nn.Module):
         # 2. 获取hidden的结果，根据word_pieces进行对应的pool计算
         # all_outputs: [batch_size x max_len x hidden_size, batch_size x max_len x hidden_size, ...]
         bert_outputs, pooled_cls = self.encoder(word_pieces, token_type_ids=token_type_ids, attention_mask=attn_masks,
-                                           output_all_encoded_layers=True)
+                                                output_all_encoded_layers=True)
         # output_layers = [self.layers]  # len(self.layers) x batch_size x real_word_piece_length x hidden_size
-
+        
         if self.include_cls_sep:
             outputs = bert_outputs[-1].new_zeros(len(self.layers), batch_size, max_word_len + 2,
                                                  bert_outputs[-1].size(-1))
@@ -414,7 +430,7 @@ class _WordBertModel(nn.Module):
             real_word_piece_length = output_layer.size(1) - 2
             if word_piece_length > real_word_piece_length:  # 如果实际上是截取出来的
                 paddings = output_layer.new_zeros(batch_size,
-                                                  word_piece_length-real_word_piece_length,
+                                                  word_piece_length - real_word_piece_length,
                                                   output_layer.size(2))
                 output_layer = torch.cat((output_layer, paddings), dim=1).contiguous()
             # 从word_piece collapse到word的表示
@@ -423,27 +439,27 @@ class _WordBertModel(nn.Module):
             if self.pool_method == 'first':
                 for i in range(batch_size):
                     i_word_pieces_cum_length = batch_word_pieces_cum_length[i, :seq_len[i]]  # 每个word的start位置
-                    outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length]  # num_layer x batch_size x len x hidden_size
+                    outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[
+                        i, i_word_pieces_cum_length]  # num_layer x batch_size x len x hidden_size
             elif self.pool_method == 'last':
                 for i in range(batch_size):
-                    i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i]+1] - 1 # 每个word的end
+                    i_word_pieces_cum_length = batch_word_pieces_cum_length[i, 1:seq_len[i] + 1] - 1  # 每个word的end
                     outputs[l_index, i, s_shift:outputs_seq_len[i]] = truncate_output_layer[i, i_word_pieces_cum_length]
             elif self.pool_method == 'max':
                 for i in range(batch_size):
                     for j in range(seq_len[i]):
-                        start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
-                        outputs[l_index, i, j+s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2)
+                        start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1]
+                        outputs[l_index, i, j + s_shift], _ = torch.max(truncate_output_layer[i, start:end], dim=-2)
             else:
                 for i in range(batch_size):
                     for j in range(seq_len[i]):
-                        start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j+1]
-                        outputs[l_index, i, j+s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2)
+                        start, end = batch_word_pieces_cum_length[i, j], batch_word_pieces_cum_length[i, j + 1]
+                        outputs[l_index, i, j + s_shift] = torch.mean(truncate_output_layer[i, start:end], dim=-2)
             if self.include_cls_sep:
-                if l in (len(bert_outputs)-1, -1) and self.pooled_cls:
+                if l in (len(bert_outputs) - 1, -1) and self.pooled_cls:
                     outputs[l_index, :, 0] = pooled_cls
                 else:
                     outputs[l_index, :, 0] = output_layer[:, 0]
-                outputs[l_index, batch_indexes, seq_len+s_shift] = output_layer[batch_indexes, seq_len+s_shift]
+                outputs[l_index, batch_indexes, seq_len + s_shift] = output_layer[batch_indexes, seq_len + s_shift]
         # 3. 最终的embedding结果
         return outputs
-
diff --git a/fastNLP/embeddings/char_embedding.py b/fastNLP/embeddings/char_embedding.py
index 520e85e6..24c84314 100644
--- a/fastNLP/embeddings/char_embedding.py
+++ b/fastNLP/embeddings/char_embedding.py
@@ -3,6 +3,10 @@
 词的index而不需要使用词语中的char的index来获取表达。
 """
 
+__all__ = [
+    "CNNCharEmbedding",
+    "LSTMCharEmbedding"
+]
 
 import torch
 import torch.nn as nn
@@ -16,6 +20,7 @@ from .embedding import TokenEmbedding
 from .utils import _construct_char_vocab_from_vocab
 from .utils import get_embeddings
 
+
 class CNNCharEmbedding(TokenEmbedding):
     """
     别名：:class:`fastNLP.embeddings.CNNCharEmbedding`   :class:`fastNLP.embeddings.char_embedding.CNNCharEmbedding`
@@ -49,14 +54,15 @@ class CNNCharEmbedding(TokenEmbedding):
         (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径；第二种是传入embedding的名称，第二种情况将自动查看缓存中是否存在该模型，
         没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding.
     """
-    def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0,
-                 dropout:float=0, filter_nums: List[int]=(40, 30, 20), kernel_sizes: List[int]=(5, 3, 1),
-                 pool_method: str='max', activation='relu', min_char_freq: int=2, pre_train_char_embed: str=None):
+    
+    def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0,
+                 dropout: float = 0, filter_nums: List[int] = (40, 30, 20), kernel_sizes: List[int] = (5, 3, 1),
+                 pool_method: str = 'max', activation='relu', min_char_freq: int = 2, pre_train_char_embed: str = None):
         super(CNNCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
-
+        
         for kernel in kernel_sizes:
             assert kernel % 2 == 1, "Only odd kernel is allowed."
-
+        
         assert pool_method in ('max', 'avg')
         self.pool_method = pool_method
         # activation function
@@ -74,7 +80,7 @@ class CNNCharEmbedding(TokenEmbedding):
         else:
             raise Exception(
                 "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")
-
+        
         print("Start constructing character vocabulary.")
         # 建立char的词表
         self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
@@ -96,14 +102,14 @@ class CNNCharEmbedding(TokenEmbedding):
             self.char_embedding = StaticEmbedding(self.char_vocab, model_dir_or_name=pre_train_char_embed)
         else:
             self.char_embedding = get_embeddings((len(self.char_vocab), char_emb_size))
-
+        
         self.convs = nn.ModuleList([nn.Conv1d(
             char_emb_size, filter_nums[i], kernel_size=kernel_sizes[i], bias=True, padding=kernel_sizes[i] // 2)
             for i in range(len(kernel_sizes))])
         self._embed_size = embed_size
         self.fc = nn.Linear(sum(filter_nums), embed_size)
         self.reset_parameters()
-
+    
     def forward(self, words):
         """
         输入words的index后，生成对应的words的表示。
@@ -114,14 +120,14 @@ class CNNCharEmbedding(TokenEmbedding):
         words = self.drop_word(words)
         batch_size, max_len = words.size()
         chars = self.words_to_chars_embedding[words]  # batch_size x max_len x max_word_len
-        word_lengths = self.word_lengths[words] # batch_size x max_len
+        word_lengths = self.word_lengths[words]  # batch_size x max_len
         max_word_len = word_lengths.max()
         chars = chars[:, :, :max_word_len]
         # 为1的地方为mask
         chars_masks = chars.eq(self.char_pad_index)  # batch_size x max_len x max_word_len 如果为0, 说明是padding的位置了
         chars = self.char_embedding(chars)  # batch_size x max_len x max_word_len x embed_size
         chars = self.dropout(chars)
-        reshaped_chars = chars.reshape(batch_size*max_len, max_word_len, -1)
+        reshaped_chars = chars.reshape(batch_size * max_len, max_word_len, -1)
         reshaped_chars = reshaped_chars.transpose(1, 2)  # B' x E x M
         conv_chars = [conv(reshaped_chars).transpose(1, 2).reshape(batch_size, max_len, max_word_len, -1)
                       for conv in self.convs]
@@ -129,13 +135,13 @@ class CNNCharEmbedding(TokenEmbedding):
         conv_chars = self.activation(conv_chars)
         if self.pool_method == 'max':
             conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf'))
-            chars, _ = torch.max(conv_chars, dim=-2) # batch_size x max_len x sum(filters)
+            chars, _ = torch.max(conv_chars, dim=-2)  # batch_size x max_len x sum(filters)
         else:
             conv_chars = conv_chars.masked_fill(chars_masks.unsqueeze(-1), 0)
-            chars = torch.sum(conv_chars, dim=-2)/chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
+            chars = torch.sum(conv_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
         chars = self.fc(chars)
         return self.dropout(chars)
-
+    
     @property
     def requires_grad(self):
         """
@@ -151,21 +157,21 @@ class CNNCharEmbedding(TokenEmbedding):
             return requires_grads.pop()
         else:
             return None
-
+    
     @requires_grad.setter
     def requires_grad(self, value):
         for name, param in self.named_parameters():
             if 'words_to_chars_embedding' in name or 'word_lengths' in name:  # 这个不能加入到requires_grad中
                 continue
             param.requires_grad = value
-
+    
     def reset_parameters(self):
         for name, param in self.named_parameters():
             if 'words_to_chars_embedding' in name or 'word_lengths' in name:  # 这个不能reset
                 continue
             if 'char_embedding' in name:
                 continue
-            if param.data.dim()>1:
+            if param.data.dim() > 1:
                 nn.init.xavier_uniform_(param, 1)
             else:
                 nn.init.uniform_(param, -1, 1)
@@ -203,13 +209,15 @@ class LSTMCharEmbedding(TokenEmbedding):
         (文件夹下应该只有一个以.txt作为后缀的文件)或文件路径；第二种是传入embedding的名称，第二种情况将自动查看缓存中是否存在该模型，
         没有的话将自动下载。如果输入为None则使用embedding_dim的维度随机初始化一个embedding.
     """
-    def __init__(self, vocab: Vocabulary, embed_size: int=50, char_emb_size: int=50, word_dropout:float=0,
-                 dropout:float=0, hidden_size=50,pool_method: str='max', activation='relu', min_char_freq: int=2,
-                 bidirectional=True, pre_train_char_embed: str=None):
+    
+    def __init__(self, vocab: Vocabulary, embed_size: int = 50, char_emb_size: int = 50, word_dropout: float = 0,
+                 dropout: float = 0, hidden_size=50, pool_method: str = 'max', activation='relu',
+                 min_char_freq: int = 2,
+                 bidirectional=True, pre_train_char_embed: str = None):
         super(LSTMCharEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
-
+        
         assert hidden_size % 2 == 0, "Only even kernel is allowed."
-
+        
         assert pool_method in ('max', 'avg')
         self.pool_method = pool_method
         # activation function
@@ -227,7 +235,7 @@ class LSTMCharEmbedding(TokenEmbedding):
         else:
             raise Exception(
                 "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")
-
+        
         print("Start constructing character vocabulary.")
         # 建立char的词表
         self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
@@ -249,14 +257,14 @@ class LSTMCharEmbedding(TokenEmbedding):
             self.char_embedding = StaticEmbedding(self.char_vocab, pre_train_char_embed)
         else:
             self.char_embedding = nn.Embedding(len(self.char_vocab), char_emb_size)
-
+        
         self.fc = nn.Linear(hidden_size, embed_size)
         hidden_size = hidden_size // 2 if bidirectional else hidden_size
-
+        
         self.lstm = LSTM(char_emb_size, hidden_size, bidirectional=bidirectional, batch_first=True)
         self._embed_size = embed_size
         self.bidirectional = bidirectional
-
+    
     def forward(self, words):
         """
         输入words的index后，生成对应的words的表示。
@@ -278,7 +286,7 @@ class LSTMCharEmbedding(TokenEmbedding):
         char_seq_len = chars_masks.eq(0).sum(dim=-1).reshape(batch_size * max_len)
         lstm_chars = self.lstm(reshaped_chars, char_seq_len)[0].reshape(batch_size, max_len, max_word_len, -1)
         # B x M x M x H
-
+        
         lstm_chars = self.activation(lstm_chars)
         if self.pool_method == 'max':
             lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), float('-inf'))
@@ -286,11 +294,11 @@ class LSTMCharEmbedding(TokenEmbedding):
         else:
             lstm_chars = lstm_chars.masked_fill(chars_masks.unsqueeze(-1), 0)
             chars = torch.sum(lstm_chars, dim=-2) / chars_masks.eq(0).sum(dim=-1, keepdim=True).float()
-
+        
         chars = self.fc(chars)
-
+        
         return self.dropout(chars)
-
+    
     @property
     def requires_grad(self):
         """
@@ -307,7 +315,7 @@ class LSTMCharEmbedding(TokenEmbedding):
             return requires_grads.pop()
         else:
             return None
-
+    
     @requires_grad.setter
     def requires_grad(self, value):
         for name, param in self.named_parameters():
diff --git a/fastNLP/embeddings/contextual_embedding.py b/fastNLP/embeddings/contextual_embedding.py
index 152b0ab9..2a1e2f82 100644
--- a/fastNLP/embeddings/contextual_embedding.py
+++ b/fastNLP/embeddings/contextual_embedding.py
@@ -1,3 +1,12 @@
+"""
+.. todo::
+    doc
+"""
+
+__all__ = [
+    "ContextualEmbedding"
+]
+
 from abc import abstractmethod
 import torch
 
@@ -8,16 +17,12 @@ from ..core.sampler import SequentialSampler
 from ..core.utils import _move_model_to_device, _get_model_device
 from .embedding import TokenEmbedding
 
-__all__ = [
-    "ContextualEmbedding"
-]
-
 
 class ContextualEmbedding(TokenEmbedding):
-    def __init__(self, vocab: Vocabulary, word_dropout:float=0.0, dropout:float=0.0):
+    def __init__(self, vocab: Vocabulary, word_dropout: float = 0.0, dropout: float = 0.0):
         super(ContextualEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
-
-    def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool=True):
+    
+    def add_sentence_cache(self, *datasets, batch_size=32, device='cpu', delete_weights: bool = True):
         """
         由于动态embedding生成比较耗时，所以可以把每句话embedding缓存下来，这样就不需要每次都运行生成过程。
 
@@ -34,7 +39,7 @@ class ContextualEmbedding(TokenEmbedding):
             except Exception as e:
                 print(f"Exception happens at {index} dataset.")
                 raise e
-
+        
         sent_embeds = {}
         _move_model_to_device(self, device=device)
         device = _get_model_device(self)
@@ -54,7 +59,7 @@ class ContextualEmbedding(TokenEmbedding):
                         word_embeds = self(words).detach().cpu().numpy()
                         for b in range(words.size(0)):
                             length = seq_len_from_behind[b]
-                            if length==0:
+                            if length == 0:
                                 sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b]
                             else:
                                 sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length]
@@ -65,7 +70,7 @@ class ContextualEmbedding(TokenEmbedding):
         self.sent_embeds = sent_embeds
         if delete_weights:
             self._delete_model_weights()
-
+    
     def _get_sent_reprs(self, words):
         """
         获取sentence的表示，如果有缓存，则返回缓存的值; 没有缓存则返回None
@@ -88,12 +93,12 @@ class ContextualEmbedding(TokenEmbedding):
                 embeds[i, :len(embed)] = torch.FloatTensor(embed).to(words.device)
             return embeds
         return None
-
+    
     @abstractmethod
     def _delete_model_weights(self):
         """删除计算表示的模型以节省资源"""
         raise NotImplementedError
-
+    
     def remove_sentence_cache(self):
         """
         删除缓存的句子表示. 删除之后如果模型权重没有被删除，将开始使用动态计算权重。
diff --git a/fastNLP/embeddings/elmo_embedding.py b/fastNLP/embeddings/elmo_embedding.py
index 24cd052e..fb5388fd 100644
--- a/fastNLP/embeddings/elmo_embedding.py
+++ b/fastNLP/embeddings/elmo_embedding.py
@@ -1,6 +1,13 @@
+"""
+.. todo::
+    doc
+"""
 
-import os
+__all__ = [
+    "ElmoEmbedding"
+]
 
+import os
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -49,11 +56,11 @@ class ElmoEmbedding(ContextualEmbedding):
     :param cache_word_reprs: 可以选择对word的表示进行cache; 设置为True的话，将在初始化的时候为每个word生成对应的embedding，
         并删除character encoder，之后将直接使用cache的embedding。默认为False。
     """
-
+    
     def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', layers: str = '2', requires_grad: bool = False,
                  word_dropout=0.0, dropout=0.0, cache_word_reprs: bool = False):
         super(ElmoEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
-
+        
         # 根据model_dir_or_name检查是否存在并下载
         if model_dir_or_name.lower() in PRETRAINED_ELMO_MODEL_DIR:
             model_url = _get_embedding_url('elmo', model_dir_or_name.lower())
@@ -64,7 +71,7 @@ class ElmoEmbedding(ContextualEmbedding):
         else:
             raise ValueError(f"Cannot recognize {model_dir_or_name}.")
         self.model = _ElmoModel(model_dir, vocab, cache_word_reprs=cache_word_reprs)
-
+        
         if layers == 'mix':
             self.layer_weights = nn.Parameter(torch.zeros(self.model.config['lstm']['n_layers'] + 1),
                                               requires_grad=requires_grad)
@@ -79,16 +86,16 @@ class ElmoEmbedding(ContextualEmbedding):
             self.layers = layers
             self._get_outputs = self._get_layer_outputs
             self._embed_size = len(self.layers) * self.model.config['lstm']['projection_dim'] * 2
-
+        
         self.requires_grad = requires_grad
-
+    
     def _get_mixed_outputs(self, outputs):
         # outputs: num_layers x batch_size x max_len x hidden_size
         # return: batch_size x max_len x hidden_size
         weights = F.softmax(self.layer_weights + 1 / len(outputs), dim=0).to(outputs)
         outputs = torch.einsum('l,lbij->bij', weights, outputs)
         return self.gamma.to(outputs) * outputs
-
+    
     def set_mix_weights_requires_grad(self, flag=True):
         """
         当初始化ElmoEmbedding时layers被设置为mix时，可以通过调用该方法设置mix weights是否可训练。如果layers不是mix，调用
@@ -100,15 +107,15 @@ class ElmoEmbedding(ContextualEmbedding):
         if hasattr(self, 'layer_weights'):
             self.layer_weights.requires_grad = flag
             self.gamma.requires_grad = flag
-
+    
     def _get_layer_outputs(self, outputs):
         if len(self.layers) == 1:
             outputs = outputs[self.layers[0]]
         else:
             outputs = torch.cat(tuple([*outputs[self.layers]]), dim=-1)
-
+        
         return outputs
-
+    
     def forward(self, words: torch.LongTensor):
         """
         计算words的elmo embedding表示。根据elmo文章中介绍的ELMO实际上是有2L+1层结果，但是为了让结果比较容易拆分，token的
@@ -125,12 +132,12 @@ class ElmoEmbedding(ContextualEmbedding):
         outputs = self.model(words)
         outputs = self._get_outputs(outputs)
         return self.dropout(outputs)
-
+    
     def _delete_model_weights(self):
         for name in ['layers', 'model', 'layer_weights', 'gamma']:
             if hasattr(self, name):
                 delattr(self, name)
-
+    
     @property
     def requires_grad(self):
         """
@@ -144,7 +151,7 @@ class ElmoEmbedding(ContextualEmbedding):
             return requires_grads.pop()
         else:
             return None
-
+    
     @requires_grad.setter
     def requires_grad(self, value):
         for name, param in self.named_parameters():
@@ -162,7 +169,7 @@ class _ElmoModel(nn.Module):
         (4) 设计一个保存token的embedding，允许缓存word的表示。
 
     """
-
+    
     def __init__(self, model_dir: str, vocab: Vocabulary = None, cache_word_reprs: bool = False):
         super(_ElmoModel, self).__init__()
         self.model_dir = model_dir
@@ -187,14 +194,14 @@ class _ElmoModel(nn.Module):
             config = json.load(config_f)
         self.weight_file = os.path.join(model_dir, weight_file)
         self.config = config
-
+        
         OOV_TAG = '<oov>'
         PAD_TAG = '<pad>'
         BOS_TAG = '<bos>'
         EOS_TAG = '<eos>'
         BOW_TAG = '<bow>'
         EOW_TAG = '<eow>'
-
+        
         # For the model trained with character-based word encoder.
         char_lexicon = {}
         with codecs.open(os.path.join(model_dir, 'char.dic'), 'r', encoding='utf-8') as fpi:
@@ -204,29 +211,29 @@ class _ElmoModel(nn.Module):
                     tokens.insert(0, '\u3000')
                 token, i = tokens
                 char_lexicon[token] = int(i)
-
+        
         # 做一些sanity check
         for special_word in [PAD_TAG, OOV_TAG, BOW_TAG, EOW_TAG]:
             assert special_word in char_lexicon, f"{special_word} not found in char.dic."
-
+        
         # 从vocab中构建char_vocab
         char_vocab = Vocabulary(unknown=OOV_TAG, padding=PAD_TAG)
         # 需要保证<bow>与<eow>在里面
         char_vocab.add_word_lst([BOW_TAG, EOW_TAG, BOS_TAG, EOS_TAG])
-
+        
         for word, index in vocab:
             char_vocab.add_word_lst(list(word))
-
+        
         self.bos_index, self.eos_index, self._pad_index = len(vocab), len(vocab) + 1, vocab.padding_idx
         # 根据char_lexicon调整, 多设置一位，是预留给word padding的(该位置的char表示为全0表示)
         char_emb_layer = nn.Embedding(len(char_vocab) + 1, int(config['char_cnn']['embedding']['dim']),
                                       padding_idx=len(char_vocab))
-
+        
         # 读入预训练权重 这里的elmo_model 包含char_cnn和 lstm 的 state_dict
         elmo_model = torch.load(os.path.join(self.model_dir, weight_file), map_location='cpu')
-
+        
         char_embed_weights = elmo_model["char_cnn"]['char_emb_layer.weight']
-
+        
         found_char_count = 0
         for char, index in char_vocab:  # 调整character embedding
             if char in char_lexicon:
@@ -235,11 +242,11 @@ class _ElmoModel(nn.Module):
             else:
                 index_in_pre = char_lexicon[OOV_TAG]
             char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre]
-
+        
         print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.")
         # 生成words到chars的映射
         max_chars = config['char_cnn']['max_characters_per_token']
-
+        
         self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab) + 2, max_chars),
                                                                 fill_value=len(char_vocab),
                                                                 dtype=torch.long),
@@ -258,20 +265,20 @@ class _ElmoModel(nn.Module):
                     char_vocab.to_index(EOW_TAG)]
                 char_ids += [char_vocab.to_index(PAD_TAG)] * (max_chars - len(char_ids))
             self.words_to_chars_embedding[index] = torch.LongTensor(char_ids)
-
+        
         self.char_vocab = char_vocab
-
+        
         self.token_embedder = ConvTokenEmbedder(
             config, self.weight_file, None, char_emb_layer)
         elmo_model["char_cnn"]['char_emb_layer.weight'] = char_emb_layer.weight
         self.token_embedder.load_state_dict(elmo_model["char_cnn"])
-
+        
         self.output_dim = config['lstm']['projection_dim']
-
+        
         # lstm encoder
         self.encoder = ElmobiLm(config)
         self.encoder.load_state_dict(elmo_model["lstm"])
-
+        
         if cache_word_reprs:
             if config['char_cnn']['embedding']['dim'] > 0:  # 只有在使用了chars的情况下有用
                 print("Start to generate cache word representations.")
@@ -280,7 +287,7 @@ class _ElmoModel(nn.Module):
                 word_size = self.words_to_chars_embedding.size(0)
                 num_batches = word_size // batch_size + \
                               int(word_size % batch_size != 0)
-
+                
                 self.cached_word_embedding = nn.Embedding(word_size,
                                                           config['lstm']['projection_dim'])
                 with torch.no_grad():
@@ -291,12 +298,12 @@ class _ElmoModel(nn.Module):
                         word_reprs = self.token_embedder(words.unsqueeze(1),
                                                          chars).detach()  # batch_size x 1 x config['encoder']['projection_dim']
                         self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1)
-
+                    
                     print("Finish generating cached word representations. Going to delete the character encoder.")
                 del self.token_embedder, self.words_to_chars_embedding
             else:
                 print("There is no need to cache word representations, since no character information is used.")
-
+    
     def forward(self, words):
         """
 
@@ -321,7 +328,7 @@ class _ElmoModel(nn.Module):
             else:
                 chars = None
             token_embedding = self.token_embedder(expanded_words, chars)  # batch_size x max_len x embed_dim
-
+        
         encoder_output = self.encoder(token_embedding, seq_len)
         if encoder_output.size(2) < max_len + 2:
             num_layers, _, output_len, hidden_size = encoder_output.size()
@@ -332,7 +339,7 @@ class _ElmoModel(nn.Module):
         token_embedding = token_embedding.masked_fill(mask, 0)
         token_embedding = torch.cat((token_embedding, token_embedding), dim=2).view(1, sz[1], sz[2], sz[3])
         encoder_output = torch.cat((token_embedding, encoder_output), dim=0)
-
+        
         # 删除<eos>, <bos>. 这里没有精确地删除，但应该也不会影响最后的结果了。
         encoder_output = encoder_output[:, :, 1:-1]
         return encoder_output
diff --git a/fastNLP/embeddings/embedding.py b/fastNLP/embeddings/embedding.py
index 8b746c0d..7ac841ce 100644
--- a/fastNLP/embeddings/embedding.py
+++ b/fastNLP/embeddings/embedding.py
@@ -3,6 +3,10 @@
 
 """
 
+__all__ = [
+    "Embedding",
+    "TokenEmbedding"
+]
 
 import torch.nn as nn
 from abc import abstractmethod
@@ -33,11 +37,11 @@ class Embedding(nn.Module):
     :param float dropout: 对Embedding的输出的dropout。
     :param int unk_index: drop word时替换为的index。fastNLP的Vocabulary的unk_index默认为1。
     """
-
+    
     def __init__(self, init_embed, word_dropout=0, dropout=0.0, unk_index=None):
-
+        
         super(Embedding, self).__init__()
-
+        
         self.embed = get_embeddings(init_embed)
         
         self.dropout = nn.Dropout(dropout)
@@ -48,44 +52,44 @@ class Embedding(nn.Module):
                 self._embed_size = self.embed.embedding_dim
             else:
                 self._embed_size = self.embed.weight.size(1)
-            if word_dropout>0 and not isinstance(unk_index, int):
+            if word_dropout > 0 and not isinstance(unk_index, int):
                 raise ValueError("When drop word is set, you need to pass in the unk_index.")
         else:
             self._embed_size = self.embed.embed_size
             unk_index = self.embed.get_word_vocab().unknown_idx
         self.unk_index = unk_index
         self.word_dropout = word_dropout
-
+    
     def forward(self, words):
         """
         :param torch.LongTensor words: [batch, seq_len]
         :return: torch.Tensor : [batch, seq_len, embed_dim]
         """
-        if self.word_dropout>0 and self.training:
+        if self.word_dropout > 0 and self.training:
             mask = torch.ones_like(words).float() * self.word_dropout
             mask = torch.bernoulli(mask).eq(1)  # dropout_word越大，越多位置为1
             words = words.masked_fill(mask, self.unk_index)
         words = self.embed(words)
         return self.dropout(words)
-
+    
     @property
-    def num_embedding(self)->int:
+    def num_embedding(self) -> int:
         if isinstance(self.embed, nn.Embedding):
             return self.embed.weight.size(0)
         else:
             return self.embed.num_embedding
-
+    
     def __len__(self):
         return len(self.embed)
-
+    
     @property
     def embed_size(self) -> int:
         return self._embed_size
-
+    
     @property
     def embedding_dim(self) -> int:
         return self._embed_size
-
+    
     @property
     def requires_grad(self):
         """
@@ -96,14 +100,14 @@ class Embedding(nn.Module):
             return self.embed.weight.requires_grad
         else:
             return self.embed.requires_grad
-
+    
     @requires_grad.setter
     def requires_grad(self, value):
         if not isinstance(self.embed, TokenEmbedding):
             self.embed.weight.requires_grad = value
         else:
             self.embed.requires_grad = value
-
+    
     @property
     def size(self):
         if isinstance(self.embed, TokenEmbedding):
@@ -120,12 +124,12 @@ class TokenEmbedding(nn.Module):
         assert vocab.padding is not None, "Vocabulary must have a padding entry."
         self._word_vocab = vocab
         self._word_pad_index = vocab.padding_idx
-        if word_dropout>0:
+        if word_dropout > 0:
             assert vocab.unknown is not None, "Vocabulary must have unknown entry when you want to drop a word."
         self.word_dropout = word_dropout
         self._word_unk_index = vocab.unknown_idx
         self.dropout_layer = nn.Dropout(dropout)
-
+    
     def drop_word(self, words):
         """
         按照设定随机将words设置为unknown_index。
@@ -138,7 +142,7 @@ class TokenEmbedding(nn.Module):
             mask = torch.bernoulli(mask).eq(1)  # dropout_word越大，越多位置为1
             words = words.masked_fill(mask, self._word_unk_index)
         return words
-
+    
     def dropout(self, words):
         """
         对embedding后的word表示进行drop。
@@ -147,7 +151,7 @@ class TokenEmbedding(nn.Module):
         :return:
         """
         return self.dropout_layer(words)
-
+    
     @property
     def requires_grad(self):
         """
@@ -159,23 +163,23 @@ class TokenEmbedding(nn.Module):
             return requires_grads.pop()
         else:
             return None
-
+    
     @requires_grad.setter
     def requires_grad(self, value):
         for param in self.parameters():
             param.requires_grad = value
-
+    
     def __len__(self):
         return len(self._word_vocab)
-
+    
     @property
     def embed_size(self) -> int:
         return self._embed_size
-
+    
     @property
     def embedding_dim(self) -> int:
         return self._embed_size
-
+    
     @property
     def num_embedding(self) -> int:
         """
@@ -183,7 +187,7 @@ class TokenEmbedding(nn.Module):
         :return:
         """
         return len(self._word_vocab)
-
+    
     def get_word_vocab(self):
         """
         返回embedding的词典。
@@ -191,11 +195,11 @@ class TokenEmbedding(nn.Module):
         :return: Vocabulary
         """
         return self._word_vocab
-
+    
     @property
     def size(self):
         return torch.Size(self.num_embedding, self._embed_size)
-
+    
     @abstractmethod
     def forward(self, words):
         raise NotImplementedError
diff --git a/fastNLP/embeddings/stack_embedding.py b/fastNLP/embeddings/stack_embedding.py
index d3ce462b..14781945 100644
--- a/fastNLP/embeddings/stack_embedding.py
+++ b/fastNLP/embeddings/stack_embedding.py
@@ -1,3 +1,12 @@
+"""
+.. todo::
+    doc
+"""
+
+__all__ = [
+    "StackEmbedding",
+]
+
 from typing import List
 
 import torch
@@ -26,6 +35,7 @@ class StackEmbedding(TokenEmbedding):
     :param float dropout: 以多大的概率对embedding的表示进行Dropout。0.1即随机将10%的值置为0。
 
     """
+    
     def __init__(self, embeds: List[TokenEmbedding], word_dropout=0, dropout=0):
         vocabs = []
         for embed in embeds:
@@ -34,14 +44,14 @@ class StackEmbedding(TokenEmbedding):
         _vocab = vocabs[0]
         for vocab in vocabs[1:]:
             assert vocab == _vocab, "All embeddings in StackEmbedding should use the same word vocabulary."
-
+        
         super(StackEmbedding, self).__init__(_vocab, word_dropout=word_dropout, dropout=dropout)
         assert isinstance(embeds, list)
         for embed in embeds:
             assert isinstance(embed, TokenEmbedding), "Only TokenEmbedding type is supported."
         self.embeds = nn.ModuleList(embeds)
         self._embed_size = sum([embed.embed_size for embed in self.embeds])
-
+    
     def append(self, embed: TokenEmbedding):
         """
         添加一个embedding到结尾。
@@ -50,18 +60,18 @@ class StackEmbedding(TokenEmbedding):
         """
         assert isinstance(embed, TokenEmbedding)
         self.embeds.append(embed)
-
+    
     def pop(self):
         """
         弹出最后一个embed
         :return:
         """
         return self.embeds.pop()
-
+    
     @property
     def embed_size(self):
         return self._embed_size
-
+    
     @property
     def requires_grad(self):
         """
@@ -73,12 +83,12 @@ class StackEmbedding(TokenEmbedding):
             return requires_grads.pop()
         else:
             return None
-
+    
     @requires_grad.setter
     def requires_grad(self, value):
         for embed in self.embeds():
             embed.requires_grad = value
-
+    
     def forward(self, words):
         """
         得到多个embedding的结果，并把结果按照顺序concat起来。
diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py
index a75ad18f..1c66e52b 100644
--- a/fastNLP/embeddings/static_embedding.py
+++ b/fastNLP/embeddings/static_embedding.py
@@ -1,4 +1,11 @@
+"""
+.. todo::
+    doc
+"""
 
+__all__ = [
+    "StaticEmbedding"
+]
 import os
 
 import torch
@@ -13,6 +20,7 @@ from ..modules.utils import _get_file_name_base_on_postfix
 from copy import deepcopy
 from collections import defaultdict
 
+
 class StaticEmbedding(TokenEmbedding):
     """
     别名：:class:`fastNLP.embeddings.StaticEmbedding`   :class:`fastNLP.embeddings.static_embedding.StaticEmbedding`
@@ -55,15 +63,16 @@ class StaticEmbedding(TokenEmbedding):
     :param bool normalize: 是否对vector进行normalize，使得每个vector的norm为1。
     :param int min_freq: Vocabulary词频数小于这个数量的word将被指向unk。
     """
-    def __init__(self, vocab: Vocabulary, model_dir_or_name: str='en', embedding_dim=-1, requires_grad: bool=True,
+    
+    def __init__(self, vocab: Vocabulary, model_dir_or_name: str = 'en', embedding_dim=-1, requires_grad: bool = True,
                  init_method=None, lower=False, dropout=0, word_dropout=0, normalize=False, min_freq=1, **kwargs):
         super(StaticEmbedding, self).__init__(vocab, word_dropout=word_dropout, dropout=dropout)
-        if embedding_dim>0:
+        if embedding_dim > 0:
             model_dir_or_name = None
-
+        
         # 得到cache_path
         if model_dir_or_name is None:
-            assert embedding_dim>=1, "The dimension of embedding should be larger than 1."
+            assert embedding_dim >= 1, "The dimension of embedding should be larger than 1."
             embedding_dim = int(embedding_dim)
             model_path = None
         elif model_dir_or_name.lower() in PRETRAIN_STATIC_FILES:
@@ -76,9 +85,9 @@ class StaticEmbedding(TokenEmbedding):
             model_path = _get_file_name_base_on_postfix(os.path.abspath(os.path.expanduser(model_dir_or_name)), '.txt')
         else:
             raise ValueError(f"Cannot recognize {model_dir_or_name}.")
-
+        
         # 根据min_freq缩小vocab
-        truncate_vocab = (vocab.min_freq is None and min_freq>1) or (vocab.min_freq and vocab.min_freq<min_freq)
+        truncate_vocab = (vocab.min_freq is None and min_freq > 1) or (vocab.min_freq and vocab.min_freq < min_freq)
         if truncate_vocab:
             truncated_vocab = deepcopy(vocab)
             truncated_vocab.min_freq = min_freq
@@ -89,14 +98,14 @@ class StaticEmbedding(TokenEmbedding):
                     lowered_word_count[word.lower()] += count
                 for word in truncated_vocab.word_count.keys():
                     word_count = truncated_vocab.word_count[word]
-                    if lowered_word_count[word.lower()]>=min_freq and word_count<min_freq:
-                        truncated_vocab.add_word_lst([word]*(min_freq-word_count),
+                    if lowered_word_count[word.lower()] >= min_freq and word_count < min_freq:
+                        truncated_vocab.add_word_lst([word] * (min_freq - word_count),
                                                      no_create_entry=truncated_vocab._is_word_no_create_entry(word))
-
+            
             # 只限制在train里面的词语使用min_freq筛选
             if kwargs.get('only_train_min_freq', False) and model_dir_or_name is not None:
                 for word in truncated_vocab.word_count.keys():
-                    if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word]<min_freq:
+                    if truncated_vocab._is_word_no_create_entry(word) and truncated_vocab.word_count[word] < min_freq:
                         truncated_vocab.add_word_lst([word] * (min_freq - truncated_vocab.word_count[word]),
                                                      no_create_entry=True)
             truncated_vocab.build_vocab()
@@ -105,7 +114,7 @@ class StaticEmbedding(TokenEmbedding):
                 truncated_words_to_words[index] = truncated_vocab.to_index(word)
             print(f"{len(vocab) - len(truncated_vocab)} out of {len(vocab)} words have frequency less than {min_freq}.")
             vocab = truncated_vocab
-
+        
         self.only_norm_found_vector = kwargs.get('only_norm_found_vector', False)
         # 读取embedding
         if lower:
@@ -145,21 +154,21 @@ class StaticEmbedding(TokenEmbedding):
                 self.words_to_words = nn.Parameter(torch.arange(len(vocab)).long(), requires_grad=False)
         if not self.only_norm_found_vector and normalize:
             embedding /= (torch.norm(embedding, dim=1, keepdim=True) + 1e-12)
-
+        
         if truncate_vocab:
             for i in range(len(truncated_words_to_words)):
                 index_in_truncated_vocab = truncated_words_to_words[i]
                 truncated_words_to_words[i] = self.words_to_words[index_in_truncated_vocab]
             del self.words_to_words
             self.words_to_words = nn.Parameter(truncated_words_to_words, requires_grad=False)
-
+        
         self.embedding = nn.Embedding(num_embeddings=embedding.shape[0], embedding_dim=embedding.shape[1],
                                       padding_idx=vocab.padding_idx,
                                       max_norm=None, norm_type=2, scale_grad_by_freq=False,
                                       sparse=False, _weight=embedding)
         self._embed_size = self.embedding.weight.size(1)
         self.requires_grad = requires_grad
-
+    
     def _randomly_init_embed(self, num_embedding, embedding_dim, init_embed=None):
         """
 
@@ -169,14 +178,14 @@ class StaticEmbedding(TokenEmbedding):
         :return: torch.FloatTensor
         """
         embed = torch.zeros(num_embedding, embedding_dim)
-
+        
         if init_embed is None:
-            nn.init.uniform_(embed, -np.sqrt(3/embedding_dim), np.sqrt(3/embedding_dim))
+            nn.init.uniform_(embed, -np.sqrt(3 / embedding_dim), np.sqrt(3 / embedding_dim))
         else:
             init_embed(embed)
-
+        
         return embed
-
+    
     @property
     def requires_grad(self):
         """
@@ -190,14 +199,14 @@ class StaticEmbedding(TokenEmbedding):
             return requires_grads.pop()
         else:
             return None
-
+    
     @requires_grad.setter
     def requires_grad(self, value):
         for name, param in self.named_parameters():
             if 'words_to_words' in name:
                 continue
             param.requires_grad = value
-
+    
     def _load_with_vocab(self, embed_filepath, vocab, dtype=np.float32, padding='<pad>', unknown='<unk>',
                          error='ignore', init_method=None):
         """
@@ -250,7 +259,7 @@ class StaticEmbedding(TokenEmbedding):
                         index = vocab.to_index(word)
                         matrix[index] = torch.from_numpy(np.fromstring(' '.join(nums), sep=' ', dtype=dtype, count=dim))
                         if self.only_norm_found_vector:
-                            matrix[index] = matrix[index]/np.linalg.norm(matrix[index])
+                            matrix[index] = matrix[index] / np.linalg.norm(matrix[index])
                         found_count += 1
                 except Exception as e:
                     if error == 'ignore':
@@ -267,22 +276,22 @@ class StaticEmbedding(TokenEmbedding):
                         matrix[index] = None
             # matrix中代表是需要建立entry的词
             vectors = self._randomly_init_embed(len(matrix), dim, init_method)
-
+            
             if vocab.unknown is None:  # 创建一个专门的unknown
                 unknown_idx = len(matrix)
                 vectors = torch.cat((vectors, torch.zeros(1, dim)), dim=0).contiguous()
             else:
                 unknown_idx = vocab.unknown_idx
-            self.words_to_words = nn.Parameter(torch.full((len(vocab), ), fill_value=unknown_idx).long(),
+            self.words_to_words = nn.Parameter(torch.full((len(vocab),), fill_value=unknown_idx).long(),
                                                requires_grad=False)
-
+            
             for index, (index_in_vocab, vec) in enumerate(matrix.items()):
                 if vec is not None:
                     vectors[index] = vec
                 self.words_to_words[index_in_vocab] = index
-
+            
             return vectors
-
+    
     def forward(self, words):
         """
         传入words的index
diff --git a/fastNLP/embeddings/utils.py b/fastNLP/embeddings/utils.py
index 1e83219a..844a0c93 100644
--- a/fastNLP/embeddings/utils.py
+++ b/fastNLP/embeddings/utils.py
@@ -1,13 +1,19 @@
+"""
+.. todo::
+    doc
+"""
 import numpy as np
 import torch
 from torch import nn as nn
 
 from ..core.vocabulary import Vocabulary
 
-__all__ = ['get_embeddings']
+__all__ = [
+    'get_embeddings'
+]
 
 
-def _construct_char_vocab_from_vocab(vocab:Vocabulary, min_freq:int=1):
+def _construct_char_vocab_from_vocab(vocab: Vocabulary, min_freq: int = 1):
     """
     给定一个word的vocabulary生成character的vocabulary.
 
@@ -36,8 +42,8 @@ def get_embeddings(init_embed):
     if isinstance(init_embed, tuple):
         res = nn.Embedding(
             num_embeddings=init_embed[0], embedding_dim=init_embed[1])
-        nn.init.uniform_(res.weight.data, a=-np.sqrt(3/res.weight.data.size(1)),
-                         b=np.sqrt(3/res.weight.data.size(1)))
+        nn.init.uniform_(res.weight.data, a=-np.sqrt(3 / res.weight.data.size(1)),
+                         b=np.sqrt(3 / res.weight.data.size(1)))
     elif isinstance(init_embed, nn.Module):
         res = init_embed
     elif isinstance(init_embed, torch.Tensor):
@@ -48,4 +54,4 @@ def get_embeddings(init_embed):
     else:
         raise TypeError(
             'invalid init_embed type: {}'.format((type(init_embed))))
-    return res
\ No newline at end of file
+    return res

From 8445bdbc793c69e998efd9381229820ae9a5ba9d Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Sun, 25 Aug 2019 16:57:47 +0800
Subject: [PATCH 03/19] delete predictor.py

---
 fastNLP/core/predictor.py   | 79 -------------------------------------
 test/core/test_predictor.py | 48 ----------------------
 2 files changed, 127 deletions(-)
 delete mode 100644 fastNLP/core/predictor.py
 delete mode 100644 test/core/test_predictor.py

diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py
deleted file mode 100644
index 2d6a7380..00000000
--- a/fastNLP/core/predictor.py
+++ /dev/null
@@ -1,79 +0,0 @@
-"""
-    ..todo::
-        检查这个类是否需要
-"""
-from collections import defaultdict
-
-import torch
-
-from . import DataSetIter
-from . import DataSet
-from . import SequentialSampler
-from .utils import _build_args, _move_dict_value_to_device, _get_model_device
-
-
-class Predictor(object):
-    """
-    一个根据训练模型预测输出的预测器（Predictor）
-
-    与测试器（Tester）不同的是，predictor不关心模型性能的评价指标，只做inference。
-    这是一个fastNLP调用的高级模型包装器。它与Trainer、Tester不共享任何操作。
-
-    :param torch.nn.Module network: 用来完成预测任务的模型
-    """
-
-    def __init__(self, network):
-        if not isinstance(network, torch.nn.Module):
-            raise ValueError(
-                "Only fastNLP.models.BaseModel or torch.nn,Module is allowed, not {}".format(type(network)))
-        self.network = network
-        self.batch_size = 1
-        self.batch_output = []
-
-    def predict(self, data: DataSet, seq_len_field_name=None):
-        """用已经训练好的模型进行inference.
-
-        :param fastNLP.DataSet data: 待预测的数据集
-        :param str seq_len_field_name: 表示序列长度信息的field名字
-        :return: dict dict里面的内容为模型预测的结果
-        """
-        if not isinstance(data, DataSet):
-            raise ValueError("Only Dataset class is allowed, not {}.".format(type(data)))
-        if seq_len_field_name is not None and seq_len_field_name not in data.field_arrays:
-            raise ValueError("Field name {} not found in DataSet {}.".format(seq_len_field_name, data))
-
-        prev_training = self.network.training
-        self.network.eval()
-        network_device = _get_model_device(self.network)
-        batch_output = defaultdict(list)
-        data_iterator = DataSetIter(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False)
-
-        if hasattr(self.network, "predict"):
-            predict_func = self.network.predict
-        else:
-            predict_func = self.network.forward
-
-        with torch.no_grad():
-            for batch_x, _ in data_iterator:
-                _move_dict_value_to_device(batch_x, _, device=network_device)
-                refined_batch_x = _build_args(predict_func, **batch_x)
-                prediction = predict_func(**refined_batch_x)
-
-                if seq_len_field_name is not None:
-                    seq_lens = batch_x[seq_len_field_name].tolist()
-
-                for key, value in prediction.items():
-                    value = value.cpu().numpy()
-                    if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1):
-                        batch_output[key].extend(value.tolist())
-                    else:
-                        if seq_len_field_name is not None:
-                            tmp_batch = []
-                            for idx, seq_len in enumerate(seq_lens):
-                                tmp_batch.append(value[idx, :seq_len])
-                            batch_output[key].extend(tmp_batch)
-                        else:
-                            batch_output[key].append(value)
-
-        self.network.train(prev_training)
-        return batch_output
diff --git a/test/core/test_predictor.py b/test/core/test_predictor.py
deleted file mode 100644
index 701353dc..00000000
--- a/test/core/test_predictor.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import unittest
-from collections import defaultdict
-
-import numpy as np
-import torch
-
-from fastNLP.core.dataset import DataSet
-from fastNLP.core.instance import Instance
-from fastNLP.core.predictor import Predictor
-
-
-def prepare_fake_dataset():
-    mean = np.array([-3, -3])
-    cov = np.array([[1, 0], [0, 1]])
-    class_A = np.random.multivariate_normal(mean, cov, size=(1000,))
-
-    mean = np.array([3, 3])
-    cov = np.array([[1, 0], [0, 1]])
-    class_B = np.random.multivariate_normal(mean, cov, size=(1000,))
-
-    data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] +
-                       [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B])
-    return data_set
-
-
-class LinearModel(torch.nn.Module):
-    def __init__(self):
-        super(LinearModel, self).__init__()
-        self.linear = torch.nn.Linear(2, 1)
-
-    def forward(self, x):
-        return {"predict": self.linear(x)}
-
-
-class TestPredictor(unittest.TestCase):
-    def test_simple(self):
-        model = LinearModel()
-        predictor = Predictor(model)
-        data = prepare_fake_dataset()
-        data.set_input("x")
-        ans = predictor.predict(data)
-        self.assertTrue(isinstance(ans, defaultdict))
-        self.assertTrue("predict" in ans)
-        self.assertTrue(isinstance(ans["predict"], list))
-
-    def test_sequence(self):
-        # test sequence input/output
-        pass

From 65a6fd3dc721508f40dab11aac1d0ffac9781eee Mon Sep 17 00:00:00 2001
From: xuyige <ygxu18@fudan.edu.cn>
Date: Sun, 25 Aug 2019 17:48:01 +0800
Subject: [PATCH 04/19] Revert "delete predictor.py"

This reverts commit 8445bdbc793c69e998efd9381229820ae9a5ba9d.
---
 fastNLP/core/predictor.py   | 79 +++++++++++++++++++++++++++++++++++++
 test/core/test_predictor.py | 48 ++++++++++++++++++++++
 2 files changed, 127 insertions(+)
 create mode 100644 fastNLP/core/predictor.py
 create mode 100644 test/core/test_predictor.py

diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py
new file mode 100644
index 00000000..2d6a7380
--- /dev/null
+++ b/fastNLP/core/predictor.py
@@ -0,0 +1,79 @@
+"""
+    ..todo::
+        检查这个类是否需要
+"""
+from collections import defaultdict
+
+import torch
+
+from . import DataSetIter
+from . import DataSet
+from . import SequentialSampler
+from .utils import _build_args, _move_dict_value_to_device, _get_model_device
+
+
+class Predictor(object):
+    """
+    一个根据训练模型预测输出的预测器（Predictor）
+
+    与测试器（Tester）不同的是，predictor不关心模型性能的评价指标，只做inference。
+    这是一个fastNLP调用的高级模型包装器。它与Trainer、Tester不共享任何操作。
+
+    :param torch.nn.Module network: 用来完成预测任务的模型
+    """
+
+    def __init__(self, network):
+        if not isinstance(network, torch.nn.Module):
+            raise ValueError(
+                "Only fastNLP.models.BaseModel or torch.nn,Module is allowed, not {}".format(type(network)))
+        self.network = network
+        self.batch_size = 1
+        self.batch_output = []
+
+    def predict(self, data: DataSet, seq_len_field_name=None):
+        """用已经训练好的模型进行inference.
+
+        :param fastNLP.DataSet data: 待预测的数据集
+        :param str seq_len_field_name: 表示序列长度信息的field名字
+        :return: dict dict里面的内容为模型预测的结果
+        """
+        if not isinstance(data, DataSet):
+            raise ValueError("Only Dataset class is allowed, not {}.".format(type(data)))
+        if seq_len_field_name is not None and seq_len_field_name not in data.field_arrays:
+            raise ValueError("Field name {} not found in DataSet {}.".format(seq_len_field_name, data))
+
+        prev_training = self.network.training
+        self.network.eval()
+        network_device = _get_model_device(self.network)
+        batch_output = defaultdict(list)
+        data_iterator = DataSetIter(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False)
+
+        if hasattr(self.network, "predict"):
+            predict_func = self.network.predict
+        else:
+            predict_func = self.network.forward
+
+        with torch.no_grad():
+            for batch_x, _ in data_iterator:
+                _move_dict_value_to_device(batch_x, _, device=network_device)
+                refined_batch_x = _build_args(predict_func, **batch_x)
+                prediction = predict_func(**refined_batch_x)
+
+                if seq_len_field_name is not None:
+                    seq_lens = batch_x[seq_len_field_name].tolist()
+
+                for key, value in prediction.items():
+                    value = value.cpu().numpy()
+                    if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1):
+                        batch_output[key].extend(value.tolist())
+                    else:
+                        if seq_len_field_name is not None:
+                            tmp_batch = []
+                            for idx, seq_len in enumerate(seq_lens):
+                                tmp_batch.append(value[idx, :seq_len])
+                            batch_output[key].extend(tmp_batch)
+                        else:
+                            batch_output[key].append(value)
+
+        self.network.train(prev_training)
+        return batch_output
diff --git a/test/core/test_predictor.py b/test/core/test_predictor.py
new file mode 100644
index 00000000..701353dc
--- /dev/null
+++ b/test/core/test_predictor.py
@@ -0,0 +1,48 @@
+import unittest
+from collections import defaultdict
+
+import numpy as np
+import torch
+
+from fastNLP.core.dataset import DataSet
+from fastNLP.core.instance import Instance
+from fastNLP.core.predictor import Predictor
+
+
+def prepare_fake_dataset():
+    mean = np.array([-3, -3])
+    cov = np.array([[1, 0], [0, 1]])
+    class_A = np.random.multivariate_normal(mean, cov, size=(1000,))
+
+    mean = np.array([3, 3])
+    cov = np.array([[1, 0], [0, 1]])
+    class_B = np.random.multivariate_normal(mean, cov, size=(1000,))
+
+    data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] +
+                       [Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B])
+    return data_set
+
+
+class LinearModel(torch.nn.Module):
+    def __init__(self):
+        super(LinearModel, self).__init__()
+        self.linear = torch.nn.Linear(2, 1)
+
+    def forward(self, x):
+        return {"predict": self.linear(x)}
+
+
+class TestPredictor(unittest.TestCase):
+    def test_simple(self):
+        model = LinearModel()
+        predictor = Predictor(model)
+        data = prepare_fake_dataset()
+        data.set_input("x")
+        ans = predictor.predict(data)
+        self.assertTrue(isinstance(ans, defaultdict))
+        self.assertTrue("predict" in ans)
+        self.assertTrue(isinstance(ans["predict"], list))
+
+    def test_sequence(self):
+        # test sequence input/output
+        pass

From 74934271dc77e53a3deb0e7efc85f401f5d1f349 Mon Sep 17 00:00:00 2001
From: yh_cc <poemsmileyh@gmail.com>
Date: Sun, 25 Aug 2019 18:20:58 +0800
Subject: [PATCH 05/19] =?UTF-8?q?1.=E5=A2=9E=E5=8A=A0sequence=20labelling?=
 =?UTF-8?q?=E4=B8=ADbert=20ner;=202.=E5=B0=86print=E6=9B=BF=E6=8D=A2?=
 =?UTF-8?q?=E4=B8=BAlogger?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/core/batch.py                         |   4 +-
 fastNLP/core/callback.py                      |  18 +--
 fastNLP/core/dataset.py                       |  11 +-
 fastNLP/core/dist_trainer.py                  |   1 -
 fastNLP/core/field.py                         |  19 +--
 fastNLP/core/tester.py                        |   2 +-
 fastNLP/core/utils.py                         |   4 +-
 fastNLP/core/vocabulary.py                    |   7 +-
 fastNLP/embeddings/bert_embedding.py          |  16 +-
 fastNLP/embeddings/char_embedding.py          |   9 +-
 fastNLP/embeddings/contextual_embedding.py    |  10 +-
 fastNLP/embeddings/elmo_embedding.py          |  10 +-
 fastNLP/embeddings/embedding.py               |   4 +-
 fastNLP/embeddings/static_embedding.py        |   9 +-
 fastNLP/io/embed_loader.py                    |   8 +-
 fastNLP/io/file_reader.py                     |   9 +-
 fastNLP/io/file_utils.py                      |  13 +-
 fastNLP/io/pipe/classification.py             |   2 +-
 fastNLP/io/utils.py                           |   6 +-
 fastNLP/modules/encoder/bert.py               |  15 +-
 .../ner/data/Conll2003Loader.py               |  93 -----------
 .../ner/data/OntoNoteLoader.py                | 152 ------------------
 .../seqence_labelling/ner/data/utils.py       |  49 ------
 .../seqence_labelling/ner/model/bert_crf.py   |  31 ++++
 .../seqence_labelling/ner/test/__init__.py    |   0
 .../seqence_labelling/ner/test/test.py        |  33 ----
 .../seqence_labelling/ner/train_bert.py       |  52 ++++++
 .../seqence_labelling/ner/train_idcnn.py      |  22 +--
 28 files changed, 182 insertions(+), 427 deletions(-)
 delete mode 100644 reproduction/seqence_labelling/ner/data/Conll2003Loader.py
 delete mode 100644 reproduction/seqence_labelling/ner/data/OntoNoteLoader.py
 delete mode 100644 reproduction/seqence_labelling/ner/data/utils.py
 create mode 100644 reproduction/seqence_labelling/ner/model/bert_crf.py
 delete mode 100644 reproduction/seqence_labelling/ner/test/__init__.py
 delete mode 100644 reproduction/seqence_labelling/ner/test/test.py
 create mode 100644 reproduction/seqence_labelling/ner/train_bert.py

diff --git a/fastNLP/core/batch.py b/fastNLP/core/batch.py
index 8d97783e..ff710b30 100644
--- a/fastNLP/core/batch.py
+++ b/fastNLP/core/batch.py
@@ -17,7 +17,7 @@ from numbers import Number
 
 from .sampler import SequentialSampler
 from .dataset import DataSet
-
+from ._logger import logger
 _python_is_exit = False
 
 
@@ -75,7 +75,7 @@ class DataSetGetter:
                         try:
                             data, flag = _to_tensor(data, f.dtype)
                         except TypeError as e:
-                            print(f"Field {n} cannot be converted to torch.tensor.")
+                            logger.error(f"Field {n} cannot be converted to torch.tensor.")
                             raise e
                     batch_dict[n] = data
             return batch_dict
diff --git a/fastNLP/core/callback.py b/fastNLP/core/callback.py
index 24b42b6e..2c130061 100644
--- a/fastNLP/core/callback.py
+++ b/fastNLP/core/callback.py
@@ -83,7 +83,6 @@ try:
 except:
     tensorboardX_flag = False
 
-from ..io.model_io import ModelSaver, ModelLoader
 from .dataset import DataSet
 from .tester import Tester
 from ._logger import logger
@@ -505,7 +504,7 @@ class EarlyStopCallback(Callback):
     
     def on_exception(self, exception):
         if isinstance(exception, EarlyStopError):
-            print("Early Stopping triggered in epoch {}!".format(self.epoch))
+            logger.info("Early Stopping triggered in epoch {}!".format(self.epoch))
         else:
             raise exception  # 抛出陌生Error
 
@@ -752,8 +751,7 @@ class LRFinder(Callback):
         self.smooth_value = SmoothValue(0.8)
         self.opt = None
         self.find = None
-        self.loader = ModelLoader()
-    
+
     @property
     def lr_gen(self):
         scale = (self.end_lr - self.start_lr) / self.batch_per_epoch
@@ -768,7 +766,7 @@ class LRFinder(Callback):
             self.opt = self.trainer.optimizer  # pytorch optimizer
             self.opt.param_groups[0]["lr"] = self.start_lr
             # save model
-            ModelSaver("tmp").save_pytorch(self.trainer.model, param_only=True)
+            torch.save(self.model.state_dict(), 'tmp')
             self.find = True
     
     def on_backward_begin(self, loss):
@@ -797,7 +795,9 @@ class LRFinder(Callback):
             self.opt.param_groups[0]["lr"] = self.best_lr
             self.find = False
             # reset model
-            ModelLoader().load_pytorch(self.trainer.model, "tmp")
+            states = torch.load('tmp')
+            self.model.load_state_dict(states)
+            os.remove('tmp')
             self.pbar.write("Model reset. \nFind best lr={}".format(self.best_lr))
 
 
@@ -988,14 +988,14 @@ class SaveModelCallback(Callback):
             try:
                 _save_model(self.model, model_name=name, save_dir=self.save_dir, only_param=self.only_param)
             except Exception as e:
-                print(f"The following exception:{e} happens when save model to {self.save_dir}.")
+                logger.error(f"The following exception:{e} happens when save model to {self.save_dir}.")
         if delete_pair:
             try:
                 delete_model_path = os.path.join(self.save_dir, delete_pair[1])
                 if os.path.exists(delete_model_path):
                     os.remove(delete_model_path)
             except Exception as e:
-                print(f"Fail to delete model {name} at {self.save_dir} caused by exception:{e}.")
+                logger.error(f"Fail to delete model {name} at {self.save_dir} caused by exception:{e}.")
 
     def on_exception(self, exception):
         if self.save_on_exception:
@@ -1032,7 +1032,7 @@ class EchoCallback(Callback):
 
     def __getattribute__(self, item):
         if item.startswith('on_'):
-            print('{}.{} has been called at pid: {}'.format(self.name, item, os.getpid()),
+            logger.info('{}.{} has been called at pid: {}'.format(self.name, item, os.getpid()),
                   file=self.out)
         return super(EchoCallback, self).__getattribute__(item)
 
diff --git a/fastNLP/core/dataset.py b/fastNLP/core/dataset.py
index 4c689842..51bcef43 100644
--- a/fastNLP/core/dataset.py
+++ b/fastNLP/core/dataset.py
@@ -300,6 +300,7 @@ from .utils import _get_func_signature
 from .field import AppendToTargetOrInputException
 from .field import SetInputOrTargetException
 from .const import Const
+from ._logger import logger
 
 class DataSet(object):
     """
@@ -452,7 +453,7 @@ class DataSet(object):
                 try:
                     self.field_arrays[name].append(field)
                 except AppendToTargetOrInputException as e:
-                    print(f"Cannot append to field:{name}.")
+                    logger.error(f"Cannot append to field:{name}.")
                     raise e
     
     def add_fieldarray(self, field_name, fieldarray):
@@ -609,7 +610,7 @@ class DataSet(object):
                     self.field_arrays[name]._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type)
                     self.field_arrays[name].is_target = flag
                 except SetInputOrTargetException as e:
-                    print(f"Cannot set field:{name} as target.")
+                    logger.error(f"Cannot set field:{name} as target.")
                     raise e
             else:
                 raise KeyError("{} is not a valid field name.".format(name))
@@ -633,7 +634,7 @@ class DataSet(object):
                     self.field_arrays[name]._use_1st_ins_infer_dim_type = bool(use_1st_ins_infer_dim_type)
                     self.field_arrays[name].is_input = flag
                 except SetInputOrTargetException as e:
-                    print(f"Cannot set field:{name} as input, exception happens at the {e.index} value.")
+                    logger.error(f"Cannot set field:{name} as input, exception happens at the {e.index} value.")
                     raise e
             else:
                 raise KeyError("{} is not a valid field name.".format(name))
@@ -728,7 +729,7 @@ class DataSet(object):
                 results.append(func(ins[field_name]))
         except Exception as e:
             if idx != -1:
-                print("Exception happens at the `{}`th(from 1) instance.".format(idx+1))
+                logger.error("Exception happens at the `{}`th(from 1) instance.".format(idx+1))
             raise e
         if not (new_field_name is None) and len(list(filter(lambda x: x is not None, results))) == 0:  # all None
             raise ValueError("{} always return None.".format(_get_func_signature(func=func)))
@@ -795,7 +796,7 @@ class DataSet(object):
                 results.append(func(ins))
         except BaseException as e:
             if idx != -1:
-                print("Exception happens at the `{}`th instance.".format(idx))
+                logger.error("Exception happens at the `{}`th instance.".format(idx))
             raise e
 
         # results = [func(ins) for ins in self._inner_iter()]
diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py
index 346539cd..7c64fee4 100644
--- a/fastNLP/core/dist_trainer.py
+++ b/fastNLP/core/dist_trainer.py
@@ -54,7 +54,6 @@ class DistTrainer():
                  num_workers=1, drop_last=False,
                  dev_data=None, metrics=None, metric_key=None,
                  update_every=1, print_every=10, validate_every=-1,
-                 log_path=None,
                  save_every=-1, save_path=None, device='auto',
                  fp16='', backend=None, init_method=None):
 
diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py
index 26d22ada..b3f024f8 100644
--- a/fastNLP/core/field.py
+++ b/fastNLP/core/field.py
@@ -12,6 +12,7 @@ from abc import abstractmethod
 from copy import deepcopy
 from collections import Counter
 from .utils import _is_iterable
+from ._logger import logger
 
 
 class SetInputOrTargetException(Exception):
@@ -39,7 +40,7 @@ class FieldArray:
         try:
             _content = list(_content)
         except BaseException as e:
-            print(f"Cannot convert content(of type:{type(content)}) into list.")
+            logger.error(f"Cannot convert content(of type:{type(content)}) into list.")
             raise e
         self.name = name
         self.content = _content
@@ -263,7 +264,7 @@ class FieldArray:
             try:
                 new_contents.append(cell.split(sep))
             except Exception as e:
-                print(f"Exception happens when process value in index {index}.")
+                logger.error(f"Exception happens when process value in index {index}.")
                 raise e
         return self._after_process(new_contents, inplace=inplace)
     
@@ -283,8 +284,8 @@ class FieldArray:
                 else:
                     new_contents.append(int(cell))
             except Exception as e:
-                print(f"Exception happens when process value in index {index}.")
-                print(e)
+                logger.error(f"Exception happens when process value in index {index}.")
+                raise e
         return self._after_process(new_contents, inplace=inplace)
     
     def float(self, inplace=True):
@@ -303,7 +304,7 @@ class FieldArray:
                 else:
                     new_contents.append(float(cell))
             except Exception as e:
-                print(f"Exception happens when process value in index {index}.")
+                logger.error(f"Exception happens when process value in index {index}.")
                 raise e
         return self._after_process(new_contents, inplace=inplace)
     
@@ -323,7 +324,7 @@ class FieldArray:
                 else:
                     new_contents.append(bool(cell))
             except Exception as e:
-                print(f"Exception happens when process value in index {index}.")
+                logger.error(f"Exception happens when process value in index {index}.")
                 raise e
         
         return self._after_process(new_contents, inplace=inplace)
@@ -344,7 +345,7 @@ class FieldArray:
                 else:
                     new_contents.append(cell.lower())
             except Exception as e:
-                print(f"Exception happens when process value in index {index}.")
+                logger.error(f"Exception happens when process value in index {index}.")
                 raise e
         return self._after_process(new_contents, inplace=inplace)
     
@@ -364,7 +365,7 @@ class FieldArray:
                 else:
                     new_contents.append(cell.upper())
             except Exception as e:
-                print(f"Exception happens when process value in index {index}.")
+                logger.error(f"Exception happens when process value in index {index}.")
                 raise e
         return self._after_process(new_contents, inplace=inplace)
     
@@ -401,7 +402,7 @@ class FieldArray:
                 self.is_input = self.is_input
                 self.is_target = self.is_input
             except SetInputOrTargetException as e:
-                print("The newly generated field cannot be set as input or target.")
+                logger.error("The newly generated field cannot be set as input or target.")
                 raise e
             return self
         else:
diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py
index b339f671..e549df81 100644
--- a/fastNLP/core/tester.py
+++ b/fastNLP/core/tester.py
@@ -192,7 +192,7 @@ class Tester(object):
                                  dataset=self.data, check_level=0)
         
         if self.verbose >= 1:
-            print("[tester] \n{}".format(self._format_eval_results(eval_results)))
+            logger.info("[tester] \n{}".format(self._format_eval_results(eval_results)))
         self._mode(network, is_test=False)
         return eval_results
     
diff --git a/fastNLP/core/utils.py b/fastNLP/core/utils.py
index a023c29e..fcb2a07b 100644
--- a/fastNLP/core/utils.py
+++ b/fastNLP/core/utils.py
@@ -145,7 +145,7 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1):
                     with open(cache_filepath, 'rb') as f:
                         results = _pickle.load(f)
                     if verbose == 1:
-                        print("Read cache from {}.".format(cache_filepath))
+                        logger.info("Read cache from {}.".format(cache_filepath))
                     refresh_flag = False
             
             if refresh_flag:
@@ -156,7 +156,7 @@ def cache_results(_cache_fp, _refresh=False, _verbose=1):
                     _prepare_cache_filepath(cache_filepath)
                     with open(cache_filepath, 'wb') as f:
                         _pickle.dump(results, f)
-                    print("Save cache to {}.".format(cache_filepath))
+                    logger.info("Save cache to {}.".format(cache_filepath))
             
             return results
         
diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py
index 330d73dd..92f54f9a 100644
--- a/fastNLP/core/vocabulary.py
+++ b/fastNLP/core/vocabulary.py
@@ -10,6 +10,7 @@ from .utils import Option
 from functools import partial
 import numpy as np
 from .utils import _is_iterable
+from ._logger import logger
 
 class VocabularyOption(Option):
     def __init__(self,
@@ -49,7 +50,7 @@ def _check_build_status(func):
         if self.rebuild is False:
             self.rebuild = True
             if self.max_size is not None and len(self.word_count) >= self.max_size:
-                print("[Warning] Vocabulary has reached the max size {} when calling {} method. "
+                logger.info("[Warning] Vocabulary has reached the max size {} when calling {} method. "
                       "Adding more words may cause unexpected behaviour of Vocabulary. ".format(
                     self.max_size, func.__name__))
         return func(self, *args, **kwargs)
@@ -297,7 +298,7 @@ class Vocabulary(object):
                     for f_n, n_f_n in zip(field_name, new_field_name):
                         dataset.apply_field(index_instance, field_name=f_n, new_field_name=n_f_n)
                 except Exception as e:
-                    print("When processing the `{}` dataset, the following error occurred.".format(idx))
+                    logger.info("When processing the `{}` dataset, the following error occurred.".format(idx))
                     raise e
             else:
                 raise RuntimeError("Only DataSet type is allowed.")
@@ -353,7 +354,7 @@ class Vocabulary(object):
                 try:
                     dataset.apply(construct_vocab)
                 except BaseException as e:
-                    print("When processing the `{}` dataset, the following error occurred:".format(idx))
+                    log("When processing the `{}` dataset, the following error occurred:".format(idx))
                     raise e
             else:
                 raise TypeError("Only DataSet type is allowed.")
diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py
index e8844aa1..4bd06ec3 100644
--- a/fastNLP/embeddings/bert_embedding.py
+++ b/fastNLP/embeddings/bert_embedding.py
@@ -21,6 +21,7 @@ from ..io.file_utils import _get_embedding_url, cached_path, PRETRAINED_BERT_MOD
 from ..modules.encoder.bert import _WordPieceBertModel, BertModel, BertTokenizer
 from .contextual_embedding import ContextualEmbedding
 import warnings
+from ..core import logger
 
 
 class BertEmbedding(ContextualEmbedding):
@@ -125,8 +126,10 @@ class BertEmbedding(ContextualEmbedding):
             with torch.no_grad():
                 if self._word_sep_index:  # 不能drop sep
                     sep_mask = words.eq(self._word_sep_index)
-                mask = torch.ones_like(words).float() * self.word_dropout
+                mask = torch.full_like(words, fill_value=self.word_dropout)
                 mask = torch.bernoulli(mask).eq(1)  # dropout_word越大，越多位置为1
+                pad_mask = words.ne(0)
+                mask = pad_mask.__and__(mask)  # pad的位置不为unk
                 words = words.masked_fill(mask, self._word_unk_index)
                 if self._word_sep_index:
                     words.masked_fill_(sep_mask, self._word_sep_index)
@@ -182,6 +185,7 @@ class BertWordPieceEncoder(nn.Module):
         
         self.model = _WordPieceBertModel(model_dir=model_dir, layers=layers, pooled_cls=pooled_cls)
         self._sep_index = self.model._sep_index
+        self._wordpiece_pad_index = self.model._wordpiece_pad_index
         self._wordpiece_unk_index = self.model._wordpiece_unknown_index
         self._embed_size = len(self.model.layers) * self.model.encoder.hidden_size
         self.requires_grad = requires_grad
@@ -263,8 +267,10 @@ class BertWordPieceEncoder(nn.Module):
             with torch.no_grad():
                 if self._word_sep_index:  # 不能drop sep
                     sep_mask = words.eq(self._wordpiece_unk_index)
-                mask = torch.ones_like(words).float() * self.word_dropout
+                mask = torch.full_like(words, fill_value=self.word_dropout)
                 mask = torch.bernoulli(mask).eq(1)  # dropout_word越大，越多位置为1
+                pad_mask = words.ne(self._wordpiece_pad_index)
+                mask = pad_mask.__and__(mask)  # pad的位置不为unk
                 words = words.masked_fill(mask, self._word_unk_index)
                 if self._word_sep_index:
                     words.masked_fill_(sep_mask, self._wordpiece_unk_index)
@@ -297,7 +303,7 @@ class _WordBertModel(nn.Module):
         self.auto_truncate = auto_truncate
         
         # 将所有vocab中word的wordpiece计算出来, 需要额外考虑[CLS]和[SEP]
-        print("Start to generating word pieces for word.")
+        logger.info("Start to generating word pieces for word.")
         # 第一步统计出需要的word_piece, 然后创建新的embed和word_piece_vocab, 然后填入值
         word_piece_dict = {'[CLS]': 1, '[SEP]': 1}  # 用到的word_piece以及新增的
         found_count = 0
@@ -356,10 +362,10 @@ class _WordBertModel(nn.Module):
         self._sep_index = self.tokenzier.vocab['[SEP]']
         self._word_pad_index = vocab.padding_idx
         self._wordpiece_pad_index = self.tokenzier.vocab['[PAD]']  # 需要用于生成word_piece
-        print("Found(Or segment into word pieces) {} words out of {}.".format(found_count, len(vocab)))
+        logger.info("Found(Or segment into word pieces) {} words out of {}.".format(found_count, len(vocab)))
         self.word_to_wordpieces = np.array(word_to_wordpieces)
         self.word_pieces_lengths = nn.Parameter(torch.LongTensor(word_pieces_lengths), requires_grad=False)
-        print("Successfully generate word pieces.")
+        logger.debug("Successfully generate word pieces.")
     
     def forward(self, words):
         """
diff --git a/fastNLP/embeddings/char_embedding.py b/fastNLP/embeddings/char_embedding.py
index 24c84314..acffa054 100644
--- a/fastNLP/embeddings/char_embedding.py
+++ b/fastNLP/embeddings/char_embedding.py
@@ -19,6 +19,7 @@ from ..core.vocabulary import Vocabulary
 from .embedding import TokenEmbedding
 from .utils import _construct_char_vocab_from_vocab
 from .utils import get_embeddings
+from ..core import logger
 
 
 class CNNCharEmbedding(TokenEmbedding):
@@ -81,11 +82,11 @@ class CNNCharEmbedding(TokenEmbedding):
             raise Exception(
                 "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")
         
-        print("Start constructing character vocabulary.")
+        logger.info("Start constructing character vocabulary.")
         # 建立char的词表
         self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
         self.char_pad_index = self.char_vocab.padding_idx
-        print(f"In total, there are {len(self.char_vocab)} distinct characters.")
+        logger.info(f"In total, there are {len(self.char_vocab)} distinct characters.")
         # 对vocab进行index
         max_word_len = max(map(lambda x: len(x[0]), vocab))
         self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), max_word_len),
@@ -236,11 +237,11 @@ class LSTMCharEmbedding(TokenEmbedding):
             raise Exception(
                 "Undefined activation function: choose from: [relu, tanh, sigmoid, or a callable function]")
         
-        print("Start constructing character vocabulary.")
+        logger.info("Start constructing character vocabulary.")
         # 建立char的词表
         self.char_vocab = _construct_char_vocab_from_vocab(vocab, min_freq=min_char_freq)
         self.char_pad_index = self.char_vocab.padding_idx
-        print(f"In total, there are {len(self.char_vocab)} distinct characters.")
+        logger.info(f"In total, there are {len(self.char_vocab)} distinct characters.")
         # 对vocab进行index
         self.max_word_len = max(map(lambda x: len(x[0]), vocab))
         self.words_to_chars_embedding = nn.Parameter(torch.full((len(vocab), self.max_word_len),
diff --git a/fastNLP/embeddings/contextual_embedding.py b/fastNLP/embeddings/contextual_embedding.py
index 2a1e2f82..2c304da7 100644
--- a/fastNLP/embeddings/contextual_embedding.py
+++ b/fastNLP/embeddings/contextual_embedding.py
@@ -16,7 +16,7 @@ from ..core.batch import DataSetIter
 from ..core.sampler import SequentialSampler
 from ..core.utils import _move_model_to_device, _get_model_device
 from .embedding import TokenEmbedding
-
+from ..core import logger
 
 class ContextualEmbedding(TokenEmbedding):
     def __init__(self, vocab: Vocabulary, word_dropout: float = 0.0, dropout: float = 0.0):
@@ -37,14 +37,14 @@ class ContextualEmbedding(TokenEmbedding):
                 assert isinstance(dataset, DataSet), "Only fastNLP.DataSet object is allowed."
                 assert 'words' in dataset.get_input_name(), "`words` field has to be set as input."
             except Exception as e:
-                print(f"Exception happens at {index} dataset.")
+                logger.error(f"Exception happens at {index} dataset.")
                 raise e
         
         sent_embeds = {}
         _move_model_to_device(self, device=device)
         device = _get_model_device(self)
         pad_index = self._word_vocab.padding_idx
-        print("Start to calculate sentence representations.")
+        logger.info("Start to calculate sentence representations.")
         with torch.no_grad():
             for index, dataset in enumerate(datasets):
                 try:
@@ -64,9 +64,9 @@ class ContextualEmbedding(TokenEmbedding):
                             else:
                                 sent_embeds[tuple(words_list[b][:seq_len[b]])] = word_embeds[b, :-length]
                 except Exception as e:
-                    print(f"Exception happens at {index} dataset.")
+                    logger.error(f"Exception happens at {index} dataset.")
                     raise e
-        print("Finish calculating sentence representations.")
+        logger.info("Finish calculating sentence representations.")
         self.sent_embeds = sent_embeds
         if delete_weights:
             self._delete_model_weights()
diff --git a/fastNLP/embeddings/elmo_embedding.py b/fastNLP/embeddings/elmo_embedding.py
index fb5388fd..3df424a2 100644
--- a/fastNLP/embeddings/elmo_embedding.py
+++ b/fastNLP/embeddings/elmo_embedding.py
@@ -18,7 +18,7 @@ from ..core.vocabulary import Vocabulary
 from ..io.file_utils import cached_path, _get_embedding_url, PRETRAINED_ELMO_MODEL_DIR
 from ..modules.encoder._elmo import ElmobiLm, ConvTokenEmbedder
 from .contextual_embedding import ContextualEmbedding
-
+from ..core import logger
 
 class ElmoEmbedding(ContextualEmbedding):
     """
@@ -243,7 +243,7 @@ class _ElmoModel(nn.Module):
                 index_in_pre = char_lexicon[OOV_TAG]
             char_emb_layer.weight.data[index] = char_embed_weights[index_in_pre]
         
-        print(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.")
+        logger.info(f"{found_char_count} out of {len(char_vocab)} characters were found in pretrained elmo embedding.")
         # 生成words到chars的映射
         max_chars = config['char_cnn']['max_characters_per_token']
         
@@ -281,7 +281,7 @@ class _ElmoModel(nn.Module):
         
         if cache_word_reprs:
             if config['char_cnn']['embedding']['dim'] > 0:  # 只有在使用了chars的情况下有用
-                print("Start to generate cache word representations.")
+                logger.info("Start to generate cache word representations.")
                 batch_size = 320
                 # bos eos
                 word_size = self.words_to_chars_embedding.size(0)
@@ -299,10 +299,10 @@ class _ElmoModel(nn.Module):
                                                          chars).detach()  # batch_size x 1 x config['encoder']['projection_dim']
                         self.cached_word_embedding.weight.data[words] = word_reprs.squeeze(1)
                     
-                    print("Finish generating cached word representations. Going to delete the character encoder.")
+                    logger.info("Finish generating cached word representations. Going to delete the character encoder.")
                 del self.token_embedder, self.words_to_chars_embedding
             else:
-                print("There is no need to cache word representations, since no character information is used.")
+                logger.info("There is no need to cache word representations, since no character information is used.")
     
     def forward(self, words):
         """
diff --git a/fastNLP/embeddings/embedding.py b/fastNLP/embeddings/embedding.py
index 7ac841ce..a94985c1 100644
--- a/fastNLP/embeddings/embedding.py
+++ b/fastNLP/embeddings/embedding.py
@@ -138,8 +138,10 @@ class TokenEmbedding(nn.Module):
         :return:
         """
         if self.word_dropout > 0 and self.training:
-            mask = torch.ones_like(words).float() * self.word_dropout
+            mask = torch.full_like(words, fill_value=self.word_dropout)
             mask = torch.bernoulli(mask).eq(1)  # dropout_word越大，越多位置为1
+            pad_mask = words.ne(self._word_pad_index)
+            mask = mask.__and__(pad_mask)
             words = words.masked_fill(mask, self._word_unk_index)
         return words
     
diff --git a/fastNLP/embeddings/static_embedding.py b/fastNLP/embeddings/static_embedding.py
index 1c66e52b..98986565 100644
--- a/fastNLP/embeddings/static_embedding.py
+++ b/fastNLP/embeddings/static_embedding.py
@@ -19,6 +19,7 @@ from .embedding import TokenEmbedding
 from ..modules.utils import _get_file_name_base_on_postfix
 from copy import deepcopy
 from collections import defaultdict
+from ..core import logger
 
 
 class StaticEmbedding(TokenEmbedding):
@@ -112,7 +113,7 @@ class StaticEmbedding(TokenEmbedding):
             truncated_words_to_words = torch.arange(len(vocab)).long()
             for word, index in vocab:
                 truncated_words_to_words[index] = truncated_vocab.to_index(word)
-            print(f"{len(vocab) - len(truncated_vocab)} out of {len(vocab)} words have frequency less than {min_freq}.")
+            logger.info(f"{len(vocab) - len(truncated_vocab)} out of {len(vocab)} words have frequency less than {min_freq}.")
             vocab = truncated_vocab
         
         self.only_norm_found_vector = kwargs.get('only_norm_found_vector', False)
@@ -124,7 +125,7 @@ class StaticEmbedding(TokenEmbedding):
                     lowered_vocab.add_word(word.lower(), no_create_entry=True)
                 else:
                     lowered_vocab.add_word(word.lower())  # 先加入需要创建entry的
-            print(f"All word in the vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} "
+            logger.info(f"All word in the vocab have been lowered. There are {len(vocab)} words, {len(lowered_vocab)} "
                   f"unique lowered words.")
             if model_path:
                 embedding = self._load_with_vocab(model_path, vocab=lowered_vocab, init_method=init_method)
@@ -265,9 +266,9 @@ class StaticEmbedding(TokenEmbedding):
                     if error == 'ignore':
                         warnings.warn("Error occurred at the {} line.".format(idx))
                     else:
-                        print("Error occurred at the {} line.".format(idx))
+                        logger.error("Error occurred at the {} line.".format(idx))
                         raise e
-            print("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab)))
+            logger.info("Found {} out of {} words in the pre-training embedding.".format(found_count, len(vocab)))
             for word, index in vocab:
                 if index not in matrix and not vocab._is_word_no_create_entry(word):
                     if found_unknown:  # 如果有unkonwn，用unknown初始化
diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py
index 48048983..c58385e1 100644
--- a/fastNLP/io/embed_loader.py
+++ b/fastNLP/io/embed_loader.py
@@ -11,7 +11,7 @@ import numpy as np
 from ..core.vocabulary import Vocabulary
 from .data_bundle import BaseLoader
 from ..core.utils import Option
-
+import logging
 
 class EmbeddingOption(Option):
     def __init__(self,
@@ -91,10 +91,10 @@ class EmbedLoader(BaseLoader):
                     if error == 'ignore':
                         warnings.warn("Error occurred at the {} line.".format(idx))
                     else:
-                        print("Error occurred at the {} line.".format(idx))
+                        logging.error("Error occurred at the {} line.".format(idx))
                         raise e
             total_hits = sum(hit_flags)
-            print("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab)))
+            logging.info("Found {} out of {} words in the pre-training embedding.".format(total_hits, len(vocab)))
             if init_method is None:
                 found_vectors = matrix[hit_flags]
                 if len(found_vectors) != 0:
@@ -157,7 +157,7 @@ class EmbedLoader(BaseLoader):
                         warnings.warn("Error occurred at the {} line.".format(idx))
                         pass
                     else:
-                        print("Error occurred at the {} line.".format(idx))
+                        logging.error("Error occurred at the {} line.".format(idx))
                         raise e
             if dim == -1:
                 raise RuntimeError("{} is an empty file.".format(embed_filepath))
diff --git a/fastNLP/io/file_reader.py b/fastNLP/io/file_reader.py
index 6aa89b80..0320572c 100644
--- a/fastNLP/io/file_reader.py
+++ b/fastNLP/io/file_reader.py
@@ -2,7 +2,8 @@
 此模块用于给其它模块提供读取文件的函数，没有为用户提供 API
 """
 import json
-import warnings
+from ..core import logger
+
 
 def _read_csv(path, encoding='utf-8', headers=None, sep=',', dropna=True):
     """
@@ -103,9 +104,9 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True):
                         yield line_idx, res
                     except Exception as e:
                         if dropna:
-                            warnings.warn('Invalid instance ends at line: {} has been dropped.'.format(line_idx))
+                            logger.warn('Invalid instance which ends at line: {} has been dropped.'.format(line_idx))
                             continue
-                        raise ValueError('Invalid instance ends at line: {}'.format(line_idx))
+                        raise ValueError('Invalid instance which ends at line: {}'.format(line_idx))
             elif line.startswith('#'):
                 continue
             else:
@@ -117,5 +118,5 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True):
             except Exception as e:
                 if dropna:
                     return
-                print('invalid instance ends at line: {}'.format(line_idx))
+                logger.error('invalid instance ends at line: {}'.format(line_idx))
                 raise e
diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py
index 5af3c4ff..9dbb515d 100644
--- a/fastNLP/io/file_utils.py
+++ b/fastNLP/io/file_utils.py
@@ -7,6 +7,7 @@ import tempfile
 from tqdm import tqdm
 import shutil
 from requests import HTTPError
+from ..core import logger
 
 PRETRAINED_BERT_MODEL_DIR = {
     'en': 'bert-base-cased.zip',
@@ -336,7 +337,7 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path:
                 content_length = req.headers.get("Content-Length")
                 total = int(content_length) if content_length is not None else None
                 progress = tqdm(unit="B", total=total, unit_scale=1)
-                print("%s not found in cache, downloading to %s" % (url, temp_filename))
+                logger.info("%s not found in cache, downloading to %s" % (url, temp_filename))
 
                 with open(temp_filename, "wb") as temp_file:
                     for chunk in req.iter_content(chunk_size=1024 * 16):
@@ -344,12 +345,12 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path:
                             progress.update(len(chunk))
                             temp_file.write(chunk)
                 progress.close()
-                print(f"Finish download from {url}")
+                logger.info(f"Finish download from {url}")
 
                 # 开始解压
                 if suffix in ('.zip', '.tar.gz', '.gz'):
                     uncompress_temp_dir = tempfile.mkdtemp()
-                    print(f"Start to uncompress file to {uncompress_temp_dir}")
+                    logger.debug(f"Start to uncompress file to {uncompress_temp_dir}")
                     if suffix == '.zip':
                         unzip_file(Path(temp_filename), Path(uncompress_temp_dir))
                     elif suffix == '.gz':
@@ -362,13 +363,13 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path:
                             uncompress_temp_dir = os.path.join(uncompress_temp_dir, filenames[0])
 
                     cache_path.mkdir(parents=True, exist_ok=True)
-                    print("Finish un-compressing file.")
+                    logger.debug("Finish un-compressing file.")
                 else:
                     uncompress_temp_dir = temp_filename
                     cache_path = str(cache_path) + suffix
 
                 # 复制到指定的位置
-                print(f"Copy file to {cache_path}")
+                logger.info(f"Copy file to {cache_path}")
                 if os.path.isdir(uncompress_temp_dir):
                     for filename in os.listdir(uncompress_temp_dir):
                         if os.path.isdir(os.path.join(uncompress_temp_dir, filename)):
@@ -379,7 +380,7 @@ def get_from_cache(url: str, cache_dir: Path = None) -> Path:
                     shutil.copyfile(uncompress_temp_dir, cache_path)
                 success = True
             except Exception as e:
-                print(e)
+                logger.error(e)
                 raise e
             finally:
                 if not success:
diff --git a/fastNLP/io/pipe/classification.py b/fastNLP/io/pipe/classification.py
index daa17da9..f42d5400 100644
--- a/fastNLP/io/pipe/classification.py
+++ b/fastNLP/io/pipe/classification.py
@@ -11,7 +11,7 @@ from .utils import get_tokenizer, _indexize, _add_words_field, _drop_empty_insta
 from .pipe import Pipe
 import re
 nonalpnum = re.compile('[^0-9a-zA-Z?!\']+')
-from ...core.utils import cache_results
+
 
 class _CLSPipe(Pipe):
     """
diff --git a/fastNLP/io/utils.py b/fastNLP/io/utils.py
index 76b32b0a..faec2a55 100644
--- a/fastNLP/io/utils.py
+++ b/fastNLP/io/utils.py
@@ -2,7 +2,7 @@ import os
 
 from typing import Union, Dict
 from pathlib import Path
-
+from ..core import logger
 
 def check_loader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]:
     """
@@ -70,8 +70,8 @@ def get_tokenizer():
         import spacy
         spacy.prefer_gpu()
         en = spacy.load('en')
-        print('use spacy tokenizer')
+        logger.info('use spacy tokenizer')
         return lambda x: [w.text for w in en.tokenizer(x)]
     except Exception as e:
-        print('use raw tokenizer')
+        logger.error('use raw tokenizer')
         return lambda x: x.split()
diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py
index ffc43863..b74c4da0 100644
--- a/fastNLP/modules/encoder/bert.py
+++ b/fastNLP/modules/encoder/bert.py
@@ -17,8 +17,7 @@ import os
 
 import torch
 from torch import nn
-import sys
-
+from ...core import logger
 from ..utils import _get_file_name_base_on_postfix
 
 CONFIG_FILE = 'bert_config.json'
@@ -489,10 +488,10 @@ class BertModel(nn.Module):
 
         load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
         if len(missing_keys) > 0:
-            print("Weights of {} not initialized from pretrained model: {}".format(
+            logger.warn("Weights of {} not initialized from pretrained model: {}".format(
                 model.__class__.__name__, missing_keys))
         if len(unexpected_keys) > 0:
-            print("Weights from pretrained model not used in {}: {}".format(
+            logger.warn("Weights from pretrained model not used in {}: {}".format(
                 model.__class__.__name__, unexpected_keys))
         return model
 
@@ -799,7 +798,7 @@ class BertTokenizer(object):
         for token in tokens:
             ids.append(self.vocab[token])
         if len(ids) > self.max_len:
-            print(
+            logger.warn(
                 "Token indices sequence length is longer than the specified maximum "
                 " sequence length for this BERT model ({} > {}). Running this"
                 " sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
@@ -823,7 +822,7 @@ class BertTokenizer(object):
         with open(vocab_file, "w", encoding="utf-8") as writer:
             for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                 if index != token_index:
-                    print("Saving vocabulary to {}: vocabulary indices are not consecutive."
+                    logger.warn("Saving vocabulary to {}: vocabulary indices are not consecutive."
                           " Please check that the vocabulary is not corrupted!".format(vocab_file))
                     index = token_index
                 writer.write(token + u'\n')
@@ -837,7 +836,7 @@ class BertTokenizer(object):
 
         """
         pretrained_model_name_or_path = _get_file_name_base_on_postfix(model_dir, '.txt')
-        print("loading vocabulary file {}".format(pretrained_model_name_or_path))
+        logger.info("loading vocabulary file {}".format(pretrained_model_name_or_path))
         max_len = 512
         kwargs['max_len'] = min(kwargs.get('max_position_embeddings', int(1e12)), max_len)
         # Instantiate tokenizer.
@@ -901,7 +900,7 @@ class _WordPieceBertModel(nn.Module):
                                     is_input=True)
                 dataset.set_pad_val('word_pieces', self._wordpiece_pad_index)
             except Exception as e:
-                print(f"Exception happens when processing the {index} dataset.")
+                logger.error(f"Exception happens when processing the {index} dataset.")
                 raise e
 
     def forward(self, word_pieces, token_type_ids=None):
diff --git a/reproduction/seqence_labelling/ner/data/Conll2003Loader.py b/reproduction/seqence_labelling/ner/data/Conll2003Loader.py
deleted file mode 100644
index 0af4681e..00000000
--- a/reproduction/seqence_labelling/ner/data/Conll2003Loader.py
+++ /dev/null
@@ -1,93 +0,0 @@
-
-from fastNLP.core.vocabulary import VocabularyOption
-from fastNLP.io.data_bundle import DataSetLoader, DataBundle
-from typing import Union, Dict
-from fastNLP import Vocabulary
-from fastNLP import Const
-from reproduction.utils import check_dataloader_paths
-
-from fastNLP.io import ConllLoader
-from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2
-
-
-class Conll2003DataLoader(DataSetLoader):
-    def __init__(self, task:str='ner', encoding_type:str='bioes'):
-        """
-        加载Conll2003格式的英语语料，该数据集的信息可以在https://www.clips.uantwerpen.be/conll2003/ner/找到。当task为pos
-            时，返回的DataSet中target取值于第2列; 当task为chunk时，返回的DataSet中target取值于第3列;当task为ner时，返回
-            的DataSet中target取值于第4列。所有"-DOCSTART- -X- O O"将被忽略，这会导致数据的数量少于很多文献报道的值，但
-            鉴于"-DOCSTART- -X- O O"只是用于文档分割的符号，并不应该作为预测对象，所以我们忽略了数据中的-DOCTSTART-开头的行
-        ner与chunk任务读取后的数据的target将为encoding_type类型。pos任务读取后就是pos列的数据。
-
-        :param task: 指定需要标注任务。可选ner, pos, chunk
-        """
-        assert task in ('ner', 'pos', 'chunk')
-        index = {'ner':3, 'pos':1, 'chunk':2}[task]
-        self._loader = ConllLoader(headers=['raw_words', 'target'], indexes=[0, index])
-        self._tag_converters = []
-        if task in ('ner', 'chunk'):
-            self._tag_converters = [iob2]
-            if encoding_type == 'bioes':
-                self._tag_converters.append(iob2bioes)
-
-    def load(self, path: str):
-        dataset = self._loader.load(path)
-        def convert_tag_schema(tags):
-            for converter in self._tag_converters:
-                tags = converter(tags)
-            return tags
-        if self._tag_converters:
-            dataset.apply_field(convert_tag_schema, field_name=Const.TARGET, new_field_name=Const.TARGET)
-        return dataset
-
-    def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None, lower:bool=False):
-        """
-        读取并处理数据。数据中的'-DOCSTART-'开头的行会被忽略
-
-        :param paths:
-        :param word_vocab_opt: vocabulary的初始化值
-        :param lower: 是否将所有字母转为小写。
-        :return:
-        """
-        # 读取数据
-        paths = check_dataloader_paths(paths)
-        data = DataBundle()
-        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
-        target_fields = [Const.TARGET, Const.INPUT_LEN]
-        for name, path in paths.items():
-            dataset = self.load(path)
-            dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT)
-            if lower:
-                dataset.words.lower()
-            data.datasets[name] = dataset
-
-        # 对construct vocab
-        word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt)
-        word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT,
-                                no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train'])
-        word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT)
-        data.vocabs[Const.INPUT] = word_vocab
-
-        # cap words
-        cap_word_vocab = Vocabulary()
-        cap_word_vocab.from_dataset(data.datasets['train'], field_name='raw_words',
-                                no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train'])
-        cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words')
-        input_fields.append('cap_words')
-        data.vocabs['cap_words'] = cap_word_vocab
-
-        # 对target建vocab
-        target_vocab = Vocabulary(unknown=None, padding=None)
-        target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET)
-        target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET)
-        data.vocabs[Const.TARGET] = target_vocab
-
-        for name, dataset in data.datasets.items():
-            dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN)
-            dataset.set_input(*input_fields)
-            dataset.set_target(*target_fields)
-
-        return data
-
-if __name__ == '__main__':
-    pass
\ No newline at end of file
diff --git a/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py b/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py
deleted file mode 100644
index 25c6f29b..00000000
--- a/reproduction/seqence_labelling/ner/data/OntoNoteLoader.py
+++ /dev/null
@@ -1,152 +0,0 @@
-from fastNLP.core.vocabulary import VocabularyOption
-from fastNLP.io.data_bundle import DataSetLoader, DataBundle
-from typing import Union, Dict
-from fastNLP import DataSet
-from fastNLP import Vocabulary
-from fastNLP import Const
-from reproduction.utils import check_dataloader_paths
-
-from fastNLP.io import ConllLoader
-from reproduction.seqence_labelling.ner.data.utils import iob2bioes, iob2
-
-class OntoNoteNERDataLoader(DataSetLoader):
-    """
-    用于读取处理为Conll格式后的OntoNote数据。将OntoNote数据处理为conll格式的过程可以参考https://github.com/yhcc/OntoNotes-5.0-NER。
-
-    """
-    def __init__(self, encoding_type:str='bioes'):
-        assert encoding_type in ('bioes', 'bio')
-        self.encoding_type = encoding_type
-        if encoding_type=='bioes':
-            self.encoding_method = iob2bioes
-        else:
-            self.encoding_method = iob2
-
-    def load(self, path:str)->DataSet:
-        """
-        给定一个文件路径，读取数据。返回的DataSet包含以下的field
-            raw_words: List[str]
-            target: List[str]
-
-        :param path:
-        :return:
-        """
-        dataset = ConllLoader(headers=['raw_words', 'target'], indexes=[3, 10]).load(path)
-        def convert_to_bio(tags):
-            bio_tags = []
-            flag = None
-            for tag in tags:
-                label = tag.strip("()*")
-                if '(' in tag:
-                    bio_label = 'B-' + label
-                    flag = label
-                elif flag:
-                    bio_label = 'I-' + flag
-                else:
-                    bio_label = 'O'
-                if ')' in tag:
-                    flag = None
-                bio_tags.append(bio_label)
-            return self.encoding_method(bio_tags)
-
-        def convert_word(words):
-            converted_words = []
-            for word in words:
-                word = word.replace('/.', '.')  # 有些结尾的.是/.形式的
-                if not word.startswith('-'):
-                    converted_words.append(word)
-                    continue
-                # 以下是由于这些符号被转义了，再转回来
-                tfrs = {'-LRB-':'(',
-                        '-RRB-': ')',
-                        '-LSB-': '[',
-                        '-RSB-': ']',
-                        '-LCB-': '{',
-                        '-RCB-': '}'
-                        }
-                if word in tfrs:
-                    converted_words.append(tfrs[word])
-                else:
-                    converted_words.append(word)
-            return converted_words
-
-        dataset.apply_field(convert_word, field_name='raw_words', new_field_name='raw_words')
-        dataset.apply_field(convert_to_bio, field_name='target', new_field_name='target')
-
-        return dataset
-
-    def process(self, paths: Union[str, Dict[str, str]], word_vocab_opt:VocabularyOption=None,
-                lower:bool=True)->DataBundle:
-        """
-        读取并处理数据。返回的DataInfo包含以下的内容
-            vocabs:
-                word: Vocabulary
-                target: Vocabulary
-            datasets:
-                train: DataSet
-                    words: List[int], 被设置为input
-                    target: int. label，被同时设置为input和target
-                    seq_len: int. 句子的长度，被同时设置为input和target
-                    raw_words: List[str]
-                xxx(根据传入的paths可能有所变化)
-
-        :param paths:
-        :param word_vocab_opt: vocabulary的初始化值
-        :param lower: 是否使用小写
-        :return:
-        """
-        paths = check_dataloader_paths(paths)
-        data = DataBundle()
-        input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
-        target_fields = [Const.TARGET, Const.INPUT_LEN]
-        for name, path in paths.items():
-            dataset = self.load(path)
-            dataset.apply_field(lambda words: words, field_name='raw_words', new_field_name=Const.INPUT)
-            if lower:
-                dataset.words.lower()
-            data.datasets[name] = dataset
-
-        # 对construct vocab
-        word_vocab = Vocabulary(min_freq=2) if word_vocab_opt is None else Vocabulary(**word_vocab_opt)
-        word_vocab.from_dataset(data.datasets['train'], field_name=Const.INPUT,
-                                no_create_entry_dataset=[dataset for name, dataset in data.datasets.items() if name!='train'])
-        word_vocab.index_dataset(*data.datasets.values(), field_name=Const.INPUT, new_field_name=Const.INPUT)
-        data.vocabs[Const.INPUT] = word_vocab
-
-        # cap words
-        cap_word_vocab = Vocabulary()
-        cap_word_vocab.from_dataset(*data.datasets.values(), field_name='raw_words')
-        cap_word_vocab.index_dataset(*data.datasets.values(), field_name='raw_words', new_field_name='cap_words')
-        input_fields.append('cap_words')
-        data.vocabs['cap_words'] = cap_word_vocab
-
-        # 对target建vocab
-        target_vocab = Vocabulary(unknown=None, padding=None)
-        target_vocab.from_dataset(*data.datasets.values(), field_name=Const.TARGET)
-        target_vocab.index_dataset(*data.datasets.values(), field_name=Const.TARGET)
-        data.vocabs[Const.TARGET] = target_vocab
-
-        for name, dataset in data.datasets.items():
-            dataset.add_seq_len(Const.INPUT, new_field_name=Const.INPUT_LEN)
-            dataset.set_input(*input_fields)
-            dataset.set_target(*target_fields)
-
-        return data
-
-
-if __name__ == '__main__':
-    loader = OntoNoteNERDataLoader()
-    dataset = loader.load('/hdd/fudanNLP/fastNLP/others/data/v4/english/test.txt')
-    print(dataset.target.value_count())
-    print(dataset[:4])
-
-
-"""
-train 115812 2200752
-development 15680 304684
-test 12217 230111
-
-train 92403 1901772
-valid 13606 279180
-test 10258 204135
-"""
\ No newline at end of file
diff --git a/reproduction/seqence_labelling/ner/data/utils.py b/reproduction/seqence_labelling/ner/data/utils.py
deleted file mode 100644
index 8f7af792..00000000
--- a/reproduction/seqence_labelling/ner/data/utils.py
+++ /dev/null
@@ -1,49 +0,0 @@
-from typing import List
-
-def iob2(tags:List[str])->List[str]:
-    """
-    检查数据是否是合法的IOB数据，如果是IOB1会被自动转换为IOB2。
-
-    :param tags: 需要转换的tags
-    """
-    for i, tag in enumerate(tags):
-        if tag == "O":
-            continue
-        split = tag.split("-")
-        if len(split) != 2 or split[0] not in ["I", "B"]:
-            raise TypeError("The encoding schema is not a valid IOB type.")
-        if split[0] == "B":
-            continue
-        elif i == 0 or tags[i - 1] == "O":  # conversion IOB1 to IOB2
-            tags[i] = "B" + tag[1:]
-        elif tags[i - 1][1:] == tag[1:]:
-            continue
-        else:  # conversion IOB1 to IOB2
-            tags[i] = "B" + tag[1:]
-    return tags
-
-def iob2bioes(tags:List[str])->List[str]:
-    """
-    将iob的tag转换为bmeso编码
-    :param tags:
-    :return:
-    """
-    new_tags = []
-    for i, tag in enumerate(tags):
-        if tag == 'O':
-            new_tags.append(tag)
-        else:
-            split = tag.split('-')[0]
-            if split == 'B':
-                if i+1!=len(tags) and tags[i+1].split('-')[0] == 'I':
-                    new_tags.append(tag)
-                else:
-                    new_tags.append(tag.replace('B-', 'S-'))
-            elif split == 'I':
-                if i + 1<len(tags) and tags[i+1].split('-')[0] == 'I':
-                    new_tags.append(tag)
-                else:
-                    new_tags.append(tag.replace('I-', 'E-'))
-            else:
-                raise TypeError("Invalid IOB format.")
-    return new_tags
\ No newline at end of file
diff --git a/reproduction/seqence_labelling/ner/model/bert_crf.py b/reproduction/seqence_labelling/ner/model/bert_crf.py
new file mode 100644
index 00000000..8061d116
--- /dev/null
+++ b/reproduction/seqence_labelling/ner/model/bert_crf.py
@@ -0,0 +1,31 @@
+
+
+from torch import nn
+from fastNLP.modules import ConditionalRandomField, allowed_transitions
+import torch.nn.functional as F
+
+class BertCRF(nn.Module):
+    def __init__(self, embed, tag_vocab, encoding_type='bio'):
+        super().__init__()
+        self.embed = embed
+        self.fc = nn.Linear(self.embed.embed_size, len(tag_vocab))
+        trans = allowed_transitions(tag_vocab, encoding_type=encoding_type, include_start_end=True)
+        self.crf = ConditionalRandomField(len(tag_vocab), include_start_end_trans=True, allowed_transitions=trans)
+
+    def _forward(self, words, target):
+        mask = words.ne(0)
+        words = self.embed(words)
+        words = self.fc(words)
+        logits = F.log_softmax(words, dim=-1)
+        if target is not None:
+            loss = self.crf(logits, target, mask)
+            return {'loss': loss}
+        else:
+            paths, _ = self.crf.viterbi_decode(logits, mask)
+            return {'pred': paths}
+
+    def forward(self, words, target):
+        return self._forward(words, target)
+
+    def predict(self, words):
+        return self._forward(words, None)
diff --git a/reproduction/seqence_labelling/ner/test/__init__.py b/reproduction/seqence_labelling/ner/test/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/reproduction/seqence_labelling/ner/test/test.py b/reproduction/seqence_labelling/ner/test/test.py
deleted file mode 100644
index 09d0f468..00000000
--- a/reproduction/seqence_labelling/ner/test/test.py
+++ /dev/null
@@ -1,33 +0,0 @@
-
-from reproduction.seqence_labelling.ner.data.Conll2003Loader import Conll2003DataLoader
-from reproduction.seqence_labelling.ner.data.Conll2003Loader import iob2, iob2bioes
-import unittest
-
-class TestTagSchemaConverter(unittest.TestCase):
-    def test_iob2(self):
-        tags = ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
-        golden = ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
-        self.assertListEqual(golden, iob2(tags))
-
-        tags = ['I-ORG', 'O']
-        golden = ['B-ORG', 'O']
-        self.assertListEqual(golden, iob2(tags))
-
-        tags = ['I-MISC', 'I-MISC', 'O', 'I-PER', 'I-PER', 'O']
-        golden = ['B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O']
-        self.assertListEqual(golden, iob2(tags))
-
-    def test_iob2bemso(self):
-        tags = ['B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O']
-        golden = ['B-MISC', 'E-MISC', 'O', 'B-PER', 'E-PER', 'O']
-        self.assertListEqual(golden, iob2bioes(tags))
-
-
-def test_conll2003_loader():
-    path = '/hdd/fudanNLP/fastNLP/others/data/conll2003/train.txt'
-    loader = Conll2003DataLoader().load(path)
-    print(loader[:3])
-
-
-if __name__ == '__main__':
-    test_conll2003_loader()
\ No newline at end of file
diff --git a/reproduction/seqence_labelling/ner/train_bert.py b/reproduction/seqence_labelling/ner/train_bert.py
new file mode 100644
index 00000000..f79bd4a5
--- /dev/null
+++ b/reproduction/seqence_labelling/ner/train_bert.py
@@ -0,0 +1,52 @@
+
+
+"""
+使用Bert进行英文命名实体识别
+
+"""
+
+import sys
+
+sys.path.append('../../../')
+
+from reproduction.seqence_labelling.ner.model.bert_crf import BertCRF
+from fastNLP.embeddings import BertEmbedding
+from fastNLP import Trainer, Const
+from fastNLP import BucketSampler, SpanFPreRecMetric, GradientClipCallback
+from fastNLP.core.callback import WarmupCallback
+from fastNLP.core.optimizer import AdamW
+from fastNLP.io import Conll2003NERPipe
+
+from fastNLP import cache_results, EvaluateCallback
+
+encoding_type = 'bioes'
+
+@cache_results('caches/conll2003.pkl', _refresh=False)
+def load_data():
+    # 替换路径
+    paths = 'data/conll2003'
+    data = Conll2003NERPipe(encoding_type=encoding_type).process_from_file(paths)
+    return data
+data = load_data()
+print(data)
+
+embed = BertEmbedding(data.get_vocab(Const.INPUT), model_dir_or_name='en-base-cased',
+                        pool_method='max', requires_grad=True, layers='11', include_cls_sep=False, dropout=0.5,
+                      word_dropout=0.01)
+
+callbacks = [
+                GradientClipCallback(clip_type='norm', clip_value=1),
+                WarmupCallback(warmup=0.1, schedule='linear'),
+                EvaluateCallback(data.get_dataset('test'))
+            ]
+
+model = BertCRF(embed, tag_vocab=data.get_vocab('target'), encoding_type=encoding_type)
+optimizer = AdamW(model.parameters(), lr=2e-5)
+
+trainer = Trainer(train_data=data.datasets['train'], model=model, optimizer=optimizer, sampler=BucketSampler(),
+                  device=0, dev_data=data.datasets['dev'], batch_size=6,
+                  metrics=SpanFPreRecMetric(tag_vocab=data.vocabs[Const.TARGET], encoding_type=encoding_type),
+                  loss=None, callbacks=callbacks, num_workers=2, n_epochs=5,
+                  check_code_level=0, update_every=3, test_use_tqdm=False)
+trainer.train()
+
diff --git a/reproduction/seqence_labelling/ner/train_idcnn.py b/reproduction/seqence_labelling/ner/train_idcnn.py
index 53f2798f..4dcbd45d 100644
--- a/reproduction/seqence_labelling/ner/train_idcnn.py
+++ b/reproduction/seqence_labelling/ner/train_idcnn.py
@@ -1,4 +1,4 @@
-from reproduction.seqence_labelling.ner.data.OntoNoteLoader import OntoNoteNERDataLoader
+from fastNLP.io import OntoNotesNERPipe
 from fastNLP.core.callback import LRScheduler
 from fastNLP import GradientClipCallback
 from torch.optim.lr_scheduler import LambdaLR
@@ -10,14 +10,10 @@ from fastNLP import Trainer, Tester
 from fastNLP.core.metrics import MetricBase
 from reproduction.seqence_labelling.ner.model.dilated_cnn import IDCNN
 from fastNLP.core.utils import Option
-from fastNLP.embeddings.embedding import StaticEmbedding
+from fastNLP.embeddings import StaticEmbedding
 from fastNLP.core.utils import cache_results
-from fastNLP.core.vocabulary import VocabularyOption
 import torch.cuda
 import os
-os.environ['FASTNLP_BASE_URL'] = 'http://10.141.222.118:8888/file/download/'
-os.environ['FASTNLP_CACHE_DIR'] = '/remote-home/hyan01/fastnlp_caches'
-os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 
 encoding_type = 'bioes'
 
@@ -40,18 +36,8 @@ ops = Option(
 @cache_results('ontonotes-case-cache')
 def load_data():
     print('loading data')
-    data = OntoNoteNERDataLoader(encoding_type=encoding_type).process(
-        paths = get_path('workdir/datasets/ontonotes-v4'),
-        lower=False,
-        word_vocab_opt=VocabularyOption(min_freq=0),
-    )
-    # data = Conll2003DataLoader(task='ner', encoding_type=encoding_type).process(
-    #     paths=get_path('workdir/datasets/conll03'),
-    # lower=False, word_vocab_opt=VocabularyOption(min_freq=0)
-    # )
-
-    # char_embed = CNNCharEmbedding(vocab=data.vocabs['cap_words'], embed_size=30, char_emb_size=30, filter_nums=[30],
-    #                               kernel_sizes=[3])
+    data = OntoNotesNERPipe(encoding_type=encoding_type).process_from_file(
+        paths = get_path('workdir/datasets/ontonotes-v4'))
     print('loading embedding')
     word_embed = StaticEmbedding(vocab=data.vocabs[Const.INPUT],
                                  model_dir_or_name='en-glove-840b-300',

From be77533c3832afeca5557e3762e0470d467572f7 Mon Sep 17 00:00:00 2001
From: yh_cc <poemsmileyh@gmail.com>
Date: Sun, 25 Aug 2019 18:58:03 +0800
Subject: [PATCH 06/19] =?UTF-8?q?=E4=BF=AE=E5=A4=8Dword=20drop=20bug,=20?=
 =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=9B=B8=E5=BA=94=E6=B5=8B=E8=AF=95?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/embeddings/bert_embedding.py     |  4 ++--
 fastNLP/embeddings/embedding.py          |  2 +-
 test/embeddings/test_bert_embedding.py   |  9 ++++++++-
 test/embeddings/test_static_embedding.py | 11 +++++++++++
 4 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/fastNLP/embeddings/bert_embedding.py b/fastNLP/embeddings/bert_embedding.py
index 4bd06ec3..047048d8 100644
--- a/fastNLP/embeddings/bert_embedding.py
+++ b/fastNLP/embeddings/bert_embedding.py
@@ -126,7 +126,7 @@ class BertEmbedding(ContextualEmbedding):
             with torch.no_grad():
                 if self._word_sep_index:  # 不能drop sep
                     sep_mask = words.eq(self._word_sep_index)
-                mask = torch.full_like(words, fill_value=self.word_dropout)
+                mask = torch.full_like(words, fill_value=self.word_dropout, dtype=torch.float, device=words.device)
                 mask = torch.bernoulli(mask).eq(1)  # dropout_word越大，越多位置为1
                 pad_mask = words.ne(0)
                 mask = pad_mask.__and__(mask)  # pad的位置不为unk
@@ -267,7 +267,7 @@ class BertWordPieceEncoder(nn.Module):
             with torch.no_grad():
                 if self._word_sep_index:  # 不能drop sep
                     sep_mask = words.eq(self._wordpiece_unk_index)
-                mask = torch.full_like(words, fill_value=self.word_dropout)
+                mask = torch.full_like(words, fill_value=self.word_dropout, dtype=torch.float, device=words.device)
                 mask = torch.bernoulli(mask).eq(1)  # dropout_word越大，越多位置为1
                 pad_mask = words.ne(self._wordpiece_pad_index)
                 mask = pad_mask.__and__(mask)  # pad的位置不为unk
diff --git a/fastNLP/embeddings/embedding.py b/fastNLP/embeddings/embedding.py
index a94985c1..5e7b9803 100644
--- a/fastNLP/embeddings/embedding.py
+++ b/fastNLP/embeddings/embedding.py
@@ -138,7 +138,7 @@ class TokenEmbedding(nn.Module):
         :return:
         """
         if self.word_dropout > 0 and self.training:
-            mask = torch.full_like(words, fill_value=self.word_dropout)
+            mask = torch.full_like(words, fill_value=self.word_dropout, dtype=torch.float, device=words.device)
             mask = torch.bernoulli(mask).eq(1)  # dropout_word越大，越多位置为1
             pad_mask = words.ne(self._word_pad_index)
             mask = mask.__and__(pad_mask)
diff --git a/test/embeddings/test_bert_embedding.py b/test/embeddings/test_bert_embedding.py
index 760029a3..da81c8c9 100644
--- a/test/embeddings/test_bert_embedding.py
+++ b/test/embeddings/test_bert_embedding.py
@@ -10,5 +10,12 @@ class TestDownload(unittest.TestCase):
         # import os
         vocab = Vocabulary().add_word_lst("This is a test .".split())
         embed = BertEmbedding(vocab, model_dir_or_name='en')
-        words = torch.LongTensor([[0, 1, 2]])
+        words = torch.LongTensor([[2, 3, 4, 0]])
         print(embed(words).size())
+
+    def test_word_drop(self):
+        vocab = Vocabulary().add_word_lst("This is a test .".split())
+        embed = BertEmbedding(vocab, model_dir_or_name='en', dropout=0.1, word_dropout=0.2)
+        for i in range(10):
+            words = torch.LongTensor([[2, 3, 4, 0]])
+            print(embed(words).size())
\ No newline at end of file
diff --git a/test/embeddings/test_static_embedding.py b/test/embeddings/test_static_embedding.py
index 83137345..c17daa0a 100644
--- a/test/embeddings/test_static_embedding.py
+++ b/test/embeddings/test_static_embedding.py
@@ -5,6 +5,7 @@ from fastNLP import Vocabulary
 import torch
 import os
 
+
 class TestLoad(unittest.TestCase):
     def test_norm1(self):
         # 测试只对可以找到的norm
@@ -22,6 +23,16 @@ class TestLoad(unittest.TestCase):
         self.assertEqual(round(torch.norm(embed(torch.LongTensor([[2]]))).item(), 4), 1)
         self.assertEqual(round(torch.norm(embed(torch.LongTensor([[4]]))).item(), 4), 1)
 
+    def test_dropword(self):
+        # 测试是否可以通过drop word
+        vocab = Vocabulary().add_word_lst([chr(i) for i in range(1, 200)])
+        embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=10, dropout=0.1, word_dropout=0.4)
+        for i in range(10):
+            length = torch.randint(1, 50, (1,)).item()
+            batch = torch.randint(1, 4, (1,)).item()
+            words = torch.randint(1, 200, (batch, length)).long()
+            embed(words)
+
 class TestRandomSameEntry(unittest.TestCase):
     def test_same_vector(self):
         vocab = Vocabulary().add_word_lst(["The", "the", "THE", 'a', "A"])

From 584a92c64c62f7319bd2966070d4e138bdf39801 Mon Sep 17 00:00:00 2001
From: yh_cc <poemsmileyh@gmail.com>
Date: Mon, 26 Aug 2019 01:33:17 +0800
Subject: [PATCH 07/19] =?UTF-8?q?1.=E5=A2=9E=E5=8A=A0sequence=20labeling?=
 =?UTF-8?q?=E4=BB=BB=E5=8A=A1=E7=9A=84=E6=95=B0=E6=8D=AE=E8=AF=B4=E6=98=8E?=
 =?UTF-8?q?;=202.=E5=A2=9E=E5=8A=A0=E5=AF=B9CWSPipe=E7=9A=84=E5=BC=95?=
 =?UTF-8?q?=E7=94=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/io/__init__.py                        |  1 +
 fastNLP/io/pipe/__init__.py                   |  3 ++
 .../seqence_labelling/chinese_ner/readme.md   | 30 +++++++++++++++++
 reproduction/seqence_labelling/cws/readme.md  | 32 +++++++++++++++++++
 .../seqence_labelling/cws/test/__init__.py    |  0
 .../cws/test/test_CWSDataLoader.py            | 17 ----------
 6 files changed, 66 insertions(+), 17 deletions(-)
 create mode 100644 reproduction/seqence_labelling/chinese_ner/readme.md
 create mode 100644 reproduction/seqence_labelling/cws/readme.md
 delete mode 100644 reproduction/seqence_labelling/cws/test/__init__.py
 delete mode 100644 reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py

diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py
index 01683628..a3ea0148 100644
--- a/fastNLP/io/__init__.py
+++ b/fastNLP/io/__init__.py
@@ -38,6 +38,7 @@ __all__ = [
     'JsonLoader',
 
     'CWSLoader',
+    "CWSPipe",
 
     'MNLILoader',
     "QuoraLoader",
diff --git a/fastNLP/io/pipe/__init__.py b/fastNLP/io/pipe/__init__.py
index 1907af4a..048e4cfe 100644
--- a/fastNLP/io/pipe/__init__.py
+++ b/fastNLP/io/pipe/__init__.py
@@ -10,6 +10,8 @@ Pipe用于处理通过 Loader 读取的数据，所有的 Pipe 都包含 ``proce
 __all__ = [
     "Pipe",
 
+    "CWSPipe",
+
     "YelpFullPipe",
     "YelpPolarityPipe",
     "SSTPipe",
@@ -43,3 +45,4 @@ from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe
     MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe
 from .pipe import Pipe
 from .conll import Conll2003Pipe
+from .cws import CWSPipe
diff --git a/reproduction/seqence_labelling/chinese_ner/readme.md b/reproduction/seqence_labelling/chinese_ner/readme.md
new file mode 100644
index 00000000..3a9d37d8
--- /dev/null
+++ b/reproduction/seqence_labelling/chinese_ner/readme.md
@@ -0,0 +1,30 @@
+使用以下中文NERPipe自动下载的统计数据
+
+| MsraNERPipe | # of sents | # of tokens |
+| ----------- | ---------- | ----------- |
+| train       | 41747      | 1954374     |
+| dev         | 4617       | 215505      |
+| test        | 4365       | 172601      |
+| total       | 50729      | 2342480     |
+这里报道的统计数据，与[https://arxiv.org/pdf/1805.02023.pdf]()报道的一致
+
+
+
+| WeiboNERPipe | # of sents | # of tokens |
+| ------------ | ---------- | ----------- |
+| train        | 1350       | 73778       |
+| dev          | 270        | 14509       |
+| test         | 270        | 14842       |
+| total        | 1890       | 1890        |
+这里报道的统计数据与[https://www.cs.cmu.edu/~ark/EMNLP-2015/proceedings/EMNLP/pdf/EMNLP064.pdf]()一致
+
+
+
+
+| PeopleDailyPipe | # of sents | # of tokens |
+| --------------- | ---------- | ----------- |
+| train           | 50658      | 2169879     |
+| dev             | 4631       | 172601      |
+| test            | 68         | 2270        |
+| total           | 55357      | 2344750     |
+这里使用的数据与[https://arxiv.org/pdf/1906.08101.pdf]()的数据是一致的
diff --git a/reproduction/seqence_labelling/cws/readme.md b/reproduction/seqence_labelling/cws/readme.md
new file mode 100644
index 00000000..a25bb0ed
--- /dev/null
+++ b/reproduction/seqence_labelling/cws/readme.md
@@ -0,0 +1,32 @@
+四个数据集的统计信息，最原始的数据可以从[http://sighan.cs.uchicago.edu/bakeoff2005/]()下载。
+
+| pku   | # of sents | # of tokens |
+| ----- | ---------- | ----------- |
+| train | 17173      | 1650222     |
+| dev   | 1881       | 176226      |
+| test  | 1944       | 172733      |
+| total | 20998      | 1999181     |
+
+
+| cityu | # of sents | # of tokens |
+| ----- | ---------- | ----------- |
+| train | 47696      | 2164907     |
+| dev   | 5323       | 238447      |
+| test  | 1492       | 67690       |
+| total | 54511      | 2471044     |
+
+
+| msra  | # of sents | # of tokens |
+| ----- | ---------- | ----------- |
+| train | 78242      | 3644550     |
+| dev   | 8676       | 405919      |
+| test  | 3985       | 184355      |
+| total | 90903      | 4234824     |
+
+
+| as    | # of sents | # of tokens |
+| ----- | ---------- | ----------- |
+| train | 638273     | 7536586     |
+| dev   | 70680      | 831464      |
+| test  | 14429      | 197681      |
+| total | 723382     | 8565731     |
diff --git a/reproduction/seqence_labelling/cws/test/__init__.py b/reproduction/seqence_labelling/cws/test/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py b/reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py
deleted file mode 100644
index f4260849..00000000
--- a/reproduction/seqence_labelling/cws/test/test_CWSDataLoader.py
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-import unittest
-from ..data.CWSDataLoader import SigHanLoader
-from fastNLP.core.vocabulary import VocabularyOption
-
-
-class TestCWSDataLoader(unittest.TestCase):
-    def test_case1(self):
-        cws_loader = SigHanLoader(target_type='bmes')
-        data = cws_loader.process('pku_demo.txt')
-        print(data.datasets)
-
-    def test_calse2(self):
-        cws_loader = SigHanLoader(target_type='bmes')
-        data = cws_loader.process('pku_demo.txt', bigram_vocab_opt=VocabularyOption())
-        print(data.datasets)
\ No newline at end of file

From 78be840ab97b47acbf517962173b9781cc6fbebe Mon Sep 17 00:00:00 2001
From: xuyige <ygxu18@fudan.edu.cn>
Date: Mon, 26 Aug 2019 01:56:20 +0800
Subject: [PATCH 08/19] 1.update README 2. fix a filename-bug in
 pretrain_static_file; 3. add Pipe to documents; 4. update documents in some
 loaders; 5. update tutorial 2 & 3 to adapt version 0.5.0

---
 README.md                                     |  13 +-
 .../tutorials/tutorial_2_load_dataset.rst     | 220 ++++++------------
 .../source/tutorials/tutorial_3_embedding.rst |  89 ++-----
 docs/source/user/tutorials.rst                |   2 +-
 fastNLP/io/__init__.py                        |   5 +-
 fastNLP/io/file_utils.py                      |   2 +-
 fastNLP/io/loader/__init__.py                 |   4 +-
 fastNLP/io/loader/classification.py           |   1 -
 fastNLP/io/loader/conll.py                    |   3 +-
 fastNLP/io/loader/csv.py                      |   2 +-
 fastNLP/io/pipe/matching.py                   |   4 +-
 fastNLP/io/pipe/pipe.py                       |   3 +
 12 files changed, 117 insertions(+), 231 deletions(-)

diff --git a/README.md b/README.md
index b35776dc..531fbc83 100644
--- a/README.md
+++ b/README.md
@@ -6,11 +6,12 @@
 ![Hex.pm](https://img.shields.io/hexpm/l/plug.svg)
 [![Documentation Status](https://readthedocs.org/projects/fastnlp/badge/?version=latest)](http://fastnlp.readthedocs.io/?badge=latest)
 
-fastNLP 是一款轻量级的 NLP 处理套件。你既可以使用它快速地完成一个序列标注（[NER](reproduction/seqence_labelling/ner)、POS-Tagging等）、中文分词、[文本分类](reproduction/text_classification)、[Matching](reproduction/matching)、[指代消解](reproduction/coreference_resolution)、[摘要](reproduction/Summarization)等任务； 也可以使用它构建许多复杂的网络模型，进行科研。它具有如下的特性：
+fastNLP 是一款轻量级的 NLP 工具包。你既可以使用它快速地完成一个序列标注（[NER](reproduction/seqence_labelling/ner)、POS-Tagging等）、中文分词、[文本分类](reproduction/text_classification)、[Matching](reproduction/matching)、[指代消解](reproduction/coreference_resolution)、[摘要](reproduction/Summarization)等任务； 也可以使用它快速构建许多复杂的网络模型，进行科研。它具有如下的特性：
 
-- 统一的Tabular式数据容器，让数据预处理过程简洁明了。内置多种数据集的DataSet Loader，省去预处理代码;
+- 统一的Tabular式数据容器，让数据预处理过程简洁明了。内置多种数据集的Loader和Pipe，省去预处理代码;
 - 多种训练、测试组件，例如训练器Trainer；测试器Tester；以及各种评测metrics等等;
 - 各种方便的NLP工具，例如预处理embedding加载（包括ELMo和BERT）; 中间数据cache等;
+- 部分[数据集与预训练模型](https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0)的自动下载
 - 详尽的中文[文档](https://fastnlp.readthedocs.io/)、[教程](https://fastnlp.readthedocs.io/zh/latest/user/tutorials.html)以供查阅;
 - 提供诸多高级模块，例如Variational LSTM, Transformer, CRF等;
 - 在序列标注、中文分词、文本分类、Matching、指代消解、摘要等任务上封装了各种模型可供直接使用，详细内容见 [reproduction](reproduction) 部分;
@@ -36,7 +37,7 @@ pip install fastNLP
 python -m spacy download en
 ```
 
-目前使用pip安装fastNLP的版本是0.4.1，有较多功能仍未更新，最新内容以master分支为准。
+目前使用pypi安装fastNLP的版本是0.4.1，有较多功能仍未更新，最新内容以master分支为准。
 fastNLP0.5.0版本将在近期推出，请密切关注。
 
 
@@ -44,7 +45,7 @@ fastNLP0.5.0版本将在近期推出，请密切关注。
 
 - [0. 快速入门](https://fastnlp.readthedocs.io/zh/latest/user/quickstart.html)
 - [1. 使用DataSet预处理文本](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_1_data_preprocess.html)
-- [2. 使用DataSetLoader加载数据集](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_load_dataset.html)
+- [2. 使用Loader和Pipe加载并处理数据集](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_2_load_dataset.html)
 - [3. 使用Embedding模块将文本转成向量](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_3_embedding.html)
 - [4. 动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_4_loss_optimizer.html)
 - [5. 动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程](https://fastnlp.readthedocs.io/zh/latest/tutorials/tutorial_5_datasetiter.html)
@@ -91,7 +92,7 @@ fastNLP 在 embeddings 模块中内置了几种不同的embedding：静态embedd
 
 ## 项目结构
 
-![](./docs/source/figures/workflow.png)
+<img src="./docs/source/figures/workflow.png" width="60%" height="60%">
 
 fastNLP的大致工作流程如上图所示，而项目结构如下：
 
@@ -118,7 +119,7 @@ fastNLP的大致工作流程如上图所示，而项目结构如下：
 </tr>
 <tr>
     <td><b> fastNLP.io </b></td>
-    <td> 实现了读写功能，包括数据读入，模型读写等 </td>
+    <td> 实现了读写功能，包括数据读入与预处理，模型读写，自动下载等 </td>
 </tr>
 </table>
 
diff --git a/docs/source/tutorials/tutorial_2_load_dataset.rst b/docs/source/tutorials/tutorial_2_load_dataset.rst
index 4fa4a84d..17ad6baf 100644
--- a/docs/source/tutorials/tutorial_2_load_dataset.rst
+++ b/docs/source/tutorials/tutorial_2_load_dataset.rst
@@ -1,57 +1,53 @@
-=================================
-使用DataSetLoader加载数据集
-=================================
+=======================================
+使用Loader和Pipe加载并处理数据集
+=======================================
 
 这一部分是一个关于如何加载数据集的教程
 
 教程目录：
 
-    - `Part I: 数据集容器`_
-    - `Part II: 数据集的使用方式`_
-    - `Part III: 不同数据类型的DataSetLoader`_
-    - `Part IV: DataSetLoader举例`_
-    - `Part V: fastNLP封装好的数据集加载器`_
+    - `Part I: 数据集容器DataBundle`_
+    - `Part II: 加载数据集的基类Loader`_
+    - `Part III: 不同格式类型的基础Loader`_
+    - `Part IV: 使用Pipe对数据集进行预处理`_
+    - `Part V: fastNLP封装好的Loader和Pipe`_
 
 
-----------------------------
-Part I: 数据集容器
-----------------------------
+------------------------------------
+Part I: 数据集容器DataBundle
+------------------------------------
 
-在fastNLP中，我们使用 :class:`~fastNLP.io.base_loader.DataBundle` 来存储数据集信息。
-:class:`~fastNLP.io.base_loader.DataBundle` 类包含了两个重要内容： `datasets` 和 `vocabs` 。
+在fastNLP中，我们使用 :class:`~fastNLP.io.data_bundle.DataBundle` 来存储数据集信息。
+:class:`~fastNLP.io.data_bundle.DataBundle` 类包含了两个重要内容： `datasets` 和 `vocabs` 。
 
 `datasets` 是一个 `key` 为数据集名称（如 `train` ， `dev` ，和 `test` 等）， `value` 为 :class:`~fastNLP.DataSet` 的字典。
 
 `vocabs` 是一个 `key` 为词表名称（如 :attr:`fastNLP.Const.INPUT` 表示输入文本的词表名称， :attr:`fastNLP.Const.TARGET` 表示目标
 的真实标签词表的名称，等等）， `value` 为词表内容（ :class:`~fastNLP.Vocabulary` ）的字典。
 
-----------------------------
-Part II: 数据集的使用方式
-----------------------------
+-------------------------------------
+Part II: 加载数据集的基类Loader
+-------------------------------------
 
-在fastNLP中，我们采用 :class:`~fastNLP.io.base_loader.DataSetLoader` 来作为加载数据集的基类。
-:class:`~fastNLP.io.base_loader.DataSetLoader` 定义了各种DataSetLoader所需的API接口，开发者应该继承它实现各种的DataSetLoader。
-在各种数据集的DataSetLoader当中，至少应该编写如下内容:
+在fastNLP中，我们采用 :class:`~fastNLP.io.loader.Loader` 来作为加载数据集的基类。
+:class:`~fastNLP.io.loader.Loader` 定义了各种Loader所需的API接口，开发者应该继承它实现各种的Loader。
+在各种数据集的Loader当中，至少应该编写如下内容:
 
-    - _load 函数：从一个数据文件中读取数据到一个 :class:`~fastNLP.DataSet`
-    - load 函数（可以使用基类的方法）：从一个或多个数据文件中读取数据到一个或多个 :class:`~fastNLP.DataSet`
-    - process 函数：一个或多个从数据文件中读取数据，并处理成可以训练的 :class:`~fastNLP.io.DataBundle`
+    - _load 函数：从一个数据文件中读取数据，返回一个 :class:`~fastNLP.DataSet`
+    - load 函数：从文件或者文件夹中读取数据并组装成 :class:`~fastNLP.io.data_bundle.DataBundle`
 
-    **\*process函数中可以调用load函数或_load函数**
-
-DataSetLoader的_load或者load函数返回的 :class:`~fastNLP.DataSet` 当中，内容为数据集的文本信息，process函数返回的
-:class:`~fastNLP.io.DataBundle` 当中， `datasets` 的内容为已经index好的、可以直接被 :class:`~fastNLP.Trainer`
-接受的内容。
+Loader的load函数返回的 :class:`~fastNLP.io.data_bundle.DataBundle` 里面包含了数据集的原始数据。
 
 --------------------------------------------------------
-Part III: 不同数据类型的DataSetLoader
+Part III: 不同格式类型的基础Loader
 --------------------------------------------------------
 
-:class:`~fastNLP.io.dataset_loader.CSVLoader`
+:class:`~fastNLP.io.loader.CSVLoader`
     读取CSV类型的数据集文件。例子如下：
 
     .. code-block:: python
 
+        from fastNLP.io.loader import CSVLoader
         data_set_loader = CSVLoader(
             headers=('words', 'target'), sep='\t'
         )
@@ -67,17 +63,18 @@ Part III: 不同数据类型的DataSetLoader
         The performances are an absolute joy .	4
 
 
-:class:`~fastNLP.io.dataset_loader.JsonLoader`
+:class:`~fastNLP.io.loader.JsonLoader`
     读取Json类型的数据集文件，数据必须按行存储，每行是一个包含各类属性的Json对象。例子如下：
 
     .. code-block:: python
 
-        data_set_loader = JsonLoader(
+        from fastNLP.io.loader import JsonLoader
+        oader = JsonLoader(
             fields={'sentence1': 'words1', 'sentence2': 'words2', 'gold_label': 'target'}
         )
         # 表示将Json对象中'sentence1'、'sentence2'和'gold_label'对应的值赋给'words1'、'words2'、'target'这三个fields
 
-        data_set = data_set_loader._load('path/to/your/file')
+        data_set = loader._load('path/to/your/file')
 
     数据集内容样例如下 ::
 
@@ -86,139 +83,68 @@ Part III: 不同数据类型的DataSetLoader
         {"annotator_labels": ["entailment"], "captionID": "3416050480.jpg#4", "gold_label": "entailment", "pairID": "3416050480.jpg#4r1e", "sentence1": "A person on a horse jumps over a broken down airplane.", "sentence1_binary_parse": "( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )", "sentence1_parse": "(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))", "sentence2": "A person is outdoors, on a horse.", "sentence2_binary_parse": "( ( A person ) ( ( ( ( is outdoors ) , ) ( on ( a horse ) ) ) . ) )", "sentence2_parse": "(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (ADVP (RB outdoors)) (, ,) (PP (IN on) (NP (DT a) (NN horse)))) (. .)))"}
 
 ------------------------------------------
-Part IV: DataSetLoader举例
+Part IV: 使用Pipe对数据集进行预处理
 ------------------------------------------
 
-以Matching任务为例子：
-
-    :class:`~fastNLP.io.data_loader.MatchingLoader`
-        我们在fastNLP当中封装了一个Matching任务数据集的数据加载类： :class:`~fastNLP.io.data_loader.MatchingLoader` .
-
-        在MatchingLoader类当中我们封装了一个对数据集中的文本内容进行进一步的预处理的函数：
-        :meth:`~fastNLP.io.data_loader.MatchingLoader.process`
-        这个函数具有各种预处理option，如：
-        - 是否将文本转成全小写
-        - 是否需要序列长度信息，需要什么类型的序列长度信息
-        - 是否需要用BertTokenizer来获取序列的WordPiece信息
-        - 等等
+在fastNLP中，我们采用 :class:`~fastNLP.io.pipe.Pipe` 来作为加载数据集的基类。
+:class:`~fastNLP.io.pipe.Pipe` 定义了各种Pipe所需的API接口，开发者应该继承它实现各种的Pipe。
+在各种数据集的Pipe当中，至少应该编写如下内容:
 
-        具体内容参见 :meth:`fastNLP.io.MatchingLoader.process` 。
+    - process 函数：对输入的 :class:`~fastNLP.io.data_bundle.DataBundle` 进行处理（如构建词表、
+      将dataset的文本内容转成index等等），然后返回该 :class:`~fastNLP.io.data_bundle.DataBundle`
+    - process_from_file 函数：输入数据集所在文件夹，读取内容并组装成 :class:`~fastNLP.io.data_bundle.DataBundle` ，
+      然后调用相对应的process函数对数据进行预处理
 
-    :class:`~fastNLP.io.data_loader.SNLILoader`
-        一个关于SNLI数据集的DataSetLoader。SNLI数据集来自
-        `SNLI Data Set <https://nlp.stanford.edu/projects/snli/snli_1.0.zip>`_ .
+以SNLI数据集为例，写一个自定义Pipe的例子如下：
 
-        在 :class:`~fastNLP.io.data_loader.SNLILoader` 的 :meth:`~fastNLP.io.data_loader.SNLILoader._load`
-        函数中，我们用以下代码将数据集内容从文本文件读入内存：
+.. code-block:: python
 
-        .. code-block:: python
+    from fastNLP.io.loader import SNLILoader
+    from fastNLP.io.pipe import MatchingPipe
 
-                data = SNLILoader().process(
-                    paths='path/to/snli/data', to_lower=False, seq_len_type='seq_len',
-                    get_index=True, concat=False,
-                )
-                print(data)
+    class MySNLIPipe(MatchingPipe):
 
-        输出的内容是::
+        def process(self, data_bundle):
+            data_bundle = super(MySNLIPipe, self).process(data_bundle)
+            # MatchingPipe类里封装了一个关于matching任务的process函数，可以直接继承使用
+            # 如果有需要进行额外的预处理操作可以在这里加入您的代码
+            return data_bundle
 
-            In total 3 datasets:
-                train has 549367 instances.
-                dev has 9842 instances.
-                test has 9824 instances.
-            In total 2 vocabs:
-                words has 43154 entries.
-                target has 3 entries.
+        def process_from_file(self, paths=None):
+            data_bundle = SNLILoader().load(paths) # 使用SNLILoader读取原始数据集
+            # SNLILoader的load函数中，paths如果为None则会自动下载
+            return self.process(data_bundle)  # 调用相对应的process函数对data_bundle进行处理
 
+调用Pipe示例：
 
-        这里的data是一个 :class:`~fastNLP.io.base_loader.DataBundle` ，取 ``datasets`` 字典里的内容即可直接传入
-        :class:`~fastNLP.Trainer` 或者 :class:`~fastNLP.Tester` 进行训练或者测试。
+.. code-block:: python
 
-    :class:`~fastNLP.io.data_loader.IMDBLoader`
-        以IMDB数据集为例，在 :class:`~fastNLP.io.data_loader.IMDBLoader` 的 :meth:`~fastNLP.io.data_loader.IMDBLoader._load`
-        函数中，我们用以下代码将数据集内容从文本文件读入内存：
+    from fastNLP.io.pipe import SNLIBertPipe
+    data_bundle = SNLIBertPipe(lower=True, tokenizer=arg.tokenizer).process_from_file()
+    print(data_bundle)
 
-        .. code-block:: python
+输出的内容是::
 
-                data = IMDBLoader().process(
-                    paths={'train': 'path/to/train/file', 'test': 'path/to/test/file'}
-                )
-                print(data)
+    In total 3 datasets:
+            train has 549367 instances.
+            dev has 9842 instances.
+            test has 9824 instances.
+    In total 2 vocabs:
+            words has 34184 entries.
+            target has 3 entries.
 
-        输出的内容是::
-
-            In total 3 datasets:
-                train has 22500 instances.
-                test has 25000 instances.
-                dev has 2500 instances.
-            In total 2 vocabs:
-                words has 82846 entries.
-                target has 2 entries.
-
-
-        这里的将原来的train集按9:1的比例分成了训练集和验证集。
+这里表示一共有3个数据集和2个词表。其中：
 
+    - 3个数据集分别为train、dev、test数据集，分别有549367、9842、9824个instance
+    - 2个词表分别为words词表与target词表。其中words词表为句子文本所构建的词表，一共有34184个单词；
+      target词表为目标标签所构建的词表，一共有3种标签。（注：如果有多个输入，则句子文本所构建的词表将
+      会被命名为words1以对应相对应的列名）
 
 ------------------------------------------
-Part V: fastNLP封装好的数据集加载器
+Part V: fastNLP封装好的Loader和Pipe
 ------------------------------------------
 
-fastNLP封装好的数据集加载器可以适用于多种类型的任务：
-
-    - `文本分类任务`_
-    - `序列标注任务`_
-    - `Matching任务`_
-
-
-文本分类任务
--------------------
-
-==========================    ==================================================================
-数据集名称                      数据集加载器
---------------------------    ------------------------------------------------------------------
-IMDb                          :class:`~fastNLP.io.data_loader.IMDBLoader`
---------------------------    ------------------------------------------------------------------
-SST                           :class:`~fastNLP.io.data_loader.SSTLoader`
---------------------------    ------------------------------------------------------------------
-SST-2                         :class:`~fastNLP.io.data_loader.SST2Loader`
---------------------------    ------------------------------------------------------------------
-Yelp Polarity                 :class:`~fastNLP.io.data_loader.YelpLoader`
---------------------------    ------------------------------------------------------------------
-Yelp Full                     :class:`~fastNLP.io.data_loader.YelpLoader`
---------------------------    ------------------------------------------------------------------
-MTL16                         :class:`~fastNLP.io.data_loader.MTL16Loader`
-==========================    ==================================================================
-
-
-
-序列标注任务
--------------------
-
-==========================    ==================================================================
-数据集名称                      数据集加载器
---------------------------    ------------------------------------------------------------------
-Conll                         :class:`~fastNLP.io.data_loader.ConllLoader`
---------------------------    ------------------------------------------------------------------
-Conll2003                     :class:`~fastNLP.io.data_loader.Conll2003Loader`
---------------------------    ------------------------------------------------------------------
-人民日报数据集                   :class:`~fastNLP.io.data_loader.PeopleDailyCorpusLoader`
-==========================    ==================================================================
-
-
-
-Matching任务
--------------------
-
-==========================    ==================================================================
-数据集名称                      数据集加载器
---------------------------    ------------------------------------------------------------------
-SNLI                          :class:`~fastNLP.io.data_loader.SNLILoader`
---------------------------    ------------------------------------------------------------------
-MultiNLI                      :class:`~fastNLP.io.data_loader.MNLILoader`
---------------------------    ------------------------------------------------------------------
-QNLI                          :class:`~fastNLP.io.data_loader.QNLILoader`
---------------------------    ------------------------------------------------------------------
-RTE                           :class:`~fastNLP.io.data_loader.RTELoader`
---------------------------    ------------------------------------------------------------------
-Quora Pair Dataset            :class:`~fastNLP.io.data_loader.QuoraLoader`
-==========================    ==================================================================
+fastNLP封装了多种任务/数据集的Loader和Pipe并提供自动下载功能，具体参见文档
+
+`fastNLP可加载的embedding与数据集 <https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0>`_
 
diff --git a/docs/source/tutorials/tutorial_3_embedding.rst b/docs/source/tutorials/tutorial_3_embedding.rst
index 489b43b4..07dc30bc 100644
--- a/docs/source/tutorials/tutorial_3_embedding.rst
+++ b/docs/source/tutorials/tutorial_3_embedding.rst
@@ -12,6 +12,7 @@
     - `Part IV: 使用预训练的Contextual Embedding(ELMo & BERT)`_
     - `Part V: 使用character-level的embedding`_
     - `Part VI: 叠加使用多个embedding`_
+    - `Part VII: fastNLP支持的预训练Embedding`_
 
 
 
@@ -35,12 +36,14 @@ Part II: 使用随机初始化的embedding
 
 .. code-block:: python
 
+    from fastNLP import Embedding
     embed = Embedding(10000, 50)
 
 也可以传入一个初始化的参数矩阵：
 
 .. code-block:: python
 
+    from fastNLP import Embedding
     embed = Embedding(init_embed)
 
 其中的init_embed可以是torch.FloatTensor、torch.nn.Embedding或者numpy.ndarray。
@@ -59,6 +62,7 @@ Embedding，例子如下：
 
 .. code-block:: python
 
+    from fastNLP import StaticEmbedding
     embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
 
 vocab为根据数据集构建的词表，model_dir_or_name可以是一个路径，也可以是embedding模型的名称：
@@ -67,34 +71,13 @@ vocab为根据数据集构建的词表，model_dir_or_name可以是一个路径
     和word2vec类型的权重文件都支持)
 
     2 如果传入的是模型名称，那么fastNLP将会根据名称查找embedding模型，如果在cache目录下找到模型则会
-    自动加载；如果找不到则会自动下载。可以通过环境变量 ``FASTNLP_CACHE_DIR`` 来自定义cache目录，如::
+    自动加载；如果找不到则会自动下载到cache目录。默认的cache目录为 `~/.fastNLP` 文件夹。可以通过环境
+    变量 ``FASTNLP_CACHE_DIR`` 来自定义cache目录，如::
 
         $ FASTNLP_CACHE_DIR=~/fastnlp_cache_dir python your_python_file.py
 
 这个命令表示fastNLP将会在 `~/fastnlp_cache_dir` 这个目录下寻找模型，找不到则会自动将模型下载到这个目录
 
-目前支持的静态embedding模型有：
-
-    ==========================    ================================
-    模型名称                        模型
-    --------------------------    --------------------------------
-    en                            glove.840B.300d
-    --------------------------    --------------------------------
-    en-glove-840d-300             glove.840B.300d
-    --------------------------    --------------------------------
-    en-glove-6b-50                glove.6B.50d
-    --------------------------    --------------------------------
-    en-word2vec-300               谷歌word2vec 300维
-    --------------------------    --------------------------------
-    en-fasttext                   英文fasttext 300维
-    --------------------------    --------------------------------
-    cn                            腾讯中文词向量 200维
-    --------------------------    --------------------------------
-    cn-fasttext                   中文fasttext 300维
-    ==========================    ================================
-
-
-
 -----------------------------------------------------------
 Part IV: 使用预训练的Contextual Embedding(ELMo & BERT)
 -----------------------------------------------------------
@@ -106,62 +89,20 @@ Part IV: 使用预训练的Contextual Embedding(ELMo & BERT)
 
 .. code-block:: python
 
+    from fastNLP import ElmoEmbedding
     embed = ElmoEmbedding(vocab, model_dir_or_name='small', requires_grad=False)
 
-目前支持的ElmoEmbedding模型有：
-
-    ==========================    ================================
-    模型名称                        模型
-    --------------------------    --------------------------------
-    small                         allennlp ELMo的small
-    --------------------------    --------------------------------
-    medium                        allennlp ELMo的medium
-    --------------------------    --------------------------------
-    original                      allennlp ELMo的original
-    --------------------------    --------------------------------
-    5.5b-original                 allennlp ELMo的5.5B original
-    ==========================    ================================
-
 BERT-embedding的使用方法如下：
 
 .. code-block:: python
 
+    from fastNLP import BertEmbedding
     embed = BertEmbedding(
         vocab, model_dir_or_name='en-base-cased', requires_grad=False, layers='4,-2,-1'
     )
 
 其中layers变量表示需要取哪几层的encode结果。
 
-目前支持的BertEmbedding模型有：
-
-    ==========================    ====================================
-    模型名称                        模型
-    --------------------------    ------------------------------------
-    en                            bert-base-cased
-    --------------------------    ------------------------------------
-    en-base-uncased               bert-base-uncased
-    --------------------------    ------------------------------------
-    en-base-cased                 bert-base-cased
-    --------------------------    ------------------------------------
-    en-large-uncased              bert-large-uncased
-    --------------------------    ------------------------------------
-    en-large-cased                bert-large-cased
-    --------------------------    ------------------------------------
-    --------------------------    ------------------------------------
-    en-large-cased-wwm            bert-large-cased-whole-word-mask
-    --------------------------    ------------------------------------
-    en-large-uncased-wwm          bert-large-uncased-whole-word-mask
-    --------------------------    ------------------------------------
-    en-base-cased-mrpc            bert-base-cased-finetuned-mrpc
-    --------------------------    ------------------------------------
-    --------------------------    ------------------------------------
-    multilingual                  bert-base-multilingual-cased
-    --------------------------    ------------------------------------
-    multilingual-base-uncased     bert-base-multilingual-uncased
-    --------------------------    ------------------------------------
-    multilingual-base-cased       bert-base-multilingual-cased
-    ==========================    ====================================
-
 -----------------------------------------------------
 Part V: 使用character-level的embedding
 -----------------------------------------------------
@@ -173,6 +114,7 @@ CNNCharEmbedding的使用例子如下：
 
 .. code-block:: python
 
+    from fastNLP import CNNCharEmbedding
     embed = CNNCharEmbedding(vocab, embed_size=100, char_emb_size=50)
 
 这表示这个CNNCharEmbedding当中character的embedding维度大小为50，返回的embedding结果维度大小为100。
@@ -181,12 +123,12 @@ CNNCharEmbedding的使用例子如下：
 
 .. code-block:: python
 
+    from fastNLP import LSTMCharEmbedding
     embed = LSTMCharEmbedding(vocab, embed_size=100, char_emb_size=50)
 
 这表示这个LSTMCharEmbedding当中character的embedding维度大小为50，返回的embedding结果维度大小为100。
 
 
-
 -----------------------------------------------------
 Part VI: 叠加使用多个embedding
 -----------------------------------------------------
@@ -197,6 +139,7 @@ Part VI: 叠加使用多个embedding
 
 .. code-block:: python
 
+    from fastNLP import StaticEmbedding, StackEmbedding
     embed_1 = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
     embed_2 = StaticEmbedding(vocab, model_dir_or_name='en-word2vec-300', requires_grad=True)
 
@@ -208,7 +151,17 @@ StackEmbedding会把多个embedding的结果拼接起来，如上面例子的sta
 
 .. code-block:: python
 
+    from fastNLP import StaticEmbedding, StackEmbedding, ElmoEmbedding
     elmo_embedding = ElmoEmbedding(vocab, model_dir_or_name='medium', layers='0,1,2', requires_grad=False)
     glove_embedding = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50', requires_grad=True)
 
     stack_embed = StackEmbedding([elmo_embedding, glove_embedding])
+
+------------------------------------------
+Part VII: fastNLP支持的预训练Embedding
+------------------------------------------
+
+fastNLP支持多种预训练Embedding并提供自动下载功能，具体参见文档
+
+`fastNLP可加载的embedding与数据集 <https://docs.qq.com/sheet/DVnpkTnF6VW9UeXdh?c=A1A0A0>`_
+
diff --git a/docs/source/user/tutorials.rst b/docs/source/user/tutorials.rst
index 196f9c29..3e9e1b54 100644
--- a/docs/source/user/tutorials.rst
+++ b/docs/source/user/tutorials.rst
@@ -8,7 +8,7 @@ fastNLP 详细使用教程
    :maxdepth: 1
 
    使用DataSet预处理文本 </tutorials/tutorial_1_data_preprocess>
-   使用DataSetLoader加载数据集 </tutorials/tutorial_2_load_dataset>
+   使用Loader和Pipe加载并处理数据集 </tutorials/tutorial_2_load_dataset>
    使用Embedding模块将文本转成向量 </tutorials/tutorial_3_embedding>
    动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试 </tutorials/tutorial_4_loss_optimizer>
    动手实现一个文本分类器II-使用DataSetIter实现自定义训练过程 </tutorials/tutorial_5_datasetiter>
diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py
index a3ea0148..8ed1956a 100644
--- a/fastNLP/io/__init__.py
+++ b/fastNLP/io/__init__.py
@@ -38,7 +38,6 @@ __all__ = [
     'JsonLoader',
 
     'CWSLoader',
-    "CWSPipe",
 
     'MNLILoader',
     "QuoraLoader",
@@ -46,6 +45,8 @@ __all__ = [
     "QNLILoader",
     "RTELoader",
 
+    "Pipe",
+
     "YelpFullPipe",
     "YelpPolarityPipe",
     "SSTPipe",
@@ -59,6 +60,8 @@ __all__ = [
     "PeopleDailyPipe",
     "WeiboNERPipe",
 
+    "CWSPipe",
+
     "MatchingBertPipe",
     "RTEBertPipe",
     "SNLIBertPipe",
diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py
index 9dbb515d..bd02158e 100644
--- a/fastNLP/io/file_utils.py
+++ b/fastNLP/io/file_utils.py
@@ -59,7 +59,7 @@ PRETRAIN_STATIC_FILES = {
     'en-fasttext-crawl': "crawl-300d-2M.vec.zip",
 
     'cn': "tencent_cn.zip",
-    'cn-tencent': "tencent_cn.txt.zip",
+    'cn-tencent': "tencent_cn.zip",
     'cn-fasttext': "cc.zh.300.vec.gz",
     'cn-sgns-literature-word': 'sgns.literature.word.txt.zip',
 }
diff --git a/fastNLP/io/loader/__init__.py b/fastNLP/io/loader/__init__.py
index 820c33be..6c23f213 100644
--- a/fastNLP/io/loader/__init__.py
+++ b/fastNLP/io/loader/__init__.py
@@ -62,8 +62,8 @@ __all__ = [
     "PeopleDailyNERLoader",
     "WeiboNERLoader",
 
-    # 'CSVLoader',
-    # 'JsonLoader',
+    'CSVLoader',
+    'JsonLoader',
 
     'CWSLoader',
 
diff --git a/fastNLP/io/loader/classification.py b/fastNLP/io/loader/classification.py
index 67e19773..f64a26e7 100644
--- a/fastNLP/io/loader/classification.py
+++ b/fastNLP/io/loader/classification.py
@@ -5,7 +5,6 @@ import warnings
 import os
 import random
 import shutil
-import numpy as np
 import glob
 import time
 
diff --git a/fastNLP/io/loader/conll.py b/fastNLP/io/loader/conll.py
index 5dc4c6d7..b5241cff 100644
--- a/fastNLP/io/loader/conll.py
+++ b/fastNLP/io/loader/conll.py
@@ -11,9 +11,10 @@ import shutil
 import time
 import random
 
+
 class ConllLoader(Loader):
     """
-    别名：:class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.data_loader.ConllLoader`
+    别名：:class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.loader.ConllLoader`
 
     ConllLoader支持读取的数据格式: 以空行隔开两个sample，除了分割行，每一行用空格或者制表符隔开不同的元素。如下例所示:
 
diff --git a/fastNLP/io/loader/csv.py b/fastNLP/io/loader/csv.py
index 166f912b..5195cc8e 100644
--- a/fastNLP/io/loader/csv.py
+++ b/fastNLP/io/loader/csv.py
@@ -6,7 +6,7 @@ from .loader import Loader
 
 class CSVLoader(Loader):
     """
-    别名：:class:`fastNLP.io.CSVLoader` :class:`fastNLP.io.dataset_loader.CSVLoader`
+    别名：:class:`fastNLP.io.CSVLoader` :class:`fastNLP.io.loader.CSVLoader`
 
     读取CSV格式的数据集, 返回 ``DataSet`` 。
 
diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py
index 0d1b4e82..ffa6375b 100644
--- a/fastNLP/io/pipe/matching.py
+++ b/fastNLP/io/pipe/matching.py
@@ -181,8 +181,8 @@ class MatchingPipe(Pipe):
            "This site includes a...", "The Government Executive...", "not_entailment"
            "...", "..."
 
-        :param data_bundle:
-        :return:
+        :param data_bundle: 通过loader读取得到的data_bundle，里面包含了数据集的原始数据内容
+        :return: data_bundle
         """
         data_bundle = self._tokenize(data_bundle, [Const.RAW_WORDS(0), Const.RAW_WORDS(1)],
                                      [Const.INPUTS(0), Const.INPUTS(1)])
diff --git a/fastNLP/io/pipe/pipe.py b/fastNLP/io/pipe/pipe.py
index a2b74301..cc45dee4 100644
--- a/fastNLP/io/pipe/pipe.py
+++ b/fastNLP/io/pipe/pipe.py
@@ -2,6 +2,9 @@ from .. import DataBundle
 
 
 class Pipe:
+    """
+    别名：:class:`fastNLP.io.Pipe` :class:`fastNLP.io.pipe.Pipe`
+    """
     def process(self, data_bundle: DataBundle) -> DataBundle:
         """
         对输入的DataBundle进行处理，然后返回该DataBundle。

From 9e16791c538b856184efd4095ab0faed5ff4d2ce Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Sun, 25 Aug 2019 17:08:19 +0800
Subject: [PATCH 09/19] fix some importing bugs

---
 fastNLP/io/pipe/cws.py | 84 ++++++++++++++++++++++++------------------
 1 file changed, 49 insertions(+), 35 deletions(-)

diff --git a/fastNLP/io/pipe/cws.py b/fastNLP/io/pipe/cws.py
index 6ea1ae0c..4ca0219c 100644
--- a/fastNLP/io/pipe/cws.py
+++ b/fastNLP/io/pipe/cws.py
@@ -1,10 +1,13 @@
+import re
+from itertools import chain
+
 from .pipe import Pipe
+from .utils import _indexize
 from .. import DataBundle
 from ..loader import CWSLoader
-from ... import Const
-from itertools import chain
-from .utils import _indexize
-import re
+from ...core.const import Const
+
+
 def _word_lens_to_bmes(word_lens):
     """
 
@@ -13,11 +16,11 @@ def _word_lens_to_bmes(word_lens):
     """
     tags = []
     for word_len in word_lens:
-        if word_len==1:
+        if word_len == 1:
             tags.append('S')
         else:
             tags.append('B')
-            tags.extend(['M']*(word_len-2))
+            tags.extend(['M'] * (word_len - 2))
             tags.append('E')
     return tags
 
@@ -30,10 +33,10 @@ def _word_lens_to_segapp(word_lens):
     """
     tags = []
     for word_len in word_lens:
-        if word_len==1:
+        if word_len == 1:
             tags.append('SEG')
         else:
-            tags.extend(['APP']*(word_len-1))
+            tags.extend(['APP'] * (word_len - 1))
             tags.append('SEG')
     return tags
 
@@ -97,13 +100,21 @@ def _digit_span_to_special_tag(span):
     else:
         return '<NUM>'
 
+
 def _find_and_replace_digit_spans(line):
-    # only consider words start with number, contains '.', characters.
-    #	If ends with space, will be processed
-    #	If ends with Chinese character, will be processed
-    #	If ends with or contains english char, not handled.
-    # floats are replaced by <DEC>
-    # otherwise unkdgt
+    """
+    only consider words start with number, contains '.', characters.
+    
+        If ends with space, will be processed
+        
+        If ends with Chinese character, will be processed
+        
+        If ends with or contains english char, not handled.
+    
+    floats are replaced by <DEC>
+    
+    otherwise unkdgt
+    """
     new_line = ''
     pattern = '\d[\d\\.﹒·]*(?=[\u4e00-\u9fff  ，％,。！<－“])'
     prev_end = 0
@@ -136,17 +147,18 @@ class CWSPipe(Pipe):
     :param bool bigrams: 是否增加一列bigram. bigram的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]
     :param bool trigrams: 是否增加一列trigram. trigram的构成是 ['复', '旦', '大', '学', ...]->["复旦大", "旦大学", ...]
     """
+    
     def __init__(self, dataset_name=None, encoding_type='bmes', replace_num_alpha=True, bigrams=False, trigrams=False):
-        if encoding_type=='bmes':
+        if encoding_type == 'bmes':
             self.word_lens_to_tags = _word_lens_to_bmes
         else:
             self.word_lens_to_tags = _word_lens_to_segapp
-
+        
         self.dataset_name = dataset_name
         self.bigrams = bigrams
         self.trigrams = trigrams
         self.replace_num_alpha = replace_num_alpha
-
+    
     def _tokenize(self, data_bundle):
         """
         将data_bundle中的'chars'列切分成一个一个的word.
@@ -162,10 +174,10 @@ class CWSPipe(Pipe):
                 char = []
                 subchar = []
                 for c in word:
-                    if c=='<':
+                    if c == '<':
                         subchar.append(c)
                         continue
-                    if c=='>' and subchar[0]=='<':
+                    if c == '>' and subchar[0] == '<':
                         char.append(''.join(subchar))
                         subchar = []
                     if subchar:
@@ -175,12 +187,12 @@ class CWSPipe(Pipe):
                 char.extend(subchar)
                 chars.append(char)
             return chars
-
+        
         for name, dataset in data_bundle.datasets.items():
             dataset.apply_field(split_word_into_chars, field_name=Const.CHAR_INPUT,
                                 new_field_name=Const.CHAR_INPUT)
         return data_bundle
-
+    
     def process(self, data_bundle: DataBundle) -> DataBundle:
         """
         可以处理的DataSet需要包含raw_words列
@@ -196,42 +208,43 @@ class CWSPipe(Pipe):
         :return:
         """
         data_bundle.copy_field(Const.RAW_WORD, Const.CHAR_INPUT)
-
+        
         if self.replace_num_alpha:
             data_bundle.apply_field(_find_and_replace_alpha_spans, Const.CHAR_INPUT, Const.CHAR_INPUT)
             data_bundle.apply_field(_find_and_replace_digit_spans, Const.CHAR_INPUT, Const.CHAR_INPUT)
-
+        
         self._tokenize(data_bundle)
-
+        
         for name, dataset in data_bundle.datasets.items():
-            dataset.apply_field(lambda chars:self.word_lens_to_tags(map(len, chars)), field_name=Const.CHAR_INPUT,
+            dataset.apply_field(lambda chars: self.word_lens_to_tags(map(len, chars)), field_name=Const.CHAR_INPUT,
                                 new_field_name=Const.TARGET)
-            dataset.apply_field(lambda chars:list(chain(*chars)), field_name=Const.CHAR_INPUT,
+            dataset.apply_field(lambda chars: list(chain(*chars)), field_name=Const.CHAR_INPUT,
                                 new_field_name=Const.CHAR_INPUT)
         input_field_names = [Const.CHAR_INPUT]
         if self.bigrams:
             for name, dataset in data_bundle.datasets.items():
-                dataset.apply_field(lambda chars: [c1+c2 for c1, c2 in zip(chars, chars[1:]+['<eos>'])],
+                dataset.apply_field(lambda chars: [c1 + c2 for c1, c2 in zip(chars, chars[1:] + ['<eos>'])],
                                     field_name=Const.CHAR_INPUT, new_field_name='bigrams')
             input_field_names.append('bigrams')
         if self.trigrams:
             for name, dataset in data_bundle.datasets.items():
-                dataset.apply_field(lambda chars: [c1+c2+c3 for c1, c2, c3 in zip(chars, chars[1:]+['<eos>'], chars[2:]+['<eos>']*2)],
+                dataset.apply_field(lambda chars: [c1 + c2 + c3 for c1, c2, c3 in
+                                                   zip(chars, chars[1:] + ['<eos>'], chars[2:] + ['<eos>'] * 2)],
                                     field_name=Const.CHAR_INPUT, new_field_name='trigrams')
             input_field_names.append('trigrams')
-
+        
         _indexize(data_bundle, input_field_names, Const.TARGET)
-
+        
         input_fields = [Const.TARGET, Const.INPUT_LEN] + input_field_names
         target_fields = [Const.TARGET, Const.INPUT_LEN]
         for name, dataset in data_bundle.datasets.items():
             dataset.add_seq_len(Const.CHAR_INPUT)
-
+        
         data_bundle.set_input(*input_fields)
         data_bundle.set_target(*target_fields)
-
+        
         return data_bundle
-
+    
     def process_from_file(self, paths=None) -> DataBundle:
         """
 
@@ -239,8 +252,9 @@ class CWSPipe(Pipe):
         :return:
         """
         if self.dataset_name is None and paths is None:
-            raise RuntimeError("You have to set `paths` when calling process_from_file() or `dataset_name `when initialization.")
+            raise RuntimeError(
+                "You have to set `paths` when calling process_from_file() or `dataset_name `when initialization.")
         if self.dataset_name is not None and paths is not None:
             raise RuntimeError("You cannot specify `paths` and `dataset_name` simultaneously")
         data_bundle = CWSLoader(self.dataset_name).load(paths)
-        return self.process(data_bundle)
\ No newline at end of file
+        return self.process(data_bundle)

From 34e17e97935f69aef54a9d75694713f0823c41fe Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Mon, 26 Aug 2019 10:07:52 +0800
Subject: [PATCH 10/19] update the fastNLP.__init__ : use loader&pipe to
 replace data_loader

---
 fastNLP/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fastNLP/__init__.py b/fastNLP/__init__.py
index 2720f292..19efac31 100644
--- a/fastNLP/__init__.py
+++ b/fastNLP/__init__.py
@@ -65,8 +65,8 @@ __all__ = [
 ]
 __version__ = '0.4.5'
 
-from .core import *
+from . import embeddings
 from . import models
 from . import modules
-from . import embeddings
-from .io import data_loader
+from .core import *
+from .io import loader, pipe

From 9535ec60b65e7a2bc70394f444b5067bcb161ad9 Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Mon, 26 Aug 2019 10:17:43 +0800
Subject: [PATCH 11/19] update the doc system: use customized tool to generate
 the rst files

---
 docs/count.py                                 | 133 +++++++-----------
 docs/source/conf.py                           |   8 +-
 docs/source/fastNLP.core.batch.rst            |   6 +-
 docs/source/fastNLP.core.callback.rst         |   6 +-
 docs/source/fastNLP.core.const.rst            |   6 +-
 docs/source/fastNLP.core.dataset.rst          |   6 +-
 docs/source/fastNLP.core.field.rst            |   6 +-
 docs/source/fastNLP.core.instance.rst         |   6 +-
 docs/source/fastNLP.core.losses.rst           |   6 +-
 docs/source/fastNLP.core.metrics.rst          |   6 +-
 docs/source/fastNLP.core.optimizer.rst        |   6 +-
 docs/source/fastNLP.core.rst                  |   9 +-
 docs/source/fastNLP.core.sampler.rst          |   6 +-
 docs/source/fastNLP.core.tester.rst           |   6 +-
 docs/source/fastNLP.core.trainer.rst          |   6 +-
 docs/source/fastNLP.core.utils.rst            |   6 +-
 docs/source/fastNLP.core.vocabulary.rst       |   6 +-
 .../fastNLP.embeddings.bert_embedding.rst     |  10 +-
 .../fastNLP.embeddings.char_embedding.rst     |  10 +-
 ...astNLP.embeddings.contextual_embedding.rst |   7 +
 .../fastNLP.embeddings.elmo_embedding.rst     |  10 +-
 docs/source/fastNLP.embeddings.embedding.rst  |   6 +-
 docs/source/fastNLP.embeddings.rst            |  10 +-
 .../fastNLP.embeddings.stack_embedding.rst    |  10 +-
 .../fastNLP.embeddings.static_embedding.rst   |  10 +-
 docs/source/fastNLP.embeddings.utils.rst      |   6 +-
 docs/source/fastNLP.io.data_bundle.rst        |  10 +-
 docs/source/fastNLP.io.data_loader.rst        |   8 --
 docs/source/fastNLP.io.dataset_loader.rst     |   9 +-
 docs/source/fastNLP.io.embed_loader.rst       |  10 +-
 docs/source/fastNLP.io.file_utils.rst         |  10 +-
 docs/source/fastNLP.io.loader.rst             |   5 +-
 docs/source/fastNLP.io.model_io.rst           |  10 +-
 docs/source/fastNLP.io.pipe.rst               |   5 +-
 docs/source/fastNLP.io.rst                    |  21 +--
 docs/source/fastNLP.io.utils.rst              |   6 +-
 .../source/fastNLP.models.biaffine_parser.rst |  10 +-
 ...fastNLP.models.cnn_text_classification.rst |  10 +-
 docs/source/fastNLP.models.rst                |   9 +-
 .../fastNLP.models.sequence_labeling.rst      |  10 +-
 docs/source/fastNLP.models.snli.rst           |   6 +-
 .../fastNLP.models.star_transformer.rst       |  10 +-
 docs/source/fastNLP.modules.decoder.rst       |   5 +-
 docs/source/fastNLP.modules.encoder.rst       |   5 +-
 docs/source/fastNLP.modules.rst               |  15 +-
 docs/source/fastNLP.modules.utils.rst         |   6 +-
 docs/source/fastNLP.rst                       |   9 +-
 47 files changed, 223 insertions(+), 279 deletions(-)
 create mode 100644 docs/source/fastNLP.embeddings.contextual_embedding.rst
 delete mode 100644 docs/source/fastNLP.io.data_loader.rst

diff --git a/docs/count.py b/docs/count.py
index d906f4c0..e1aad115 100644
--- a/docs/count.py
+++ b/docs/count.py
@@ -1,98 +1,65 @@
 import os
+import sys
 
 
-def find_all(path='../fastNLP'):
-    head_list = []
-    alias_list = []
-    for path, dirs, files in os.walk(path):
+def find_all_modules():
+    modules = {}
+    children = {}
+    to_doc = set()
+    root = '../fastNLP'
+    for path, dirs, files in os.walk(root):
         for file in files:
             if file.endswith('.py'):
                 name = ".".join(path.split('/')[1:])
                 if file.split('.')[0] != "__init__":
                     name = name + '.' + file.split('.')[0]
-                if len(name.split('.')) < 3 or name.startswith('fastNLP.core'):
-                    heads, alias = find_one(path + '/' + file)
-                    for h in heads:
-                        head_list.append(name + "." + h)
-                    for a in alias:
-                        alias_list.append(a)
-    heads = {}
-    for h in head_list:
-        end = h.split('.')[-1]
-        file = h[:-len(end) - 1]
-        if end not in heads:
-            heads[end] = set()
-        heads[end].add(file)
-    alias = {}
-    for a in alias_list:
-        for each in a:
-            end = each.split('.')[-1]
-            file = each[:-len(end) - 1]
-            if end not in alias:
-                alias[end] = set()
-            alias[end].add(file)
-    print("IN alias NOT IN heads")
-    for item in alias:
-        if item not in heads:
-            print(item, alias[item])
-        elif len(heads[item]) != 2:
-            print(item, alias[item], heads[item])
-    
-    print("\n\nIN heads NOT IN alias")
-    for item in heads:
-        if item not in alias:
-            print(item, heads[item])
+                __import__(name)
+                m = sys.modules[name]
+                modules[name] = m
+                try:
+                    m.__all__
+                except:
+                    print(name, "__all__ missing")
+                    continue
+                if m.__doc__ is None:
+                    print(name, "__doc__ missing")
+                    continue
+                if "undocumented" not in m.__doc__:
+                    to_doc.add(name)
+    for module in to_doc:
+        t = ".".join(module.split('.')[:-1])
+        if t in to_doc:
+            if t not in children:
+                children[t] = set()
+            children[t].add(module)
+    for m in children:
+        children[m] = sorted(children[m])
+    return modules, to_doc, children
 
 
-def find_class(path):
-    with open(path, 'r') as fin:
-        lines = fin.readlines()
-    pars = {}
-    for i, line in enumerate(lines):
-        if line.strip().startswith('class'):
-            line = line.strip()[len('class'):-1].strip()
-            if line[-1] == ')':
-                line = line[:-1].split('(')
-                name = line[0].strip()
-                parents = line[1].split(',')
-                for i in range(len(parents)):
-                    parents[i] = parents[i].strip()
-                if len(parents) == 1:
-                    pars[name] = parents[0]
-                else:
-                    pars[name] = tuple(parents)
-    return pars
+def create_rst_file(modules, name, children):
+    m = modules[name]
+    with open("./source/" + name + ".rst", "w") as fout:
+        t = "=" * len(name)
+        fout.write(name + "\n")
+        fout.write(t + "\n")
+        fout.write("\n")
+        fout.write(".. automodule:: " + name + "\n")
+        if len(m.__all__) > 0:
+            fout.write("   :members: " + ", ".join(m.__all__) + "\n")
+            fout.write("   :inherited-members:\n")
+        fout.write("\n")
+        if name in children:
+            fout.write("子模块\n------\n\n.. toctree::\n\n")
+            for module in children[name]:
+                fout.write("   " + module + "\n")
 
 
-def find_one(path):
-    head_list = []
-    alias = []
-    with open(path, 'r') as fin:
-        lines = fin.readlines()
-    flag = False
-    for i, line in enumerate(lines):
-        if line.strip().startswith('__all__'):
-            line = line.strip()[len('__all__'):].strip()
-            if line[-1] == ']':
-                line = line[1:-1].strip()[1:].strip()
-                head_list.append(line.strip("\"").strip("\'").strip())
-            else:
-                flag = True
-        elif line.strip() == ']':
-            flag = False
-        elif flag:
-            line = line.strip()[:-1].strip("\"").strip("\'").strip()
-            if len(line) == 0 or line[0] == '#':
-                continue
-            head_list.append(line)
-        if line.startswith('def') or line.startswith('class'):
-            if lines[i + 2].strip().startswith("别名："):
-                names = lines[i + 2].strip()[len("别名："):].split()
-                names[0] = names[0][len(":class:`"):-1]
-                names[1] = names[1][len(":class:`"):-1]
-                alias.append((names[0], names[1]))
-    return head_list, alias
+def main():
+    modules, to_doc, children = find_all_modules()
+    for name in to_doc:
+        create_rst_file(modules, name, children)
 
 
 if __name__ == "__main__":
-    find_all()  # use to check __all__
+    main()
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 2e10bc89..83cb7185 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -48,12 +48,14 @@ extensions = [
 autodoc_default_options = {
     'member-order': 'bysource',
     'special-members': '__init__',
-    'undoc-members': True,
+    'undoc-members': False,
 }
 
+autoclass_content = "class"
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
-
+# template_bridge
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
@@ -113,7 +115,7 @@ html_static_path = ['_static']
 # -- Options for HTMLHelp output ---------------------------------------------
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'fastNLPdoc'
+htmlhelp_basename = 'fastNLP doc'
 
 # -- Options for LaTeX output ------------------------------------------------
 
diff --git a/docs/source/fastNLP.core.batch.rst b/docs/source/fastNLP.core.batch.rst
index 03008b52..50ad6fed 100644
--- a/docs/source/fastNLP.core.batch.rst
+++ b/docs/source/fastNLP.core.batch.rst
@@ -2,6 +2,6 @@ fastNLP.core.batch
 ==================
 
 .. automodule:: fastNLP.core.batch
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: BatchIter, DataSetIter, TorchLoaderIter
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.core.callback.rst b/docs/source/fastNLP.core.callback.rst
index 74a7825d..d37ddb11 100644
--- a/docs/source/fastNLP.core.callback.rst
+++ b/docs/source/fastNLP.core.callback.rst
@@ -2,6 +2,6 @@ fastNLP.core.callback
 =====================
 
 .. automodule:: fastNLP.core.callback
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, EchoCallback, TesterCallback, CallbackException, EarlyStopError
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.core.const.rst b/docs/source/fastNLP.core.const.rst
index 330a8883..82a1992e 100644
--- a/docs/source/fastNLP.core.const.rst
+++ b/docs/source/fastNLP.core.const.rst
@@ -2,6 +2,6 @@ fastNLP.core.const
 ==================
 
 .. automodule:: fastNLP.core.const
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: Const
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.core.dataset.rst b/docs/source/fastNLP.core.dataset.rst
index 1ad94bb6..e13d7f1c 100644
--- a/docs/source/fastNLP.core.dataset.rst
+++ b/docs/source/fastNLP.core.dataset.rst
@@ -2,6 +2,6 @@ fastNLP.core.dataset
 ====================
 
 .. automodule:: fastNLP.core.dataset
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: DataSet
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.core.field.rst b/docs/source/fastNLP.core.field.rst
index 7fc099c9..73dad8af 100644
--- a/docs/source/fastNLP.core.field.rst
+++ b/docs/source/fastNLP.core.field.rst
@@ -2,6 +2,6 @@ fastNLP.core.field
 ==================
 
 .. automodule:: fastNLP.core.field
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: Padder, AutoPadder, EngChar2DPadder
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.core.instance.rst b/docs/source/fastNLP.core.instance.rst
index 6e496ac1..010567b9 100644
--- a/docs/source/fastNLP.core.instance.rst
+++ b/docs/source/fastNLP.core.instance.rst
@@ -2,6 +2,6 @@ fastNLP.core.instance
 =====================
 
 .. automodule:: fastNLP.core.instance
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: Instance
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.core.losses.rst b/docs/source/fastNLP.core.losses.rst
index 8e63dfa1..daf246f8 100644
--- a/docs/source/fastNLP.core.losses.rst
+++ b/docs/source/fastNLP.core.losses.rst
@@ -2,6 +2,6 @@ fastNLP.core.losses
 ===================
 
 .. automodule:: fastNLP.core.losses
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: LossBase, LossFunc, LossInForward, CrossEntropyLoss, BCELoss, L1Loss, NLLLoss
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.core.metrics.rst b/docs/source/fastNLP.core.metrics.rst
index d3b87bb8..96748a78 100644
--- a/docs/source/fastNLP.core.metrics.rst
+++ b/docs/source/fastNLP.core.metrics.rst
@@ -2,6 +2,6 @@ fastNLP.core.metrics
 ====================
 
 .. automodule:: fastNLP.core.metrics
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: MetricBase, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.core.optimizer.rst b/docs/source/fastNLP.core.optimizer.rst
index c80be53f..44e45c4f 100644
--- a/docs/source/fastNLP.core.optimizer.rst
+++ b/docs/source/fastNLP.core.optimizer.rst
@@ -2,6 +2,6 @@ fastNLP.core.optimizer
 ======================
 
 .. automodule:: fastNLP.core.optimizer
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: Optimizer, SGD, Adam, AdamW
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.core.rst b/docs/source/fastNLP.core.rst
index 08d161b7..56de46e9 100644
--- a/docs/source/fastNLP.core.rst
+++ b/docs/source/fastNLP.core.rst
@@ -2,12 +2,11 @@ fastNLP.core
 ============
 
 .. automodule:: fastNLP.core
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: DataSet, Instance, FieldArray, Padder, AutoPadder, EngChar2DPadder, Vocabulary, DataSetIter, BatchIter, TorchLoaderIter, Const, Tester, Trainer, cache_results, seq_len_to_mask, get_seq_len, logger, Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, EchoCallback, TesterCallback, CallbackException, EarlyStopError, LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric, Optimizer, SGD, Adam, AdamW, SequentialSampler, BucketSampler, RandomSampler, Sampler
+   :inherited-members:
 
-Submodules
-----------
+子模块
+------
 
 .. toctree::
 
diff --git a/docs/source/fastNLP.core.sampler.rst b/docs/source/fastNLP.core.sampler.rst
index 0110f0c0..56291894 100644
--- a/docs/source/fastNLP.core.sampler.rst
+++ b/docs/source/fastNLP.core.sampler.rst
@@ -2,6 +2,6 @@ fastNLP.core.sampler
 ====================
 
 .. automodule:: fastNLP.core.sampler
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: Sampler, BucketSampler, SequentialSampler, RandomSampler
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.core.tester.rst b/docs/source/fastNLP.core.tester.rst
index 4d71a27b..90ec2a88 100644
--- a/docs/source/fastNLP.core.tester.rst
+++ b/docs/source/fastNLP.core.tester.rst
@@ -2,6 +2,6 @@ fastNLP.core.tester
 ===================
 
 .. automodule:: fastNLP.core.tester
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: Tester
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.core.trainer.rst b/docs/source/fastNLP.core.trainer.rst
index 60bf2d5b..92c08718 100644
--- a/docs/source/fastNLP.core.trainer.rst
+++ b/docs/source/fastNLP.core.trainer.rst
@@ -2,6 +2,6 @@ fastNLP.core.trainer
 ====================
 
 .. automodule:: fastNLP.core.trainer
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: Trainer
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.core.utils.rst b/docs/source/fastNLP.core.utils.rst
index 3f80b4e8..027a43e9 100644
--- a/docs/source/fastNLP.core.utils.rst
+++ b/docs/source/fastNLP.core.utils.rst
@@ -2,6 +2,6 @@ fastNLP.core.utils
 ==================
 
 .. automodule:: fastNLP.core.utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: cache_results, seq_len_to_mask, get_seq_len
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.core.vocabulary.rst b/docs/source/fastNLP.core.vocabulary.rst
index ba9598b9..ac07a8c6 100644
--- a/docs/source/fastNLP.core.vocabulary.rst
+++ b/docs/source/fastNLP.core.vocabulary.rst
@@ -2,6 +2,6 @@ fastNLP.core.vocabulary
 =======================
 
 .. automodule:: fastNLP.core.vocabulary
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: Vocabulary, VocabularyOption
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.embeddings.bert_embedding.rst b/docs/source/fastNLP.embeddings.bert_embedding.rst
index 24ceff1c..51828cb0 100644
--- a/docs/source/fastNLP.embeddings.bert_embedding.rst
+++ b/docs/source/fastNLP.embeddings.bert_embedding.rst
@@ -1,7 +1,7 @@
-fastNLP.embeddings.bert\_embedding
-==================================
+fastNLP.embeddings.bert_embedding
+=================================
 
 .. automodule:: fastNLP.embeddings.bert_embedding
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: BertEmbedding, BertWordPieceEncoder
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.embeddings.char_embedding.rst b/docs/source/fastNLP.embeddings.char_embedding.rst
index 501089d8..a9b129d8 100644
--- a/docs/source/fastNLP.embeddings.char_embedding.rst
+++ b/docs/source/fastNLP.embeddings.char_embedding.rst
@@ -1,7 +1,7 @@
-fastNLP.embeddings.char\_embedding
-==================================
+fastNLP.embeddings.char_embedding
+=================================
 
 .. automodule:: fastNLP.embeddings.char_embedding
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: CNNCharEmbedding, LSTMCharEmbedding
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.embeddings.contextual_embedding.rst b/docs/source/fastNLP.embeddings.contextual_embedding.rst
new file mode 100644
index 00000000..ee64c7a0
--- /dev/null
+++ b/docs/source/fastNLP.embeddings.contextual_embedding.rst
@@ -0,0 +1,7 @@
+fastNLP.embeddings.contextual_embedding
+=======================================
+
+.. automodule:: fastNLP.embeddings.contextual_embedding
+   :members: ContextualEmbedding
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.embeddings.elmo_embedding.rst b/docs/source/fastNLP.embeddings.elmo_embedding.rst
index 76669ee3..06cc13af 100644
--- a/docs/source/fastNLP.embeddings.elmo_embedding.rst
+++ b/docs/source/fastNLP.embeddings.elmo_embedding.rst
@@ -1,7 +1,7 @@
-fastNLP.embeddings.elmo\_embedding
-==================================
+fastNLP.embeddings.elmo_embedding
+=================================
 
 .. automodule:: fastNLP.embeddings.elmo_embedding
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: ElmoEmbedding
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.embeddings.embedding.rst b/docs/source/fastNLP.embeddings.embedding.rst
index 5960d2cd..4d5fcf46 100644
--- a/docs/source/fastNLP.embeddings.embedding.rst
+++ b/docs/source/fastNLP.embeddings.embedding.rst
@@ -2,6 +2,6 @@ fastNLP.embeddings.embedding
 ============================
 
 .. automodule:: fastNLP.embeddings.embedding
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: Embedding, TokenEmbedding
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.embeddings.rst b/docs/source/fastNLP.embeddings.rst
index 6872e91d..8376408c 100644
--- a/docs/source/fastNLP.embeddings.rst
+++ b/docs/source/fastNLP.embeddings.rst
@@ -2,17 +2,17 @@ fastNLP.embeddings
 ==================
 
 .. automodule:: fastNLP.embeddings
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: Embedding, TokenEmbedding, StaticEmbedding, ElmoEmbedding, BertEmbedding, BertWordPieceEncoder, StackEmbedding, LSTMCharEmbedding, CNNCharEmbedding, get_embeddings
+   :inherited-members:
 
-Submodules
-----------
+子模块
+------
 
 .. toctree::
 
    fastNLP.embeddings.bert_embedding
    fastNLP.embeddings.char_embedding
+   fastNLP.embeddings.contextual_embedding
    fastNLP.embeddings.elmo_embedding
    fastNLP.embeddings.embedding
    fastNLP.embeddings.stack_embedding
diff --git a/docs/source/fastNLP.embeddings.stack_embedding.rst b/docs/source/fastNLP.embeddings.stack_embedding.rst
index 4d2115f7..6af91623 100644
--- a/docs/source/fastNLP.embeddings.stack_embedding.rst
+++ b/docs/source/fastNLP.embeddings.stack_embedding.rst
@@ -1,7 +1,7 @@
-fastNLP.embeddings.stack\_embedding
-===================================
+fastNLP.embeddings.stack_embedding
+==================================
 
 .. automodule:: fastNLP.embeddings.stack_embedding
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: StackEmbedding
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.embeddings.static_embedding.rst b/docs/source/fastNLP.embeddings.static_embedding.rst
index e46de81a..2df1c329 100644
--- a/docs/source/fastNLP.embeddings.static_embedding.rst
+++ b/docs/source/fastNLP.embeddings.static_embedding.rst
@@ -1,7 +1,7 @@
-fastNLP.embeddings.static\_embedding
-====================================
+fastNLP.embeddings.static_embedding
+===================================
 
 .. automodule:: fastNLP.embeddings.static_embedding
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: StaticEmbedding
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.embeddings.utils.rst b/docs/source/fastNLP.embeddings.utils.rst
index 263bfbd6..13e5936b 100644
--- a/docs/source/fastNLP.embeddings.utils.rst
+++ b/docs/source/fastNLP.embeddings.utils.rst
@@ -2,6 +2,6 @@ fastNLP.embeddings.utils
 ========================
 
 .. automodule:: fastNLP.embeddings.utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: get_embeddings
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.io.data_bundle.rst b/docs/source/fastNLP.io.data_bundle.rst
index a6273956..71a921f1 100644
--- a/docs/source/fastNLP.io.data_bundle.rst
+++ b/docs/source/fastNLP.io.data_bundle.rst
@@ -1,7 +1,7 @@
-fastNLP.io.data\_bundle
-=======================
+fastNLP.io.data_bundle
+======================
 
 .. automodule:: fastNLP.io.data_bundle
-    :members:
-    :undoc-members:
-    :show-inheritance:
+   :members: DataBundle
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.io.data_loader.rst b/docs/source/fastNLP.io.data_loader.rst
deleted file mode 100644
index 0b4f5d0b..00000000
--- a/docs/source/fastNLP.io.data_loader.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-fastNLP.io.data\_loader
-=======================
-
-.. automodule:: fastNLP.io.data_loader
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
diff --git a/docs/source/fastNLP.io.dataset_loader.rst b/docs/source/fastNLP.io.dataset_loader.rst
index e7990714..c211ecf9 100644
--- a/docs/source/fastNLP.io.dataset_loader.rst
+++ b/docs/source/fastNLP.io.dataset_loader.rst
@@ -1,7 +1,6 @@
-fastNLP.io.dataset\_loader
-==========================
+fastNLP.io.dataset_loader
+=========================
 
 .. automodule:: fastNLP.io.dataset_loader
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: CSVLoader, JsonLoader
+
diff --git a/docs/source/fastNLP.io.embed_loader.rst b/docs/source/fastNLP.io.embed_loader.rst
index 69e1f7ff..581f5c1b 100644
--- a/docs/source/fastNLP.io.embed_loader.rst
+++ b/docs/source/fastNLP.io.embed_loader.rst
@@ -1,7 +1,7 @@
-fastNLP.io.embed\_loader
-========================
+fastNLP.io.embed_loader
+=======================
 
 .. automodule:: fastNLP.io.embed_loader
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: EmbedLoader, EmbeddingOption
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.io.file_utils.rst b/docs/source/fastNLP.io.file_utils.rst
index 944550d7..0815e068 100644
--- a/docs/source/fastNLP.io.file_utils.rst
+++ b/docs/source/fastNLP.io.file_utils.rst
@@ -1,7 +1,7 @@
-fastNLP.io.file\_utils
-======================
+fastNLP.io.file_utils
+=====================
 
 .. automodule:: fastNLP.io.file_utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: cached_path, get_filepath, get_cache_path, split_filename_suffix, get_from_cache
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.io.loader.rst b/docs/source/fastNLP.io.loader.rst
index bbdc1d7a..060b5450 100644
--- a/docs/source/fastNLP.io.loader.rst
+++ b/docs/source/fastNLP.io.loader.rst
@@ -2,7 +2,6 @@ fastNLP.io.loader
 =================
 
 .. automodule:: fastNLP.io.loader
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader
+   :inherited-members:
 
diff --git a/docs/source/fastNLP.io.model_io.rst b/docs/source/fastNLP.io.model_io.rst
index 537ce752..183122b1 100644
--- a/docs/source/fastNLP.io.model_io.rst
+++ b/docs/source/fastNLP.io.model_io.rst
@@ -1,7 +1,7 @@
-fastNLP.io.model\_io
-====================
+fastNLP.io.model_io
+===================
 
 .. automodule:: fastNLP.io.model_io
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: ModelLoader, ModelSaver
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.io.pipe.rst b/docs/source/fastNLP.io.pipe.rst
index bf126585..d35d2ddc 100644
--- a/docs/source/fastNLP.io.pipe.rst
+++ b/docs/source/fastNLP.io.pipe.rst
@@ -2,7 +2,6 @@ fastNLP.io.pipe
 ===============
 
 .. automodule:: fastNLP.io.pipe
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: Pipe, CWSPipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe, Conll2003Pipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe
+   :inherited-members:
 
diff --git a/docs/source/fastNLP.io.rst b/docs/source/fastNLP.io.rst
index 0cd5d3f2..2aacb883 100644
--- a/docs/source/fastNLP.io.rst
+++ b/docs/source/fastNLP.io.rst
@@ -2,27 +2,18 @@ fastNLP.io
 ==========
 
 .. automodule:: fastNLP.io
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: DataBundle, EmbedLoader, Loader, YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader, ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader, MsraNERLoader, WeiboNERLoader, PeopleDailyNERLoader, CSVLoader, JsonLoader, CWSLoader, MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader, Pipe, YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, Conll2003Pipe, Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, PeopleDailyPipe, WeiboNERPipe, CWSPipe, MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe, ModelLoader, ModelSaver
+   :inherited-members:
 
-Subpackages
------------
-
-.. toctree::
-
-   fastNLP.io.data_loader
-   fastNLP.io.loader
-   fastNLP.io.pipe
-
-Submodules
-----------
+子模块
+------
 
 .. toctree::
 
    fastNLP.io.data_bundle
-   fastNLP.io.dataset_loader
    fastNLP.io.embed_loader
    fastNLP.io.file_utils
+   fastNLP.io.loader
    fastNLP.io.model_io
+   fastNLP.io.pipe
    fastNLP.io.utils
diff --git a/docs/source/fastNLP.io.utils.rst b/docs/source/fastNLP.io.utils.rst
index 0b3f3938..3bff3c45 100644
--- a/docs/source/fastNLP.io.utils.rst
+++ b/docs/source/fastNLP.io.utils.rst
@@ -2,6 +2,6 @@ fastNLP.io.utils
 ================
 
 .. automodule:: fastNLP.io.utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: check_loader_paths
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.models.biaffine_parser.rst b/docs/source/fastNLP.models.biaffine_parser.rst
index f19504e8..c3dbb0a5 100644
--- a/docs/source/fastNLP.models.biaffine_parser.rst
+++ b/docs/source/fastNLP.models.biaffine_parser.rst
@@ -1,7 +1,7 @@
-fastNLP.models.biaffine\_parser
-===============================
+fastNLP.models.biaffine_parser
+==============================
 
 .. automodule:: fastNLP.models.biaffine_parser
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: BiaffineParser, GraphParser
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.models.cnn_text_classification.rst b/docs/source/fastNLP.models.cnn_text_classification.rst
index eacf6916..fe4bb157 100644
--- a/docs/source/fastNLP.models.cnn_text_classification.rst
+++ b/docs/source/fastNLP.models.cnn_text_classification.rst
@@ -1,7 +1,7 @@
-fastNLP.models.cnn\_text\_classification
-========================================
+fastNLP.models.cnn_text_classification
+======================================
 
 .. automodule:: fastNLP.models.cnn_text_classification
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: CNNText
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.models.rst b/docs/source/fastNLP.models.rst
index 36875b85..88854a79 100644
--- a/docs/source/fastNLP.models.rst
+++ b/docs/source/fastNLP.models.rst
@@ -2,12 +2,11 @@ fastNLP.models
 ==============
 
 .. automodule:: fastNLP.models
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: CNNText, SeqLabeling, AdvSeqLabel, ESIM, StarTransEnc, STSeqLabel, STNLICls, STSeqCls, BiaffineParser, GraphParser
+   :inherited-members:
 
-Submodules
-----------
+子模块
+------
 
 .. toctree::
 
diff --git a/docs/source/fastNLP.models.sequence_labeling.rst b/docs/source/fastNLP.models.sequence_labeling.rst
index 85e28f06..b66e637e 100644
--- a/docs/source/fastNLP.models.sequence_labeling.rst
+++ b/docs/source/fastNLP.models.sequence_labeling.rst
@@ -1,7 +1,7 @@
-fastNLP.models.sequence\_labeling
-=================================
+fastNLP.models.sequence_labeling
+================================
 
 .. automodule:: fastNLP.models.sequence_labeling
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: SeqLabeling, AdvSeqLabel
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.models.snli.rst b/docs/source/fastNLP.models.snli.rst
index 3b9b555c..8551051a 100644
--- a/docs/source/fastNLP.models.snli.rst
+++ b/docs/source/fastNLP.models.snli.rst
@@ -2,6 +2,6 @@ fastNLP.models.snli
 ===================
 
 .. automodule:: fastNLP.models.snli
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: ESIM
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.models.star_transformer.rst b/docs/source/fastNLP.models.star_transformer.rst
index 69d5c5b2..f4b5989e 100644
--- a/docs/source/fastNLP.models.star_transformer.rst
+++ b/docs/source/fastNLP.models.star_transformer.rst
@@ -1,7 +1,7 @@
-fastNLP.models.star\_transformer
-================================
+fastNLP.models.star_transformer
+===============================
 
 .. automodule:: fastNLP.models.star_transformer
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: StarTransEnc, STNLICls, STSeqCls, STSeqLabel
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.modules.decoder.rst b/docs/source/fastNLP.modules.decoder.rst
index ecc2adbd..b121f9e9 100644
--- a/docs/source/fastNLP.modules.decoder.rst
+++ b/docs/source/fastNLP.modules.decoder.rst
@@ -2,7 +2,6 @@ fastNLP.modules.decoder
 =======================
 
 .. automodule:: fastNLP.modules.decoder
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: MLP, ConditionalRandomField, viterbi_decode, allowed_transitions
+   :inherited-members:
 
diff --git a/docs/source/fastNLP.modules.encoder.rst b/docs/source/fastNLP.modules.encoder.rst
index e60f9fa4..6b44a192 100644
--- a/docs/source/fastNLP.modules.encoder.rst
+++ b/docs/source/fastNLP.modules.encoder.rst
@@ -2,7 +2,6 @@ fastNLP.modules.encoder
 =======================
 
 .. automodule:: fastNLP.modules.encoder
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: ConvolutionCharEncoder, LSTMCharEncoder, ConvMaxpool, LSTM, StarTransformer, TransformerEncoder, VarRNN, VarLSTM, VarGRU, MaxPool, MaxPoolWithMask, AvgPool, AvgPoolWithMask, MultiHeadAttention
+   :inherited-members:
 
diff --git a/docs/source/fastNLP.modules.rst b/docs/source/fastNLP.modules.rst
index 06494b53..6134d0dd 100644
--- a/docs/source/fastNLP.modules.rst
+++ b/docs/source/fastNLP.modules.rst
@@ -2,21 +2,14 @@ fastNLP.modules
 ===============
 
 .. automodule:: fastNLP.modules
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: ConvolutionCharEncoder, LSTMCharEncoder, ConvMaxpool, LSTM, StarTransformer, TransformerEncoder, VarRNN, VarLSTM, VarGRU, MaxPool, MaxPoolWithMask, AvgPool, AvgPoolWithMask, MultiHeadAttention, MLP, ConditionalRandomField, viterbi_decode, allowed_transitions, TimestepDropout
+   :inherited-members:
 
-Subpackages
------------
+子模块
+------
 
 .. toctree::
 
    fastNLP.modules.decoder
    fastNLP.modules.encoder
-
-Submodules
-----------
-
-.. toctree::
-
    fastNLP.modules.utils
diff --git a/docs/source/fastNLP.modules.utils.rst b/docs/source/fastNLP.modules.utils.rst
index c0219435..e28ca35a 100644
--- a/docs/source/fastNLP.modules.utils.rst
+++ b/docs/source/fastNLP.modules.utils.rst
@@ -2,6 +2,6 @@ fastNLP.modules.utils
 =====================
 
 .. automodule:: fastNLP.modules.utils
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: initial_parameter, summary
+   :inherited-members:
+
diff --git a/docs/source/fastNLP.rst b/docs/source/fastNLP.rst
index e3ba429d..f22ea936 100644
--- a/docs/source/fastNLP.rst
+++ b/docs/source/fastNLP.rst
@@ -2,12 +2,11 @@ fastNLP
 =======
 
 .. automodule:: fastNLP
-   :members:
-   :undoc-members:
-   :show-inheritance:
+   :members: Instance, FieldArray, DataSetIter, BatchIter, TorchLoaderIter, Vocabulary, DataSet, Const, Trainer, Tester, Callback, GradientClipCallback, EarlyStopCallback, TensorboardCallback, LRScheduler, ControlC, LRFinder, Padder, AutoPadder, EngChar2DPadder, AccuracyMetric, SpanFPreRecMetric, ExtractiveQAMetric, Optimizer, SGD, Adam, AdamW, Sampler, SequentialSampler, BucketSampler, RandomSampler, LossFunc, CrossEntropyLoss, L1Loss, BCELoss, NLLLoss, LossInForward, cache_results, logger
+   :inherited-members:
 
-Subpackages
------------
+子模块
+------
 
 .. toctree::
 

From efe88263bb2fb7bebacb8022eb86c390e266ec36 Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Mon, 26 Aug 2019 10:21:10 +0800
Subject: [PATCH 12/19] add __all__ and __doc__ for all files in module 'core',
 using 'undocumented' tags

---
 fastNLP/core/__init__.py                   | 67 +++++++++++++++++++++-
 fastNLP/core/_logger.py                    | 38 ++++++------
 fastNLP/core/_parallel_utils.py            | 21 ++++---
 fastNLP/core/const.py                      | 26 ++++++---
 fastNLP/core/dist_trainer.py               | 22 +++----
 fastNLP/core/field.py                      | 19 ++++--
 fastNLP/core/predictor.py                  | 28 ++++-----
 fastNLP/core/vocabulary.py                 | 28 +++++----
 fastNLP/embeddings/contextual_embedding.py | 10 ++--
 9 files changed, 178 insertions(+), 81 deletions(-)

diff --git a/fastNLP/core/__init__.py b/fastNLP/core/__init__.py
index 1feaf3fb..efee08b5 100644
--- a/fastNLP/core/__init__.py
+++ b/fastNLP/core/__init__.py
@@ -10,8 +10,72 @@ core 模块里实现了 fastNLP 的核心框架，常用的功能都可以从 fa
 
 对于常用的功能，你只需要在 :doc:`fastNLP` 中查看即可。如果想了解各个子模块的具体作用，您可以在下面找到每个子模块的具体文档。
 
-    
 """
+__all__ = [
+    "DataSet",
+    
+    "Instance",
+    
+    "FieldArray",
+    "Padder",
+    "AutoPadder",
+    "EngChar2DPadder",
+    
+    "Vocabulary",
+    
+    "DataSetIter",
+    "BatchIter",
+    "TorchLoaderIter",
+    
+    "Const",
+    
+    "Tester",
+    "Trainer",
+    
+    "cache_results",
+    "seq_len_to_mask",
+    "get_seq_len",
+    "logger",
+    
+    "Callback",
+    "GradientClipCallback",
+    "EarlyStopCallback",
+    "FitlogCallback",
+    "EvaluateCallback",
+    "LRScheduler",
+    "ControlC",
+    "LRFinder",
+    "TensorboardCallback",
+    "WarmupCallback",
+    'SaveModelCallback',
+    "EchoCallback",
+    "TesterCallback",
+    "CallbackException",
+    "EarlyStopError",
+    
+    "LossFunc",
+    "CrossEntropyLoss",
+    "L1Loss",
+    "BCELoss",
+    "NLLLoss",
+    "LossInForward",
+    
+    "AccuracyMetric",
+    "SpanFPreRecMetric",
+    "ExtractiveQAMetric",
+    
+    "Optimizer",
+    "SGD",
+    "Adam",
+    "AdamW",
+    
+    "SequentialSampler",
+    "BucketSampler",
+    "RandomSampler",
+    "Sampler",
+]
+
+from ._logger import logger
 from .batch import DataSetIter, BatchIter, TorchLoaderIter
 from .callback import Callback, GradientClipCallback, EarlyStopCallback, FitlogCallback, EvaluateCallback, \
     LRScheduler, ControlC, LRFinder, TensorboardCallback, WarmupCallback, SaveModelCallback, EchoCallback, \
@@ -28,4 +92,3 @@ from .tester import Tester
 from .trainer import Trainer
 from .utils import cache_results, seq_len_to_mask, get_seq_len
 from .vocabulary import Vocabulary
-from ._logger import logger
diff --git a/fastNLP/core/_logger.py b/fastNLP/core/_logger.py
index 50266d7a..7198cfbd 100644
--- a/fastNLP/core/_logger.py
+++ b/fastNLP/core/_logger.py
@@ -1,15 +1,15 @@
+"""undocumented"""
+
+__all__ = [
+    'logger',
+]
+
 import logging
 import logging.config
-import torch
-import _pickle as pickle
 import os
 import sys
 import warnings
 
-__all__ = [
-    'logger',
-]
-
 ROOT_NAME = 'fastNLP'
 
 try:
@@ -25,7 +25,7 @@ if tqdm is not None:
     class TqdmLoggingHandler(logging.Handler):
         def __init__(self, level=logging.INFO):
             super().__init__(level)
-
+        
         def emit(self, record):
             try:
                 msg = self.format(record)
@@ -59,14 +59,14 @@ def _add_file_handler(logger, path, level='INFO'):
             if os.path.abspath(path) == h.baseFilename:
                 # file path already added
                 return
-
+    
     # File Handler
     if os.path.exists(path):
         assert os.path.isfile(path)
         warnings.warn('log already exists in {}'.format(path))
     dirname = os.path.abspath(os.path.dirname(path))
     os.makedirs(dirname, exist_ok=True)
-
+    
     file_handler = logging.FileHandler(path, mode='a')
     file_handler.setLevel(_get_level(level))
     file_formatter = logging.Formatter(fmt='%(asctime)s - %(module)s - [%(levelname)s] - %(message)s',
@@ -87,7 +87,7 @@ def _set_stdout_handler(logger, stdout='tqdm', level='INFO'):
             break
     if stream_handler is not None:
         logger.removeHandler(stream_handler)
-
+    
     # Stream Handler
     if stdout == 'plain':
         stream_handler = logging.StreamHandler(sys.stdout)
@@ -95,7 +95,7 @@ def _set_stdout_handler(logger, stdout='tqdm', level='INFO'):
         stream_handler = TqdmLoggingHandler(level)
     else:
         stream_handler = None
-
+    
     if stream_handler is not None:
         stream_formatter = logging.Formatter('%(message)s')
         stream_handler.setLevel(level)
@@ -103,38 +103,40 @@ def _set_stdout_handler(logger, stdout='tqdm', level='INFO'):
         logger.addHandler(stream_handler)
 
 
-
 class FastNLPLogger(logging.getLoggerClass()):
     def __init__(self, name):
         super().__init__(name)
-
+    
     def add_file(self, path='./log.txt', level='INFO'):
         """add log output file and level"""
         _add_file_handler(self, path, level)
-
+    
     def set_stdout(self, stdout='tqdm', level='INFO'):
         """set stdout format and level"""
         _set_stdout_handler(self, stdout, level)
 
+
 logging.setLoggerClass(FastNLPLogger)
+
+
 # print(logging.getLoggerClass())
 # print(logging.getLogger())
 
 def _init_logger(path=None, stdout='tqdm', level='INFO'):
     """initialize logger"""
     level = _get_level(level)
-
+    
     # logger = logging.getLogger()
     logger = logging.getLogger(ROOT_NAME)
     logger.propagate = False
     logger.setLevel(level)
-
+    
     _set_stdout_handler(logger, stdout, level)
-
+    
     # File Handler
     if path is not None:
         _add_file_handler(logger, path, level)
-
+    
     return logger
 
 
diff --git a/fastNLP/core/_parallel_utils.py b/fastNLP/core/_parallel_utils.py
index 6b24d9f9..ce745820 100644
--- a/fastNLP/core/_parallel_utils.py
+++ b/fastNLP/core/_parallel_utils.py
@@ -1,11 +1,14 @@
+"""undocumented"""
+
+__all__ = []
 
 import threading
+
 import torch
 from torch import nn
 from torch.nn.parallel.parallel_apply import get_a_var
-
-from torch.nn.parallel.scatter_gather import scatter_kwargs, gather
 from torch.nn.parallel.replicate import replicate
+from torch.nn.parallel.scatter_gather import scatter_kwargs, gather
 
 
 def parallel_apply(modules, func_name, inputs, kwargs_tup=None, devices=None):
@@ -27,11 +30,11 @@ def parallel_apply(modules, func_name, inputs, kwargs_tup=None, devices=None):
         assert len(modules) == len(devices)
     else:
         devices = [None] * len(modules)
-
+    
     lock = threading.Lock()
     results = {}
     grad_enabled = torch.is_grad_enabled()
-
+    
     def _worker(i, module, input, kwargs, device=None):
         torch.set_grad_enabled(grad_enabled)
         if device is None:
@@ -47,20 +50,20 @@ def parallel_apply(modules, func_name, inputs, kwargs_tup=None, devices=None):
         except Exception as e:
             with lock:
                 results[i] = e
-
+    
     if len(modules) > 1:
         threads = [threading.Thread(target=_worker,
                                     args=(i, module, input, kwargs, device))
                    for i, (module, input, kwargs, device) in
                    enumerate(zip(modules, inputs, kwargs_tup, devices))]
-
+        
         for thread in threads:
             thread.start()
         for thread in threads:
             thread.join()
     else:
         _worker(0, modules[0], inputs[0], kwargs_tup[0], devices[0])
-
+    
     outputs = []
     for i in range(len(inputs)):
         output = results[i]
@@ -79,6 +82,7 @@ def _data_parallel_wrapper(func_name, device_ids, output_device):
     :param output_device: nn.DataParallel中的output_device
     :return:
     """
+    
     def wrapper(network, *inputs, **kwargs):
         inputs, kwargs = scatter_kwargs(inputs, kwargs, device_ids, dim=0)
         if len(device_ids) == 1:
@@ -86,6 +90,7 @@ def _data_parallel_wrapper(func_name, device_ids, output_device):
         replicas = replicate(network, device_ids[:len(inputs)])
         outputs = parallel_apply(replicas, func_name, inputs, kwargs, device_ids[:len(replicas)])
         return gather(outputs, output_device)
+    
     return wrapper
 
 
@@ -99,4 +104,4 @@ def _model_contains_inner_module(model):
     if isinstance(model, nn.Module):
         if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)):
             return True
-    return False
\ No newline at end of file
+    return False
diff --git a/fastNLP/core/const.py b/fastNLP/core/const.py
index 27e8d1cb..ad5d1f1e 100644
--- a/fastNLP/core/const.py
+++ b/fastNLP/core/const.py
@@ -1,3 +1,13 @@
+"""
+.. todo::
+    doc
+"""
+
+__all__ = [
+    "Const"
+]
+
+
 class Const:
     """
     fastNLP中field命名常量。
@@ -25,47 +35,47 @@ class Const:
     LOSS = 'loss'
     RAW_WORD = 'raw_words'
     RAW_CHAR = 'raw_chars'
-
+    
     @staticmethod
     def INPUTS(i):
         """得到第 i 个 ``INPUT`` 的命名"""
         i = int(i) + 1
         return Const.INPUT + str(i)
-
+    
     @staticmethod
     def CHAR_INPUTS(i):
         """得到第 i 个 ``CHAR_INPUT`` 的命名"""
         i = int(i) + 1
         return Const.CHAR_INPUT + str(i)
-
+    
     @staticmethod
     def RAW_WORDS(i):
         i = int(i) + 1
         return Const.RAW_WORD + str(i)
-
+    
     @staticmethod
     def RAW_CHARS(i):
         i = int(i) + 1
         return Const.RAW_CHAR + str(i)
-
+    
     @staticmethod
     def INPUT_LENS(i):
         """得到第 i 个 ``INPUT_LEN`` 的命名"""
         i = int(i) + 1
         return Const.INPUT_LEN + str(i)
-
+    
     @staticmethod
     def OUTPUTS(i):
         """得到第 i 个 ``OUTPUT`` 的命名"""
         i = int(i) + 1
         return Const.OUTPUT + str(i)
-
+    
     @staticmethod
     def TARGETS(i):
         """得到第 i 个 ``TARGET`` 的命名"""
         i = int(i) + 1
         return Const.TARGET + str(i)
-
+    
     @staticmethod
     def LOSSES(i):
         """得到第 i 个 ``LOSS`` 的命名"""
diff --git a/fastNLP/core/dist_trainer.py b/fastNLP/core/dist_trainer.py
index 7c64fee4..3a293447 100644
--- a/fastNLP/core/dist_trainer.py
+++ b/fastNLP/core/dist_trainer.py
@@ -1,29 +1,29 @@
-"""
+"""undocumented
 正在开发中的分布式训练代码
 """
+import logging
+import os
+import time
+from datetime import datetime
+
 import torch
 import torch.cuda
-import torch.optim
 import torch.distributed as dist
-from torch.utils.data.distributed import DistributedSampler
+import torch.optim
+from pkg_resources import parse_version
 from torch.nn.parallel import DistributedDataParallel as DDP
-import os
+from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
-import time
-from datetime import datetime, timedelta
-from functools import partial
 
+from ._logger import logger
 from .batch import DataSetIter, BatchIter
 from .callback import DistCallbackManager, CallbackException, TesterCallback
 from .dataset import DataSet
 from .losses import _prepare_losser
 from .optimizer import Optimizer
 from .utils import _build_args
-from .utils import _move_dict_value_to_device
 from .utils import _get_func_signature
-from  ._logger import logger
-import logging
-from pkg_resources import parse_version
+from .utils import _move_dict_value_to_device
 
 __all__ = [
     'get_local_rank',
diff --git a/fastNLP/core/field.py b/fastNLP/core/field.py
index b3f024f8..05f987c2 100644
--- a/fastNLP/core/field.py
+++ b/fastNLP/core/field.py
@@ -1,18 +1,25 @@
+"""
+.. todo::
+    doc
+"""
+
 __all__ = [
     "Padder",
     "AutoPadder",
     "EngChar2DPadder",
 ]
 
-from numbers import Number
-import torch
-import numpy as np
-from typing import Any
 from abc import abstractmethod
-from copy import deepcopy
 from collections import Counter
-from .utils import _is_iterable
+from copy import deepcopy
+from numbers import Number
+from typing import Any
+
+import numpy as np
+import torch
+
 from ._logger import logger
+from .utils import _is_iterable
 
 
 class SetInputOrTargetException(Exception):
diff --git a/fastNLP/core/predictor.py b/fastNLP/core/predictor.py
index 2d6a7380..c6b8fc90 100644
--- a/fastNLP/core/predictor.py
+++ b/fastNLP/core/predictor.py
@@ -1,13 +1,15 @@
-"""
-    ..todo::
-        检查这个类是否需要
-"""
+"""undocumented"""
+
+__all__ = [
+    "Predictor"
+]
+
 from collections import defaultdict
 
 import torch
 
-from . import DataSetIter
 from . import DataSet
+from . import DataSetIter
 from . import SequentialSampler
 from .utils import _build_args, _move_dict_value_to_device, _get_model_device
 
@@ -21,7 +23,7 @@ class Predictor(object):
 
     :param torch.nn.Module network: 用来完成预测任务的模型
     """
-
+    
     def __init__(self, network):
         if not isinstance(network, torch.nn.Module):
             raise ValueError(
@@ -29,7 +31,7 @@ class Predictor(object):
         self.network = network
         self.batch_size = 1
         self.batch_output = []
-
+    
     def predict(self, data: DataSet, seq_len_field_name=None):
         """用已经训练好的模型进行inference.
 
@@ -41,27 +43,27 @@ class Predictor(object):
             raise ValueError("Only Dataset class is allowed, not {}.".format(type(data)))
         if seq_len_field_name is not None and seq_len_field_name not in data.field_arrays:
             raise ValueError("Field name {} not found in DataSet {}.".format(seq_len_field_name, data))
-
+        
         prev_training = self.network.training
         self.network.eval()
         network_device = _get_model_device(self.network)
         batch_output = defaultdict(list)
         data_iterator = DataSetIter(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False)
-
+        
         if hasattr(self.network, "predict"):
             predict_func = self.network.predict
         else:
             predict_func = self.network.forward
-
+        
         with torch.no_grad():
             for batch_x, _ in data_iterator:
                 _move_dict_value_to_device(batch_x, _, device=network_device)
                 refined_batch_x = _build_args(predict_func, **batch_x)
                 prediction = predict_func(**refined_batch_x)
-
+                
                 if seq_len_field_name is not None:
                     seq_lens = batch_x[seq_len_field_name].tolist()
-
+                
                 for key, value in prediction.items():
                     value = value.cpu().numpy()
                     if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1):
@@ -74,6 +76,6 @@ class Predictor(object):
                             batch_output[key].extend(tmp_batch)
                         else:
                             batch_output[key].append(value)
-
+        
         self.network.train(prev_training)
         return batch_output
diff --git a/fastNLP/core/vocabulary.py b/fastNLP/core/vocabulary.py
index 92f54f9a..52d33a5a 100644
--- a/fastNLP/core/vocabulary.py
+++ b/fastNLP/core/vocabulary.py
@@ -1,16 +1,22 @@
+"""
+.. todo::
+    doc
+"""
+
 __all__ = [
     "Vocabulary",
     "VocabularyOption",
 ]
 
-from functools import wraps
 from collections import Counter
+from functools import partial
+from functools import wraps
+
+from ._logger import logger
 from .dataset import DataSet
 from .utils import Option
-from functools import partial
-import numpy as np
 from .utils import _is_iterable
-from ._logger import logger
+
 
 class VocabularyOption(Option):
     def __init__(self,
@@ -51,7 +57,7 @@ def _check_build_status(func):
             self.rebuild = True
             if self.max_size is not None and len(self.word_count) >= self.max_size:
                 logger.info("[Warning] Vocabulary has reached the max size {} when calling {} method. "
-                      "Adding more words may cause unexpected behaviour of Vocabulary. ".format(
+                            "Adding more words may cause unexpected behaviour of Vocabulary. ".format(
                     self.max_size, func.__name__))
         return func(self, *args, **kwargs)
     
@@ -199,7 +205,7 @@ class Vocabulary(object):
         self.build_reverse_vocab()
         self.rebuild = False
         return self
-
+    
     def build_reverse_vocab(self):
         """
         基于 `word to index` dict, 构建 `index to word` dict.
@@ -279,19 +285,19 @@ class Vocabulary(object):
                     if not isinstance(field[0][0], str) and _is_iterable(field[0][0]):
                         raise RuntimeError("Only support field with 2 dimensions.")
                     return [[self.to_index(c) for c in w] for w in field]
-
+        
         new_field_name = new_field_name or field_name
-
+        
         if type(new_field_name) == type(field_name):
             if isinstance(new_field_name, list):
                 assert len(new_field_name) == len(field_name), "new_field_name should have same number elements with " \
-                                                             "field_name."
+                                                               "field_name."
             elif isinstance(new_field_name, str):
                 field_name = [field_name]
                 new_field_name = [new_field_name]
             else:
                 raise TypeError("field_name and new_field_name can only be str or List[str].")
-
+        
         for idx, dataset in enumerate(datasets):
             if isinstance(dataset, DataSet):
                 try:
@@ -377,7 +383,7 @@ class Vocabulary(object):
         :return: bool
         """
         return word in self._no_create_word
-
+    
     def to_index(self, w):
         """
         将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出``ValueError``::
diff --git a/fastNLP/embeddings/contextual_embedding.py b/fastNLP/embeddings/contextual_embedding.py
index 2c304da7..9910a44b 100644
--- a/fastNLP/embeddings/contextual_embedding.py
+++ b/fastNLP/embeddings/contextual_embedding.py
@@ -8,15 +8,17 @@ __all__ = [
 ]
 
 from abc import abstractmethod
+
 import torch
 
-from ..core.vocabulary import Vocabulary
-from ..core.dataset import DataSet
+from .embedding import TokenEmbedding
+from ..core import logger
 from ..core.batch import DataSetIter
+from ..core.dataset import DataSet
 from ..core.sampler import SequentialSampler
 from ..core.utils import _move_model_to_device, _get_model_device
-from .embedding import TokenEmbedding
-from ..core import logger
+from ..core.vocabulary import Vocabulary
+
 
 class ContextualEmbedding(TokenEmbedding):
     def __init__(self, vocab: Vocabulary, word_dropout: float = 0.0, dropout: float = 0.0):

From 0d5f43b451473fe25703cb1f9798fcf03eb64c76 Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Mon, 26 Aug 2019 10:25:01 +0800
Subject: [PATCH 13/19] add __all__ and __doc__ for all files in module 'io',
 using 'undocumented' tags

---
 fastNLP/io/data_bundle.py           |   7 +-
 fastNLP/io/dataset_loader.py        |   6 +-
 fastNLP/io/embed_loader.py          |   9 +-
 fastNLP/io/file_reader.py           |  16 ++-
 fastNLP/io/file_utils.py            |  23 +++-
 fastNLP/io/loader/classification.py |  26 +++--
 fastNLP/io/loader/conll.py          |  84 +++++++++------
 fastNLP/io/loader/csv.py            |  10 +-
 fastNLP/io/loader/cws.py            |  17 ++-
 fastNLP/io/loader/json.py           |  10 +-
 fastNLP/io/loader/loader.py         |  13 ++-
 fastNLP/io/loader/matching.py       |  82 ++++++++------
 fastNLP/io/pipe/classification.py   | 161 +++++++++++++++-------------
 fastNLP/io/pipe/conll.py            |  79 ++++++++------
 fastNLP/io/pipe/cws.py              |   6 ++
 fastNLP/io/pipe/matching.py         |  75 ++++++++-----
 fastNLP/io/pipe/pipe.py             |   6 ++
 fastNLP/io/pipe/utils.py            |  38 ++++---
 fastNLP/io/utils.py                 |  25 +++--
 19 files changed, 439 insertions(+), 254 deletions(-)

diff --git a/fastNLP/io/data_bundle.py b/fastNLP/io/data_bundle.py
index 1e663f1e..db60a86f 100644
--- a/fastNLP/io/data_bundle.py
+++ b/fastNLP/io/data_bundle.py
@@ -1,10 +1,15 @@
+"""
+.. todo::
+    doc
+"""
 __all__ = [
     'DataBundle',
 ]
 
 import _pickle as pickle
-from typing import Union, Dict
 import os
+from typing import Union, Dict
+
 from ..core.dataset import DataSet
 from ..core.vocabulary import Vocabulary
 
diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py
index 82e96597..fca0de69 100644
--- a/fastNLP/io/dataset_loader.py
+++ b/fastNLP/io/dataset_loader.py
@@ -1,4 +1,4 @@
-"""
+"""undocumented
 .. warning::
 
     本模块将在 `0.5.0版本` 中被废弃，由 :mod:`~fastNLP.io.loader`  和 :mod:`~fastNLP.io.pipe` 模块替代。
@@ -23,10 +23,10 @@ __all__ = [
 ]
 
 
+from .data_bundle import DataSetLoader
+from .file_reader import _read_csv, _read_json
 from ..core.dataset import DataSet
 from ..core.instance import Instance
-from .file_reader import _read_csv, _read_json
-from .data_bundle import DataSetLoader
 
 
 class JsonLoader(DataSetLoader):
diff --git a/fastNLP/io/embed_loader.py b/fastNLP/io/embed_loader.py
index c58385e1..780d91e4 100644
--- a/fastNLP/io/embed_loader.py
+++ b/fastNLP/io/embed_loader.py
@@ -1,17 +1,22 @@
+"""
+.. todo::
+    doc
+"""
 __all__ = [
     "EmbedLoader",
     "EmbeddingOption",
 ]
 
+import logging
 import os
 import warnings
 
 import numpy as np
 
-from ..core.vocabulary import Vocabulary
 from .data_bundle import BaseLoader
 from ..core.utils import Option
-import logging
+from ..core.vocabulary import Vocabulary
+
 
 class EmbeddingOption(Option):
     def __init__(self,
diff --git a/fastNLP/io/file_reader.py b/fastNLP/io/file_reader.py
index 0320572c..7a953098 100644
--- a/fastNLP/io/file_reader.py
+++ b/fastNLP/io/file_reader.py
@@ -1,7 +1,11 @@
-"""
+"""undocumented
 此模块用于给其它模块提供读取文件的函数，没有为用户提供 API
 """
+
+__all__ = []
+
 import json
+
 from ..core import logger
 
 
@@ -24,8 +28,8 @@ def _read_csv(path, encoding='utf-8', headers=None, sep=',', dropna=True):
             headers = headers.split(sep)
             start_idx += 1
         elif not isinstance(headers, (list, tuple)):
-                raise TypeError("headers should be list or tuple, not {}." \
-                        .format(type(headers)))
+            raise TypeError("headers should be list or tuple, not {}." \
+                            .format(type(headers)))
         for line_idx, line in enumerate(f, start_idx):
             contents = line.rstrip('\r\n').split(sep)
             if len(contents) != len(headers):
@@ -82,6 +86,7 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True):
             :if False, raise ValueError when reading invalid data. default: True
     :return: generator, every time yield (line number, conll item)
     """
+    
     def parse_conll(sample):
         sample = list(map(list, zip(*sample)))
         sample = [sample[i] for i in indexes]
@@ -89,14 +94,15 @@ def _read_conll(path, encoding='utf-8', indexes=None, dropna=True):
             if len(f) <= 0:
                 raise ValueError('empty field')
         return sample
+    
     with open(path, 'r', encoding=encoding) as f:
         sample = []
         start = next(f).strip()
-        if start!='':
+        if start != '':
             sample.append(start.split())
         for line_idx, line in enumerate(f, 1):
             line = line.strip()
-            if line=='':
+            if line == '':
                 if len(sample):
                     try:
                         res = parse_conll(sample)
diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py
index bd02158e..8ecdff25 100644
--- a/fastNLP/io/file_utils.py
+++ b/fastNLP/io/file_utils.py
@@ -1,12 +1,27 @@
+"""
+.. todo::
+    doc
+"""
+
+__all__ = [
+    "cached_path",
+    "get_filepath",
+    "get_cache_path",
+    "split_filename_suffix",
+    "get_from_cache",
+]
+
 import os
+import re
+import shutil
+import tempfile
 from pathlib import Path
 from urllib.parse import urlparse
-import re
+
 import requests
-import tempfile
-from tqdm import tqdm
-import shutil
 from requests import HTTPError
+from tqdm import tqdm
+
 from ..core import logger
 
 PRETRAINED_BERT_MODEL_DIR = {
diff --git a/fastNLP/io/loader/classification.py b/fastNLP/io/loader/classification.py
index f64a26e7..ec00d2b4 100644
--- a/fastNLP/io/loader/classification.py
+++ b/fastNLP/io/loader/classification.py
@@ -1,12 +1,24 @@
-from ...core.dataset import DataSet
-from ...core.instance import Instance
-from .loader import Loader
-import warnings
+"""undocumented"""
+
+__all__ = [
+    "YelpLoader",
+    "YelpFullLoader",
+    "YelpPolarityLoader",
+    "IMDBLoader",
+    "SSTLoader",
+    "SST2Loader",
+]
+
+import glob
 import os
 import random
 import shutil
-import glob
 import time
+import warnings
+
+from .loader import Loader
+from ...core.dataset import DataSet
+from ...core.instance import Instance
 
 
 class YelpLoader(Loader):
@@ -58,7 +70,7 @@ class YelpLoader(Loader):
 
 
 class YelpFullLoader(YelpLoader):
-    def download(self, dev_ratio: float = 0.1, re_download:bool=False):
+    def download(self, dev_ratio: float = 0.1, re_download: bool = False):
         """
         自动下载数据集，如果你使用了这个数据集，请引用以下的文章
 
@@ -127,7 +139,7 @@ class YelpPolarityLoader(YelpLoader):
         if time.time() - modify_time > 1 and re_download:  # 通过这种比较丑陋的方式判断一下文件是否是才下载的
             shutil.rmtree(data_dir)
             data_dir = self._get_dataset_path(dataset_name=dataset_name)
-
+        
         if not os.path.exists(os.path.join(data_dir, 'dev.csv')):
             if dev_ratio > 0:
                 assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)."
diff --git a/fastNLP/io/loader/conll.py b/fastNLP/io/loader/conll.py
index b5241cff..1bd1b448 100644
--- a/fastNLP/io/loader/conll.py
+++ b/fastNLP/io/loader/conll.py
@@ -1,15 +1,28 @@
-from typing import Dict, Union
+"""undocumented"""
+
+__all__ = [
+    "ConllLoader",
+    "Conll2003Loader",
+    "Conll2003NERLoader",
+    "OntoNotesNERLoader",
+    "CTBLoader",
+    "CNNERLoader",
+    "MsraNERLoader",
+    "WeiboNERLoader",
+    "PeopleDailyNERLoader"
+]
 
-from .loader import Loader
-from ...core.dataset import DataSet
-from ..file_reader import _read_conll
-from ...core.instance import Instance
-from ...core.const import Const
 import glob
 import os
+import random
 import shutil
 import time
-import random
+
+from .loader import Loader
+from ..file_reader import _read_conll
+from ...core.const import Const
+from ...core.dataset import DataSet
+from ...core.instance import Instance
 
 
 class ConllLoader(Loader):
@@ -47,6 +60,7 @@ class ConllLoader(Loader):
     :param bool dropna: 是否忽略非法数据，若 ``False`` ，遇到非法数据时抛出 ``ValueError`` 。Default: ``True``
 
     """
+    
     def __init__(self, headers, indexes=None, dropna=True):
         super(ConllLoader, self).__init__()
         if not isinstance(headers, (list, tuple)):
@@ -60,7 +74,7 @@ class ConllLoader(Loader):
             if len(indexes) != len(headers):
                 raise ValueError
             self.indexes = indexes
-
+    
     def _load(self, path):
         """
         传入的一个文件路径，将该文件读入DataSet中，field由ConllLoader初始化时指定的headers决定。
@@ -101,12 +115,13 @@ class Conll2003Loader(ConllLoader):
        "[...]", "[...]", "[...]", "[...]"
 
     """
+    
     def __init__(self):
         headers = [
             'raw_words', 'pos', 'chunk', 'ner',
         ]
         super(Conll2003Loader, self).__init__(headers=headers)
-
+    
     def _load(self, path):
         """
         传入的一个文件路径，将该文件读入DataSet中，field由ConllLoader初始化时指定的headers决定。
@@ -127,7 +142,7 @@ class Conll2003Loader(ConllLoader):
             ins = {h: data[i] for i, h in enumerate(self.headers)}
             ds.append(Instance(**ins))
         return ds
-
+    
     def download(self, output_dir=None):
         raise RuntimeError("conll2003 cannot be downloaded automatically.")
 
@@ -158,12 +173,13 @@ class Conll2003NERLoader(ConllLoader):
        "[...]",  "[...]"
 
     """
+    
     def __init__(self):
         headers = [
             'raw_words', 'target',
         ]
         super().__init__(headers=headers, indexes=[0, 3])
-
+    
     def _load(self, path):
         """
         传入的一个文件路径，将该文件读入DataSet中，field由ConllLoader初始化时指定的headers决定。
@@ -184,7 +200,7 @@ class Conll2003NERLoader(ConllLoader):
             ins = {h: data[i] for i, h in enumerate(self.headers)}
             ds.append(Instance(**ins))
         return ds
-
+    
     def download(self):
         raise RuntimeError("conll2003 cannot be downloaded automatically.")
 
@@ -204,13 +220,13 @@ class OntoNotesNERLoader(ConllLoader):
         "[...]", "[...]"
 
     """
-
+    
     def __init__(self):
         super().__init__(headers=[Const.RAW_WORD, Const.TARGET], indexes=[3, 10])
-
-    def _load(self, path:str):
+    
+    def _load(self, path: str):
         dataset = super()._load(path)
-
+        
         def convert_to_bio(tags):
             bio_tags = []
             flag = None
@@ -227,7 +243,7 @@ class OntoNotesNERLoader(ConllLoader):
                     flag = None
                 bio_tags.append(bio_label)
             return bio_tags
-
+        
         def convert_word(words):
             converted_words = []
             for word in words:
@@ -236,7 +252,7 @@ class OntoNotesNERLoader(ConllLoader):
                     converted_words.append(word)
                     continue
                 # 以下是由于这些符号被转义了，再转回来
-                tfrs = {'-LRB-':'(',
+                tfrs = {'-LRB-': '(',
                         '-RRB-': ')',
                         '-LSB-': '[',
                         '-RSB-': ']',
@@ -248,12 +264,12 @@ class OntoNotesNERLoader(ConllLoader):
                 else:
                     converted_words.append(word)
             return converted_words
-
+        
         dataset.apply_field(convert_word, field_name=Const.RAW_WORD, new_field_name=Const.RAW_WORD)
         dataset.apply_field(convert_to_bio, field_name=Const.TARGET, new_field_name=Const.TARGET)
-
+        
         return dataset
-
+    
     def download(self):
         raise RuntimeError("Ontonotes cannot be downloaded automatically, you can refer "
                            "https://github.com/yhcc/OntoNotes-5.0-NER to download and preprocess.")
@@ -262,13 +278,13 @@ class OntoNotesNERLoader(ConllLoader):
 class CTBLoader(Loader):
     def __init__(self):
         super().__init__()
-
-    def _load(self, path:str):
+    
+    def _load(self, path: str):
         pass
 
 
 class CNNERLoader(Loader):
-    def _load(self, path:str):
+    def _load(self, path: str):
         """
         支持加载形如以下格式的内容，一行两列，以空格隔开两个sample
 
@@ -331,10 +347,11 @@ class MsraNERLoader(CNNERLoader):
         "[...]", "[...]"
 
     """
+    
     def __init__(self):
         super().__init__()
-
-    def download(self, dev_ratio:float=0.1, re_download:bool=False)->str:
+    
+    def download(self, dev_ratio: float = 0.1, re_download: bool = False) -> str:
         """
         自动下载MSAR-NER的数据，如果你使用该数据，请引用 Gina-Anne Levow, 2006, The Third International Chinese Language
         Processing Bakeoff: Word Segmentation and Named Entity Recognition.
@@ -356,7 +373,7 @@ class MsraNERLoader(CNNERLoader):
         if time.time() - modify_time > 1 and re_download:  # 通过这种比较丑陋的方式判断一下文件是否是才下载的
             shutil.rmtree(data_dir)
             data_dir = self._get_dataset_path(dataset_name=dataset_name)
-
+        
         if not os.path.exists(os.path.join(data_dir, 'dev.conll')):
             if dev_ratio > 0:
                 assert 0 < dev_ratio < 1, "dev_ratio should be in range (0,1)."
@@ -380,15 +397,15 @@ class MsraNERLoader(CNNERLoader):
                 finally:
                     if os.path.exists(os.path.join(data_dir, 'middle_file.conll')):
                         os.remove(os.path.join(data_dir, 'middle_file.conll'))
-
+        
         return data_dir
 
 
 class WeiboNERLoader(CNNERLoader):
     def __init__(self):
         super().__init__()
-
-    def download(self)->str:
+    
+    def download(self) -> str:
         """
         自动下载Weibo-NER的数据，如果你使用了该数据，请引用 Nanyun Peng and Mark Dredze, 2015, Named Entity Recognition for
         Chinese Social Media with Jointly Trained Embeddings.
@@ -397,7 +414,7 @@ class WeiboNERLoader(CNNERLoader):
         """
         dataset_name = 'weibo-ner'
         data_dir = self._get_dataset_path(dataset_name=dataset_name)
-
+        
         return data_dir
 
 
@@ -427,11 +444,12 @@ class PeopleDailyNERLoader(CNNERLoader):
         "[...]", "[...]"
 
     """
+    
     def __init__(self):
         super().__init__()
-
+    
     def download(self) -> str:
         dataset_name = 'peopledaily'
         data_dir = self._get_dataset_path(dataset_name=dataset_name)
-
+        
         return data_dir
diff --git a/fastNLP/io/loader/csv.py b/fastNLP/io/loader/csv.py
index 5195cc8e..0d6e35fa 100644
--- a/fastNLP/io/loader/csv.py
+++ b/fastNLP/io/loader/csv.py
@@ -1,7 +1,13 @@
+"""undocumented"""
+
+__all__ = [
+    "CSVLoader",
+]
+
+from .loader import Loader
+from ..file_reader import _read_csv
 from ...core.dataset import DataSet
 from ...core.instance import Instance
-from ..file_reader import _read_csv
-from .loader import Loader
 
 
 class CSVLoader(Loader):
diff --git a/fastNLP/io/loader/cws.py b/fastNLP/io/loader/cws.py
index fab7639c..2fbb1091 100644
--- a/fastNLP/io/loader/cws.py
+++ b/fastNLP/io/loader/cws.py
@@ -1,11 +1,18 @@
-from .loader import Loader
-from ...core.dataset import DataSet
-from ...core.instance import Instance
+"""undocumented"""
+
+__all__ = [
+    "CWSLoader"
+]
+
 import glob
 import os
-import time
-import shutil
 import random
+import shutil
+import time
+
+from .loader import Loader
+from ...core.dataset import DataSet
+from ...core.instance import Instance
 
 
 class CWSLoader(Loader):
diff --git a/fastNLP/io/loader/json.py b/fastNLP/io/loader/json.py
index 8856b73a..012dee5a 100644
--- a/fastNLP/io/loader/json.py
+++ b/fastNLP/io/loader/json.py
@@ -1,7 +1,13 @@
+"""undocumented"""
+
+__all__ = [
+    "JsonLoader"
+]
+
+from .loader import Loader
+from ..file_reader import _read_json
 from ...core.dataset import DataSet
 from ...core.instance import Instance
-from ..file_reader import _read_json
-from .loader import Loader
 
 
 class JsonLoader(Loader):
diff --git a/fastNLP/io/loader/loader.py b/fastNLP/io/loader/loader.py
index e7b419ac..22636a27 100644
--- a/fastNLP/io/loader/loader.py
+++ b/fastNLP/io/loader/loader.py
@@ -1,8 +1,15 @@
-from ...core.dataset import DataSet
-from .. import DataBundle
-from ..utils import check_loader_paths
+"""undocumented"""
+
+__all__ = [
+    "Loader"
+]
+
 from typing import Union, Dict
+
+from .. import DataBundle
 from ..file_utils import _get_dataset_url, get_cache_path, cached_path
+from ..utils import check_loader_paths
+from ...core.dataset import DataSet
 
 
 class Loader:
diff --git a/fastNLP/io/loader/matching.py b/fastNLP/io/loader/matching.py
index 26455914..7f03ca3e 100644
--- a/fastNLP/io/loader/matching.py
+++ b/fastNLP/io/loader/matching.py
@@ -1,10 +1,21 @@
+"""undocumented"""
+
+__all__ = [
+    "MNLILoader",
+    "SNLILoader",
+    "QNLILoader",
+    "RTELoader",
+    "QuoraLoader",
+]
+
+import os
 import warnings
-from .loader import Loader
+from typing import Union, Dict
+
 from .json import JsonLoader
-from ...core.const import Const
+from .loader import Loader
 from .. import DataBundle
-import os
-from typing import Union, Dict
+from ...core.const import Const
 from ...core.dataset import DataSet
 from ...core.instance import Instance
 
@@ -22,10 +33,11 @@ class MNLILoader(Loader):
        "...", "...","."
 
     """
+    
     def __init__(self):
         super().__init__()
-
-    def _load(self, path:str):
+    
+    def _load(self, path: str):
         ds = DataSet()
         with open(path, 'r', encoding='utf-8') as f:
             f.readline()  # 跳过header
@@ -50,8 +62,8 @@ class MNLILoader(Loader):
                         if raw_words1 and raw_words2 and target:
                             ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target))
         return ds
-
-    def load(self, paths:str=None):
+    
+    def load(self, paths: str = None):
         """
 
         :param str paths: 传入数据所在目录，会在该目录下寻找dev_matched.tsv, dev_mismatched.tsv, test_matched.tsv,
@@ -64,13 +76,13 @@ class MNLILoader(Loader):
             paths = self.download()
         if not os.path.isdir(paths):
             raise NotADirectoryError(f"{paths} is not a valid directory.")
-
-        files = {'dev_matched':"dev_matched.tsv",
-                 "dev_mismatched":"dev_mismatched.tsv",
-                 "test_matched":"test_matched.tsv",
-                 "test_mismatched":"test_mismatched.tsv",
-                 "train":'train.tsv'}
-
+        
+        files = {'dev_matched': "dev_matched.tsv",
+                 "dev_mismatched": "dev_mismatched.tsv",
+                 "test_matched": "test_matched.tsv",
+                 "test_mismatched": "test_mismatched.tsv",
+                 "train": 'train.tsv'}
+        
         datasets = {}
         for name, filename in files.items():
             filepath = os.path.join(paths, filename)
@@ -78,11 +90,11 @@ class MNLILoader(Loader):
                 if 'test' not in name:
                     raise FileNotFoundError(f"{name} not found in directory {filepath}.")
             datasets[name] = self._load(filepath)
-
+        
         data_bundle = DataBundle(datasets=datasets)
-
+        
         return data_bundle
-
+    
     def download(self):
         """
         如果你使用了这个数据，请引用
@@ -106,14 +118,15 @@ class SNLILoader(JsonLoader):
        "...", "...", "."
 
     """
+    
     def __init__(self):
         super().__init__(fields={
             'sentence1': Const.RAW_WORDS(0),
             'sentence2': Const.RAW_WORDS(1),
             'gold_label': Const.TARGET,
         })
-
-    def load(self, paths: Union[str, Dict[str, str]]=None) -> DataBundle:
+    
+    def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle:
         """
         从指定一个或多个路径中的文件中读取数据，返回:class:`~fastNLP.io.DataBundle` 。
 
@@ -138,11 +151,11 @@ class SNLILoader(JsonLoader):
                 paths = _paths
             else:
                 raise NotADirectoryError(f"{paths} is not a valid directory.")
-
+        
         datasets = {name: self._load(path) for name, path in paths.items()}
         data_bundle = DataBundle(datasets=datasets)
         return data_bundle
-
+    
     def download(self):
         """
         如果您的文章使用了这份数据，请引用
@@ -169,12 +182,13 @@ class QNLILoader(JsonLoader):
     test数据集没有target列
 
     """
+    
     def __init__(self):
         super().__init__()
-
+    
     def _load(self, path):
         ds = DataSet()
-
+        
         with open(path, 'r', encoding='utf-8') as f:
             f.readline()  # 跳过header
             if path.endswith("test.tsv"):
@@ -198,7 +212,7 @@ class QNLILoader(JsonLoader):
                         if raw_words1 and raw_words2 and target:
                             ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target))
         return ds
-
+    
     def download(self):
         """
         如果您的实验使用到了该数据，请引用
@@ -225,12 +239,13 @@ class RTELoader(Loader):
 
     test数据集没有target列
     """
+    
     def __init__(self):
         super().__init__()
-
-    def _load(self, path:str):
+    
+    def _load(self, path: str):
         ds = DataSet()
-
+        
         with open(path, 'r', encoding='utf-8') as f:
             f.readline()  # 跳过header
             if path.endswith("test.tsv"):
@@ -254,7 +269,7 @@ class RTELoader(Loader):
                         if raw_words1 and raw_words2 and target:
                             ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target))
         return ds
-
+    
     def download(self):
         return self._get_dataset_path('rte')
 
@@ -281,12 +296,13 @@ class QuoraLoader(Loader):
         "...","."
 
     """
+    
     def __init__(self):
         super().__init__()
-
-    def _load(self, path:str):
+    
+    def _load(self, path: str):
         ds = DataSet()
-
+        
         with open(path, 'r', encoding='utf-8') as f:
             for line in f:
                 line = line.strip()
@@ -298,6 +314,6 @@ class QuoraLoader(Loader):
                     if raw_words1 and raw_words2 and target:
                         ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target))
         return ds
-
+    
     def download(self):
         raise RuntimeError("Quora cannot be downloaded automatically.")
diff --git a/fastNLP/io/pipe/classification.py b/fastNLP/io/pipe/classification.py
index f42d5400..30c591a4 100644
--- a/fastNLP/io/pipe/classification.py
+++ b/fastNLP/io/pipe/classification.py
@@ -1,26 +1,39 @@
+"""undocumented"""
+
+__all__ = [
+    "YelpFullPipe",
+    "YelpPolarityPipe",
+    "SSTPipe",
+    "SST2Pipe",
+    'IMDBPipe'
+]
+
+import re
+
 from nltk import Tree
 
+from .pipe import Pipe
+from .utils import get_tokenizer, _indexize, _add_words_field, _drop_empty_instance
 from ..data_bundle import DataBundle
-from ...core.vocabulary import Vocabulary
-from ...core.const import Const
 from ..loader.classification import IMDBLoader, YelpFullLoader, SSTLoader, SST2Loader, YelpPolarityLoader
+from ...core.const import Const
 from ...core.dataset import DataSet
 from ...core.instance import Instance
+from ...core.vocabulary import Vocabulary
 
-from .utils import get_tokenizer, _indexize, _add_words_field, _drop_empty_instance
-from .pipe import Pipe
-import re
 nonalpnum = re.compile('[^0-9a-zA-Z?!\']+')
 
 
+
 class _CLSPipe(Pipe):
     """
     分类问题的基类，负责对classification的数据进行tokenize操作。默认是对raw_words列操作，然后生成words列
 
     """
-    def __init__(self, tokenizer:str='spacy', lang='en'):
+    
+    def __init__(self, tokenizer: str = 'spacy', lang='en'):
         self.tokenizer = get_tokenizer(tokenizer, lang=lang)
-
+    
     def _tokenize(self, data_bundle, field_name=Const.INPUT, new_field_name=None):
         """
         将DataBundle中的数据进行tokenize
@@ -33,9 +46,9 @@ class _CLSPipe(Pipe):
         new_field_name = new_field_name or field_name
         for name, dataset in data_bundle.datasets.items():
             dataset.apply_field(self.tokenizer, field_name=field_name, new_field_name=new_field_name)
-
+        
         return data_bundle
-
+    
     def _granularize(self, data_bundle, tag_map):
         """
         该函数对data_bundle中'target'列中的内容进行转换。
@@ -47,9 +60,9 @@ class _CLSPipe(Pipe):
         """
         for name in list(data_bundle.datasets.keys()):
             dataset = data_bundle.get_dataset(name)
-            dataset.apply_field(lambda target:tag_map.get(target, -100), field_name=Const.TARGET,
+            dataset.apply_field(lambda target: tag_map.get(target, -100), field_name=Const.TARGET,
                                 new_field_name=Const.TARGET)
-            dataset.drop(lambda ins:ins[Const.TARGET] == -100)
+            dataset.drop(lambda ins: ins[Const.TARGET] == -100)
             data_bundle.set_dataset(dataset, name)
         return data_bundle
 
@@ -69,7 +82,7 @@ def _clean_str(words):
         t = ''.join(tt)
         if t != '':
             words_collection.append(t)
-
+    
     return words_collection
 
 
@@ -89,19 +102,20 @@ class YelpFullPipe(_CLSPipe):
         1、2归为1类，3归为1类，4、5归为1类；若为5, 则有5分类问题。
     :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。
     """
-    def __init__(self, lower:bool=False, granularity=5, tokenizer:str='spacy'):
+    
+    def __init__(self, lower: bool = False, granularity=5, tokenizer: str = 'spacy'):
         super().__init__(tokenizer=tokenizer, lang='en')
         self.lower = lower
         assert granularity in (2, 3, 5), "granularity can only be 2,3,5."
         self.granularity = granularity
-
-        if granularity==2:
+        
+        if granularity == 2:
             self.tag_map = {"1": 0, "2": 0, "4": 1, "5": 1}
-        elif granularity==3:
-            self.tag_map = {"1": 0, "2": 0, "3":1, "4": 2, "5": 2}
+        elif granularity == 3:
+            self.tag_map = {"1": 0, "2": 0, "3": 1, "4": 2, "5": 2}
         else:
             self.tag_map = {"1": 0, "2": 1, "3": 2, "4": 3, "5": 4}
-
+    
     def _tokenize(self, data_bundle, field_name=Const.INPUT, new_field_name=None):
         """
         将DataBundle中的数据进行tokenize
@@ -116,7 +130,7 @@ class YelpFullPipe(_CLSPipe):
             dataset.apply_field(self.tokenizer, field_name=field_name, new_field_name=new_field_name)
             dataset.apply_field(_clean_str, field_name=field_name, new_field_name=new_field_name)
         return data_bundle
-
+    
     def process(self, data_bundle):
         """
         传入的DataSet应该具备如下的结构
@@ -131,30 +145,30 @@ class YelpFullPipe(_CLSPipe):
         :param data_bundle:
         :return:
         """
-
+        
         # 复制一列words
         data_bundle = _add_words_field(data_bundle, lower=self.lower)
-
+        
         # 进行tokenize
         data_bundle = self._tokenize(data_bundle=data_bundle, field_name=Const.INPUT)
-
+        
         # 根据granularity设置tag
         data_bundle = self._granularize(data_bundle, tag_map=self.tag_map)
-
+        
         # 删除空行
         data_bundle = _drop_empty_instance(data_bundle, field_name=Const.INPUT)
-
+        
         # index
         data_bundle = _indexize(data_bundle=data_bundle)
-
+        
         for name, dataset in data_bundle.datasets.items():
             dataset.add_seq_len(Const.INPUT)
-
+        
         data_bundle.set_input(Const.INPUT, Const.INPUT_LEN)
         data_bundle.set_target(Const.TARGET)
-
+        
         return data_bundle
-
+    
     def process_from_file(self, paths=None):
         """
 
@@ -179,27 +193,28 @@ class YelpPolarityPipe(_CLSPipe):
     :param bool lower: 是否对输入进行小写化。
     :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。
     """
-    def __init__(self, lower:bool=False, tokenizer:str='spacy'):
+    
+    def __init__(self, lower: bool = False, tokenizer: str = 'spacy'):
         super().__init__(tokenizer=tokenizer, lang='en')
         self.lower = lower
-
+    
     def process(self, data_bundle):
         # 复制一列words
         data_bundle = _add_words_field(data_bundle, lower=self.lower)
-
+        
         # 进行tokenize
         data_bundle = self._tokenize(data_bundle=data_bundle, field_name=Const.INPUT)
         # index
         data_bundle = _indexize(data_bundle=data_bundle)
-
+        
         for name, dataset in data_bundle.datasets.items():
             dataset.add_seq_len(Const.INPUT)
-
+        
         data_bundle.set_input(Const.INPUT, Const.INPUT_LEN)
         data_bundle.set_target(Const.TARGET)
-
+        
         return data_bundle
-
+    
     def process_from_file(self, paths=None):
         """
 
@@ -230,7 +245,7 @@ class SSTPipe(_CLSPipe):
         0、1归为1类，2归为1类，3、4归为1类；若为5, 则有5分类问题。
     :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。
     """
-
+    
     def __init__(self, subtree=False, train_subtree=True, lower=False, granularity=5, tokenizer='spacy'):
         super().__init__(tokenizer=tokenizer, lang='en')
         self.subtree = subtree
@@ -238,15 +253,15 @@ class SSTPipe(_CLSPipe):
         self.lower = lower
         assert granularity in (2, 3, 5), "granularity can only be 2,3,5."
         self.granularity = granularity
-
-        if granularity==2:
+        
+        if granularity == 2:
             self.tag_map = {"0": 0, "1": 0, "3": 1, "4": 1}
-        elif granularity==3:
-            self.tag_map = {"0": 0, "1": 0, "2":1, "3": 2, "4": 2}
+        elif granularity == 3:
+            self.tag_map = {"0": 0, "1": 0, "2": 1, "3": 2, "4": 2}
         else:
             self.tag_map = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4}
-
-    def process(self, data_bundle:DataBundle):
+    
+    def process(self, data_bundle: DataBundle):
         """
         对DataBundle中的数据进行预处理。输入的DataSet应该至少拥有raw_words这一列，且内容类似与
 
@@ -277,26 +292,26 @@ class SSTPipe(_CLSPipe):
                     instance = Instance(raw_words=' '.join(tree.leaves()), target=tree.label())
                     ds.append(instance)
             data_bundle.set_dataset(ds, name)
-
+        
         _add_words_field(data_bundle, lower=self.lower)
-
+        
         # 进行tokenize
         data_bundle = self._tokenize(data_bundle=data_bundle, field_name=Const.INPUT)
-
+        
         # 根据granularity设置tag
         data_bundle = self._granularize(data_bundle, tag_map=self.tag_map)
-
+        
         # index
         data_bundle = _indexize(data_bundle=data_bundle)
-
+        
         for name, dataset in data_bundle.datasets.items():
             dataset.add_seq_len(Const.INPUT)
-
+        
         data_bundle.set_input(Const.INPUT, Const.INPUT_LEN)
         data_bundle.set_target(Const.TARGET)
-
+        
         return data_bundle
-
+    
     def process_from_file(self, paths=None):
         data_bundle = SSTLoader().load(paths)
         return self.process(data_bundle=data_bundle)
@@ -316,11 +331,12 @@ class SST2Pipe(_CLSPipe):
     :param bool lower: 是否对输入进行小写化。
     :param str tokenizer: 使用哪种tokenize方式将数据切成单词。支持'spacy'和'raw'。raw使用空格作为切分。
     """
+    
     def __init__(self, lower=False, tokenizer='spacy'):
         super().__init__(tokenizer=tokenizer, lang='en')
         self.lower = lower
-
-    def process(self, data_bundle:DataBundle):
+    
+    def process(self, data_bundle: DataBundle):
         """
         可以处理的DataSet应该具备如下的结构
 
@@ -335,15 +351,15 @@ class SST2Pipe(_CLSPipe):
         :return:
         """
         _add_words_field(data_bundle, self.lower)
-
+        
         data_bundle = self._tokenize(data_bundle=data_bundle)
-
+        
         src_vocab = Vocabulary()
         src_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.INPUT,
-                               no_create_entry_dataset=[dataset for name,dataset in data_bundle.datasets.items() if
+                               no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if
                                                         name != 'train'])
         src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=Const.INPUT)
-
+        
         tgt_vocab = Vocabulary(unknown=None, padding=None)
         tgt_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.TARGET)
         datasets = []
@@ -351,18 +367,18 @@ class SST2Pipe(_CLSPipe):
             if dataset.has_field(Const.TARGET):
                 datasets.append(dataset)
         tgt_vocab.index_dataset(*datasets, field_name=Const.TARGET)
-
+        
         data_bundle.set_vocab(src_vocab, Const.INPUT)
         data_bundle.set_vocab(tgt_vocab, Const.TARGET)
-
+        
         for name, dataset in data_bundle.datasets.items():
             dataset.add_seq_len(Const.INPUT)
-
+        
         data_bundle.set_input(Const.INPUT, Const.INPUT_LEN)
         data_bundle.set_target(Const.TARGET)
-
+        
         return data_bundle
-
+    
     def process_from_file(self, paths=None):
         """
 
@@ -390,11 +406,12 @@ class IMDBPipe(_CLSPipe):
     :param bool lower: 是否将words列的数据小写。
     :param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。
     """
-    def __init__(self, lower:bool=False, tokenizer:str='spacy'):
+    
+    def __init__(self, lower: bool = False, tokenizer: str = 'spacy'):
         super().__init__(tokenizer=tokenizer, lang='en')
         self.lower = lower
-
-    def process(self, data_bundle:DataBundle):
+    
+    def process(self, data_bundle: DataBundle):
         """
         期待的DataBunlde中输入的DataSet应该类似于如下，有两个field，raw_words和target，且均为str类型
 
@@ -409,25 +426,26 @@ class IMDBPipe(_CLSPipe):
             target列应该为str。
         :return: DataBundle
         """
+        
         # 替换<br />
         def replace_br(raw_words):
             raw_words = raw_words.replace("<br />", ' ')
             return raw_words
-
+        
         for name, dataset in data_bundle.datasets.items():
             dataset.apply_field(replace_br, field_name=Const.RAW_WORD, new_field_name=Const.RAW_WORD)
-
+        
         _add_words_field(data_bundle, lower=self.lower)
         self._tokenize(data_bundle, field_name=Const.INPUT, new_field_name=Const.INPUT)
         _indexize(data_bundle)
-
+        
         for name, dataset in data_bundle.datasets.items():
             dataset.add_seq_len(Const.INPUT)
             dataset.set_input(Const.INPUT, Const.INPUT_LEN)
             dataset.set_target(Const.TARGET)
-
+        
         return data_bundle
-
+    
     def process_from_file(self, paths=None):
         """
 
@@ -437,8 +455,5 @@ class IMDBPipe(_CLSPipe):
         # 读取数据
         data_bundle = IMDBLoader().load(paths)
         data_bundle = self.process(data_bundle)
-
+        
         return data_bundle
-
-
-
diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py
index 617d1236..2efec8e0 100644
--- a/fastNLP/io/pipe/conll.py
+++ b/fastNLP/io/pipe/conll.py
@@ -1,13 +1,25 @@
+"""undocumented"""
+
+__all__ = [
+    "Conll2003NERPipe",
+    "Conll2003Pipe",
+    "OntoNotesNERPipe",
+    "MsraNERPipe",
+    "PeopleDailyPipe",
+    "WeiboNERPipe"
+]
+
 from .pipe import Pipe
-from .. import DataBundle
+from .utils import _add_chars_field
+from .utils import _indexize, _add_words_field
 from .utils import iob2, iob2bioes
-from ...core.const import Const
+from .. import DataBundle
 from ..loader.conll import Conll2003NERLoader, OntoNotesNERLoader
-from .utils import _indexize, _add_words_field
-from .utils import _add_chars_field
 from ..loader.conll import PeopleDailyNERLoader, WeiboNERLoader, MsraNERLoader, ConllLoader
+from ...core.const import Const
 from ...core.vocabulary import Vocabulary
 
+
 class _NERPipe(Pipe):
     """
     NER任务的处理Pipe, 该Pipe会（1）复制raw_words列，并命名为words; (2）在words, target列建立词表
@@ -20,14 +32,14 @@ class _NERPipe(Pipe):
     :param: str encoding_type: target列使用什么类型的encoding方式，支持bioes, bio两种。
     :param bool lower: 是否将words小写化后再建立词表，绝大多数情况都不需要设置为True。
     """
-
+    
     def __init__(self, encoding_type: str = 'bio', lower: bool = False):
         if encoding_type == 'bio':
             self.convert_tag = iob2
         else:
             self.convert_tag = lambda words: iob2bioes(iob2(words))
         self.lower = lower
-
+    
     def process(self, data_bundle: DataBundle) -> DataBundle:
         """
         支持的DataSet的field为
@@ -46,21 +58,21 @@ class _NERPipe(Pipe):
         # 转换tag
         for name, dataset in data_bundle.datasets.items():
             dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET)
-
+        
         _add_words_field(data_bundle, lower=self.lower)
-
+        
         # index
         _indexize(data_bundle)
-
+        
         input_fields = [Const.TARGET, Const.INPUT, Const.INPUT_LEN]
         target_fields = [Const.TARGET, Const.INPUT_LEN]
-
+        
         for name, dataset in data_bundle.datasets.items():
             dataset.add_seq_len(Const.INPUT)
-
+        
         data_bundle.set_input(*input_fields)
         data_bundle.set_target(*target_fields)
-
+        
         return data_bundle
 
 
@@ -84,7 +96,7 @@ class Conll2003NERPipe(_NERPipe):
     :param: str encoding_type: target列使用什么类型的encoding方式，支持bioes, bio两种。
     :param bool lower: 是否将words小写化后再建立词表，绝大多数情况都不需要设置为True。
     """
-
+    
     def process_from_file(self, paths) -> DataBundle:
         """
 
@@ -94,7 +106,7 @@ class Conll2003NERPipe(_NERPipe):
         # 读取数据
         data_bundle = Conll2003NERLoader().load(paths)
         data_bundle = self.process(data_bundle)
-
+        
         return data_bundle
 
 
@@ -125,8 +137,8 @@ class Conll2003Pipe(Pipe):
         else:
             self.ner_convert_tag = lambda tags: iob2bioes(iob2(tags))
         self.lower = lower
-
-    def process(self, data_bundle)->DataBundle:
+    
+    def process(self, data_bundle) -> DataBundle:
         """
         输入的DataSet应该类似于如下的形式
 
@@ -145,9 +157,9 @@ class Conll2003Pipe(Pipe):
             dataset.drop(lambda x: "-DOCSTART-" in x[Const.RAW_WORD])
             dataset.apply_field(self.chunk_convert_tag, field_name='chunk', new_field_name='chunk')
             dataset.apply_field(self.ner_convert_tag, field_name='ner', new_field_name='ner')
-
+        
         _add_words_field(data_bundle, lower=self.lower)
-
+        
         # index
         _indexize(data_bundle, input_field_names=Const.INPUT, target_field_names=['pos', 'ner'])
         # chunk中存在一些tag只在dev中出现，没在train中
@@ -155,18 +167,18 @@ class Conll2003Pipe(Pipe):
         tgt_vocab.from_dataset(*data_bundle.datasets.values(), field_name='chunk')
         tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name='chunk')
         data_bundle.set_vocab(tgt_vocab, 'chunk')
-
+        
         input_fields = [Const.INPUT, Const.INPUT_LEN]
         target_fields = ['pos', 'ner', 'chunk', Const.INPUT_LEN]
-
+        
         for name, dataset in data_bundle.datasets.items():
             dataset.add_seq_len(Const.INPUT)
-
+        
         data_bundle.set_input(*input_fields)
         data_bundle.set_target(*target_fields)
-
+        
         return data_bundle
-
+    
     def process_from_file(self, paths):
         """
 
@@ -194,7 +206,7 @@ class OntoNotesNERPipe(_NERPipe):
     :param: str encoding_type: target列使用什么类型的encoding方式，支持bioes, bio两种。
     :param bool lower: 是否将words小写化后再建立词表，绝大多数情况都不需要设置为True。
     """
-
+    
     def process_from_file(self, paths):
         data_bundle = OntoNotesNERLoader().load(paths)
         return self.process(data_bundle)
@@ -211,13 +223,13 @@ class _CNNERPipe(Pipe):
 
     :param: str encoding_type: target列使用什么类型的encoding方式，支持bioes, bio两种。
     """
-
+    
     def __init__(self, encoding_type: str = 'bio'):
         if encoding_type == 'bio':
             self.convert_tag = iob2
         else:
             self.convert_tag = lambda words: iob2bioes(iob2(words))
-
+    
     def process(self, data_bundle: DataBundle) -> DataBundle:
         """
         支持的DataSet的field为
@@ -239,21 +251,21 @@ class _CNNERPipe(Pipe):
         # 转换tag
         for name, dataset in data_bundle.datasets.items():
             dataset.apply_field(self.convert_tag, field_name=Const.TARGET, new_field_name=Const.TARGET)
-
+        
         _add_chars_field(data_bundle, lower=False)
-
+        
         # index
         _indexize(data_bundle, input_field_names=Const.CHAR_INPUT, target_field_names=Const.TARGET)
-
+        
         input_fields = [Const.TARGET, Const.CHAR_INPUT, Const.INPUT_LEN]
         target_fields = [Const.TARGET, Const.INPUT_LEN]
-
+        
         for name, dataset in data_bundle.datasets.items():
             dataset.add_seq_len(Const.CHAR_INPUT)
-
+        
         data_bundle.set_input(*input_fields)
         data_bundle.set_target(*target_fields)
-
+        
         return data_bundle
 
 
@@ -272,6 +284,7 @@ class MsraNERPipe(_CNNERPipe):
     target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。
 
     """
+    
     def process_from_file(self, paths=None) -> DataBundle:
         data_bundle = MsraNERLoader().load(paths)
         return self.process(data_bundle)
@@ -291,6 +304,7 @@ class PeopleDailyPipe(_CNNERPipe):
     raw_chars列为List[str], 是未转换的原始数据; chars列为List[int]，是转换为index的输入数据; target列是List[int]，是转换为index的
     target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。
     """
+    
     def process_from_file(self, paths=None) -> DataBundle:
         data_bundle = PeopleDailyNERLoader().load(paths)
         return self.process(data_bundle)
@@ -312,6 +326,7 @@ class WeiboNERPipe(_CNNERPipe):
 
     :param: str encoding_type: target列使用什么类型的encoding方式，支持bioes, bio两种。
     """
+    
     def process_from_file(self, paths=None) -> DataBundle:
         data_bundle = WeiboNERLoader().load(paths)
         return self.process(data_bundle)
diff --git a/fastNLP/io/pipe/cws.py b/fastNLP/io/pipe/cws.py
index 4ca0219c..748cf10a 100644
--- a/fastNLP/io/pipe/cws.py
+++ b/fastNLP/io/pipe/cws.py
@@ -1,3 +1,9 @@
+"""undocumented"""
+
+__all__ = [
+    "CWSPipe"
+]
+
 import re
 from itertools import chain
 
diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py
index ffa6375b..699438c8 100644
--- a/fastNLP/io/pipe/matching.py
+++ b/fastNLP/io/pipe/matching.py
@@ -1,9 +1,25 @@
+"""undocumented"""
+
+__all__ = [
+    "MatchingBertPipe",
+    "RTEBertPipe",
+    "SNLIBertPipe",
+    "QuoraBertPipe",
+    "QNLIBertPipe",
+    "MNLIBertPipe",
+    "MatchingPipe",
+    "RTEPipe",
+    "SNLIPipe",
+    "QuoraPipe",
+    "QNLIPipe",
+    "MNLIPipe",
+]
 
 from .pipe import Pipe
 from .utils import get_tokenizer
+from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader
 from ...core.const import Const
 from ...core.vocabulary import Vocabulary
-from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader
 
 
 class MatchingBertPipe(Pipe):
@@ -24,12 +40,13 @@ class MatchingBertPipe(Pipe):
     :param bool lower: 是否将word小写化。
     :param str tokenizer: 使用什么tokenizer来将句子切分为words. 支持spacy, raw两种。raw即使用空格拆分。
     """
-    def __init__(self, lower=False, tokenizer: str='raw'):
+    
+    def __init__(self, lower=False, tokenizer: str = 'raw'):
         super().__init__()
-
+        
         self.lower = bool(lower)
         self.tokenizer = get_tokenizer(tokenizer=tokenizer)
-
+    
     def _tokenize(self, data_bundle, field_names, new_field_names):
         """
 
@@ -43,62 +60,62 @@ class MatchingBertPipe(Pipe):
                 dataset.apply_field(lambda words: self.tokenizer(words), field_name=field_name,
                                     new_field_name=new_field_name)
         return data_bundle
-
+    
     def process(self, data_bundle):
         for dataset in data_bundle.datasets.values():
             if dataset.has_field(Const.TARGET):
                 dataset.drop(lambda x: x[Const.TARGET] == '-')
-
+        
         for name, dataset in data_bundle.datasets.items():
             dataset.copy_field(Const.RAW_WORDS(0), Const.INPUTS(0), )
             dataset.copy_field(Const.RAW_WORDS(1), Const.INPUTS(1), )
-
+        
         if self.lower:
             for name, dataset in data_bundle.datasets.items():
                 dataset[Const.INPUTS(0)].lower()
                 dataset[Const.INPUTS(1)].lower()
-
+        
         data_bundle = self._tokenize(data_bundle, [Const.INPUTS(0), Const.INPUTS(1)],
                                      [Const.INPUTS(0), Const.INPUTS(1)])
-
+        
         # concat两个words
         def concat(ins):
             words0 = ins[Const.INPUTS(0)]
             words1 = ins[Const.INPUTS(1)]
             words = words0 + ['[SEP]'] + words1
             return words
-
+        
         for name, dataset in data_bundle.datasets.items():
             dataset.apply(concat, new_field_name=Const.INPUT)
             dataset.delete_field(Const.INPUTS(0))
             dataset.delete_field(Const.INPUTS(1))
-
+        
         word_vocab = Vocabulary()
         word_vocab.from_dataset(*[dataset for name, dataset in data_bundle.datasets.items() if 'train' in name],
                                 field_name=Const.INPUT,
                                 no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if
                                                          'train' not in name])
         word_vocab.index_dataset(*data_bundle.datasets.values(), field_name=Const.INPUT)
-
+        
         target_vocab = Vocabulary(padding=None, unknown=None)
         target_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.TARGET)
         has_target_datasets = [dataset for name, dataset in data_bundle.datasets.items() if
                                dataset.has_field(Const.TARGET)]
         target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET)
-
+        
         data_bundle.set_vocab(word_vocab, Const.INPUT)
         data_bundle.set_vocab(target_vocab, Const.TARGET)
-
+        
         input_fields = [Const.INPUT, Const.INPUT_LEN]
         target_fields = [Const.TARGET]
-
+        
         for name, dataset in data_bundle.datasets.items():
             dataset.add_seq_len(Const.INPUT)
             dataset.set_input(*input_fields, flag=True)
             for fields in target_fields:
                 if dataset.has_field(fields):
                     dataset.set_target(fields, flag=True)
-
+        
         return data_bundle
 
 
@@ -150,12 +167,13 @@ class MatchingPipe(Pipe):
     :param bool lower: 是否将所有raw_words转为小写。
     :param str tokenizer: 将原始数据tokenize的方式。支持spacy, raw. spacy是使用spacy切分，raw就是用空格切分。
     """
-    def __init__(self, lower=False, tokenizer: str='raw'):
+    
+    def __init__(self, lower=False, tokenizer: str = 'raw'):
         super().__init__()
-
+        
         self.lower = bool(lower)
         self.tokenizer = get_tokenizer(tokenizer=tokenizer)
-
+    
     def _tokenize(self, data_bundle, field_names, new_field_names):
         """
 
@@ -169,7 +187,7 @@ class MatchingPipe(Pipe):
                 dataset.apply_field(lambda words: self.tokenizer(words), field_name=field_name,
                                     new_field_name=new_field_name)
         return data_bundle
-
+    
     def process(self, data_bundle):
         """
         接受的DataBundle中的DataSet应该具有以下的field, target列可以没有
@@ -186,35 +204,35 @@ class MatchingPipe(Pipe):
         """
         data_bundle = self._tokenize(data_bundle, [Const.RAW_WORDS(0), Const.RAW_WORDS(1)],
                                      [Const.INPUTS(0), Const.INPUTS(1)])
-
+        
         for dataset in data_bundle.datasets.values():
             if dataset.has_field(Const.TARGET):
                 dataset.drop(lambda x: x[Const.TARGET] == '-')
-
+        
         if self.lower:
             for name, dataset in data_bundle.datasets.items():
                 dataset[Const.INPUTS(0)].lower()
                 dataset[Const.INPUTS(1)].lower()
-
+        
         word_vocab = Vocabulary()
         word_vocab.from_dataset(*[dataset for name, dataset in data_bundle.datasets.items() if 'train' in name],
                                 field_name=[Const.INPUTS(0), Const.INPUTS(1)],
                                 no_create_entry_dataset=[dataset for name, dataset in data_bundle.datasets.items() if
                                                          'train' not in name])
         word_vocab.index_dataset(*data_bundle.datasets.values(), field_name=[Const.INPUTS(0), Const.INPUTS(1)])
-
+        
         target_vocab = Vocabulary(padding=None, unknown=None)
         target_vocab.from_dataset(data_bundle.datasets['train'], field_name=Const.TARGET)
         has_target_datasets = [dataset for name, dataset in data_bundle.datasets.items() if
                                dataset.has_field(Const.TARGET)]
         target_vocab.index_dataset(*has_target_datasets, field_name=Const.TARGET)
-
+        
         data_bundle.set_vocab(word_vocab, Const.INPUTS(0))
         data_bundle.set_vocab(target_vocab, Const.TARGET)
-
+        
         input_fields = [Const.INPUTS(0), Const.INPUTS(1), Const.INPUT_LENS(0), Const.INPUT_LENS(1)]
         target_fields = [Const.TARGET]
-
+        
         for name, dataset in data_bundle.datasets.items():
             dataset.add_seq_len(Const.INPUTS(0), Const.INPUT_LENS(0))
             dataset.add_seq_len(Const.INPUTS(1), Const.INPUT_LENS(1))
@@ -222,7 +240,7 @@ class MatchingPipe(Pipe):
             for fields in target_fields:
                 if dataset.has_field(fields):
                     dataset.set_target(fields, flag=True)
-
+        
         return data_bundle
 
 
@@ -254,4 +272,3 @@ class MNLIPipe(MatchingPipe):
     def process_from_file(self, paths=None):
         data_bundle = MNLILoader().load(paths)
         return self.process(data_bundle)
-
diff --git a/fastNLP/io/pipe/pipe.py b/fastNLP/io/pipe/pipe.py
index cc45dee4..a1435fd3 100644
--- a/fastNLP/io/pipe/pipe.py
+++ b/fastNLP/io/pipe/pipe.py
@@ -1,3 +1,9 @@
+"""undocumented"""
+
+__all__ = [
+    "Pipe",
+]
+
 from .. import DataBundle
 
 
diff --git a/fastNLP/io/pipe/utils.py b/fastNLP/io/pipe/utils.py
index 8facd8d9..f32f58b7 100644
--- a/fastNLP/io/pipe/utils.py
+++ b/fastNLP/io/pipe/utils.py
@@ -1,8 +1,18 @@
+"""undocumented"""
+
+__all__ = [
+    "iob2",
+    "iob2bioes",
+    "get_tokenizer",
+]
+
 from typing import List
-from ...core.vocabulary import Vocabulary
+
 from ...core.const import Const
+from ...core.vocabulary import Vocabulary
+
 
-def iob2(tags:List[str])->List[str]:
+def iob2(tags: List[str]) -> List[str]:
     """
     检查数据是否是合法的IOB数据，如果是IOB1会被自动转换为IOB2。两种格式的区别见
     https://datascience.stackexchange.com/questions/37824/difference-between-iob-and-iob2-format
@@ -25,7 +35,8 @@ def iob2(tags:List[str])->List[str]:
             tags[i] = "B" + tag[1:]
     return tags
 
-def iob2bioes(tags:List[str])->List[str]:
+
+def iob2bioes(tags: List[str]) -> List[str]:
     """
     将iob的tag转换为bioes编码
     :param tags:
@@ -38,12 +49,12 @@ def iob2bioes(tags:List[str])->List[str]:
         else:
             split = tag.split('-')[0]
             if split == 'B':
-                if i+1!=len(tags) and tags[i+1].split('-')[0] == 'I':
+                if i + 1 != len(tags) and tags[i + 1].split('-')[0] == 'I':
                     new_tags.append(tag)
                 else:
                     new_tags.append(tag.replace('B-', 'S-'))
             elif split == 'I':
-                if i + 1<len(tags) and tags[i+1].split('-')[0] == 'I':
+                if i + 1 < len(tags) and tags[i + 1].split('-')[0] == 'I':
                     new_tags.append(tag)
                 else:
                     new_tags.append(tag.replace('I-', 'E-'))
@@ -52,7 +63,7 @@ def iob2bioes(tags:List[str])->List[str]:
     return new_tags
 
 
-def get_tokenizer(tokenizer:str, lang='en'):
+def get_tokenizer(tokenizer: str, lang='en'):
     """
 
     :param str tokenizer: 获取tokenzier方法
@@ -97,13 +108,13 @@ def _indexize(data_bundle, input_field_names=Const.INPUT, target_field_names=Con
                                                         name != 'train'])
         src_vocab.index_dataset(*data_bundle.datasets.values(), field_name=input_field_name)
         data_bundle.set_vocab(src_vocab, input_field_name)
-
+    
     for target_field_name in target_field_names:
         tgt_vocab = Vocabulary(unknown=None, padding=None)
         tgt_vocab.from_dataset(data_bundle.datasets['train'], field_name=target_field_name)
         tgt_vocab.index_dataset(*data_bundle.datasets.values(), field_name=target_field_name)
         data_bundle.set_vocab(tgt_vocab, target_field_name)
-
+    
     return data_bundle
 
 
@@ -116,7 +127,7 @@ def _add_words_field(data_bundle, lower=False):
     :return: 传入的DataBundle
     """
     data_bundle.copy_field(field_name=Const.RAW_WORD, new_field_name=Const.INPUT, ignore_miss_dataset=True)
-
+    
     if lower:
         for name, dataset in data_bundle.datasets.items():
             dataset[Const.INPUT].lower()
@@ -132,7 +143,7 @@ def _add_chars_field(data_bundle, lower=False):
     :return: 传入的DataBundle
     """
     data_bundle.copy_field(field_name=Const.RAW_CHAR, new_field_name=Const.CHAR_INPUT, ignore_miss_dataset=True)
-
+    
     if lower:
         for name, dataset in data_bundle.datasets.items():
             dataset[Const.CHAR_INPUT].lower()
@@ -147,6 +158,7 @@ def _drop_empty_instance(data_bundle, field_name):
     :param str field_name: 对哪个field进行检查，如果为None，则任意field为空都会删掉
     :return: 传入的DataBundle
     """
+    
     def empty_instance(ins):
         if field_name:
             field_value = ins[field_name]
@@ -157,10 +169,8 @@ def _drop_empty_instance(data_bundle, field_name):
             if field_value in ((), {}, [], ''):
                 return True
         return False
-
+    
     for name, dataset in data_bundle.datasets.items():
         dataset.drop(empty_instance)
-
+    
     return data_bundle
-
-
diff --git a/fastNLP/io/utils.py b/fastNLP/io/utils.py
index faec2a55..e1de2ae7 100644
--- a/fastNLP/io/utils.py
+++ b/fastNLP/io/utils.py
@@ -1,10 +1,20 @@
-import os
+"""
+.. todo::
+    doc
+"""
 
-from typing import Union, Dict
+__all__ = [
+    "check_loader_paths"
+]
+
+import os
 from pathlib import Path
+from typing import Union, Dict
+
 from ..core import logger
 
-def check_loader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]:
+
+def check_loader_paths(paths: Union[str, Dict[str, str]]) -> Dict[str, str]:
     """
     检查传入dataloader的文件的合法性。如果为合法路径，将返回至少包含'train'这个key的dict。类似于下面的结果::
 
@@ -33,11 +43,13 @@ def check_loader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]:
                     path_pair = ('train', filename)
                 if 'dev' in filename:
                     if path_pair:
-                        raise Exception("File:{} in {} contains bot `{}` and `dev`.".format(filename, paths, path_pair[0]))
+                        raise Exception(
+                            "File:{} in {} contains bot `{}` and `dev`.".format(filename, paths, path_pair[0]))
                     path_pair = ('dev', filename)
                 if 'test' in filename:
                     if path_pair:
-                        raise Exception("File:{} in {} contains bot `{}` and `test`.".format(filename, paths, path_pair[0]))
+                        raise Exception(
+                            "File:{} in {} contains bot `{}` and `test`.".format(filename, paths, path_pair[0]))
                     path_pair = ('test', filename)
                 if path_pair:
                     files[path_pair[0]] = os.path.join(paths, path_pair[1])
@@ -46,7 +58,7 @@ def check_loader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]:
             return files
         else:
             raise FileNotFoundError(f"{paths} is not a valid file path.")
-
+    
     elif isinstance(paths, dict):
         if paths:
             if 'train' not in paths:
@@ -65,6 +77,7 @@ def check_loader_paths(paths:Union[str, Dict[str, str]])->Dict[str, str]:
     else:
         raise TypeError(f"paths only supports str and dict. not {type(paths)}.")
 
+
 def get_tokenizer():
     try:
         import spacy

From efa9496d09d139658683eec0b4a6ae44b93dd88c Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Mon, 26 Aug 2019 10:25:51 +0800
Subject: [PATCH 14/19] add __all__ and __doc__ for all files in module
 'models', using 'undocumented' tags

---
 fastNLP/models/base_model.py              |  4 ++++
 fastNLP/models/bert.py                    |  8 ++++++--
 fastNLP/models/cnn_text_classification.py |  7 ++++++-
 fastNLP/models/enas_controller.py         |  9 +++++++--
 fastNLP/models/enas_model.py              |  5 ++++-
 fastNLP/models/enas_trainer.py            | 14 +++++++++-----
 fastNLP/models/enas_utils.py              |  8 ++++++--
 fastNLP/models/sequence_labeling.py       | 12 ++++++------
 fastNLP/models/snli.py                    |  7 +++++--
 9 files changed, 53 insertions(+), 21 deletions(-)

diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py
index 2646d580..61edb91f 100644
--- a/fastNLP/models/base_model.py
+++ b/fastNLP/models/base_model.py
@@ -1,3 +1,7 @@
+"""undocumented"""
+
+__all__ = []
+
 import torch
 
 from ..modules.decoder.mlp import MLP
diff --git a/fastNLP/models/bert.py b/fastNLP/models/bert.py
index 3afccc14..0a89b765 100644
--- a/fastNLP/models/bert.py
+++ b/fastNLP/models/bert.py
@@ -1,16 +1,20 @@
-"""
+"""undocumented
 bert.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed under the Apache License 2.0.
 
 """
+
+__all__ = []
+
 import os
+
 import torch
 from torch import nn
 
 from .base_model import BaseModel
 from ..core.const import Const
+from ..core.utils import seq_len_to_mask
 from ..modules.encoder import BertModel
 from ..modules.encoder.bert import BertConfig, CONFIG_FILE
-from ..core.utils import seq_len_to_mask
 
 
 class BertForSequenceClassification(BaseModel):
diff --git a/fastNLP/models/cnn_text_classification.py b/fastNLP/models/cnn_text_classification.py
index e00a0697..37a60c35 100644
--- a/fastNLP/models/cnn_text_classification.py
+++ b/fastNLP/models/cnn_text_classification.py
@@ -1,3 +1,8 @@
+"""
+.. todo::
+    doc
+"""
+
 __all__ = [
     "CNNText"
 ]
@@ -7,8 +12,8 @@ import torch.nn as nn
 
 from ..core.const import Const as C
 from ..core.utils import seq_len_to_mask
-from ..modules import encoder
 from ..embeddings import embedding
+from ..modules import encoder
 
 
 class CNNText(torch.nn.Module):
diff --git a/fastNLP/models/enas_controller.py b/fastNLP/models/enas_controller.py
index e83c6b51..eec820e4 100644
--- a/fastNLP/models/enas_controller.py
+++ b/fastNLP/models/enas_controller.py
@@ -1,5 +1,10 @@
-# Code Modified from https://github.com/carpedm20/ENAS-pytorch
-"""A module with NAS controller-related code."""
+"""undocumented
+Code Modified from https://github.com/carpedm20/ENAS-pytorch
+A module with NAS controller-related code.
+"""
+
+__all__ = []
+
 import collections
 import os
 
diff --git a/fastNLP/models/enas_model.py b/fastNLP/models/enas_model.py
index b6b683c0..2e8ca713 100644
--- a/fastNLP/models/enas_model.py
+++ b/fastNLP/models/enas_model.py
@@ -1,7 +1,10 @@
-"""
+"""undocumented
 Module containing the shared RNN model.
 Code Modified from https://github.com/carpedm20/ENAS-pytorch
 """
+
+__all__ = []
+
 import collections
 
 import numpy as np
diff --git a/fastNLP/models/enas_trainer.py b/fastNLP/models/enas_trainer.py
index 7abcc45f..98d778cd 100644
--- a/fastNLP/models/enas_trainer.py
+++ b/fastNLP/models/enas_trainer.py
@@ -1,11 +1,15 @@
-# Code Modified from https://github.com/carpedm20/ENAS-pytorch
+"""undocumented
+Code Modified from https://github.com/carpedm20/ENAS-pytorch
+"""
+
+__all__ = []
+
 import math
-import numpy as np
 import time
-import torch
-
 from datetime import datetime, timedelta
 
+import numpy as np
+import torch
 from torch.optim import Adam
 
 try:
@@ -15,7 +19,7 @@ except:
 
 from ..core.trainer import Trainer
 from ..core.batch import DataSetIter
-from ..core.callback import CallbackManager, CallbackException
+from ..core.callback import CallbackException
 from ..core.dataset import DataSet
 from ..core.utils import _move_dict_value_to_device
 from . import enas_utils as utils
diff --git a/fastNLP/models/enas_utils.py b/fastNLP/models/enas_utils.py
index 4e402a9a..cd6c2503 100644
--- a/fastNLP/models/enas_utils.py
+++ b/fastNLP/models/enas_utils.py
@@ -1,7 +1,11 @@
-# Code Modified from https://github.com/carpedm20/ENAS-pytorch
+"""undocumented
+Code Modified from https://github.com/carpedm20/ENAS-pytorch
+"""
+
+__all__ = []
 
-from collections import defaultdict
 import collections
+from collections import defaultdict
 
 import numpy as np
 import torch
diff --git a/fastNLP/models/sequence_labeling.py b/fastNLP/models/sequence_labeling.py
index 4bf3f95f..0dff21f0 100644
--- a/fastNLP/models/sequence_labeling.py
+++ b/fastNLP/models/sequence_labeling.py
@@ -1,5 +1,5 @@
 """
-    本模块实现了几种序列标注模型
+本模块实现了几种序列标注模型
 """
 __all__ = [
     "SeqLabeling",
@@ -12,14 +12,14 @@ import torch.nn as nn
 import torch.nn.functional as F
 
 from .base_model import BaseModel
-from ..embeddings import embedding
-from ..modules import decoder, encoder
-from ..modules.decoder.crf import allowed_transitions
-from ..core.utils import seq_len_to_mask
 from ..core.const import Const as C
-from ..modules import LSTM
+from ..core.utils import seq_len_to_mask
+from ..embeddings import embedding
 from ..embeddings import get_embeddings
 from ..modules import ConditionalRandomField
+from ..modules import LSTM
+from ..modules import decoder, encoder
+from ..modules.decoder.crf import allowed_transitions
 
 
 class BiLSTMCRF(BaseModel):
diff --git a/fastNLP/models/snli.py b/fastNLP/models/snli.py
index 3be942e8..5ca4052d 100644
--- a/fastNLP/models/snli.py
+++ b/fastNLP/models/snli.py
@@ -1,3 +1,7 @@
+"""
+.. todo::
+    doc
+"""
 __all__ = [
     "ESIM"
 ]
@@ -5,13 +9,12 @@ __all__ = [
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-
 from torch.nn import CrossEntropyLoss
 
 from .base_model import BaseModel
-from ..embeddings.embedding import TokenEmbedding, Embedding
 from ..core.const import Const
 from ..core.utils import seq_len_to_mask
+from ..embeddings.embedding import TokenEmbedding, Embedding
 
 
 class ESIM(BaseModel):

From 2cf9c0ebb1722aae734ceb971b889c43198729a2 Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Mon, 26 Aug 2019 10:26:55 +0800
Subject: [PATCH 15/19] add __all__ and __doc__ for all files in module
 'modules', using 'undocumented' tags

---
 fastNLP/modules/decoder/__init__.py         |  6 +++-
 fastNLP/modules/decoder/crf.py              |  5 +++-
 fastNLP/modules/decoder/mlp.py              |  2 ++
 fastNLP/modules/decoder/utils.py            |  2 ++
 fastNLP/modules/dropout.py                  |  6 +++-
 fastNLP/modules/encoder/__init__.py         | 10 +++++--
 fastNLP/modules/encoder/_elmo.py            |  4 ++-
 fastNLP/modules/encoder/attention.py        |  2 ++
 fastNLP/modules/encoder/bert.py             |  8 +++---
 fastNLP/modules/encoder/char_encoder.py     |  2 ++
 fastNLP/modules/encoder/conv_maxpool.py     |  2 ++
 fastNLP/modules/encoder/lstm.py             |  3 +-
 fastNLP/modules/encoder/pooling.py          |  2 ++
 fastNLP/modules/encoder/star_transformer.py |  3 +-
 fastNLP/modules/encoder/transformer.py      |  2 ++
 fastNLP/modules/encoder/variational_rnn.py  |  3 +-
 fastNLP/modules/utils.py                    | 32 ++++++++++++++-------
 17 files changed, 69 insertions(+), 25 deletions(-)

diff --git a/fastNLP/modules/decoder/__init__.py b/fastNLP/modules/decoder/__init__.py
index 664618b2..57acb172 100644
--- a/fastNLP/modules/decoder/__init__.py
+++ b/fastNLP/modules/decoder/__init__.py
@@ -1,3 +1,7 @@
+"""
+.. todo::
+    doc
+"""
 __all__ = [
     "MLP",
     "ConditionalRandomField",
@@ -6,6 +10,6 @@ __all__ = [
 ]
 
 from .crf import ConditionalRandomField
+from .crf import allowed_transitions
 from .mlp import MLP
 from .utils import viterbi_decode
-from .crf import allowed_transitions
diff --git a/fastNLP/modules/decoder/crf.py b/fastNLP/modules/decoder/crf.py
index 9f19afef..b47d0162 100644
--- a/fastNLP/modules/decoder/crf.py
+++ b/fastNLP/modules/decoder/crf.py
@@ -1,3 +1,5 @@
+"""undocumented"""
+
 __all__ = [
     "ConditionalRandomField",
     "allowed_transitions"
@@ -9,13 +11,14 @@ from torch import nn
 from ..utils import initial_parameter
 from ...core import Vocabulary
 
+
 def allowed_transitions(id2target, encoding_type='bio', include_start_end=False):
     """
     别名：:class:`fastNLP.modules.allowed_transitions`  :class:`fastNLP.modules.decoder.allowed_transitions`
 
     给定一个id到label的映射表，返回所有可以跳转的(from_tag_id, to_tag_id)列表。
 
-    :param dict,Vocabulary id2target: key是label的indices，value是str类型的tag或tag-label。value可以是只有tag的, 比如"B", "M"; 也可以是
+    :param dict, ~fastNLP.Vocabulary id2target: key是label的indices，value是str类型的tag或tag-label。value可以是只有tag的, 比如"B", "M"; 也可以是
         "B-NN", "M-NN", tag和label之间一定要用"-"隔开。一般可以通过Vocabulary.idx2word得到id2label。
     :param str encoding_type: 支持"bio", "bmes", "bmeso", "bioes"。
     :param bool include_start_end: 是否包含开始与结尾的转换。比如在bio中，b/o可以在开头，但是i不能在开头；
diff --git a/fastNLP/modules/decoder/mlp.py b/fastNLP/modules/decoder/mlp.py
index 9d9d80f2..f6e687a7 100644
--- a/fastNLP/modules/decoder/mlp.py
+++ b/fastNLP/modules/decoder/mlp.py
@@ -1,3 +1,5 @@
+"""undocumented"""
+
 __all__ = [
     "MLP"
 ]
diff --git a/fastNLP/modules/decoder/utils.py b/fastNLP/modules/decoder/utils.py
index 3d5ac3f8..118b1414 100644
--- a/fastNLP/modules/decoder/utils.py
+++ b/fastNLP/modules/decoder/utils.py
@@ -1,3 +1,5 @@
+"""undocumented"""
+
 __all__ = [
     "viterbi_decode"
 ]
diff --git a/fastNLP/modules/dropout.py b/fastNLP/modules/dropout.py
index 0ea2a2d9..24c20cc6 100644
--- a/fastNLP/modules/dropout.py
+++ b/fastNLP/modules/dropout.py
@@ -1,4 +1,8 @@
-__all__ = []
+"""undocumented"""
+
+__all__ = [
+    "TimestepDropout"
+]
 
 import torch
 
diff --git a/fastNLP/modules/encoder/__init__.py b/fastNLP/modules/encoder/__init__.py
index 1e99a0fd..0dfc18de 100644
--- a/fastNLP/modules/encoder/__init__.py
+++ b/fastNLP/modules/encoder/__init__.py
@@ -1,3 +1,8 @@
+"""
+.. todo::
+    doc
+"""
+
 __all__ = [
     # "BertModel",
 
@@ -24,13 +29,12 @@ __all__ = [
     "MultiHeadAttention",
 ]
 
+from .attention import MultiHeadAttention
 from .bert import BertModel
 from .char_encoder import ConvolutionCharEncoder, LSTMCharEncoder
 from .conv_maxpool import ConvMaxpool
 from .lstm import LSTM
+from .pooling import MaxPool, MaxPoolWithMask, AvgPool, AvgPoolWithMask
 from .star_transformer import StarTransformer
 from .transformer import TransformerEncoder
 from .variational_rnn import VarRNN, VarLSTM, VarGRU
-
-from .pooling import MaxPool, MaxPoolWithMask, AvgPool, AvgPoolWithMask
-from .attention import MultiHeadAttention
diff --git a/fastNLP/modules/encoder/_elmo.py b/fastNLP/modules/encoder/_elmo.py
index befae8bc..554cf8a9 100644
--- a/fastNLP/modules/encoder/_elmo.py
+++ b/fastNLP/modules/encoder/_elmo.py
@@ -1,7 +1,9 @@
-"""
+"""undocumented
 这个页面的代码大量参考了 allenNLP
 """
 
+__all__ = []
+
 from typing import Optional, Tuple, List, Callable
 
 import torch
diff --git a/fastNLP/modules/encoder/attention.py b/fastNLP/modules/encoder/attention.py
index fe3f7fd8..02bd078a 100644
--- a/fastNLP/modules/encoder/attention.py
+++ b/fastNLP/modules/encoder/attention.py
@@ -1,3 +1,5 @@
+"""undocumented"""
+
 __all__ = [
     "MultiHeadAttention"
 ]
diff --git a/fastNLP/modules/encoder/bert.py b/fastNLP/modules/encoder/bert.py
index b74c4da0..5026f48a 100644
--- a/fastNLP/modules/encoder/bert.py
+++ b/fastNLP/modules/encoder/bert.py
@@ -1,4 +1,4 @@
-"""
+"""undocumented
 这个页面的代码很大程度上参考(复制粘贴)了https://github.com/huggingface/pytorch-pretrained-BERT的代码， 如果你发现该代码对你
     有用，也请引用一下他们。
 """
@@ -8,17 +8,17 @@ __all__ = [
 ]
 
 import collections
-
-import unicodedata
 import copy
 import json
 import math
 import os
+import unicodedata
 
 import torch
 from torch import nn
-from ...core import logger
+
 from ..utils import _get_file_name_base_on_postfix
+from ...core import logger
 
 CONFIG_FILE = 'bert_config.json'
 VOCAB_NAME = 'vocab.txt'
diff --git a/fastNLP/modules/encoder/char_encoder.py b/fastNLP/modules/encoder/char_encoder.py
index 6a6e1470..e40bd0dd 100644
--- a/fastNLP/modules/encoder/char_encoder.py
+++ b/fastNLP/modules/encoder/char_encoder.py
@@ -1,3 +1,5 @@
+"""undocumented"""
+
 __all__ = [
     "ConvolutionCharEncoder",
     "LSTMCharEncoder"
diff --git a/fastNLP/modules/encoder/conv_maxpool.py b/fastNLP/modules/encoder/conv_maxpool.py
index 8ce6b163..68415189 100644
--- a/fastNLP/modules/encoder/conv_maxpool.py
+++ b/fastNLP/modules/encoder/conv_maxpool.py
@@ -1,3 +1,5 @@
+"""undocumented"""
+
 __all__ = [
     "ConvMaxpool"
 ]
diff --git a/fastNLP/modules/encoder/lstm.py b/fastNLP/modules/encoder/lstm.py
index e2358132..1f3eae6d 100644
--- a/fastNLP/modules/encoder/lstm.py
+++ b/fastNLP/modules/encoder/lstm.py
@@ -1,7 +1,8 @@
-"""
+"""undocumented
 轻量封装的 Pytorch LSTM 模块.
 可在 forward 时传入序列的长度, 自动对padding做合适的处理.
 """
+
 __all__ = [
     "LSTM"
 ]
diff --git a/fastNLP/modules/encoder/pooling.py b/fastNLP/modules/encoder/pooling.py
index d8aa54ad..b1272284 100644
--- a/fastNLP/modules/encoder/pooling.py
+++ b/fastNLP/modules/encoder/pooling.py
@@ -1,3 +1,5 @@
+"""undocumented"""
+
 __all__ = [
     "MaxPool",
     "MaxPoolWithMask",
diff --git a/fastNLP/modules/encoder/star_transformer.py b/fastNLP/modules/encoder/star_transformer.py
index 3927a494..02d7a6a0 100644
--- a/fastNLP/modules/encoder/star_transformer.py
+++ b/fastNLP/modules/encoder/star_transformer.py
@@ -1,6 +1,7 @@
-"""
+"""undocumented
 Star-Transformer 的encoder部分的 Pytorch 实现
 """
+
 __all__ = [
     "StarTransformer"
 ]
diff --git a/fastNLP/modules/encoder/transformer.py b/fastNLP/modules/encoder/transformer.py
index bc488e54..ce9172d5 100644
--- a/fastNLP/modules/encoder/transformer.py
+++ b/fastNLP/modules/encoder/transformer.py
@@ -1,3 +1,5 @@
+"""undocumented"""
+
 __all__ = [
     "TransformerEncoder"
 ]
diff --git a/fastNLP/modules/encoder/variational_rnn.py b/fastNLP/modules/encoder/variational_rnn.py
index 8e5e804b..933555c8 100644
--- a/fastNLP/modules/encoder/variational_rnn.py
+++ b/fastNLP/modules/encoder/variational_rnn.py
@@ -1,6 +1,7 @@
-"""
+"""undocumented
 Variational RNN 的 Pytorch 实现
 """
+
 __all__ = [
     "VarRNN",
     "VarLSTM",
diff --git a/fastNLP/modules/utils.py b/fastNLP/modules/utils.py
index ead75711..09574782 100644
--- a/fastNLP/modules/utils.py
+++ b/fastNLP/modules/utils.py
@@ -1,10 +1,20 @@
+"""
+.. todo::
+    doc
+"""
+
+__all__ = [
+    "initial_parameter",
+    "summary"
+]
+
+import os
 from functools import reduce
 
 import torch
 import torch.nn as nn
 import torch.nn.init as init
-import glob
-import os
+
 
 def initial_parameter(net, initial_method=None):
     """A method used to initialize the weights of PyTorch models.
@@ -40,7 +50,7 @@ def initial_parameter(net, initial_method=None):
         init_method = init.uniform_
     else:
         init_method = init.xavier_normal_
-
+    
     def weights_init(m):
         # classname = m.__class__.__name__
         if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d) or isinstance(m, nn.Conv3d):  # for all the cnn
@@ -66,7 +76,7 @@ def initial_parameter(net, initial_method=None):
                     else:
                         init.normal_(w.data)  # bias
                 # print("init else")
-
+    
     net.apply(weights_init)
 
 
@@ -79,11 +89,11 @@ def summary(model: nn.Module):
     """
     train = []
     nontrain = []
-
+    
     def layer_summary(module: nn.Module):
         def count_size(sizes):
-            return reduce(lambda x, y: x*y, sizes)
-
+            return reduce(lambda x, y: x * y, sizes)
+        
         for p in module.parameters(recurse=False):
             if p.requires_grad:
                 train.append(count_size(p.shape))
@@ -91,7 +101,7 @@ def summary(model: nn.Module):
                 nontrain.append(count_size(p.shape))
         for subm in module.children():
             layer_summary(subm)
-
+    
     layer_summary(model)
     total_train = sum(train)
     total_nontrain = sum(nontrain)
@@ -101,7 +111,7 @@ def summary(model: nn.Module):
     strings.append('Trainable params: {:,}'.format(total_train))
     strings.append('Non-trainable params: {:,}'.format(total_nontrain))
     max_len = len(max(strings, key=len))
-    bar = '-'*(max_len + 3)
+    bar = '-' * (max_len + 3)
     strings = [bar] + strings + [bar]
     print('\n'.join(strings))
     return total, total_train, total_nontrain
@@ -128,9 +138,9 @@ def _get_file_name_base_on_postfix(dir_path, postfix):
     :param postfix: 形如".bin", ".json"等
     :return: str，文件的路径
     """
-    files = list(filter(lambda filename:filename.endswith(postfix), os.listdir(os.path.join(dir_path))))
+    files = list(filter(lambda filename: filename.endswith(postfix), os.listdir(os.path.join(dir_path))))
     if len(files) == 0:
         raise FileNotFoundError(f"There is no file endswith *{postfix} file in {dir_path}")
     elif len(files) > 1:
         raise FileExistsError(f"There are multiple *{postfix} files in {dir_path}")
-    return os.path.join(dir_path, files[0])
\ No newline at end of file
+    return os.path.join(dir_path, files[0])

From e1f234841cf763839c767ebf4d6e750c5391adb4 Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Mon, 26 Aug 2019 11:00:45 +0800
Subject: [PATCH 16/19] mark the dataloader.__init__ as undocumented

---
 fastNLP/io/data_loader/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastNLP/io/data_loader/__init__.py b/fastNLP/io/data_loader/__init__.py
index b3ca9021..8a9dd60b 100644
--- a/fastNLP/io/data_loader/__init__.py
+++ b/fastNLP/io/data_loader/__init__.py
@@ -1,4 +1,4 @@
-"""
+"""undocumented
 .. warning::
 
     本模块在 `0.5.0版本` 中被废弃，由 :mod:`~fastNLP.io.loader`  和 :mod:`~fastNLP.io.pipe` 模块替代。

From ffd5fd813559cee2930f5d0d0274357fb151cc4c Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Mon, 26 Aug 2019 11:58:20 +0800
Subject: [PATCH 17/19] delete the old doc-tool

---
 docs/format.py | 68 --------------------------------------------------
 1 file changed, 68 deletions(-)
 delete mode 100644 docs/format.py

diff --git a/docs/format.py b/docs/format.py
deleted file mode 100644
index 67671ae7..00000000
--- a/docs/format.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import os
-
-
-def shorten(file, to_delete, cut=False):
-    if file.endswith("index.rst") or file.endswith("conf.py"):
-        return
-    res = []
-    with open(file, "r") as fin:
-        lines = fin.readlines()
-    for line in lines:
-        if cut and line.rstrip() == "Submodules":
-            break
-        else:
-            res.append(line.rstrip())
-    for i, line in enumerate(res):
-        if line.endswith(" package"):
-            res[i] = res[i][:-len(" package")]
-            res[i + 1] = res[i + 1][:-len(" package")]
-        elif line.endswith(" module"):
-            res[i] = res[i][:-len(" module")]
-            res[i + 1] = res[i + 1][:-len(" module")]
-        else:
-            for name in to_delete:
-                if line.endswith(name):
-                    res[i] = "del"
-
-    with open(file, "w") as fout:
-        for line in res:
-            if line != "del":
-                print(line, file=fout)
-
-
-def clear(path='./source/'):
-    files = os.listdir(path)
-    to_delete = [
-        "fastNLP.core.dist_trainer",
-        "fastNLP.core.predictor",
-
-        "fastNLP.io.file_reader",
-        "fastNLP.io.config_io",
-
-        "fastNLP.embeddings.contextual_embedding",
-
-        "fastNLP.modules.dropout",
-        "fastNLP.models.base_model",
-        "fastNLP.models.bert",
-        "fastNLP.models.enas_utils",
-        "fastNLP.models.enas_controller",
-        "fastNLP.models.enas_model",
-        "fastNLP.models.enas_trainer",
-    ]
-    for file in files:
-        if not os.path.isdir(path + file):
-            res = file.split('.')
-            if len(res) > 4:
-                to_delete.append(file[:-4])
-            elif len(res) == 4:
-                shorten(path + file, to_delete, True)
-            else:
-                shorten(path + file, to_delete)
-    for file in to_delete:
-        try:
-            os.remove(path + file + ".rst")
-        except:
-            pass
-
-
-clear()

From 78af3491a432cb10b36d9cf17b75c12e40146026 Mon Sep 17 00:00:00 2001
From: zide05 <845465009@qq.com>
Date: Mon, 26 Aug 2019 14:03:40 +0800
Subject: [PATCH 18/19] =?UTF-8?q?=E4=BF=AE=E6=94=B9tutorial?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/source/tutorials/tutorial_4_loss_optimizer.rst | 7 +++++--
 docs/source/tutorials/tutorial_5_datasetiter.rst    | 5 ++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/docs/source/tutorials/tutorial_4_loss_optimizer.rst b/docs/source/tutorials/tutorial_4_loss_optimizer.rst
index f863a7a8..a53ef89b 100644
--- a/docs/source/tutorials/tutorial_4_loss_optimizer.rst
+++ b/docs/source/tutorials/tutorial_4_loss_optimizer.rst
@@ -1,4 +1,4 @@
-==============================================================================
+﻿==============================================================================
 动手实现一个文本分类器I-使用Trainer和Tester快速训练和测试
 ==============================================================================
 
@@ -19,7 +19,9 @@
 
         loader = SSTLoader()
         #这里的all.txt是下载好数据后train.txt、dev.txt、test.txt的组合
-        dataset = loader.load("./trainDevTestTrees_PTB/trees/all.txt")
+        #loader.load(path)会首先判断path是否为none，若是则自动从网站下载数据，若不是则读入数据并返回databundle
+        databundle_ = loader.load("./trainDevTestTrees_PTB/trees/all.txt")
+        dataset = databundle_.datasets['train']
         print(dataset[0])
 
     输出数据如下::
@@ -31,6 +33,7 @@
     
 
 数据处理
+    可以使用事先定义的 :class:`~fastNLP.io.SSTPipe` 类对数据进行基本预处理，这里我们手动进行处理。
     我们使用 :class:`~fastNLP.DataSet` 类的 :meth:`~fastNLP.DataSet.apply` 方法将 ``target`` :mod:`~fastNLP.core.field` 转化为整数。
     
     .. code-block:: python
diff --git a/docs/source/tutorials/tutorial_5_datasetiter.rst b/docs/source/tutorials/tutorial_5_datasetiter.rst
index e81b18dd..2ec753c3 100644
--- a/docs/source/tutorials/tutorial_5_datasetiter.rst
+++ b/docs/source/tutorials/tutorial_5_datasetiter.rst
@@ -20,7 +20,9 @@
 
         loader = SSTLoader()
         #这里的all.txt是下载好数据后train.txt、dev.txt、test.txt的组合
-        dataset = loader.load("./trainDevTestTrees_PTB/trees/all.txt")
+        #loader.load(path)会首先判断path是否为none，若是则自动从网站下载数据，若不是则读入数据并返回databundle
+        databundle_ = loader.load("./trainDevTestTrees_PTB/trees/all.txt")
+        dataset = databundle_.datasets['train']
         print(dataset[0])
 
     输出数据如下::
@@ -32,6 +34,7 @@
     
 
 数据处理
+    可以使用事先定义的 :class:`~fastNLP.io.SSTPipe` 类对数据进行基本预处理，这里我们手动进行处理。
     我们使用 :class:`~fastNLP.DataSet` 类的 :meth:`~fastNLP.DataSet.apply` 方法将 ``target`` :mod:`~fastNLP.core.field` 转化为整数。
     
     .. code-block:: python

From 53975c045a6841e38d4a7cfcc23abea6de0fe3f3 Mon Sep 17 00:00:00 2001
From: ChenXin <will131@foxmail.com>
Date: Mon, 26 Aug 2019 14:58:36 +0800
Subject: [PATCH 19/19] update the doc-tool & fix an importing bug

---
 docs/count.py                  | 42 ++++++++++++++++++++++++++++++++++
 fastNLP/modules/decoder/crf.py |  2 +-
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/docs/count.py b/docs/count.py
index e1aad115..72868403 100644
--- a/docs/count.py
+++ b/docs/count.py
@@ -1,7 +1,28 @@
+import inspect
 import os
 import sys
 
 
+def _colored_string(string: str, color: str or int) -> str:
+    """在终端中显示一串有颜色的文字
+    :param string: 在终端中显示的文字
+    :param color: 文字的颜色
+    :return:
+    """
+    if isinstance(color, str):
+        color = {
+            "black": 30, "Black": 30, "BLACK": 30,
+            "red": 31, "Red": 31, "RED": 31,
+            "green": 32, "Green": 32, "GREEN": 32,
+            "yellow": 33, "Yellow": 33, "YELLOW": 33,
+            "blue": 34, "Blue": 34, "BLUE": 34,
+            "purple": 35, "Purple": 35, "PURPLE": 35,
+            "cyan": 36, "Cyan": 36, "CYAN": 36,
+            "white": 37, "White": 37, "WHITE": 37
+        }[color]
+    return "\033[%dm%s\033[0m" % (color, string)
+
+
 def find_all_modules():
     modules = {}
     children = {}
@@ -55,10 +76,31 @@ def create_rst_file(modules, name, children):
                 fout.write("   " + module + "\n")
 
 
+def check_file(m, name):
+    for item, obj in inspect.getmembers(m):
+        if inspect.isclass(obj) and obj.__module__ == name:
+            print(obj)
+        if inspect.isfunction(obj) and obj.__module__ == name:
+            print("FUNC", obj)
+
+
+def check_files(modules):
+    for name in sorted(modules.keys()):
+        if name == 'fastNLP.core.utils':
+            check_file(modules[name], name)
+
+
 def main():
+    print(_colored_string('Getting modules...', "Blue"))
     modules, to_doc, children = find_all_modules()
+    print(_colored_string('Done!', "Green"))
+    print(_colored_string('Creating rst files...', "Blue"))
     for name in to_doc:
         create_rst_file(modules, name, children)
+    print(_colored_string('Done!', "Green"))
+    print(_colored_string('Checking all files...', "Blue"))
+    check_files(modules)
+    print(_colored_string('Done!', "Green"))
 
 
 if __name__ == "__main__":
diff --git a/fastNLP/modules/decoder/crf.py b/fastNLP/modules/decoder/crf.py
index b47d0162..f63d46e3 100644
--- a/fastNLP/modules/decoder/crf.py
+++ b/fastNLP/modules/decoder/crf.py
@@ -9,7 +9,7 @@ import torch
 from torch import nn
 
 from ..utils import initial_parameter
-from ...core import Vocabulary
+from ...core.vocabulary import Vocabulary
 
 
 def allowed_transitions(id2target, encoding_type='bio', include_start_end=False):