From 8f729b665659cad085adf88074cdbd25e65e6116 Mon Sep 17 00:00:00 2001
From: xuyige <ygxu18@fudan.edu.cn>
Date: Thu, 4 Jul 2019 23:05:19 +0800
Subject: [PATCH 1/4] merge matching loader to fastNLP package

---
 fastNLP/io/__init__.py                        |  24 +-
 fastNLP/io/data_loader/matching.py            | 428 ++++++++++++++++++
 fastNLP/io/dataset_loader.py                  |  36 --
 .../matching/data/MatchingDataLoader.py       |  11 +-
 4 files changed, 453 insertions(+), 46 deletions(-)
 create mode 100644 fastNLP/io/data_loader/matching.py
diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py
index 28f466a8..05d75f43 100644
--- a/fastNLP/io/__init__.py
+++ b/fastNLP/io/__init__.py
@@ -11,21 +11,35 @@
 """
 __all__ = [
     'EmbedLoader',
-    
+
+    'DataInfo',
     'DataSetLoader',
+
     'CSVLoader',
     'JsonLoader',
     'ConllLoader',
-    'SNLILoader',
-    'SSTLoader',
     'PeopleDailyCorpusLoader',
     'Conll2003Loader',
     
     'ModelLoader',
     'ModelSaver',
+
+    'SSTLoader',
+
+    'MatchingLoader',
+    'SNLILoader',
+    'MNLILoader',
+    'QNLILoader',
+    'QuoraLoader',
+    'RTELoader',
 ]
 
 from .embed_loader import EmbedLoader
-from .dataset_loader import DataSetLoader, CSVLoader, JsonLoader, ConllLoader, \
-    SNLILoader, SSTLoader, PeopleDailyCorpusLoader, Conll2003Loader
+from .base_loader import DataInfo, DataSetLoader
+from .dataset_loader import CSVLoader, JsonLoader, ConllLoader, \
+    PeopleDailyCorpusLoader, Conll2003Loader
 from .model_io import ModelLoader, ModelSaver
+
+from .data_loader.sst import SSTLoader
+from .data_loader.matching import MatchingLoader, SNLILoader, \
+    MNLILoader, QNLILoader, QuoraLoader, RTELoader
diff --git a/fastNLP/io/data_loader/matching.py b/fastNLP/io/data_loader/matching.py
new file mode 100644
index 00000000..70a683f2
--- /dev/null
+++ b/fastNLP/io/data_loader/matching.py
@@ -0,0 +1,428 @@
+import os
+
+from typing import Union, Dict
+
+from ...core.const import Const
+from ...core.vocabulary import Vocabulary
+from ...io.base_loader import DataInfo, DataSetLoader
+from ...io.dataset_loader import JsonLoader, CSVLoader
+from ...io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR
+from ...modules.encoder._bert import BertTokenizer
+
+
+class MatchingLoader(DataSetLoader):
+    """
+    别名：:class:`fastNLP.io.MatchingLoader` :class:`fastNLP.io.dataset_loader.MatchingLoader`
+
+    读取Matching任务的数据集
+
+    :param dict paths: key是数据集名称（如train、dev、test），value是对应的文件名
+    """
+
+    def __init__(self, paths: dict = None):
+        self.paths = paths
+
+    def _load(self, path):
+        """
+        :param str path: 待读取数据集的路径名
+        :return: fastNLP.DataSet ds: 返回一个DataSet对象，里面必须包含3个field：其中两个分别为两个句子
+            的原始字符串文本，第三个为标签
+        """
+        raise NotImplementedError
+
+    def process(self, paths: Union[str, Dict[str, str]], dataset_name: str = None,
+                to_lower=False, seq_len_type: str = None, bert_tokenizer: str = None,
+                cut_text: int = None, get_index=True, auto_pad_length: int = None,
+                auto_pad_token: str = '<pad>', set_input: Union[list, str, bool] = True,
+                set_target: Union[list, str, bool] = True, concat: Union[str, list, bool] = None, ) -> DataInfo:
+        """
+        :param paths: str或者Dict[str, str]。如果是str，则为数据集所在的文件夹或者是全路径文件名：如果是文件夹，
+            则会从self.paths里面找对应的数据集名称与文件名。如果是Dict，则为数据集名称（如train、dev、test）和
+            对应的全路径文件名。
+        :param str dataset_name: 如果在paths里传入的是一个数据集的全路径文件名，那么可以用dataset_name来定义
+            这个数据集的名字，如果不定义则默认为train。
+        :param bool to_lower: 是否将文本自动转为小写。默认值为False。
+        :param str seq_len_type: 提供的seq_len类型，支持 ``seq_len`` ：提供一个数字作为句子长度； ``mask`` :
+            提供一个0/1的mask矩阵作为句子长度； ``bert`` ：提供segment_type_id（第一个句子为0，第二个句子为1）和
+            attention mask矩阵（0/1的mask矩阵）。默认值为None，即不提供seq_len
+        :param str bert_tokenizer: bert tokenizer所使用的词表所在的文件夹路径
+        :param int cut_text: 将长于cut_text的内容截掉。默认为None，即不截。
+        :param bool get_index: 是否需要根据词表将文本转为index
+        :param int auto_pad_length: 是否需要将文本自动pad到一定长度（超过这个长度的文本将会被截掉），默认为不会自动pad
+        :param str auto_pad_token: 自动pad的内容
+        :param set_input: 如果为True，则会自动将相关的field（名字里含有Const.INPUT的）设置为input，如果为False
+            则不会将任何field设置为input。如果传入str或者List[str]，则会根据传入的内容将相对应的field设置为input，
+            于此同时其他field不会被设置为input。默认值为True。
+        :param set_target: set_target将控制哪些field可以被设置为target，用法与set_input一致。默认值为True。
+        :param concat: 是否需要将两个句子拼接起来。如果为False则不会拼接。如果为True则会在两个句子之间插入一个<sep>。
+            如果传入一个长度为4的list，则分别表示插在第一句开始前、第一句结束后、第二句开始前、第二句结束后的标识符。如果
+            传入字符串 ``bert`` ，则会采用bert的拼接方式，等价于['[CLS]', '[SEP]', '', '[SEP]'].
+        :return:
+        """
+        if isinstance(set_input, str):
+            set_input = [set_input]
+        if isinstance(set_target, str):
+            set_target = [set_target]
+        if isinstance(set_input, bool):
+            auto_set_input = set_input
+        else:
+            auto_set_input = False
+        if isinstance(set_target, bool):
+            auto_set_target = set_target
+        else:
+            auto_set_target = False
+        if isinstance(paths, str):
+            if os.path.isdir(paths):
+                path = {n: os.path.join(paths, self.paths[n]) for n in self.paths.keys()}
+            else:
+                path = {dataset_name if dataset_name is not None else 'train': paths}
+        else:
+            path = paths
+
+        data_info = DataInfo()
+        for data_name in path.keys():
+            data_info.datasets[data_name] = self._load(path[data_name])
+
+        for data_name, data_set in data_info.datasets.items():
+            if auto_set_input:
+                data_set.set_input(Const.INPUTS(0), Const.INPUTS(1))
+            if auto_set_target:
+                if Const.TARGET in data_set.get_field_names():
+                    data_set.set_target(Const.TARGET)
+
+        if to_lower:
+            for data_name, data_set in data_info.datasets.items():
+                data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(0)]], new_field_name=Const.INPUTS(0),
+                               is_input=auto_set_input)
+                data_set.apply(lambda x: [w.lower() for w in x[Const.INPUTS(1)]], new_field_name=Const.INPUTS(1),
+                               is_input=auto_set_input)
+
+        if bert_tokenizer is not None:
+            if bert_tokenizer.lower() in PRETRAINED_BERT_MODEL_DIR:
+                PRETRAIN_URL = _get_base_url('bert')
+                model_name = PRETRAINED_BERT_MODEL_DIR[bert_tokenizer]
+                model_url = PRETRAIN_URL + model_name
+                model_dir = cached_path(model_url)
+                # 检查是否存在
+            elif os.path.isdir(bert_tokenizer):
+                model_dir = bert_tokenizer
+            else:
+                raise ValueError(f"Cannot recognize BERT tokenizer from {bert_tokenizer}.")
+
+            words_vocab = Vocabulary(padding='[PAD]', unknown='[UNK]')
+            with open(os.path.join(model_dir, 'vocab.txt'), 'r') as f:
+                lines = f.readlines()
+            lines = [line.strip() for line in lines]
+            words_vocab.add_word_lst(lines)
+            words_vocab.build_vocab()
+
+            tokenizer = BertTokenizer.from_pretrained(model_dir)
+
+            for data_name, data_set in data_info.datasets.items():
+                for fields in data_set.get_field_names():
+                    if Const.INPUT in fields:
+                        data_set.apply(lambda x: tokenizer.tokenize(' '.join(x[fields])), new_field_name=fields,
+                                       is_input=auto_set_input)
+
+        if isinstance(concat, bool):
+            concat = 'default' if concat else None
+        if concat is not None:
+            if isinstance(concat, str):
+                CONCAT_MAP = {'bert': ['[CLS]', '[SEP]', '', '[SEP]'],
+                              'default': ['', '<sep>', '', '']}
+                if concat.lower() in CONCAT_MAP:
+                    concat = CONCAT_MAP[concat]
+                else:
+                    concat = 4 * [concat]
+            assert len(concat) == 4, \
+                f'Please choose a list with 4 symbols which at the beginning of first sentence ' \
+                f'the end of first sentence, the begin of second sentence, and the end of second' \
+                f'sentence. Your input is {concat}'
+
+            for data_name, data_set in data_info.datasets.items():
+                data_set.apply(lambda x: [concat[0]] + x[Const.INPUTS(0)] + [concat[1]] + [concat[2]] +
+                               x[Const.INPUTS(1)] + [concat[3]], new_field_name=Const.INPUT)
+                data_set.apply(lambda x: [w for w in x[Const.INPUT] if len(w) > 0], new_field_name=Const.INPUT,
+                               is_input=auto_set_input)
+
+        if seq_len_type is not None:
+            if seq_len_type == 'seq_len':  #
+                for data_name, data_set in data_info.datasets.items():
+                    for fields in data_set.get_field_names():
+                        if Const.INPUT in fields:
+                            data_set.apply(lambda x: len(x[fields]),
+                                           new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN),
+                                           is_input=auto_set_input)
+            elif seq_len_type == 'mask':
+                for data_name, data_set in data_info.datasets.items():
+                    for fields in data_set.get_field_names():
+                        if Const.INPUT in fields:
+                            data_set.apply(lambda x: [1] * len(x[fields]),
+                                           new_field_name=fields.replace(Const.INPUT, Const.INPUT_LEN),
+                                           is_input=auto_set_input)
+            elif seq_len_type == 'bert':
+                for data_name, data_set in data_info.datasets.items():
+                    if Const.INPUT not in data_set.get_field_names():
+                        raise KeyError(f'Field ``{Const.INPUT}`` not in {data_name} data set: '
+                                       f'got {data_set.get_field_names()}')
+                    data_set.apply(lambda x: [0] * (len(x[Const.INPUTS(0)]) + 2) + [1] * (len(x[Const.INPUTS(1)]) + 1),
+                                   new_field_name=Const.INPUT_LENS(0), is_input=auto_set_input)
+                    data_set.apply(lambda x: [1] * len(x[Const.INPUT_LENS(0)]),
+                                   new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input)
+
+        if auto_pad_length is not None:
+            cut_text = min(auto_pad_length, cut_text if cut_text is not None else 0)
+
+        if cut_text is not None:
+            for data_name, data_set in data_info.datasets.items():
+                for fields in data_set.get_field_names():
+                    if (Const.INPUT in fields) or ((Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len')):
+                        data_set.apply(lambda x: x[fields][: cut_text], new_field_name=fields,
+                                       is_input=auto_set_input)
+
+        data_set_list = [d for n, d in data_info.datasets.items()]
+        assert len(data_set_list) > 0, f'There are NO data sets in data info!'
+
+        if bert_tokenizer is None:
+            words_vocab = Vocabulary(padding=auto_pad_token)
+            words_vocab = words_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n],
+                                                   field_name=[n for n in data_set_list[0].get_field_names()
+                                                               if (Const.INPUT in n)],
+                                                   no_create_entry_dataset=[d for n, d in data_info.datasets.items()
+                                                                            if 'train' not in n])
+        target_vocab = Vocabulary(padding=None, unknown=None)
+        target_vocab = target_vocab.from_dataset(*[d for n, d in data_info.datasets.items() if 'train' in n],
+                                                 field_name=Const.TARGET)
+        data_info.vocabs = {Const.INPUT: words_vocab, Const.TARGET: target_vocab}
+
+        if get_index:
+            for data_name, data_set in data_info.datasets.items():
+                for fields in data_set.get_field_names():
+                    if Const.INPUT in fields:
+                        data_set.apply(lambda x: [words_vocab.to_index(w) for w in x[fields]], new_field_name=fields,
+                                       is_input=auto_set_input)
+
+                if Const.TARGET in data_set.get_field_names():
+                    data_set.apply(lambda x: target_vocab.to_index(x[Const.TARGET]), new_field_name=Const.TARGET,
+                                   is_input=auto_set_input, is_target=auto_set_target)
+
+        if auto_pad_length is not None:
+            for data_name, data_set in data_info.datasets.items():
+                if seq_len_type == 'seq_len':
+                    raise RuntimeError(f'sequence will be padded with the length {auto_pad_length},'
+                                       f'the seq_len_type cannot be `{seq_len_type}`!')
+                for fields in data_set.get_field_names():
+                    if Const.INPUT in fields:
+                        data_set.apply(lambda x: x[fields] + [words_vocab.to_index(words_vocab.padding)] *
+                                       (auto_pad_length - len(x[fields])), new_field_name=fields,
+                                       is_input=auto_set_input)
+                    elif (Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len'):
+                        data_set.apply(lambda x: x[fields] + [0] * (auto_pad_length - len(x[fields])),
+                                       new_field_name=fields, is_input=auto_set_input)
+
+        for data_name, data_set in data_info.datasets.items():
+            if isinstance(set_input, list):
+                data_set.set_input(*[inputs for inputs in set_input if inputs in data_set.get_field_names()])
+            if isinstance(set_target, list):
+                data_set.set_target(*[target for target in set_target if target in data_set.get_field_names()])
+
+        return data_info
+
+
+class SNLILoader(MatchingLoader, JsonLoader):
+    """
+    别名：:class:`fastNLP.io.SNLILoader` :class:`fastNLP.io.dataset_loader.SNLILoader`
+
+    读取SNLI数据集，读取的DataSet包含fields::
+
+        words1: list(str)，第一句文本, premise
+        words2: list(str), 第二句文本, hypothesis
+        target: str, 真实标签
+
+    数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip
+    """
+
+    def __init__(self, paths: dict = None):
+        fields = {
+            'sentence1_binary_parse': Const.INPUTS(0),
+            'sentence2_binary_parse': Const.INPUTS(1),
+            'gold_label': Const.TARGET,
+        }
+        paths = paths if paths is not None else {
+            'train': 'snli_1.0_train.jsonl',
+            'dev': 'snli_1.0_dev.jsonl',
+            'test': 'snli_1.0_test.jsonl'}
+        MatchingLoader.__init__(self, paths=paths)
+        JsonLoader.__init__(self, fields=fields)
+
+    def _load(self, path):
+        ds = JsonLoader._load(self, path)
+
+        parentheses_table = str.maketrans({'(': None, ')': None})
+
+        ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(),
+                 new_field_name=Const.INPUTS(0))
+        ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(),
+                 new_field_name=Const.INPUTS(1))
+        ds.drop(lambda x: x[Const.TARGET] == '-')
+        return ds
+
+
+class RTELoader(MatchingLoader, CSVLoader):
+    """
+    别名：:class:`fastNLP.io.RTELoader` :class:`fastNLP.io.dataset_loader.RTELoader`
+
+    读取RTE数据集，读取的DataSet包含fields::
+
+        words1: list(str)，第一句文本, premise
+        words2: list(str), 第二句文本, hypothesis
+        target: str, 真实标签
+
+    数据来源:
+    """
+
+    def __init__(self, paths: dict = None):
+        paths = paths if paths is not None else {
+            'train': 'train.tsv',
+            'dev': 'dev.tsv',
+            'test': 'test.tsv'  # test set has not label
+        }
+        MatchingLoader.__init__(self, paths=paths)
+        self.fields = {
+            'sentence1': Const.INPUTS(0),
+            'sentence2': Const.INPUTS(1),
+            'label': Const.TARGET,
+        }
+        CSVLoader.__init__(self, sep='\t')
+
+    def _load(self, path):
+        ds = CSVLoader._load(self, path)
+
+        for k, v in self.fields.items():
+            ds.rename_field(k, v)
+        for fields in ds.get_all_fields():
+            if Const.INPUT in fields:
+                ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields)
+
+        return ds
+
+
+class QNLILoader(MatchingLoader, CSVLoader):
+    """
+    别名：:class:`fastNLP.io.QNLILoader` :class:`fastNLP.io.dataset_loader.QNLILoader`
+
+    读取QNLI数据集，读取的DataSet包含fields::
+
+        words1: list(str)，第一句文本, premise
+        words2: list(str), 第二句文本, hypothesis
+        target: str, 真实标签
+
+    数据来源:
+    """
+
+    def __init__(self, paths: dict = None):
+        paths = paths if paths is not None else {
+            'train': 'train.tsv',
+            'dev': 'dev.tsv',
+            'test': 'test.tsv'  # test set has not label
+        }
+        MatchingLoader.__init__(self, paths=paths)
+        self.fields = {
+            'question': Const.INPUTS(0),
+            'sentence': Const.INPUTS(1),
+            'label': Const.TARGET,
+        }
+        CSVLoader.__init__(self, sep='\t')
+
+    def _load(self, path):
+        ds = CSVLoader._load(self, path)
+
+        for k, v in self.fields.items():
+            ds.rename_field(k, v)
+        for fields in ds.get_all_fields():
+            if Const.INPUT in fields:
+                ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields)
+
+        return ds
+
+
+class MNLILoader(MatchingLoader, CSVLoader):
+    """
+    别名：:class:`fastNLP.io.MNLILoader` :class:`fastNLP.io.dataset_loader.MNLILoader`
+
+    读取MNLI数据集，读取的DataSet包含fields::
+
+        words1: list(str)，第一句文本, premise
+        words2: list(str), 第二句文本, hypothesis
+        target: str, 真实标签
+
+    数据来源:
+    """
+
+    def __init__(self, paths: dict = None):
+        paths = paths if paths is not None else {
+            'train': 'train.tsv',
+            'dev_matched': 'dev_matched.tsv',
+            'dev_mismatched': 'dev_mismatched.tsv',
+            'test_matched': 'test_matched.tsv',
+            'test_mismatched': 'test_mismatched.tsv',
+            # 'test_0.9_matched': 'multinli_0.9_test_matched_unlabeled.txt',
+            # 'test_0.9_mismatched': 'multinli_0.9_test_mismatched_unlabeled.txt',
+
+            # test_0.9_mathed与mismatched是MNLI0.9版本的（数据来源：kaggle）
+        }
+        MatchingLoader.__init__(self, paths=paths)
+        CSVLoader.__init__(self, sep='\t')
+        self.fields = {
+            'sentence1_binary_parse': Const.INPUTS(0),
+            'sentence2_binary_parse': Const.INPUTS(1),
+            'gold_label': Const.TARGET,
+        }
+
+    def _load(self, path):
+        ds = CSVLoader._load(self, path)
+
+        for k, v in self.fields.items():
+            if k in ds.get_field_names():
+                ds.rename_field(k, v)
+
+        if Const.TARGET in ds.get_field_names():
+            if ds[0][Const.TARGET] == 'hidden':
+                ds.delete_field(Const.TARGET)
+
+        parentheses_table = str.maketrans({'(': None, ')': None})
+
+        ds.apply(lambda ins: ins[Const.INPUTS(0)].translate(parentheses_table).strip().split(),
+                 new_field_name=Const.INPUTS(0))
+        ds.apply(lambda ins: ins[Const.INPUTS(1)].translate(parentheses_table).strip().split(),
+                 new_field_name=Const.INPUTS(1))
+        if Const.TARGET in ds.get_field_names():
+            ds.drop(lambda x: x[Const.TARGET] == '-')
+        return ds
+
+
+class QuoraLoader(MatchingLoader, CSVLoader):
+    """
+    别名：:class:`fastNLP.io.QuoraLoader` :class:`fastNLP.io.dataset_loader.QuoraLoader`
+
+    读取MNLI数据集，读取的DataSet包含fields::
+
+        words1: list(str)，第一句文本, premise
+        words2: list(str), 第二句文本, hypothesis
+        target: str, 真实标签
+
+    数据来源:
+    """
+
+    def __init__(self, paths: dict = None):
+        paths = paths if paths is not None else {
+            'train': 'train.tsv',
+            'dev': 'dev.tsv',
+            'test': 'test.tsv',
+        }
+        MatchingLoader.__init__(self, paths=paths)
+        CSVLoader.__init__(self, sep='\t', headers=(Const.TARGET, Const.INPUTS(0), Const.INPUTS(1), 'pairID'))
+
+    def _load(self, path):
+        ds = CSVLoader._load(self, path)
+        return ds
diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py
index 558fe20e..26edd8bd 100644
--- a/fastNLP/io/dataset_loader.py
+++ b/fastNLP/io/dataset_loader.py
@@ -249,42 +249,6 @@ class JsonLoader(DataSetLoader):
         return ds
 
 
-class SNLILoader(JsonLoader):
-    """
-    别名：:class:`fastNLP.io.SNLILoader` :class:`fastNLP.io.dataset_loader.SNLILoader`
-
-    读取SNLI数据集，读取的DataSet包含fields::
-
-        words1: list(str)，第一句文本, premise
-        words2: list(str), 第二句文本, hypothesis
-        target: str, 真实标签
-
-    数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip
-    """
-
-    def __init__(self):
-        fields = {
-            'sentence1_parse': Const.INPUTS(0),
-            'sentence2_parse': Const.INPUTS(1),
-            'gold_label': Const.TARGET,
-        }
-        super(SNLILoader, self).__init__(fields=fields)
-
-    def _load(self, path):
-        ds = super(SNLILoader, self)._load(path)
-
-        def parse_tree(x):
-            t = Tree.fromstring(x)
-            return t.leaves()
-
-        ds.apply(lambda ins: parse_tree(
-            ins[Const.INPUTS(0)]), new_field_name=Const.INPUTS(0))
-        ds.apply(lambda ins: parse_tree(
-            ins[Const.INPUTS(1)]), new_field_name=Const.INPUTS(1))
-        ds.drop(lambda x: x[Const.TARGET] == '-')
-        return ds
-
-
 class CSVLoader(DataSetLoader):
     """
     别名：:class:`fastNLP.io.CSVLoader` :class:`fastNLP.io.dataset_loader.CSVLoader`
diff --git a/reproduction/matching/data/MatchingDataLoader.py b/reproduction/matching/data/MatchingDataLoader.py
index 9d948ec1..43f016d6 100644
--- a/reproduction/matching/data/MatchingDataLoader.py
+++ b/reproduction/matching/data/MatchingDataLoader.py
@@ -212,12 +212,12 @@ class MatchingLoader(DataSetLoader):
             for data_name, data_set in data_info.datasets.items():
                 for fields in data_set.get_field_names():
                     if Const.INPUT in fields:
-                        data_set.apply(lambda x: x[fields] + [words_vocab.padding] * (auto_pad_length - len(x[fields])),
-                                       new_field_name=fields, is_input=auto_set_input)
-                    elif (Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len'):
                         data_set.apply(lambda x: x[fields] + [words_vocab.to_index(words_vocab.padding)] *
                                        (auto_pad_length - len(x[fields])), new_field_name=fields,
                                        is_input=auto_set_input)
+                    elif (Const.INPUT_LEN in fields) and (seq_len_type != 'seq_len'):
+                        data_set.apply(lambda x: x[fields] + [0] * (auto_pad_length - len(x[fields])),
+                                       new_field_name=fields, is_input=auto_set_input)
 
         for data_name, data_set in data_info.datasets.items():
             if isinstance(set_input, list):
@@ -284,7 +284,7 @@ class RTELoader(MatchingLoader, CSVLoader):
         paths = paths if paths is not None else {
             'train': 'train.tsv',
             'dev': 'dev.tsv',
-            # 'test': 'test.tsv' # test set has not label
+            'test': 'test.tsv'  # test set has not label
         }
         MatchingLoader.__init__(self, paths=paths)
         self.fields = {
@@ -323,7 +323,7 @@ class QNLILoader(MatchingLoader, CSVLoader):
         paths = paths if paths is not None else {
             'train': 'train.tsv',
             'dev': 'dev.tsv',
-            # 'test': 'test.tsv' # test set has not label
+            'test': 'test.tsv'  # test set has not label
         }
         MatchingLoader.__init__(self, paths=paths)
         self.fields = {
@@ -367,6 +367,7 @@ class MNLILoader(MatchingLoader, CSVLoader):
             'test_mismatched': 'test_mismatched.tsv',
             # 'test_0.9_matched': 'multinli_0.9_test_matched_unlabeled.txt',
             # 'test_0.9_mismatched': 'multinli_0.9_test_mismatched_unlabeled.txt',
+
             # test_0.9_mathed与mismatched是MNLI0.9版本的（数据来源：kaggle）
         }
         MatchingLoader.__init__(self, paths=paths)

From 00cf9820a216df98b573f6a7f2bc841879b7e2ce Mon Sep 17 00:00:00 2001
From: xuyige <ygxu18@fudan.edu.cn>
Date: Fri, 5 Jul 2019 00:19:02 +0800
Subject: [PATCH 2/4] fix a bug in matching loader

---
 fastNLP/io/data_loader/matching.py            | 36 ++++++++++---------
 .../matching/data/MatchingDataLoader.py       | 29 +++++++++++----
 2 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/fastNLP/io/data_loader/matching.py b/fastNLP/io/data_loader/matching.py
index 70a683f2..1cde950f 100644
--- a/fastNLP/io/data_loader/matching.py
+++ b/fastNLP/io/data_loader/matching.py
@@ -19,7 +19,7 @@ class MatchingLoader(DataSetLoader):
     :param dict paths: key是数据集名称（如train、dev、test），value是对应的文件名
     """
 
-    def __init__(self, paths: dict = None):
+    def __init__(self, paths: dict=None):
         self.paths = paths
 
     def _load(self, path):
@@ -30,11 +30,11 @@ class MatchingLoader(DataSetLoader):
         """
         raise NotImplementedError
 
-    def process(self, paths: Union[str, Dict[str, str]], dataset_name: str = None,
-                to_lower=False, seq_len_type: str = None, bert_tokenizer: str = None,
-                cut_text: int = None, get_index=True, auto_pad_length: int = None,
-                auto_pad_token: str = '<pad>', set_input: Union[list, str, bool] = True,
-                set_target: Union[list, str, bool] = True, concat: Union[str, list, bool] = None, ) -> DataInfo:
+    def process(self, paths: Union[str, Dict[str, str]], dataset_name: str=None,
+                to_lower=False, seq_len_type: str=None, bert_tokenizer: str=None,
+                cut_text: int = None, get_index=True, auto_pad_length: int=None,
+                auto_pad_token: str='<pad>', set_input: Union[list, str, bool]=True,
+                set_target: Union[list, str, bool] = True, concat: Union[str, list, bool]=None, ) -> DataInfo:
         """
         :param paths: str或者Dict[str, str]。如果是str，则为数据集所在的文件夹或者是全路径文件名：如果是文件夹，
             则会从self.paths里面找对应的数据集名称与文件名。如果是Dict，则为数据集名称（如train、dev、test）和
@@ -171,7 +171,7 @@ class MatchingLoader(DataSetLoader):
                                    new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input)
 
         if auto_pad_length is not None:
-            cut_text = min(auto_pad_length, cut_text if cut_text is not None else 0)
+            cut_text = min(auto_pad_length, cut_text if cut_text is not None else auto_pad_length)
 
         if cut_text is not None:
             for data_name, data_set in data_info.datasets.items():
@@ -207,10 +207,10 @@ class MatchingLoader(DataSetLoader):
                                    is_input=auto_set_input, is_target=auto_set_target)
 
         if auto_pad_length is not None:
+            if seq_len_type == 'seq_len':
+                raise RuntimeError(f'the sequence will be padded with the length {auto_pad_length}, '
+                                   f'so the seq_len_type cannot be `{seq_len_type}`!')
             for data_name, data_set in data_info.datasets.items():
-                if seq_len_type == 'seq_len':
-                    raise RuntimeError(f'sequence will be padded with the length {auto_pad_length},'
-                                       f'the seq_len_type cannot be `{seq_len_type}`!')
                 for fields in data_set.get_field_names():
                     if Const.INPUT in fields:
                         data_set.apply(lambda x: x[fields] + [words_vocab.to_index(words_vocab.padding)] *
@@ -242,7 +242,7 @@ class SNLILoader(MatchingLoader, JsonLoader):
     数据来源: https://nlp.stanford.edu/projects/snli/snli_1.0.zip
     """
 
-    def __init__(self, paths: dict = None):
+    def __init__(self, paths: dict=None):
         fields = {
             'sentence1_binary_parse': Const.INPUTS(0),
             'sentence2_binary_parse': Const.INPUTS(1),
@@ -281,7 +281,7 @@ class RTELoader(MatchingLoader, CSVLoader):
     数据来源:
     """
 
-    def __init__(self, paths: dict = None):
+    def __init__(self, paths: dict=None):
         paths = paths if paths is not None else {
             'train': 'train.tsv',
             'dev': 'dev.tsv',
@@ -299,7 +299,8 @@ class RTELoader(MatchingLoader, CSVLoader):
         ds = CSVLoader._load(self, path)
 
         for k, v in self.fields.items():
-            ds.rename_field(k, v)
+            if v in ds.get_field_names():
+                ds.rename_field(k, v)
         for fields in ds.get_all_fields():
             if Const.INPUT in fields:
                 ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields)
@@ -320,7 +321,7 @@ class QNLILoader(MatchingLoader, CSVLoader):
     数据来源:
     """
 
-    def __init__(self, paths: dict = None):
+    def __init__(self, paths: dict=None):
         paths = paths if paths is not None else {
             'train': 'train.tsv',
             'dev': 'dev.tsv',
@@ -338,7 +339,8 @@ class QNLILoader(MatchingLoader, CSVLoader):
         ds = CSVLoader._load(self, path)
 
         for k, v in self.fields.items():
-            ds.rename_field(k, v)
+            if v in ds.get_field_names():
+                ds.rename_field(k, v)
         for fields in ds.get_all_fields():
             if Const.INPUT in fields:
                 ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields)
@@ -359,7 +361,7 @@ class MNLILoader(MatchingLoader, CSVLoader):
     数据来源:
     """
 
-    def __init__(self, paths: dict = None):
+    def __init__(self, paths: dict=None):
         paths = paths if paths is not None else {
             'train': 'train.tsv',
             'dev_matched': 'dev_matched.tsv',
@@ -414,7 +416,7 @@ class QuoraLoader(MatchingLoader, CSVLoader):
     数据来源:
     """
 
-    def __init__(self, paths: dict = None):
+    def __init__(self, paths: dict=None):
         paths = paths if paths is not None else {
             'train': 'train.tsv',
             'dev': 'dev.tsv',
diff --git a/reproduction/matching/data/MatchingDataLoader.py b/reproduction/matching/data/MatchingDataLoader.py
index 43f016d6..7c32899c 100644
--- a/reproduction/matching/data/MatchingDataLoader.py
+++ b/reproduction/matching/data/MatchingDataLoader.py
@@ -16,12 +16,11 @@ class MatchingLoader(DataSetLoader):
     别名：:class:`fastNLP.io.MatchingLoader` :class:`fastNLP.io.dataset_loader.MatchingLoader`
 
     读取Matching任务的数据集
+
+    :param dict paths: key是数据集名称（如train、dev、test），value是对应的文件名
     """
 
     def __init__(self, paths: dict=None):
-        """
-        :param dict paths: key是数据集名称（如train、dev、test），value是对应的文件名
-        """
         self.paths = paths
 
     def _load(self, path):
@@ -173,7 +172,7 @@ class MatchingLoader(DataSetLoader):
                                    new_field_name=Const.INPUT_LENS(1), is_input=auto_set_input)
 
         if auto_pad_length is not None:
-            cut_text = min(auto_pad_length, cut_text if cut_text is not None else 0)
+            cut_text = min(auto_pad_length, cut_text if cut_text is not None else auto_pad_length)
 
         if cut_text is not None:
             for data_name, data_set in data_info.datasets.items():
@@ -209,6 +208,9 @@ class MatchingLoader(DataSetLoader):
                                    is_input=auto_set_input, is_target=auto_set_target)
 
         if auto_pad_length is not None:
+            if seq_len_type == 'seq_len':
+                raise RuntimeError(f'the sequence will be padded with the length {auto_pad_length}, '
+                                   f'so the seq_len_type cannot be `{seq_len_type}`!')
             for data_name, data_set in data_info.datasets.items():
                 for fields in data_set.get_field_names():
                     if Const.INPUT in fields:
@@ -298,7 +300,8 @@ class RTELoader(MatchingLoader, CSVLoader):
         ds = CSVLoader._load(self, path)
 
         for k, v in self.fields.items():
-            ds.rename_field(k, v)
+            if v in ds.get_field_names():
+                ds.rename_field(k, v)
         for fields in ds.get_all_fields():
             if Const.INPUT in fields:
                 ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields)
@@ -337,7 +340,8 @@ class QNLILoader(MatchingLoader, CSVLoader):
         ds = CSVLoader._load(self, path)
 
         for k, v in self.fields.items():
-            ds.rename_field(k, v)
+            if v in ds.get_field_names():
+                ds.rename_field(k, v)
         for fields in ds.get_all_fields():
             if Const.INPUT in fields:
                 ds.apply(lambda x: x[fields].strip().split(), new_field_name=fields)
@@ -349,7 +353,7 @@ class MNLILoader(MatchingLoader, CSVLoader):
     """
     别名：:class:`fastNLP.io.MNLILoader` :class:`fastNLP.io.dataset_loader.MNLILoader`
 
-    读取SNLI数据集，读取的DataSet包含fields::
+    读取MNLI数据集，读取的DataSet包含fields::
 
         words1: list(str)，第一句文本, premise
         words2: list(str), 第二句文本, hypothesis
@@ -401,6 +405,17 @@ class MNLILoader(MatchingLoader, CSVLoader):
 
 
 class QuoraLoader(MatchingLoader, CSVLoader):
+    """
+    别名：:class:`fastNLP.io.QuoraLoader` :class:`fastNLP.io.dataset_loader.QuoraLoader`
+
+    读取MNLI数据集，读取的DataSet包含fields::
+
+        words1: list(str)，第一句文本, premise
+        words2: list(str), 第二句文本, hypothesis
+        target: str, 真实标签
+
+    数据来源:
+    """
 
     def __init__(self, paths: dict=None):
         paths = paths if paths is not None else {

From 089009f9f24c4b4438f0a4d65c546210ee33a1ed Mon Sep 17 00:00:00 2001
From: xuyige <ygxu18@fudan.edu.cn>
Date: Sat, 6 Jul 2019 01:08:55 +0800
Subject: [PATCH 3/4] =?UTF-8?q?=E5=A4=A7=E5=B9=85=E5=BA=A6=E6=9B=B4?=
 =?UTF-8?q?=E6=96=B0=EF=BC=9A=201=E3=80=81=E6=9B=B4=E6=96=B0requirements?=
 =?UTF-8?q?=E4=BB=A5=E5=8F=8AREADME.md=202=E3=80=81=E6=9B=B4=E6=96=B0DataL?=
 =?UTF-8?q?oader=203=E3=80=81=E6=9B=B4=E6=96=B0loss=204=E3=80=81=E6=9B=B4?=
 =?UTF-8?q?=E6=96=B0model/bert.py=E5=86=85=E5=AE=B9=E5=8F=8A=E9=80=82?=
 =?UTF-8?q?=E9=85=8D=E7=9A=84=E6=B5=8B=E8=AF=95=E4=BB=A3=E7=A0=81=205?=
 =?UTF-8?q?=E3=80=81=E6=9B=B4=E6=96=B0reproduction/README.md=206=E3=80=81?=
 =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=85=B6=E4=BB=96=E6=B5=8B=E8=AF=95=E4=BB=A3?=
 =?UTF-8?q?=E7=A0=81=E7=9A=84=E6=8A=A5=E9=94=99=E7=9A=84=E5=9C=B0=E6=96=B9?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                                 | 30 +++++-----
 fastNLP/core/losses.py                    | 60 +++++++++----------
 fastNLP/io/base_loader.py                 |  9 +--
 fastNLP/io/data_loader/__init__.py        | 19 ++++++
 fastNLP/models/bert.py                    | 70 +++++++++++------------
 fastNLP/modules/decoder/mlp.py            |  3 +-
 fastNLP/modules/encoder/_bert.py          | 23 +++++---
 reproduction/README.md                    |  6 +-
 reproduction/Star_transformer/datasets.py |  3 +-
 requirements.txt                          |  8 +--
 test/io/test_dataset_loader.py            |  4 +-
 test/models/test_bert.py                  | 12 ++--
 test/modules/encoder/test_bert.py         |  7 ++-
 13 files changed, 140 insertions(+), 114 deletions(-)
 create mode 100644 fastNLP/io/data_loader/__init__.py

diff --git a/README.md b/README.md
index 9d949482..a5ce3c64 100644
--- a/README.md
+++ b/README.md
@@ -6,13 +6,14 @@
 ![Hex.pm](https://img.shields.io/hexpm/l/plug.svg)
 [![Documentation Status](https://readthedocs.org/projects/fastnlp/badge/?version=latest)](http://fastnlp.readthedocs.io/?badge=latest)
 
-fastNLP 是一款轻量级的 NLP 处理套件。你既可以使用它快速地完成一个命名实体识别（NER）、中文分词或文本分类任务； 也可以使用他构建许多复杂的网络模型，进行科研。它具有如下的特性：
+fastNLP 是一款轻量级的 NLP 处理套件。你既可以使用它快速地完成一个序列标注（[NER](reproduction/seqence_labelling/ner/)、POS-Tagging等）、中文分词、文本分类、[Matching](reproduction/matching/)、指代消解、摘要等任务； 也可以使用它构建许多复杂的网络模型，进行科研。它具有如下的特性：
 
-- 统一的Tabular式数据容器，让数据预处理过程简洁明了。内置多种数据集的DataSet Loader，省去预处理代码。
-- 各种方便的NLP工具，例如预处理embedding加载; 中间数据cache等;
-- 详尽的中文文档以供查阅；
+- 统一的Tabular式数据容器，让数据预处理过程简洁明了。内置多种数据集的DataSet Loader，省去预处理代码;
+- 多种训练、测试组件，例如训练器Trainer；测试器Tester；以及各种评测metrics等等;
+- 各种方便的NLP工具，例如预处理embedding加载（包括EMLo和BERT）; 中间数据cache等;
+- 详尽的中文[文档](https://fastnlp.readthedocs.io/)、教程以供查阅;
 - 提供诸多高级模块，例如Variational LSTM, Transformer, CRF等;
-- 封装CNNText，Biaffine等模型可供直接使用;
+- 在序列标注、中文分词、文本分类、Matching、指代消解、摘要等任务上封装了各种模型可供直接使用; [详细链接](reproduction/)
 - 便捷且具有扩展性的训练器; 提供多种内置callback函数，方便实验记录、异常捕获等。
 
 
@@ -20,13 +21,14 @@ fastNLP 是一款轻量级的 NLP 处理套件。你既可以使用它快速地
 
 fastNLP 依赖如下包:
 
-+ numpy
-+ torch>=0.4.0
-+ tqdm
-+ nltk
++ numpy>=1.14.2
++ torch>=1.0.0
++ tqdm>=4.28.1
++ nltk>=3.4.1
++ requests
 
-其中torch的安装可能与操作系统及 CUDA 的版本相关，请参见 PyTorch 官网 。 
-在依赖包安装完成的情况，您可以在命令行执行如下指令完成安装
+其中torch的安装可能与操作系统及 CUDA 的版本相关，请参见 [PyTorch 官网](https://pytorch.org/) 。 
+在依赖包安装完成后，您可以在命令行执行如下指令完成安装
 
 ```shell
 pip install fastNLP
@@ -77,8 +79,8 @@ fastNLP 在 modules 模块中内置了三种模块的诸多组件，可以帮助
 fastNLP 为不同的 NLP 任务实现了许多完整的模型，它们都经过了训练和测试。
 
 你可以在以下两个地方查看相关信息
-- [介绍](reproduction/)
-- [源码](fastNLP/models/)
+- [模型介绍](reproduction/)
+- [模型源码](fastNLP/models/)
 
 ## 项目结构
 
@@ -93,7 +95,7 @@ fastNLP的大致工作流程如上图所示，而项目结构如下：
 </tr>
 <tr>
     <td><b> fastNLP.core </b></td>
-    <td> 实现了核心功能，包括数据处理组件、训练器、测速器等 </td>
+    <td> 实现了核心功能，包括数据处理组件、训练器、测试器等 </td>
 </tr>
 <tr>
     <td><b> fastNLP.models </b></td>
diff --git a/fastNLP/core/losses.py b/fastNLP/core/losses.py
index 46a72802..14aacef0 100644
--- a/fastNLP/core/losses.py
+++ b/fastNLP/core/losses.py
@@ -20,6 +20,7 @@ from collections import defaultdict
 import torch
 import torch.nn.functional as F
 
+from ..core.const import Const
 from .utils import _CheckError
 from .utils import _CheckRes
 from .utils import _build_args
@@ -28,6 +29,7 @@ from .utils import _check_function_or_method
 from .utils import _get_func_signature
 from .utils import seq_len_to_mask
 
+
 class LossBase(object):
     """
     所有loss的基类。如果想了解其中的原理，请查看源码。
@@ -95,22 +97,7 @@ class LossBase(object):
         # if func_spect.varargs:
         #     raise NameError(f"Delete `*{func_spect.varargs}` in {get_func_signature(self.get_loss)}(Do not use "
         #                     f"positional argument.).")
-    
-    def _fast_param_map(self, pred_dict, target_dict):
-        """Only used as inner function. When the pred_dict, target is unequivocal. Don't need users to pass key_map.
-            such as pred_dict has one element, target_dict has one element
 
-        :param pred_dict:
-        :param target_dict:
-        :return: dict, if dict is not {}, pass it to self.evaluate. Otherwise do mapping.
-        """
-        fast_param = {}
-        if len(self._param_map) == 2 and len(pred_dict) == 1 and len(target_dict) == 1:
-            fast_param['pred'] = list(pred_dict.values())[0]
-            fast_param['target'] = list(target_dict.values())[0]
-            return fast_param
-        return fast_param
-    
     def __call__(self, pred_dict, target_dict, check=False):
         """
         :param dict pred_dict: 模型的forward函数返回的dict
@@ -118,11 +105,7 @@ class LossBase(object):
         :param Boolean check: 每一次执行映射函数的时候是否检查映射表，默认为不检查
         :return:
         """
-        fast_param = self._fast_param_map(pred_dict, target_dict)
-        if fast_param:
-            loss = self.get_loss(**fast_param)
-            return loss
-        
+
         if not self._checked:
             # 1. check consistence between signature and _param_map
             func_spect = inspect.getfullargspec(self.get_loss)
@@ -212,7 +195,6 @@ class LossFunc(LossBase):
             if not isinstance(key_map, dict):
                 raise RuntimeError(f"Loss error: key_map except a {type({})} but got a {type(key_map)}")
         self._init_param_map(key_map, **kwargs)
-        
 
 
 class CrossEntropyLoss(LossBase):
@@ -226,7 +208,7 @@ class CrossEntropyLoss(LossBase):
     :param seq_len: 句子的长度, 长度之外的token不会计算loss。。
     :param padding_idx: padding的index，在计算loss时将忽略target中标号为padding_idx的内容, 可以通过该值代替
         传入seq_len.
-    :param str reduction: 支持'elementwise_mean'和'sum'.
+    :param str reduction: 支持'mean'，'sum'和'none'.
 
     Example::
 
@@ -234,16 +216,16 @@ class CrossEntropyLoss(LossBase):
         
     """
     
-    def __init__(self, pred=None, target=None, seq_len=None, padding_idx=-100, reduction='elementwise_mean'):
+    def __init__(self, pred=None, target=None, seq_len=None, padding_idx=-100, reduction='mean'):
         super(CrossEntropyLoss, self).__init__()
         self._init_param_map(pred=pred, target=target, seq_len=seq_len)
         self.padding_idx = padding_idx
-        assert reduction in ('elementwise_mean', 'sum')
+        assert reduction in ('mean', 'sum', 'none')
         self.reduction = reduction
     
     def get_loss(self, pred, target, seq_len=None):
-        if pred.dim()>2:
-            if pred.size(1)!=target.size(1):
+        if pred.dim() > 2:
+            if pred.size(1) != target.size(1):
                 pred = pred.transpose(1, 2)
             pred = pred.reshape(-1, pred.size(-1))
             target = target.reshape(-1)
@@ -263,15 +245,18 @@ class L1Loss(LossBase):
     
     :param pred: 参数映射表中 `pred` 的映射关系，None表示映射关系为 `pred` -> `pred`
     :param target: 参数映射表中 `target` 的映射关系，None表示映射关系为 `target` >`target`
+    :param str reduction: 支持'mean'，'sum'和'none'.
     
     """
     
-    def __init__(self, pred=None, target=None):
+    def __init__(self, pred=None, target=None, reduction='mean'):
         super(L1Loss, self).__init__()
         self._init_param_map(pred=pred, target=target)
+        assert reduction in ('mean', 'sum', 'none')
+        self.reduction = reduction
     
     def get_loss(self, pred, target):
-        return F.l1_loss(input=pred, target=target)
+        return F.l1_loss(input=pred, target=target, reduction=self.reduction)
 
 
 class BCELoss(LossBase):
@@ -282,14 +267,17 @@ class BCELoss(LossBase):
     
     :param pred: 参数映射表中`pred`的映射关系，None表示映射关系为`pred`->`pred`
     :param target: 参数映射表中`target`的映射关系，None表示映射关系为`target`->`target`
+    :param str reduction: 支持'mean'，'sum'和'none'.
     """
     
-    def __init__(self, pred=None, target=None):
+    def __init__(self, pred=None, target=None, reduction='mean'):
         super(BCELoss, self).__init__()
         self._init_param_map(pred=pred, target=target)
+        assert reduction in ('mean', 'sum', 'none')
+        self.reduction = reduction
     
     def get_loss(self, pred, target):
-        return F.binary_cross_entropy(input=pred, target=target)
+        return F.binary_cross_entropy(input=pred, target=target, reduction=self.reduction)
 
 
 class NLLLoss(LossBase):
@@ -300,14 +288,20 @@ class NLLLoss(LossBase):
     
     :param pred: 参数映射表中`pred`的映射关系，None表示映射关系为`pred`->`pred`
     :param target: 参数映射表中`target`的映射关系，None表示映射关系为`target`->`target`
+    :param ignore_idx: ignore的index，在计算loss时将忽略target中标号为ignore_idx的内容, 可以通过该值代替
+        传入seq_len.
+    :param str reduction: 支持'mean'，'sum'和'none'.
     """
     
-    def __init__(self, pred=None, target=None):
+    def __init__(self, pred=None, target=None, ignore_idx=-100, reduction='mean'):
         super(NLLLoss, self).__init__()
         self._init_param_map(pred=pred, target=target)
+        assert reduction in ('mean', 'sum', 'none')
+        self.reduction = reduction
+        self.ignore_idx = ignore_idx
     
     def get_loss(self, pred, target):
-        return F.nll_loss(input=pred, target=target)
+        return F.nll_loss(input=pred, target=target, ignore_index=self.ignore_idx, reduction=self.reduction)
 
 
 class LossInForward(LossBase):
@@ -319,7 +313,7 @@ class LossInForward(LossBase):
     :param str loss_key: 在forward函数中loss的键名，默认为loss
     """
     
-    def __init__(self, loss_key='loss'):
+    def __init__(self, loss_key=Const.LOSS):
         super().__init__()
         if not isinstance(loss_key, str):
             raise TypeError(f"Only str allowed for loss_key, got {type(loss_key)}.")
diff --git a/fastNLP/io/base_loader.py b/fastNLP/io/base_loader.py
index 465fb7e8..8cff1da1 100644
--- a/fastNLP/io/base_loader.py
+++ b/fastNLP/io/base_loader.py
@@ -10,6 +10,7 @@ from typing import Union, Dict
 import os
 from ..core.dataset import DataSet
 
+
 class BaseLoader(object):
     """
     各个 Loader 的基类，提供了 API 的参考。
@@ -55,8 +56,6 @@ class BaseLoader(object):
             return obj
 
 
-
-
 def _download_from_url(url, path):
     try:
         from tqdm.auto import tqdm
@@ -115,13 +114,11 @@ class DataInfo:
     经过处理的数据信息，包括一系列数据集（比如：分开的训练集、验证集和测试集）及它们所用的词表和词嵌入。
 
     :param vocabs: 从名称(字符串)到 :class:`~fastNLP.Vocabulary` 类型的dict
-    :param embeddings: 从名称(字符串)到一系列 embedding 的dict，参考 :class:`~fastNLP.io.EmbedLoader`
     :param datasets: 从名称(字符串)到 :class:`~fastNLP.DataSet` 类型的dict
     """
 
-    def __init__(self, vocabs: dict = None, embeddings: dict = None, datasets: dict = None):
+    def __init__(self, vocabs: dict = None, datasets: dict = None):
         self.vocabs = vocabs or {}
-        self.embeddings = embeddings or {}
         self.datasets = datasets or {}
 
     def __repr__(self):
@@ -133,6 +130,7 @@ class DataInfo:
             _str += '\t{} has {} entries.\n'.format(name, len(vocab))
         return _str
 
+
 class DataSetLoader:
     """
     别名：:class:`fastNLP.io.DataSetLoader` :class:`fastNLP.io.dataset_loader.DataSetLoader`
@@ -213,7 +211,6 @@ class DataSetLoader:
         返回的 :class:`DataInfo` 对象有如下属性：
 
         - vocabs: 由从数据集中获取的词表组成的字典，每个词表
-        - embeddings: (可选) 数据集对应的词嵌入
         - datasets: 一个dict，包含一系列 :class:`~fastNLP.DataSet` 类型的对象。其中 field 的命名参考 :mod:`~fastNLP.core.const`
 
         :param paths: 原始数据读取的路径
diff --git a/fastNLP/io/data_loader/__init__.py b/fastNLP/io/data_loader/__init__.py
new file mode 100644
index 00000000..6f4dd973
--- /dev/null
+++ b/fastNLP/io/data_loader/__init__.py
@@ -0,0 +1,19 @@
+"""
+用于读数据集的模块, 具体包括:
+
+这些模块的使用方法如下:
+"""
+__all__ = [
+    'SSTLoader',
+
+    'MatchingLoader',
+    'SNLILoader',
+    'MNLILoader',
+    'QNLILoader',
+    'QuoraLoader',
+    'RTELoader',
+]
+
+from .sst import SSTLoader
+from .matching import MatchingLoader, SNLILoader, \
+    MNLILoader, QNLILoader, QuoraLoader, RTELoader
diff --git a/fastNLP/models/bert.py b/fastNLP/models/bert.py
index 4846c7fa..fb186ce4 100644
--- a/fastNLP/models/bert.py
+++ b/fastNLP/models/bert.py
@@ -8,35 +8,7 @@ from torch import nn
 from .base_model import BaseModel
 from ..core.const import Const
 from ..modules.encoder import BertModel
-
-
-class BertConfig:
-
-    def __init__(
-            self,
-            vocab_size=30522,
-            hidden_size=768,
-            num_hidden_layers=12,
-            num_attention_heads=12,
-            intermediate_size=3072,
-            hidden_act="gelu",
-            hidden_dropout_prob=0.1,
-            attention_probs_dropout_prob=0.1,
-            max_position_embeddings=512,
-            type_vocab_size=2,
-            initializer_range=0.02
-    ):
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.intermediate_size = intermediate_size
-        self.hidden_act = hidden_act
-        self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
-        self.initializer_range = initializer_range
+from ..modules.encoder._bert import BertConfig
 
 
 class BertForSequenceClassification(BaseModel):
@@ -84,11 +56,17 @@ class BertForSequenceClassification(BaseModel):
             self.bert = BertModel.from_pretrained(bert_dir)
         else:
             if config is None:
-                config = BertConfig()
-            self.bert = BertModel(**config.__dict__)
+                config = BertConfig(30522)
+            self.bert = BertModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, num_labels)
 
+    @classmethod
+    def from_pretrained(cls, num_labels, pretrained_model_dir):
+        config = BertConfig(pretrained_model_dir)
+        model = cls(num_labels=num_labels, config=config, bert_dir=pretrained_model_dir)
+        return model
+
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
         _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
         pooled_output = self.dropout(pooled_output)
@@ -151,11 +129,17 @@ class BertForMultipleChoice(BaseModel):
             self.bert = BertModel.from_pretrained(bert_dir)
         else:
             if config is None:
-                config = BertConfig()
-            self.bert = BertModel(**config.__dict__)
+                config = BertConfig(30522)
+            self.bert = BertModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, 1)
 
+    @classmethod
+    def from_pretrained(cls, num_choices, pretrained_model_dir):
+        config = BertConfig(pretrained_model_dir)
+        model = cls(num_choices=num_choices, config=config, bert_dir=pretrained_model_dir)
+        return model
+
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
         flat_input_ids = input_ids.view(-1, input_ids.size(-1))
         flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
@@ -224,11 +208,17 @@ class BertForTokenClassification(BaseModel):
             self.bert = BertModel.from_pretrained(bert_dir)
         else:
             if config is None:
-                config = BertConfig()
-            self.bert = BertModel(**config.__dict__)
+                config = BertConfig(30522)
+            self.bert = BertModel(config)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.classifier = nn.Linear(config.hidden_size, num_labels)
 
+    @classmethod
+    def from_pretrained(cls, num_labels, pretrained_model_dir):
+        config = BertConfig(pretrained_model_dir)
+        model = cls(num_labels=num_labels, config=config, bert_dir=pretrained_model_dir)
+        return model
+
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
         sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
         sequence_output = self.dropout(sequence_output)
@@ -302,12 +292,18 @@ class BertForQuestionAnswering(BaseModel):
             self.bert = BertModel.from_pretrained(bert_dir)
         else:
             if config is None:
-                config = BertConfig()
-            self.bert = BertModel(**config.__dict__)
+                config = BertConfig(30522)
+            self.bert = BertModel(config)
         # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
         # self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.qa_outputs = nn.Linear(config.hidden_size, 2)
 
+    @classmethod
+    def from_pretrained(cls, pretrained_model_dir):
+        config = BertConfig(pretrained_model_dir)
+        model = cls(config=config, bert_dir=pretrained_model_dir)
+        return model
+
     def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None):
         sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
         logits = self.qa_outputs(sequence_output)
diff --git a/fastNLP/modules/decoder/mlp.py b/fastNLP/modules/decoder/mlp.py
index c1579224..418b3a77 100644
--- a/fastNLP/modules/decoder/mlp.py
+++ b/fastNLP/modules/decoder/mlp.py
@@ -15,7 +15,8 @@ class MLP(nn.Module):
     多层感知器
 
     :param List[int] size_layer: 一个int的列表，用来定义MLP的层数，列表中的数字为每一层是hidden数目。MLP的层数为 len(size_layer) - 1
-    :param Union[str,func,List[str]] activation: 一个字符串或者函数的列表，用来定义每一个隐层的激活函数，字符串包括relu，tanh和sigmoid，默认值为relu
+    :param Union[str,func,List[str]] activation: 一个字符串或者函数的列表，用来定义每一个隐层的激活函数，字符串包括relu，tanh和
+        sigmoid，默认值为relu
     :param Union[str,func] output_activation:  字符串或者函数，用来定义输出层的激活函数，默认值为None，表示输出层没有激活函数
     :param str initial_method: 参数初始化方式
     :param float dropout: dropout概率，默认值为0
diff --git a/fastNLP/modules/encoder/_bert.py b/fastNLP/modules/encoder/_bert.py
index 4669b511..61a5d7d1 100644
--- a/fastNLP/modules/encoder/_bert.py
+++ b/fastNLP/modules/encoder/_bert.py
@@ -26,6 +26,7 @@ import sys
 
 CONFIG_FILE = 'bert_config.json'
 
+
 class BertConfig(object):
     """Configuration class to store the configuration of a `BertModel`.
     """
@@ -339,13 +340,19 @@ class BertModel(nn.Module):
     如果你想使用预训练好的权重矩阵，请在以下网址下载.
     sources::
 
-    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
-    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
-    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
-    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
-    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
-    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
-    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",
+    'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin",
+    'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin",
+    'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin",
+    'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin",
+    'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin",
+    'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin",
+    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin",
+    'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-pytorch_model.bin",
+    'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin",
+    'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin",
+    'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin",
+    'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin"
 
 
     用预训练权重矩阵来建立BERT模型::
@@ -562,6 +569,7 @@ class WordpieceTokenizer(object):
                 output_tokens.extend(sub_tokens)
         return output_tokens
 
+
 def load_vocab(vocab_file):
     """Loads a vocabulary file into a dictionary."""
     vocab = collections.OrderedDict()
@@ -692,6 +700,7 @@ class BasicTokenizer(object):
                 output.append(char)
         return "".join(output)
 
+
 def _is_whitespace(char):
     """Checks whether `chars` is a whitespace character."""
     # \t, \n, and \r are technically contorl characters but we treat them
diff --git a/reproduction/README.md b/reproduction/README.md
index 92652fb4..b6f61903 100644
--- a/reproduction/README.md
+++ b/reproduction/README.md
@@ -3,6 +3,8 @@
 
 复现的模型有:
 - [Star-Transformer](Star_transformer/)
+- [Biaffine](https://github.com/fastnlp/fastNLP/blob/999a14381747068e9e6a7cc370037b320197db00/fastNLP/models/biaffine_parser.py#L239)
+- [CNNText](https://github.com/fastnlp/fastNLP/blob/999a14381747068e9e6a7cc370037b320197db00/fastNLP/models/cnn_text_classification.py#L12)
 - ...
 
 # 任务复现
@@ -11,11 +13,11 @@
 
 
 ## Matching (自然语言推理/句子匹配)
-- [Matching 任务复现](matching/)
+- [Matching 任务复现](matching)
 
 
 ## Sequence Labeling (序列标注)
-- still in progress
+- [NER](seqence_labelling/ner)
 
 
 ## Coreference resolution (指代消解)
diff --git a/reproduction/Star_transformer/datasets.py b/reproduction/Star_transformer/datasets.py
index a9257fd4..1532a041 100644
--- a/reproduction/Star_transformer/datasets.py
+++ b/reproduction/Star_transformer/datasets.py
@@ -2,7 +2,8 @@ import torch
 import json
 import os
 from fastNLP import Vocabulary
-from fastNLP.io.dataset_loader import ConllLoader, SSTLoader, SNLILoader
+from fastNLP.io.dataset_loader import ConllLoader
+from fastNLP.io.data_loader import SSTLoader, SNLILoader
 from fastNLP.core import Const as C
 import numpy as np
 
diff --git a/requirements.txt b/requirements.txt
index 7ea8fdac..f8f7a951 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
-numpy
-torch>=0.4.0
-tqdm
-nltk
+numpy>=1.14.2
+torch>=1.0.0
+tqdm>=4.28.1
+nltk>=3.4.1
 requests
diff --git a/test/io/test_dataset_loader.py b/test/io/test_dataset_loader.py
index 7cff3c12..b091339e 100644
--- a/test/io/test_dataset_loader.py
+++ b/test/io/test_dataset_loader.py
@@ -1,7 +1,7 @@
 import unittest
 import os
-from fastNLP.io import Conll2003Loader, PeopleDailyCorpusLoader, CSVLoader, SNLILoader, JsonLoader
-from fastNLP.io.dataset_loader import SSTLoader
+from fastNLP.io import Conll2003Loader, PeopleDailyCorpusLoader, CSVLoader, JsonLoader
+from fastNLP.io.dataset_loader import SSTLoader, SNLILoader
 from reproduction.text_classification.data.yelpLoader import yelpLoader
 
 
diff --git a/test/models/test_bert.py b/test/models/test_bert.py
index 7177f31b..38a16f9b 100644
--- a/test/models/test_bert.py
+++ b/test/models/test_bert.py
@@ -8,8 +8,9 @@ from fastNLP.models.bert import *
 class TestBert(unittest.TestCase):
     def test_bert_1(self):
         from fastNLP.core.const import Const
+        from fastNLP.modules.encoder._bert import BertConfig
 
-        model = BertForSequenceClassification(2)
+        model = BertForSequenceClassification(2, BertConfig(32000))
 
         input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
         input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
@@ -22,8 +23,9 @@ class TestBert(unittest.TestCase):
 
     def test_bert_2(self):
         from fastNLP.core.const import Const
+        from fastNLP.modules.encoder._bert import BertConfig
 
-        model = BertForMultipleChoice(2)
+        model = BertForMultipleChoice(2, BertConfig(32000))
 
         input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
         input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
@@ -36,8 +38,9 @@ class TestBert(unittest.TestCase):
 
     def test_bert_3(self):
         from fastNLP.core.const import Const
+        from fastNLP.modules.encoder._bert import BertConfig
 
-        model = BertForTokenClassification(7)
+        model = BertForTokenClassification(7, BertConfig(32000))
 
         input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
         input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
@@ -50,8 +53,9 @@ class TestBert(unittest.TestCase):
 
     def test_bert_4(self):
         from fastNLP.core.const import Const
+        from fastNLP.modules.encoder._bert import BertConfig
 
-        model = BertForQuestionAnswering()
+        model = BertForQuestionAnswering(BertConfig(32000))
 
         input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
         input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
diff --git a/test/modules/encoder/test_bert.py b/test/modules/encoder/test_bert.py
index 78bcf633..2a799478 100644
--- a/test/modules/encoder/test_bert.py
+++ b/test/modules/encoder/test_bert.py
@@ -8,8 +8,9 @@ from fastNLP.models.bert import BertModel
 
 class TestBert(unittest.TestCase):
     def test_bert_1(self):
-        model = BertModel(vocab_size=32000, hidden_size=768,
-                          num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+        from fastNLP.modules.encoder._bert import BertConfig
+        config = BertConfig(32000)
+        model = BertModel(config)
 
         input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
         input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
@@ -18,4 +19,4 @@ class TestBert(unittest.TestCase):
         all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
         for layer in all_encoder_layers:
             self.assertEqual(tuple(layer.shape), (2, 3, 768))
-        self.assertEqual(tuple(pooled_output.shape), (2, 768))
\ No newline at end of file
+        self.assertEqual(tuple(pooled_output.shape), (2, 768))

From 66a7cf084ec7caa9d95319619c4e7cb1720d2818 Mon Sep 17 00:00:00 2001
From: xuyige <ygxu18@fudan.edu.cn>
Date: Sat, 6 Jul 2019 01:36:11 +0800
Subject: [PATCH 4/4] fix bug in test

---
 fastNLP/io/data_loader/matching.py |  6 +++---
 fastNLP/io/dataset_loader.py       |  5 +----
 test/io/test_dataset_loader.py     | 11 ++++++++++-
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/fastNLP/io/data_loader/matching.py b/fastNLP/io/data_loader/matching.py
index 1cde950f..3d131bcb 100644
--- a/fastNLP/io/data_loader/matching.py
+++ b/fastNLP/io/data_loader/matching.py
@@ -4,9 +4,9 @@ from typing import Union, Dict
 
 from ...core.const import Const
 from ...core.vocabulary import Vocabulary
-from ...io.base_loader import DataInfo, DataSetLoader
-from ...io.dataset_loader import JsonLoader, CSVLoader
-from ...io.file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR
+from ..base_loader import DataInfo, DataSetLoader
+from ..dataset_loader import JsonLoader, CSVLoader
+from ..file_utils import _get_base_url, cached_path, PRETRAINED_BERT_MODEL_DIR
 from ...modules.encoder._bert import BertTokenizer
 
 
diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py
index 26edd8bd..2881e6e9 100644
--- a/fastNLP/io/dataset_loader.py
+++ b/fastNLP/io/dataset_loader.py
@@ -16,8 +16,6 @@ __all__ = [
     'CSVLoader',
     'JsonLoader',
     'ConllLoader',
-    'SNLILoader',
-    'SSTLoader',
     'PeopleDailyCorpusLoader',
     'Conll2003Loader',
 ]
@@ -30,7 +28,6 @@ from ..core.dataset import DataSet
 from ..core.instance import Instance
 from .file_reader import _read_csv, _read_json, _read_conll
 from .base_loader import DataSetLoader, DataInfo
-from .data_loader.sst import SSTLoader
 from ..core.const import Const
 from ..modules.encoder._bert import BertTokenizer
 
@@ -111,7 +108,7 @@ class PeopleDailyCorpusLoader(DataSetLoader):
             else:
                 instance = Instance(words=sent_words)
             data_set.append(instance)
-        data_set.apply(lambda ins: len(ins["words"]), new_field_name="seq_len")
+        data_set.apply(lambda ins: len(ins[Const.INPUT]), new_field_name=Const.INPUT_LEN)
         return data_set
 
 
diff --git a/test/io/test_dataset_loader.py b/test/io/test_dataset_loader.py
index b091339e..09ad8c83 100644
--- a/test/io/test_dataset_loader.py
+++ b/test/io/test_dataset_loader.py
@@ -1,7 +1,7 @@
 import unittest
 import os
 from fastNLP.io import Conll2003Loader, PeopleDailyCorpusLoader, CSVLoader, JsonLoader
-from fastNLP.io.dataset_loader import SSTLoader, SNLILoader
+from fastNLP.io.data_loader import SSTLoader, SNLILoader
 from reproduction.text_classification.data.yelpLoader import yelpLoader
 
 
@@ -61,3 +61,12 @@ class TestDatasetLoader(unittest.TestCase):
         print(info.vocabs)
         print(info.datasets)
         os.remove(train), os.remove(test)
+
+    def test_import(self):
+        import fastNLP
+        from fastNLP.io import SNLILoader
+        ds = SNLILoader().process('test/data_for_tests/sample_snli.jsonl', to_lower=True,
+                                  get_index=True, seq_len_type='seq_len')
+        assert 'train' in ds.datasets
+        assert len(ds.datasets) == 1
+        assert len(ds.datasets['train']) == 3