From ad957077185f12a662f8fb9a64c1fdc1fa4464f5 Mon Sep 17 00:00:00 2001
From: Yige Xu <xuyige1996@gmail.com>
Date: Wed, 25 Sep 2019 14:46:30 +0800
Subject: [PATCH] 1. reorganize auto download datasets in io/file_utils.py; 2.
 add auto download for CNNDM and THUCNews; 3. rename XNLI loader and pipe to
 CNXNLI*; 4. update documents in some download method.

---
 fastNLP/io/file_utils.py               | 24 +++++++--
 fastNLP/io/loader/classification.py    | 70 +++++---------------------
 fastNLP/io/loader/conll.py             | 10 ++++
 fastNLP/io/loader/coreference.py       | 30 +++++++----
 fastNLP/io/loader/matching.py          | 51 +++++++++++++------
 fastNLP/io/pipe/matching.py            | 18 +++----
 test/io/loader/test_matching_loader.py |  4 +-
 test/io/pipe/test_matching.py          |  6 +--
 8 files changed, 110 insertions(+), 103 deletions(-)

diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py
index 022af0ac..a4abb575 100644
--- a/fastNLP/io/file_utils.py
+++ b/fastNLP/io/file_utils.py
@@ -83,27 +83,41 @@ PRETRAIN_STATIC_FILES = {
 }
 
 DATASET_DIR = {
+    # Classification, English
     'aclImdb': "imdb.zip",
     "yelp-review-full": "yelp_review_full.tar.gz",
     "yelp-review-polarity": "yelp_review_polarity.tar.gz",
+    "sst-2": "SST-2.zip",
+    "sst": "SST.zip",
+
+    # Classification, Chinese
+    "chn-senti-corp": "chn_senti_corp.zip",
+    "weibo-senti-100k": "WeiboSenti100k.zip",
+    "thuc-news": "THUCNews.zip",
+
+    # Matching, English
     "mnli": "MNLI.zip",
     "snli": "SNLI.zip",
     "qnli": "QNLI.zip",
-    "xnli": "XNLI.zip",
-    "sst-2": "SST-2.zip",
-    "sst": "SST.zip",
     "rte": "RTE.zip",
+
+    # Matching, Chinese
+    "cn-xnli": "XNLI.zip",
+
+    # Sequence Labeling, Chinese
     "msra-ner": "MSRA_NER.zip",
     "peopledaily": "peopledaily.zip",
     "weibo-ner": "weibo_NER.zip",
 
+    # Chinese Word Segmentation
     "cws-pku": 'cws_pku.zip',
     "cws-cityu": "cws_cityu.zip",
     "cws-as": 'cws_as.zip',
     "cws-msra": 'cws_msra.zip',
 
-    "chn-senti-corp" : "chn_senti_corp.zip",
-    "weibo-senti-100k" : "WeiboSenti100k.zip"
+    # Summarization, English
+    "ext-cnndm": "ext-cnndm.zip",
+
 }
 
 PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR,
diff --git a/fastNLP/io/loader/classification.py b/fastNLP/io/loader/classification.py
index ca9b6107..004f3ebd 100644
--- a/fastNLP/io/loader/classification.py
+++ b/fastNLP/io/loader/classification.py
@@ -373,63 +373,6 @@ class ChnSentiCorpLoader(Loader):
         """
         从path中读取数据
 
-        :param path:
-        :return:
-        """
-        ds = DataSet()
-        with open(path, 'r', encoding='utf-8') as f:
-            f.readline()
-            for line in f:
-                line = line.strip()
-                tab_index = line.index('\t')
-                if tab_index!=-1:
-                    target = line[:tab_index]
-                    raw_chars = line[tab_index+1:]
-                    if raw_chars:
-                        ds.append(Instance(raw_chars=raw_chars, target=target))
-        return ds
-
-    def download(self)->str:
-        """
-        自动下载数据，该数据取自https://github.com/pengming617/bert_classification/tree/master/data，在
-        https://arxiv.org/pdf/1904.09223.pdf与https://arxiv.org/pdf/1906.08101.pdf有使用
-
-        :return:
-        """
-        output_dir = self._get_dataset_path('chn-senti-corp')
-        return output_dir
-
-
-class ChnSentiCorpLoader(Loader):
-    """
-    支持读取的数据的格式为，第一行为标题(具体内容会被忽略)，之后一行为一个sample，第一个制表符之前被认为是label，第
-    一个制表符及之后认为是句子
-
-    Example::
-
-        label	raw_chars
-        1	這間酒店環境和服務態度亦算不錯,但房間空間太小~~
-        1	<荐书> 推荐所有喜欢<红楼>的红迷们一定要收藏这本书,要知道...
-        0	商品的不足暂时还没发现，京东的订单处理速度实在.......周二就打包完成，周五才发货...
-
-    读取后的DataSet具有以下的field
-
-    .. csv-table::
-        :header: "raw_chars", "target"
-
-        "這間酒店環境和服務態度亦算不錯,但房間空間太小~~", "1"
-        "<荐书> 推荐所有喜欢<红楼>...", "1"
-        "..."
-
-    """
-
-    def __init__(self):
-        super().__init__()
-
-    def _load(self, path: str):
-        """
-        从path中读取数据
-
         :param path:
         :return:
         """
@@ -441,7 +384,7 @@ class ChnSentiCorpLoader(Loader):
                 tab_index = line.index('\t')
                 if tab_index != -1:
                     target = line[:tab_index]
-                    raw_chars = line[tab_index + 1:]
+                    raw_chars = line[tab_index+1:]
                     if raw_chars:
                         ds.append(Instance(raw_chars=raw_chars, target=target))
         return ds
@@ -486,6 +429,17 @@ class THUCNewsLoader(Loader):
                     ds.append(Instance(raw_chars=raw_chars, target=target))
         return ds
 
+    def download(self) -> str:
+        """
+        自动下载数据，该数据取自
+
+        http://thuctc.thunlp.org/#%E4%B8%AD%E6%96%87%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E6%95%B0%E6%8D%AE%E9%9B%86THUCNews
+
+        :return:
+        """
+        output_dir = self._get_dataset_path('thuc-news')
+        return output_dir
+
 
 class WeiboSenti100kLoader(Loader):
     """
diff --git a/fastNLP/io/loader/conll.py b/fastNLP/io/loader/conll.py
index 97842338..96aefa17 100644
--- a/fastNLP/io/loader/conll.py
+++ b/fastNLP/io/loader/conll.py
@@ -316,6 +316,16 @@ class CTBLoader(Loader):
         dataset = self.loader._load(path)
         return dataset
 
+    def download(self):
+        """
+        由于版权限制，不能提供自动下载功能。可参考
+
+        https://catalog.ldc.upenn.edu/LDC2013T21
+
+        :return:
+        """
+        raise RuntimeError("CTB cannot be downloaded automatically.")
+
 
 class CNNERLoader(Loader):
     def _load(self, path: str):
diff --git a/fastNLP/io/loader/coreference.py b/fastNLP/io/loader/coreference.py
index 4293f65a..9f120638 100644
--- a/fastNLP/io/loader/coreference.py
+++ b/fastNLP/io/loader/coreference.py
@@ -13,23 +13,21 @@ from .json import JsonLoader
 
 class CoReferenceLoader(JsonLoader):
     """
-        原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息，speakers包含每句话的说话者信息，cluster是指向现实中同一个事物的聚集，sentences是文本信息内容。
+    原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息，speakers包含每句话的说话者信息，cluster是指向现实中同一个事物的聚集，sentences是文本信息内容。
 
-        Example::
+    Example::
 
-           {"doc_key":"bc/cctv/00/cctv_001",
-           "speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]",
-           "clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]",
-           "sentences":[["I","have","an","apple"],["It","is","good"]]
-           }
+       {"doc_key":"bc/cctv/00/cctv_001",
+       "speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]",
+       "clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]",
+       "sentences":[["I","have","an","apple"],["It","is","good"]]
+       }
 
-        读取预处理好的Conll2012数据。
+    读取预处理好的Conll2012数据。
 
-        """
+    """
     def __init__(self, fields=None, dropna=False):
         super().__init__(fields, dropna)
-        # self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1),
-        # "clusters":Const.TARGET,"sentences":Const.INPUTS(2)}
         self.fields = {"doc_key": Const.RAW_WORDS(0), "speakers": Const.RAW_WORDS(1), "clusters": Const.RAW_WORDS(2),
                        "sentences": Const.RAW_WORDS(3)}
 
@@ -48,3 +46,13 @@ class CoReferenceLoader(JsonLoader):
                 ins = d
             dataset.append(Instance(**ins))
         return dataset
+
+    def download(self):
+        """
+        由于版权限制，不能提供自动下载功能。可参考
+
+        https://www.aclweb.org/anthology/W12-4501
+
+        :return:
+        """
+        raise RuntimeError("CoReference cannot be downloaded automatically.")
diff --git a/fastNLP/io/loader/matching.py b/fastNLP/io/loader/matching.py
index b9724126..80889507 100644
--- a/fastNLP/io/loader/matching.py
+++ b/fastNLP/io/loader/matching.py
@@ -7,7 +7,7 @@ __all__ = [
     "RTELoader",
     "QuoraLoader",
     "BQCorpusLoader",
-    "XNLILoader",
+    "CNXNLILoader",
     "LCQMCLoader"
 ]
 
@@ -135,12 +135,12 @@ class SNLILoader(JsonLoader):
         """
         从指定一个或多个路径中的文件中读取数据，返回 :class:`~fastNLP.io.DataBundle` 。
 
-        读取的field根据ConllLoader初始化时传入的headers决定。
+        读取的field根据Loader初始化时传入的field决定。
 
         :param str paths: 传入一个目录, 将在该目录下寻找snli_1.0_train.jsonl, snli_1.0_dev.jsonl
             和snli_1.0_test.jsonl三个文件。
 
-        :return: 返回的:class:`~fastNLP.io.DataBundle`
+        :return: 返回的 :class:`~fastNLP.io.DataBundle`
         """
         _paths = {}
         if paths is None:
@@ -222,8 +222,7 @@ class QNLILoader(JsonLoader):
         """
         如果您的实验使用到了该数据，请引用
 
-        .. todo::
-            补充
+        https://arxiv.org/pdf/1809.05053.pdf
 
         :return:
         """
@@ -276,6 +275,13 @@ class RTELoader(Loader):
         return ds
     
     def download(self):
+        """
+        如果您的实验使用到了该数据，请引用GLUE Benchmark
+
+        https://openreview.net/pdf?id=rJ4km2R5t7
+
+        :return:
+        """
         return self._get_dataset_path('rte')
 
 
@@ -321,10 +327,17 @@ class QuoraLoader(Loader):
         return ds
     
     def download(self):
+        """
+        由于版权限制，不能提供自动下载功能。可参考
+
+        https://www.kaggle.com/c/quora-question-pairs/data
+
+        :return:
+        """
         raise RuntimeError("Quora cannot be downloaded automatically.")
 
 
-class XNLILoader(Loader):
+class CNXNLILoader(Loader):
     """
     别名：
     数据集简介：中文句对NLI（本为multi-lingual的数据集，但是这里只取了中文的数据集）。原句子已被MOSES tokenizer处理
@@ -341,7 +354,7 @@ class XNLILoader(Loader):
     """
 
     def __init__(self):
-        super(XNLILoader, self).__init__()
+        super(CNXNLILoader, self).__init__()
 
     def _load(self, path: str = None):
         csv_loader = CSVLoader(sep='\t')
@@ -384,7 +397,7 @@ class XNLILoader(Loader):
         https://arxiv.org/pdf/1809.05053.pdf 有使用
         :return:
         """
-        output_dir = self._get_dataset_path('xnli')
+        output_dir = self._get_dataset_path('cn-xnli')
         return output_dir
 
 
@@ -423,6 +436,16 @@ class BQCorpusLoader(Loader):
                     ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target))
         return ds
 
+    def download(self):
+        """
+        由于版权限制，不能提供自动下载功能。可参考
+
+        https://github.com/ymcui/Chinese-BERT-wwm
+
+        :return:
+        """
+        raise RuntimeError("BQCorpus cannot be downloaded automatically.")
+
 
 class LCQMCLoader(Loader):
     """
@@ -461,16 +484,14 @@ class LCQMCLoader(Loader):
                     ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target))
         return ds
 
-    '''
-    def download(self)->str:
+    def download(self):
         """
-        自动下载数据，该数据取自论文 LCQMC: A Large-scale Chinese Question Matching Corpus.
-        InProceedings of the 27thInternational Conference on Computational Linguistics. 1952–1962.
+        由于版权限制，不能提供自动下载功能。可参考
+
+        https://github.com/ymcui/Chinese-BERT-wwm
 
         :return:
         """
-        output_dir = self._get_dataset_path('chn-senti-corp')
-        return output_dir
-    '''
+        raise RuntimeError("LCQMC cannot be downloaded automatically.")
 
 
diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py
index 7747dec3..90cf17df 100644
--- a/fastNLP/io/pipe/matching.py
+++ b/fastNLP/io/pipe/matching.py
@@ -7,7 +7,7 @@ __all__ = [
     "QuoraBertPipe",
     "QNLIBertPipe",
     "MNLIBertPipe",
-    "XNLIBertPipe",
+    "CNXNLIBertPipe",
     "BQCorpusBertPipe",
     "LCQMCBertPipe",
     "MatchingPipe",
@@ -16,7 +16,7 @@ __all__ = [
     "QuoraPipe",
     "QNLIPipe",
     "MNLIPipe",
-    "XNLIPipe",
+    "CNXNLIPipe",
     "BQCorpusPipe",
     "LCQMCPipe",
 ]
@@ -25,7 +25,7 @@ import warnings
 
 from .pipe import Pipe
 from .utils import get_tokenizer
-from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, XNLILoader, LCQMCLoader
+from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, CNXNLILoader, LCQMCLoader
 from ...core.const import Const
 from ...core.vocabulary import Vocabulary
 from ...core._logger import logger
@@ -354,10 +354,10 @@ class LCQMCPipe(MatchingPipe):
         return data_bundle
 
 
-class XNLIPipe(MatchingPipe):
-    def process_from_file(self, paths = None):
-        data_bundle = XNLILoader().load(paths)
-        data_bundle = GranularizePipe(task = 'XNLI').process(data_bundle)
+class CNXNLIPipe(MatchingPipe):
+    def process_from_file(self, paths=None):
+        data_bundle = CNXNLILoader().load(paths)
+        data_bundle = GranularizePipe(task='XNLI').process(data_bundle)
         data_bundle = RenamePipe().process(data_bundle) #使中文数据的field
         data_bundle = self.process(data_bundle)
         data_bundle = RenamePipe().process(data_bundle)
@@ -473,9 +473,9 @@ class BQCorpusBertPipe(MatchingBertPipe):
         return data_bundle
 
 
-class XNLIBertPipe(MatchingBertPipe):
+class CNXNLIBertPipe(MatchingBertPipe):
     def process_from_file(self, paths = None):
-        data_bundle = XNLILoader().load(paths)
+        data_bundle = CNXNLILoader().load(paths)
         data_bundle = GranularizePipe(task='XNLI').process(data_bundle)
         data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle)
         data_bundle = self.process(data_bundle)
diff --git a/test/io/loader/test_matching_loader.py b/test/io/loader/test_matching_loader.py
index 5700ab80..abe21aa9 100644
--- a/test/io/loader/test_matching_loader.py
+++ b/test/io/loader/test_matching_loader.py
@@ -5,7 +5,7 @@ import os
 
 from fastNLP.io import DataBundle
 from fastNLP.io.loader.matching import RTELoader, QNLILoader, SNLILoader, QuoraLoader, MNLILoader, \
-    BQCorpusLoader, XNLILoader, LCQMCLoader
+    BQCorpusLoader, CNXNLILoader, LCQMCLoader
 
 
 @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis")
@@ -31,7 +31,7 @@ class TestMatchingLoad(unittest.TestCase):
             'MNLI': ('test/data_for_tests/io/MNLI', MNLILoader, (5, 5, 5, 5, 6), True),
             'Quora': ('test/data_for_tests/io/Quora', QuoraLoader, (2, 2, 2), False),
             'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusLoader, (5, 5, 5), False),
-            'XNLI': ('test/data_for_tests/io/XNLI', XNLILoader, (6, 7, 6), False),
+            'XNLI': ('test/data_for_tests/io/XNLI', CNXNLILoader, (6, 7, 6), False),
             'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCLoader, (5, 6, 6), False),
         }
         for k, v in data_set_dict.items():
diff --git a/test/io/pipe/test_matching.py b/test/io/pipe/test_matching.py
index 6d872692..52d372d5 100644
--- a/test/io/pipe/test_matching.py
+++ b/test/io/pipe/test_matching.py
@@ -4,9 +4,9 @@ import os
 
 from fastNLP.io import DataBundle
 from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, QNLIPipe, QuoraPipe, MNLIPipe, \
-    XNLIPipe, BQCorpusPipe, LCQMCPipe
+    CNXNLIPipe, BQCorpusPipe, LCQMCPipe
 from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, QNLIBertPipe, QuoraBertPipe, MNLIBertPipe, \
-    XNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe
+    CNXNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe
 
 
 @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis")
@@ -38,7 +38,7 @@ class TestRunMatchingPipe(unittest.TestCase):
             'QNLI': ('test/data_for_tests/io/QNLI', QNLIPipe, QNLIBertPipe, (5, 5, 5), (372, 2), True),
             'MNLI': ('test/data_for_tests/io/MNLI', MNLIPipe, MNLIBertPipe, (5, 5, 5, 5, 6), (459, 3), True),
             'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusPipe, BQCorpusBertPipe, (5, 5, 5), (32, 2), False),
-            'XNLI': ('test/data_for_tests/io/XNLI', XNLIPipe, XNLIBertPipe, (6, 7, 6), (37, 3), False),
+            'XNLI': ('test/data_for_tests/io/XNLI', CNXNLIPipe, CNXNLIBertPipe, (6, 7, 6), (37, 3), False),
             'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCPipe, LCQMCBertPipe, (5, 6, 6), (36, 2), False),
         }
         for k, v in data_set_dict.items():