1. reorganize auto download datasets in io/file_utils.py; 2. add auto download for CNNDM and THUCNews; 3. rename XNLI loader and pipe to CNXNLI*; 4. update documents in some download method.

5 years ago · ad95707718
--- a/fastNLP/io/file_utils.py
+++ b/fastNLP/io/file_utils.py
@@ -83,27 +83,41 @@ PRETRAIN_STATIC_FILES = {
 }

 DATASET_DIR = {
    # Classification, English
    'aclImdb': "imdb.zip",
    "yelp-review-full": "yelp_review_full.tar.gz",
    "yelp-review-polarity": "yelp_review_polarity.tar.gz",
    "sst-2": "SST-2.zip",
    "sst": "SST.zip",

    # Classification, Chinese
    "chn-senti-corp": "chn_senti_corp.zip",
    "weibo-senti-100k": "WeiboSenti100k.zip",
    "thuc-news": "THUCNews.zip",

    # Matching, English
    "mnli": "MNLI.zip",
    "snli": "SNLI.zip",
    "qnli": "QNLI.zip",
    "xnli": "XNLI.zip",
    "sst-2": "SST-2.zip",
    "sst": "SST.zip",
    "rte": "RTE.zip",

    # Matching, Chinese
    "cn-xnli": "XNLI.zip",

    # Sequence Labeling, Chinese
    "msra-ner": "MSRA_NER.zip",
    "peopledaily": "peopledaily.zip",
    "weibo-ner": "weibo_NER.zip",

    # Chinese Word Segmentation
    "cws-pku": 'cws_pku.zip',
    "cws-cityu": "cws_cityu.zip",
    "cws-as": 'cws_as.zip',
    "cws-msra": 'cws_msra.zip',

    "chn-senti-corp" : "chn_senti_corp.zip",
    "weibo-senti-100k" : "WeiboSenti100k.zip"
    # Summarization, English
    "ext-cnndm": "ext-cnndm.zip",

 }

 PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR,
--- a/fastNLP/io/loader/classification.py
+++ b/fastNLP/io/loader/classification.py
@@ -373,63 +373,6 @@ class ChnSentiCorpLoader(Loader):
        """
        从path中读取数据

        :param path:
        :return:
        """
        ds = DataSet()
        with open(path, 'r', encoding='utf-8') as f:
            f.readline()
            for line in f:
                line = line.strip()
                tab_index = line.index('\t')
                if tab_index!=-1:
                    target = line[:tab_index]
                    raw_chars = line[tab_index+1:]
                    if raw_chars:
                        ds.append(Instance(raw_chars=raw_chars, target=target))
        return ds

    def download(self)->str:
        """
        自动下载数据，该数据取自https://github.com/pengming617/bert_classification/tree/master/data，在
        https://arxiv.org/pdf/1904.09223.pdf与https://arxiv.org/pdf/1906.08101.pdf有使用

        :return:
        """
        output_dir = self._get_dataset_path('chn-senti-corp')
        return output_dir


 class ChnSentiCorpLoader(Loader):
    """
    支持读取的数据的格式为，第一行为标题(具体内容会被忽略)，之后一行为一个sample，第一个制表符之前被认为是label，第
    一个制表符及之后认为是句子

    Example::

        label	raw_chars
        1	這間酒店環境和服務態度亦算不錯,但房間空間太小~~
        1	<荐书> 推荐所有喜欢<红楼>的红迷们一定要收藏这本书,要知道...
        0	商品的不足暂时还没发现，京东的订单处理速度实在.......周二就打包完成，周五才发货...

    读取后的DataSet具有以下的field

    .. csv-table::
        :header: "raw_chars", "target"

        "這間酒店環境和服務態度亦算不錯,但房間空間太小~~", "1"
        "<荐书> 推荐所有喜欢<红楼>...", "1"
        "..."

    """

    def __init__(self):
        super().__init__()

    def _load(self, path: str):
        """
        从path中读取数据

        :param path:
        :return:
        """
@@ -441,7 +384,7 @@ class ChnSentiCorpLoader(Loader):
                tab_index = line.index('\t')
                if tab_index != -1:
                    target = line[:tab_index]
                    raw_chars = line[tab_index + 1:]
                    raw_chars = line[tab_index+1:]
                    if raw_chars:
                        ds.append(Instance(raw_chars=raw_chars, target=target))
        return ds
@@ -486,6 +429,17 @@ class THUCNewsLoader(Loader):
                    ds.append(Instance(raw_chars=raw_chars, target=target))
        return ds

    def download(self) -> str:
        """
        自动下载数据，该数据取自

        http://thuctc.thunlp.org/#%E4%B8%AD%E6%96%87%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E6%95%B0%E6%8D%AE%E9%9B%86THUCNews

        :return:
        """
        output_dir = self._get_dataset_path('thuc-news')
        return output_dir


 class WeiboSenti100kLoader(Loader):
    """
--- a/fastNLP/io/loader/conll.py
+++ b/fastNLP/io/loader/conll.py
@@ -316,6 +316,16 @@ class CTBLoader(Loader):
        dataset = self.loader._load(path)
        return dataset

    def download(self):
        """
        由于版权限制，不能提供自动下载功能。可参考

        https://catalog.ldc.upenn.edu/LDC2013T21

        :return:
        """
        raise RuntimeError("CTB cannot be downloaded automatically.")


 class CNNERLoader(Loader):
    def _load(self, path: str):
--- a/fastNLP/io/loader/coreference.py
+++ b/fastNLP/io/loader/coreference.py
@@ -13,23 +13,21 @@ from .json import JsonLoader

 class CoReferenceLoader(JsonLoader):
    """
        原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息，speakers包含每句话的说话者信息，cluster是指向现实中同一个事物的聚集，sentences是文本信息内容。
    原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息，speakers包含每句话的说话者信息，cluster是指向现实中同一个事物的聚集，sentences是文本信息内容。

        Example::
    Example::

           {"doc_key":"bc/cctv/00/cctv_001",
           "speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]",
           "clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]",
           "sentences":[["I","have","an","apple"],["It","is","good"]]
           }
       {"doc_key":"bc/cctv/00/cctv_001",
       "speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]",
       "clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]",
       "sentences":[["I","have","an","apple"],["It","is","good"]]
       }

        读取预处理好的Conll2012数据。
    读取预处理好的Conll2012数据。

        """
    """
    def __init__(self, fields=None, dropna=False):
        super().__init__(fields, dropna)
        # self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1),
        # "clusters":Const.TARGET,"sentences":Const.INPUTS(2)}
        self.fields = {"doc_key": Const.RAW_WORDS(0), "speakers": Const.RAW_WORDS(1), "clusters": Const.RAW_WORDS(2),
                       "sentences": Const.RAW_WORDS(3)}

@@ -48,3 +46,13 @@ class CoReferenceLoader(JsonLoader):
                ins = d
            dataset.append(Instance(**ins))
        return dataset

    def download(self):
        """
        由于版权限制，不能提供自动下载功能。可参考

        https://www.aclweb.org/anthology/W12-4501

        :return:
        """
        raise RuntimeError("CoReference cannot be downloaded automatically.")
--- a/fastNLP/io/loader/matching.py
+++ b/fastNLP/io/loader/matching.py
@@ -7,7 +7,7 @@ __all__ = [
    "RTELoader",
    "QuoraLoader",
    "BQCorpusLoader",
    "XNLILoader",
    "CNXNLILoader",
    "LCQMCLoader"
 ]

@@ -135,12 +135,12 @@ class SNLILoader(JsonLoader):
        """
        从指定一个或多个路径中的文件中读取数据，返回 :class:`~fastNLP.io.DataBundle` 。

        读取的field根据ConllLoader初始化时传入的headers决定。
        读取的field根据Loader初始化时传入的field决定。

        :param str paths: 传入一个目录, 将在该目录下寻找snli_1.0_train.jsonl, snli_1.0_dev.jsonl
            和snli_1.0_test.jsonl三个文件。

        :return: 返回的:class:`~fastNLP.io.DataBundle`
        :return: 返回的 :class:`~fastNLP.io.DataBundle`
        """
        _paths = {}
        if paths is None:
@@ -222,8 +222,7 @@ class QNLILoader(JsonLoader):
        """
        如果您的实验使用到了该数据，请引用

        .. todo::
            补充
        https://arxiv.org/pdf/1809.05053.pdf

        :return:
        """
@@ -276,6 +275,13 @@ class RTELoader(Loader):
        return ds
    
    def download(self):
        """
        如果您的实验使用到了该数据，请引用GLUE Benchmark

        https://openreview.net/pdf?id=rJ4km2R5t7

        :return:
        """
        return self._get_dataset_path('rte')


@@ -321,10 +327,17 @@ class QuoraLoader(Loader):
        return ds
    
    def download(self):
        """
        由于版权限制，不能提供自动下载功能。可参考

        https://www.kaggle.com/c/quora-question-pairs/data

        :return:
        """
        raise RuntimeError("Quora cannot be downloaded automatically.")


 class XNLILoader(Loader):
 class CNXNLILoader(Loader):
    """
    别名：
    数据集简介：中文句对NLI（本为multi-lingual的数据集，但是这里只取了中文的数据集）。原句子已被MOSES tokenizer处理
@@ -341,7 +354,7 @@ class XNLILoader(Loader):
    """

    def __init__(self):
        super(XNLILoader, self).__init__()
        super(CNXNLILoader, self).__init__()

    def _load(self, path: str = None):
        csv_loader = CSVLoader(sep='\t')
@@ -384,7 +397,7 @@ class XNLILoader(Loader):
        https://arxiv.org/pdf/1809.05053.pdf 有使用
        :return:
        """
        output_dir = self._get_dataset_path('xnli')
        output_dir = self._get_dataset_path('cn-xnli')
        return output_dir


@@ -423,6 +436,16 @@ class BQCorpusLoader(Loader):
                    ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target))
        return ds

    def download(self):
        """
        由于版权限制，不能提供自动下载功能。可参考

        https://github.com/ymcui/Chinese-BERT-wwm

        :return:
        """
        raise RuntimeError("BQCorpus cannot be downloaded automatically.")


 class LCQMCLoader(Loader):
    """
@@ -461,16 +484,14 @@ class LCQMCLoader(Loader):
                    ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target))
        return ds

    '''
    def download(self)->str:
    def download(self):
        """
        自动下载数据，该数据取自论文 LCQMC: A Large-scale Chinese Question Matching Corpus.
        InProceedings of the 27thInternational Conference on Computational Linguistics. 1952–1962.
        由于版权限制，不能提供自动下载功能。可参考

        https://github.com/ymcui/Chinese-BERT-wwm

        :return:
        """
        output_dir = self._get_dataset_path('chn-senti-corp')
        return output_dir
    '''
        raise RuntimeError("LCQMC cannot be downloaded automatically.")


--- a/fastNLP/io/pipe/matching.py
+++ b/fastNLP/io/pipe/matching.py
@@ -7,7 +7,7 @@ __all__ = [
    "QuoraBertPipe",
    "QNLIBertPipe",
    "MNLIBertPipe",
    "XNLIBertPipe",
    "CNXNLIBertPipe",
    "BQCorpusBertPipe",
    "LCQMCBertPipe",
    "MatchingPipe",
@@ -16,7 +16,7 @@ __all__ = [
    "QuoraPipe",
    "QNLIPipe",
    "MNLIPipe",
    "XNLIPipe",
    "CNXNLIPipe",
    "BQCorpusPipe",
    "LCQMCPipe",
 ]
@@ -25,7 +25,7 @@ import warnings

 from .pipe import Pipe
 from .utils import get_tokenizer
 from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, XNLILoader, LCQMCLoader
 from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, CNXNLILoader, LCQMCLoader
 from ...core.const import Const
 from ...core.vocabulary import Vocabulary
 from ...core._logger import logger
@@ -354,10 +354,10 @@ class LCQMCPipe(MatchingPipe):
        return data_bundle


 class XNLIPipe(MatchingPipe):
    def process_from_file(self, paths = None):
        data_bundle = XNLILoader().load(paths)
        data_bundle = GranularizePipe(task = 'XNLI').process(data_bundle)
 class CNXNLIPipe(MatchingPipe):
    def process_from_file(self, paths=None):
        data_bundle = CNXNLILoader().load(paths)
        data_bundle = GranularizePipe(task='XNLI').process(data_bundle)
        data_bundle = RenamePipe().process(data_bundle) #使中文数据的field
        data_bundle = self.process(data_bundle)
        data_bundle = RenamePipe().process(data_bundle)
@@ -473,9 +473,9 @@ class BQCorpusBertPipe(MatchingBertPipe):
        return data_bundle


 class XNLIBertPipe(MatchingBertPipe):
 class CNXNLIBertPipe(MatchingBertPipe):
    def process_from_file(self, paths = None):
        data_bundle = XNLILoader().load(paths)
        data_bundle = CNXNLILoader().load(paths)
        data_bundle = GranularizePipe(task='XNLI').process(data_bundle)
        data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle)
        data_bundle = self.process(data_bundle)
--- a/test/io/loader/test_matching_loader.py
+++ b/test/io/loader/test_matching_loader.py
@@ -5,7 +5,7 @@ import os

 from fastNLP.io import DataBundle
 from fastNLP.io.loader.matching import RTELoader, QNLILoader, SNLILoader, QuoraLoader, MNLILoader, \
    BQCorpusLoader, XNLILoader, LCQMCLoader
    BQCorpusLoader, CNXNLILoader, LCQMCLoader


@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis")
@@ -31,7 +31,7 @@ class TestMatchingLoad(unittest.TestCase):
            'MNLI': ('test/data_for_tests/io/MNLI', MNLILoader, (5, 5, 5, 5, 6), True),
            'Quora': ('test/data_for_tests/io/Quora', QuoraLoader, (2, 2, 2), False),
            'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusLoader, (5, 5, 5), False),
            'XNLI': ('test/data_for_tests/io/XNLI', XNLILoader, (6, 7, 6), False),
            'XNLI': ('test/data_for_tests/io/XNLI', CNXNLILoader, (6, 7, 6), False),
            'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCLoader, (5, 6, 6), False),
        }
        for k, v in data_set_dict.items():
--- a/test/io/pipe/test_matching.py
+++ b/test/io/pipe/test_matching.py
@@ -4,9 +4,9 @@ import os

 from fastNLP.io import DataBundle
 from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, QNLIPipe, QuoraPipe, MNLIPipe, \
    XNLIPipe, BQCorpusPipe, LCQMCPipe
    CNXNLIPipe, BQCorpusPipe, LCQMCPipe
 from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, QNLIBertPipe, QuoraBertPipe, MNLIBertPipe, \
    XNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe
    CNXNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe


@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis")
@@ -38,7 +38,7 @@ class TestRunMatchingPipe(unittest.TestCase):
            'QNLI': ('test/data_for_tests/io/QNLI', QNLIPipe, QNLIBertPipe, (5, 5, 5), (372, 2), True),
            'MNLI': ('test/data_for_tests/io/MNLI', MNLIPipe, MNLIBertPipe, (5, 5, 5, 5, 6), (459, 3), True),
            'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusPipe, BQCorpusBertPipe, (5, 5, 5), (32, 2), False),
            'XNLI': ('test/data_for_tests/io/XNLI', XNLIPipe, XNLIBertPipe, (6, 7, 6), (37, 3), False),
            'XNLI': ('test/data_for_tests/io/XNLI', CNXNLIPipe, CNXNLIBertPipe, (6, 7, 6), (37, 3), False),
            'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCPipe, LCQMCBertPipe, (5, 6, 6), (36, 2), False),
        }
        for k, v in data_set_dict.items():