From ad957077185f12a662f8fb9a64c1fdc1fa4464f5 Mon Sep 17 00:00:00 2001 From: Yige Xu Date: Wed, 25 Sep 2019 14:46:30 +0800 Subject: [PATCH] 1. reorganize auto download datasets in io/file_utils.py; 2. add auto download for CNNDM and THUCNews; 3. rename XNLI loader and pipe to CNXNLI*; 4. update documents in some download method. --- fastNLP/io/file_utils.py | 24 +++++++-- fastNLP/io/loader/classification.py | 70 +++++--------------------- fastNLP/io/loader/conll.py | 10 ++++ fastNLP/io/loader/coreference.py | 30 +++++++---- fastNLP/io/loader/matching.py | 51 +++++++++++++------ fastNLP/io/pipe/matching.py | 18 +++---- test/io/loader/test_matching_loader.py | 4 +- test/io/pipe/test_matching.py | 6 +-- 8 files changed, 110 insertions(+), 103 deletions(-) diff --git a/fastNLP/io/file_utils.py b/fastNLP/io/file_utils.py index 022af0ac..a4abb575 100644 --- a/fastNLP/io/file_utils.py +++ b/fastNLP/io/file_utils.py @@ -83,27 +83,41 @@ PRETRAIN_STATIC_FILES = { } DATASET_DIR = { + # Classification, English 'aclImdb': "imdb.zip", "yelp-review-full": "yelp_review_full.tar.gz", "yelp-review-polarity": "yelp_review_polarity.tar.gz", + "sst-2": "SST-2.zip", + "sst": "SST.zip", + + # Classification, Chinese + "chn-senti-corp": "chn_senti_corp.zip", + "weibo-senti-100k": "WeiboSenti100k.zip", + "thuc-news": "THUCNews.zip", + + # Matching, English "mnli": "MNLI.zip", "snli": "SNLI.zip", "qnli": "QNLI.zip", - "xnli": "XNLI.zip", - "sst-2": "SST-2.zip", - "sst": "SST.zip", "rte": "RTE.zip", + + # Matching, Chinese + "cn-xnli": "XNLI.zip", + + # Sequence Labeling, Chinese "msra-ner": "MSRA_NER.zip", "peopledaily": "peopledaily.zip", "weibo-ner": "weibo_NER.zip", + # Chinese Word Segmentation "cws-pku": 'cws_pku.zip', "cws-cityu": "cws_cityu.zip", "cws-as": 'cws_as.zip', "cws-msra": 'cws_msra.zip', - "chn-senti-corp" : "chn_senti_corp.zip", - "weibo-senti-100k" : "WeiboSenti100k.zip" + # Summarization, English + "ext-cnndm": "ext-cnndm.zip", + } PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, diff --git a/fastNLP/io/loader/classification.py b/fastNLP/io/loader/classification.py index ca9b6107..004f3ebd 100644 --- a/fastNLP/io/loader/classification.py +++ b/fastNLP/io/loader/classification.py @@ -373,63 +373,6 @@ class ChnSentiCorpLoader(Loader): """ 从path中读取数据 - :param path: - :return: - """ - ds = DataSet() - with open(path, 'r', encoding='utf-8') as f: - f.readline() - for line in f: - line = line.strip() - tab_index = line.index('\t') - if tab_index!=-1: - target = line[:tab_index] - raw_chars = line[tab_index+1:] - if raw_chars: - ds.append(Instance(raw_chars=raw_chars, target=target)) - return ds - - def download(self)->str: - """ - 自动下载数据,该数据取自https://github.com/pengming617/bert_classification/tree/master/data,在 - https://arxiv.org/pdf/1904.09223.pdf与https://arxiv.org/pdf/1906.08101.pdf有使用 - - :return: - """ - output_dir = self._get_dataset_path('chn-senti-corp') - return output_dir - - -class ChnSentiCorpLoader(Loader): - """ - 支持读取的数据的格式为,第一行为标题(具体内容会被忽略),之后一行为一个sample,第一个制表符之前被认为是label,第 - 一个制表符及之后认为是句子 - - Example:: - - label raw_chars - 1 這間酒店環境和服務態度亦算不錯,但房間空間太小~~ - 1 <荐书> 推荐所有喜欢<红楼>的红迷们一定要收藏这本书,要知道... - 0 商品的不足暂时还没发现,京东的订单处理速度实在.......周二就打包完成,周五才发货... - - 读取后的DataSet具有以下的field - - .. csv-table:: - :header: "raw_chars", "target" - - "這間酒店環境和服務態度亦算不錯,但房間空間太小~~", "1" - "<荐书> 推荐所有喜欢<红楼>...", "1" - "..." - - """ - - def __init__(self): - super().__init__() - - def _load(self, path: str): - """ - 从path中读取数据 - :param path: :return: """ @@ -441,7 +384,7 @@ class ChnSentiCorpLoader(Loader): tab_index = line.index('\t') if tab_index != -1: target = line[:tab_index] - raw_chars = line[tab_index + 1:] + raw_chars = line[tab_index+1:] if raw_chars: ds.append(Instance(raw_chars=raw_chars, target=target)) return ds @@ -486,6 +429,17 @@ class THUCNewsLoader(Loader): ds.append(Instance(raw_chars=raw_chars, target=target)) return ds + def download(self) -> str: + """ + 自动下载数据,该数据取自 + + http://thuctc.thunlp.org/#%E4%B8%AD%E6%96%87%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E6%95%B0%E6%8D%AE%E9%9B%86THUCNews + + :return: + """ + output_dir = self._get_dataset_path('thuc-news') + return output_dir + class WeiboSenti100kLoader(Loader): """ diff --git a/fastNLP/io/loader/conll.py b/fastNLP/io/loader/conll.py index 97842338..96aefa17 100644 --- a/fastNLP/io/loader/conll.py +++ b/fastNLP/io/loader/conll.py @@ -316,6 +316,16 @@ class CTBLoader(Loader): dataset = self.loader._load(path) return dataset + def download(self): + """ + 由于版权限制,不能提供自动下载功能。可参考 + + https://catalog.ldc.upenn.edu/LDC2013T21 + + :return: + """ + raise RuntimeError("CTB cannot be downloaded automatically.") + class CNNERLoader(Loader): def _load(self, path: str): diff --git a/fastNLP/io/loader/coreference.py b/fastNLP/io/loader/coreference.py index 4293f65a..9f120638 100644 --- a/fastNLP/io/loader/coreference.py +++ b/fastNLP/io/loader/coreference.py @@ -13,23 +13,21 @@ from .json import JsonLoader class CoReferenceLoader(JsonLoader): """ - 原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息,speakers包含每句话的说话者信息,cluster是指向现实中同一个事物的聚集,sentences是文本信息内容。 + 原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息,speakers包含每句话的说话者信息,cluster是指向现实中同一个事物的聚集,sentences是文本信息内容。 - Example:: + Example:: - {"doc_key":"bc/cctv/00/cctv_001", - "speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]", - "clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]", - "sentences":[["I","have","an","apple"],["It","is","good"]] - } + {"doc_key":"bc/cctv/00/cctv_001", + "speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]", + "clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]", + "sentences":[["I","have","an","apple"],["It","is","good"]] + } - 读取预处理好的Conll2012数据。 + 读取预处理好的Conll2012数据。 - """ + """ def __init__(self, fields=None, dropna=False): super().__init__(fields, dropna) - # self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1), - # "clusters":Const.TARGET,"sentences":Const.INPUTS(2)} self.fields = {"doc_key": Const.RAW_WORDS(0), "speakers": Const.RAW_WORDS(1), "clusters": Const.RAW_WORDS(2), "sentences": Const.RAW_WORDS(3)} @@ -48,3 +46,13 @@ class CoReferenceLoader(JsonLoader): ins = d dataset.append(Instance(**ins)) return dataset + + def download(self): + """ + 由于版权限制,不能提供自动下载功能。可参考 + + https://www.aclweb.org/anthology/W12-4501 + + :return: + """ + raise RuntimeError("CoReference cannot be downloaded automatically.") diff --git a/fastNLP/io/loader/matching.py b/fastNLP/io/loader/matching.py index b9724126..80889507 100644 --- a/fastNLP/io/loader/matching.py +++ b/fastNLP/io/loader/matching.py @@ -7,7 +7,7 @@ __all__ = [ "RTELoader", "QuoraLoader", "BQCorpusLoader", - "XNLILoader", + "CNXNLILoader", "LCQMCLoader" ] @@ -135,12 +135,12 @@ class SNLILoader(JsonLoader): """ 从指定一个或多个路径中的文件中读取数据,返回 :class:`~fastNLP.io.DataBundle` 。 - 读取的field根据ConllLoader初始化时传入的headers决定。 + 读取的field根据Loader初始化时传入的field决定。 :param str paths: 传入一个目录, 将在该目录下寻找snli_1.0_train.jsonl, snli_1.0_dev.jsonl 和snli_1.0_test.jsonl三个文件。 - :return: 返回的:class:`~fastNLP.io.DataBundle` + :return: 返回的 :class:`~fastNLP.io.DataBundle` """ _paths = {} if paths is None: @@ -222,8 +222,7 @@ class QNLILoader(JsonLoader): """ 如果您的实验使用到了该数据,请引用 - .. todo:: - 补充 + https://arxiv.org/pdf/1809.05053.pdf :return: """ @@ -276,6 +275,13 @@ class RTELoader(Loader): return ds def download(self): + """ + 如果您的实验使用到了该数据,请引用GLUE Benchmark + + https://openreview.net/pdf?id=rJ4km2R5t7 + + :return: + """ return self._get_dataset_path('rte') @@ -321,10 +327,17 @@ class QuoraLoader(Loader): return ds def download(self): + """ + 由于版权限制,不能提供自动下载功能。可参考 + + https://www.kaggle.com/c/quora-question-pairs/data + + :return: + """ raise RuntimeError("Quora cannot be downloaded automatically.") -class XNLILoader(Loader): +class CNXNLILoader(Loader): """ 别名: 数据集简介:中文句对NLI(本为multi-lingual的数据集,但是这里只取了中文的数据集)。原句子已被MOSES tokenizer处理 @@ -341,7 +354,7 @@ class XNLILoader(Loader): """ def __init__(self): - super(XNLILoader, self).__init__() + super(CNXNLILoader, self).__init__() def _load(self, path: str = None): csv_loader = CSVLoader(sep='\t') @@ -384,7 +397,7 @@ class XNLILoader(Loader): https://arxiv.org/pdf/1809.05053.pdf 有使用 :return: """ - output_dir = self._get_dataset_path('xnli') + output_dir = self._get_dataset_path('cn-xnli') return output_dir @@ -423,6 +436,16 @@ class BQCorpusLoader(Loader): ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target)) return ds + def download(self): + """ + 由于版权限制,不能提供自动下载功能。可参考 + + https://github.com/ymcui/Chinese-BERT-wwm + + :return: + """ + raise RuntimeError("BQCorpus cannot be downloaded automatically.") + class LCQMCLoader(Loader): """ @@ -461,16 +484,14 @@ class LCQMCLoader(Loader): ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target)) return ds - ''' - def download(self)->str: + def download(self): """ - 自动下载数据,该数据取自论文 LCQMC: A Large-scale Chinese Question Matching Corpus. - InProceedings of the 27thInternational Conference on Computational Linguistics. 1952–1962. + 由于版权限制,不能提供自动下载功能。可参考 + + https://github.com/ymcui/Chinese-BERT-wwm :return: """ - output_dir = self._get_dataset_path('chn-senti-corp') - return output_dir - ''' + raise RuntimeError("LCQMC cannot be downloaded automatically.") diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index 7747dec3..90cf17df 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -7,7 +7,7 @@ __all__ = [ "QuoraBertPipe", "QNLIBertPipe", "MNLIBertPipe", - "XNLIBertPipe", + "CNXNLIBertPipe", "BQCorpusBertPipe", "LCQMCBertPipe", "MatchingPipe", @@ -16,7 +16,7 @@ __all__ = [ "QuoraPipe", "QNLIPipe", "MNLIPipe", - "XNLIPipe", + "CNXNLIPipe", "BQCorpusPipe", "LCQMCPipe", ] @@ -25,7 +25,7 @@ import warnings from .pipe import Pipe from .utils import get_tokenizer -from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, XNLILoader, LCQMCLoader +from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, CNXNLILoader, LCQMCLoader from ...core.const import Const from ...core.vocabulary import Vocabulary from ...core._logger import logger @@ -354,10 +354,10 @@ class LCQMCPipe(MatchingPipe): return data_bundle -class XNLIPipe(MatchingPipe): - def process_from_file(self, paths = None): - data_bundle = XNLILoader().load(paths) - data_bundle = GranularizePipe(task = 'XNLI').process(data_bundle) +class CNXNLIPipe(MatchingPipe): + def process_from_file(self, paths=None): + data_bundle = CNXNLILoader().load(paths) + data_bundle = GranularizePipe(task='XNLI').process(data_bundle) data_bundle = RenamePipe().process(data_bundle) #使中文数据的field data_bundle = self.process(data_bundle) data_bundle = RenamePipe().process(data_bundle) @@ -473,9 +473,9 @@ class BQCorpusBertPipe(MatchingBertPipe): return data_bundle -class XNLIBertPipe(MatchingBertPipe): +class CNXNLIBertPipe(MatchingBertPipe): def process_from_file(self, paths = None): - data_bundle = XNLILoader().load(paths) + data_bundle = CNXNLILoader().load(paths) data_bundle = GranularizePipe(task='XNLI').process(data_bundle) data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) data_bundle = self.process(data_bundle) diff --git a/test/io/loader/test_matching_loader.py b/test/io/loader/test_matching_loader.py index 5700ab80..abe21aa9 100644 --- a/test/io/loader/test_matching_loader.py +++ b/test/io/loader/test_matching_loader.py @@ -5,7 +5,7 @@ import os from fastNLP.io import DataBundle from fastNLP.io.loader.matching import RTELoader, QNLILoader, SNLILoader, QuoraLoader, MNLILoader, \ - BQCorpusLoader, XNLILoader, LCQMCLoader + BQCorpusLoader, CNXNLILoader, LCQMCLoader @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") @@ -31,7 +31,7 @@ class TestMatchingLoad(unittest.TestCase): 'MNLI': ('test/data_for_tests/io/MNLI', MNLILoader, (5, 5, 5, 5, 6), True), 'Quora': ('test/data_for_tests/io/Quora', QuoraLoader, (2, 2, 2), False), 'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusLoader, (5, 5, 5), False), - 'XNLI': ('test/data_for_tests/io/XNLI', XNLILoader, (6, 7, 6), False), + 'XNLI': ('test/data_for_tests/io/XNLI', CNXNLILoader, (6, 7, 6), False), 'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCLoader, (5, 6, 6), False), } for k, v in data_set_dict.items(): diff --git a/test/io/pipe/test_matching.py b/test/io/pipe/test_matching.py index 6d872692..52d372d5 100644 --- a/test/io/pipe/test_matching.py +++ b/test/io/pipe/test_matching.py @@ -4,9 +4,9 @@ import os from fastNLP.io import DataBundle from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, QNLIPipe, QuoraPipe, MNLIPipe, \ - XNLIPipe, BQCorpusPipe, LCQMCPipe + CNXNLIPipe, BQCorpusPipe, LCQMCPipe from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, QNLIBertPipe, QuoraBertPipe, MNLIBertPipe, \ - XNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe + CNXNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") @@ -38,7 +38,7 @@ class TestRunMatchingPipe(unittest.TestCase): 'QNLI': ('test/data_for_tests/io/QNLI', QNLIPipe, QNLIBertPipe, (5, 5, 5), (372, 2), True), 'MNLI': ('test/data_for_tests/io/MNLI', MNLIPipe, MNLIBertPipe, (5, 5, 5, 5, 6), (459, 3), True), 'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusPipe, BQCorpusBertPipe, (5, 5, 5), (32, 2), False), - 'XNLI': ('test/data_for_tests/io/XNLI', XNLIPipe, XNLIBertPipe, (6, 7, 6), (37, 3), False), + 'XNLI': ('test/data_for_tests/io/XNLI', CNXNLIPipe, CNXNLIBertPipe, (6, 7, 6), (37, 3), False), 'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCPipe, LCQMCBertPipe, (5, 6, 6), (36, 2), False), } for k, v in data_set_dict.items():