Browse Source

1. reorganize auto download datasets in io/file_utils.py; 2. add auto download for CNNDM and THUCNews; 3. rename XNLI loader and pipe to CNXNLI*; 4. update documents in some download method.

tags/v0.4.10
Yige Xu 5 years ago
parent
commit
ad95707718
8 changed files with 110 additions and 103 deletions
  1. +19
    -5
      fastNLP/io/file_utils.py
  2. +12
    -58
      fastNLP/io/loader/classification.py
  3. +10
    -0
      fastNLP/io/loader/conll.py
  4. +19
    -11
      fastNLP/io/loader/coreference.py
  5. +36
    -15
      fastNLP/io/loader/matching.py
  6. +9
    -9
      fastNLP/io/pipe/matching.py
  7. +2
    -2
      test/io/loader/test_matching_loader.py
  8. +3
    -3
      test/io/pipe/test_matching.py

+ 19
- 5
fastNLP/io/file_utils.py View File

@@ -83,27 +83,41 @@ PRETRAIN_STATIC_FILES = {
} }


DATASET_DIR = { DATASET_DIR = {
# Classification, English
'aclImdb': "imdb.zip", 'aclImdb': "imdb.zip",
"yelp-review-full": "yelp_review_full.tar.gz", "yelp-review-full": "yelp_review_full.tar.gz",
"yelp-review-polarity": "yelp_review_polarity.tar.gz", "yelp-review-polarity": "yelp_review_polarity.tar.gz",
"sst-2": "SST-2.zip",
"sst": "SST.zip",

# Classification, Chinese
"chn-senti-corp": "chn_senti_corp.zip",
"weibo-senti-100k": "WeiboSenti100k.zip",
"thuc-news": "THUCNews.zip",

# Matching, English
"mnli": "MNLI.zip", "mnli": "MNLI.zip",
"snli": "SNLI.zip", "snli": "SNLI.zip",
"qnli": "QNLI.zip", "qnli": "QNLI.zip",
"xnli": "XNLI.zip",
"sst-2": "SST-2.zip",
"sst": "SST.zip",
"rte": "RTE.zip", "rte": "RTE.zip",

# Matching, Chinese
"cn-xnli": "XNLI.zip",

# Sequence Labeling, Chinese
"msra-ner": "MSRA_NER.zip", "msra-ner": "MSRA_NER.zip",
"peopledaily": "peopledaily.zip", "peopledaily": "peopledaily.zip",
"weibo-ner": "weibo_NER.zip", "weibo-ner": "weibo_NER.zip",


# Chinese Word Segmentation
"cws-pku": 'cws_pku.zip', "cws-pku": 'cws_pku.zip',
"cws-cityu": "cws_cityu.zip", "cws-cityu": "cws_cityu.zip",
"cws-as": 'cws_as.zip', "cws-as": 'cws_as.zip',
"cws-msra": 'cws_msra.zip', "cws-msra": 'cws_msra.zip',


"chn-senti-corp" : "chn_senti_corp.zip",
"weibo-senti-100k" : "WeiboSenti100k.zip"
# Summarization, English
"ext-cnndm": "ext-cnndm.zip",

} }


PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR,


+ 12
- 58
fastNLP/io/loader/classification.py View File

@@ -373,63 +373,6 @@ class ChnSentiCorpLoader(Loader):
""" """
从path中读取数据 从path中读取数据


:param path:
:return:
"""
ds = DataSet()
with open(path, 'r', encoding='utf-8') as f:
f.readline()
for line in f:
line = line.strip()
tab_index = line.index('\t')
if tab_index!=-1:
target = line[:tab_index]
raw_chars = line[tab_index+1:]
if raw_chars:
ds.append(Instance(raw_chars=raw_chars, target=target))
return ds

def download(self)->str:
"""
自动下载数据,该数据取自https://github.com/pengming617/bert_classification/tree/master/data,在
https://arxiv.org/pdf/1904.09223.pdf与https://arxiv.org/pdf/1906.08101.pdf有使用

:return:
"""
output_dir = self._get_dataset_path('chn-senti-corp')
return output_dir


class ChnSentiCorpLoader(Loader):
"""
支持读取的数据的格式为,第一行为标题(具体内容会被忽略),之后一行为一个sample,第一个制表符之前被认为是label,第
一个制表符及之后认为是句子

Example::

label raw_chars
1 這間酒店環境和服務態度亦算不錯,但房間空間太小~~
1 <荐书> 推荐所有喜欢<红楼>的红迷们一定要收藏这本书,要知道...
0 商品的不足暂时还没发现,京东的订单处理速度实在.......周二就打包完成,周五才发货...

读取后的DataSet具有以下的field

.. csv-table::
:header: "raw_chars", "target"

"這間酒店環境和服務態度亦算不錯,但房間空間太小~~", "1"
"<荐书> 推荐所有喜欢<红楼>...", "1"
"..."

"""

def __init__(self):
super().__init__()

def _load(self, path: str):
"""
从path中读取数据

:param path: :param path:
:return: :return:
""" """
@@ -441,7 +384,7 @@ class ChnSentiCorpLoader(Loader):
tab_index = line.index('\t') tab_index = line.index('\t')
if tab_index != -1: if tab_index != -1:
target = line[:tab_index] target = line[:tab_index]
raw_chars = line[tab_index + 1:]
raw_chars = line[tab_index+1:]
if raw_chars: if raw_chars:
ds.append(Instance(raw_chars=raw_chars, target=target)) ds.append(Instance(raw_chars=raw_chars, target=target))
return ds return ds
@@ -486,6 +429,17 @@ class THUCNewsLoader(Loader):
ds.append(Instance(raw_chars=raw_chars, target=target)) ds.append(Instance(raw_chars=raw_chars, target=target))
return ds return ds


def download(self) -> str:
"""
自动下载数据,该数据取自

http://thuctc.thunlp.org/#%E4%B8%AD%E6%96%87%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E6%95%B0%E6%8D%AE%E9%9B%86THUCNews

:return:
"""
output_dir = self._get_dataset_path('thuc-news')
return output_dir



class WeiboSenti100kLoader(Loader): class WeiboSenti100kLoader(Loader):
""" """


+ 10
- 0
fastNLP/io/loader/conll.py View File

@@ -316,6 +316,16 @@ class CTBLoader(Loader):
dataset = self.loader._load(path) dataset = self.loader._load(path)
return dataset return dataset


def download(self):
"""
由于版权限制,不能提供自动下载功能。可参考

https://catalog.ldc.upenn.edu/LDC2013T21

:return:
"""
raise RuntimeError("CTB cannot be downloaded automatically.")



class CNNERLoader(Loader): class CNNERLoader(Loader):
def _load(self, path: str): def _load(self, path: str):


+ 19
- 11
fastNLP/io/loader/coreference.py View File

@@ -13,23 +13,21 @@ from .json import JsonLoader


class CoReferenceLoader(JsonLoader): class CoReferenceLoader(JsonLoader):
""" """
原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息,speakers包含每句话的说话者信息,cluster是指向现实中同一个事物的聚集,sentences是文本信息内容。
原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息,speakers包含每句话的说话者信息,cluster是指向现实中同一个事物的聚集,sentences是文本信息内容。


Example::
Example::


{"doc_key":"bc/cctv/00/cctv_001",
"speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]",
"clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]",
"sentences":[["I","have","an","apple"],["It","is","good"]]
}
{"doc_key":"bc/cctv/00/cctv_001",
"speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]",
"clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]",
"sentences":[["I","have","an","apple"],["It","is","good"]]
}


读取预处理好的Conll2012数据。
读取预处理好的Conll2012数据。


"""
"""
def __init__(self, fields=None, dropna=False): def __init__(self, fields=None, dropna=False):
super().__init__(fields, dropna) super().__init__(fields, dropna)
# self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1),
# "clusters":Const.TARGET,"sentences":Const.INPUTS(2)}
self.fields = {"doc_key": Const.RAW_WORDS(0), "speakers": Const.RAW_WORDS(1), "clusters": Const.RAW_WORDS(2), self.fields = {"doc_key": Const.RAW_WORDS(0), "speakers": Const.RAW_WORDS(1), "clusters": Const.RAW_WORDS(2),
"sentences": Const.RAW_WORDS(3)} "sentences": Const.RAW_WORDS(3)}


@@ -48,3 +46,13 @@ class CoReferenceLoader(JsonLoader):
ins = d ins = d
dataset.append(Instance(**ins)) dataset.append(Instance(**ins))
return dataset return dataset

def download(self):
"""
由于版权限制,不能提供自动下载功能。可参考

https://www.aclweb.org/anthology/W12-4501

:return:
"""
raise RuntimeError("CoReference cannot be downloaded automatically.")

+ 36
- 15
fastNLP/io/loader/matching.py View File

@@ -7,7 +7,7 @@ __all__ = [
"RTELoader", "RTELoader",
"QuoraLoader", "QuoraLoader",
"BQCorpusLoader", "BQCorpusLoader",
"XNLILoader",
"CNXNLILoader",
"LCQMCLoader" "LCQMCLoader"
] ]


@@ -135,12 +135,12 @@ class SNLILoader(JsonLoader):
""" """
从指定一个或多个路径中的文件中读取数据,返回 :class:`~fastNLP.io.DataBundle` 。 从指定一个或多个路径中的文件中读取数据,返回 :class:`~fastNLP.io.DataBundle` 。


读取的field根据ConllLoader初始化时传入的headers决定。
读取的field根据Loader初始化时传入的field决定。


:param str paths: 传入一个目录, 将在该目录下寻找snli_1.0_train.jsonl, snli_1.0_dev.jsonl :param str paths: 传入一个目录, 将在该目录下寻找snli_1.0_train.jsonl, snli_1.0_dev.jsonl
和snli_1.0_test.jsonl三个文件。 和snli_1.0_test.jsonl三个文件。


:return: 返回的:class:`~fastNLP.io.DataBundle`
:return: 返回的 :class:`~fastNLP.io.DataBundle`
""" """
_paths = {} _paths = {}
if paths is None: if paths is None:
@@ -222,8 +222,7 @@ class QNLILoader(JsonLoader):
""" """
如果您的实验使用到了该数据,请引用 如果您的实验使用到了该数据,请引用


.. todo::
补充
https://arxiv.org/pdf/1809.05053.pdf


:return: :return:
""" """
@@ -276,6 +275,13 @@ class RTELoader(Loader):
return ds return ds
def download(self): def download(self):
"""
如果您的实验使用到了该数据,请引用GLUE Benchmark

https://openreview.net/pdf?id=rJ4km2R5t7

:return:
"""
return self._get_dataset_path('rte') return self._get_dataset_path('rte')




@@ -321,10 +327,17 @@ class QuoraLoader(Loader):
return ds return ds
def download(self): def download(self):
"""
由于版权限制,不能提供自动下载功能。可参考

https://www.kaggle.com/c/quora-question-pairs/data

:return:
"""
raise RuntimeError("Quora cannot be downloaded automatically.") raise RuntimeError("Quora cannot be downloaded automatically.")




class XNLILoader(Loader):
class CNXNLILoader(Loader):
""" """
别名: 别名:
数据集简介:中文句对NLI(本为multi-lingual的数据集,但是这里只取了中文的数据集)。原句子已被MOSES tokenizer处理 数据集简介:中文句对NLI(本为multi-lingual的数据集,但是这里只取了中文的数据集)。原句子已被MOSES tokenizer处理
@@ -341,7 +354,7 @@ class XNLILoader(Loader):
""" """


def __init__(self): def __init__(self):
super(XNLILoader, self).__init__()
super(CNXNLILoader, self).__init__()


def _load(self, path: str = None): def _load(self, path: str = None):
csv_loader = CSVLoader(sep='\t') csv_loader = CSVLoader(sep='\t')
@@ -384,7 +397,7 @@ class XNLILoader(Loader):
https://arxiv.org/pdf/1809.05053.pdf 有使用 https://arxiv.org/pdf/1809.05053.pdf 有使用
:return: :return:
""" """
output_dir = self._get_dataset_path('xnli')
output_dir = self._get_dataset_path('cn-xnli')
return output_dir return output_dir




@@ -423,6 +436,16 @@ class BQCorpusLoader(Loader):
ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target)) ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target))
return ds return ds


def download(self):
"""
由于版权限制,不能提供自动下载功能。可参考

https://github.com/ymcui/Chinese-BERT-wwm

:return:
"""
raise RuntimeError("BQCorpus cannot be downloaded automatically.")



class LCQMCLoader(Loader): class LCQMCLoader(Loader):
""" """
@@ -461,16 +484,14 @@ class LCQMCLoader(Loader):
ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target)) ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target))
return ds return ds


'''
def download(self)->str:
def download(self):
""" """
自动下载数据,该数据取自论文 LCQMC: A Large-scale Chinese Question Matching Corpus.
InProceedings of the 27thInternational Conference on Computational Linguistics. 1952–1962.
由于版权限制,不能提供自动下载功能。可参考

https://github.com/ymcui/Chinese-BERT-wwm


:return: :return:
""" """
output_dir = self._get_dataset_path('chn-senti-corp')
return output_dir
'''
raise RuntimeError("LCQMC cannot be downloaded automatically.")





+ 9
- 9
fastNLP/io/pipe/matching.py View File

@@ -7,7 +7,7 @@ __all__ = [
"QuoraBertPipe", "QuoraBertPipe",
"QNLIBertPipe", "QNLIBertPipe",
"MNLIBertPipe", "MNLIBertPipe",
"XNLIBertPipe",
"CNXNLIBertPipe",
"BQCorpusBertPipe", "BQCorpusBertPipe",
"LCQMCBertPipe", "LCQMCBertPipe",
"MatchingPipe", "MatchingPipe",
@@ -16,7 +16,7 @@ __all__ = [
"QuoraPipe", "QuoraPipe",
"QNLIPipe", "QNLIPipe",
"MNLIPipe", "MNLIPipe",
"XNLIPipe",
"CNXNLIPipe",
"BQCorpusPipe", "BQCorpusPipe",
"LCQMCPipe", "LCQMCPipe",
] ]
@@ -25,7 +25,7 @@ import warnings


from .pipe import Pipe from .pipe import Pipe
from .utils import get_tokenizer from .utils import get_tokenizer
from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, XNLILoader, LCQMCLoader
from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, CNXNLILoader, LCQMCLoader
from ...core.const import Const from ...core.const import Const
from ...core.vocabulary import Vocabulary from ...core.vocabulary import Vocabulary
from ...core._logger import logger from ...core._logger import logger
@@ -354,10 +354,10 @@ class LCQMCPipe(MatchingPipe):
return data_bundle return data_bundle




class XNLIPipe(MatchingPipe):
def process_from_file(self, paths = None):
data_bundle = XNLILoader().load(paths)
data_bundle = GranularizePipe(task = 'XNLI').process(data_bundle)
class CNXNLIPipe(MatchingPipe):
def process_from_file(self, paths=None):
data_bundle = CNXNLILoader().load(paths)
data_bundle = GranularizePipe(task='XNLI').process(data_bundle)
data_bundle = RenamePipe().process(data_bundle) #使中文数据的field data_bundle = RenamePipe().process(data_bundle) #使中文数据的field
data_bundle = self.process(data_bundle) data_bundle = self.process(data_bundle)
data_bundle = RenamePipe().process(data_bundle) data_bundle = RenamePipe().process(data_bundle)
@@ -473,9 +473,9 @@ class BQCorpusBertPipe(MatchingBertPipe):
return data_bundle return data_bundle




class XNLIBertPipe(MatchingBertPipe):
class CNXNLIBertPipe(MatchingBertPipe):
def process_from_file(self, paths = None): def process_from_file(self, paths = None):
data_bundle = XNLILoader().load(paths)
data_bundle = CNXNLILoader().load(paths)
data_bundle = GranularizePipe(task='XNLI').process(data_bundle) data_bundle = GranularizePipe(task='XNLI').process(data_bundle)
data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle)
data_bundle = self.process(data_bundle) data_bundle = self.process(data_bundle)


+ 2
- 2
test/io/loader/test_matching_loader.py View File

@@ -5,7 +5,7 @@ import os


from fastNLP.io import DataBundle from fastNLP.io import DataBundle
from fastNLP.io.loader.matching import RTELoader, QNLILoader, SNLILoader, QuoraLoader, MNLILoader, \ from fastNLP.io.loader.matching import RTELoader, QNLILoader, SNLILoader, QuoraLoader, MNLILoader, \
BQCorpusLoader, XNLILoader, LCQMCLoader
BQCorpusLoader, CNXNLILoader, LCQMCLoader




@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis")
@@ -31,7 +31,7 @@ class TestMatchingLoad(unittest.TestCase):
'MNLI': ('test/data_for_tests/io/MNLI', MNLILoader, (5, 5, 5, 5, 6), True), 'MNLI': ('test/data_for_tests/io/MNLI', MNLILoader, (5, 5, 5, 5, 6), True),
'Quora': ('test/data_for_tests/io/Quora', QuoraLoader, (2, 2, 2), False), 'Quora': ('test/data_for_tests/io/Quora', QuoraLoader, (2, 2, 2), False),
'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusLoader, (5, 5, 5), False), 'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusLoader, (5, 5, 5), False),
'XNLI': ('test/data_for_tests/io/XNLI', XNLILoader, (6, 7, 6), False),
'XNLI': ('test/data_for_tests/io/XNLI', CNXNLILoader, (6, 7, 6), False),
'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCLoader, (5, 6, 6), False), 'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCLoader, (5, 6, 6), False),
} }
for k, v in data_set_dict.items(): for k, v in data_set_dict.items():


+ 3
- 3
test/io/pipe/test_matching.py View File

@@ -4,9 +4,9 @@ import os


from fastNLP.io import DataBundle from fastNLP.io import DataBundle
from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, QNLIPipe, QuoraPipe, MNLIPipe, \ from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, QNLIPipe, QuoraPipe, MNLIPipe, \
XNLIPipe, BQCorpusPipe, LCQMCPipe
CNXNLIPipe, BQCorpusPipe, LCQMCPipe
from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, QNLIBertPipe, QuoraBertPipe, MNLIBertPipe, \ from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, QNLIBertPipe, QuoraBertPipe, MNLIBertPipe, \
XNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe
CNXNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe




@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis")
@@ -38,7 +38,7 @@ class TestRunMatchingPipe(unittest.TestCase):
'QNLI': ('test/data_for_tests/io/QNLI', QNLIPipe, QNLIBertPipe, (5, 5, 5), (372, 2), True), 'QNLI': ('test/data_for_tests/io/QNLI', QNLIPipe, QNLIBertPipe, (5, 5, 5), (372, 2), True),
'MNLI': ('test/data_for_tests/io/MNLI', MNLIPipe, MNLIBertPipe, (5, 5, 5, 5, 6), (459, 3), True), 'MNLI': ('test/data_for_tests/io/MNLI', MNLIPipe, MNLIBertPipe, (5, 5, 5, 5, 6), (459, 3), True),
'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusPipe, BQCorpusBertPipe, (5, 5, 5), (32, 2), False), 'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusPipe, BQCorpusBertPipe, (5, 5, 5), (32, 2), False),
'XNLI': ('test/data_for_tests/io/XNLI', XNLIPipe, XNLIBertPipe, (6, 7, 6), (37, 3), False),
'XNLI': ('test/data_for_tests/io/XNLI', CNXNLIPipe, CNXNLIBertPipe, (6, 7, 6), (37, 3), False),
'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCPipe, LCQMCBertPipe, (5, 6, 6), (36, 2), False), 'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCPipe, LCQMCBertPipe, (5, 6, 6), (36, 2), False),
} }
for k, v in data_set_dict.items(): for k, v in data_set_dict.items():


Loading…
Cancel
Save