Browse Source

1. reorganize auto download datasets in io/file_utils.py; 2. add auto download for CNNDM and THUCNews; 3. rename XNLI loader and pipe to CNXNLI*; 4. update documents in some download method.

tags/v0.4.10
Yige Xu 5 years ago
parent
commit
ad95707718
8 changed files with 110 additions and 103 deletions
  1. +19
    -5
      fastNLP/io/file_utils.py
  2. +12
    -58
      fastNLP/io/loader/classification.py
  3. +10
    -0
      fastNLP/io/loader/conll.py
  4. +19
    -11
      fastNLP/io/loader/coreference.py
  5. +36
    -15
      fastNLP/io/loader/matching.py
  6. +9
    -9
      fastNLP/io/pipe/matching.py
  7. +2
    -2
      test/io/loader/test_matching_loader.py
  8. +3
    -3
      test/io/pipe/test_matching.py

+ 19
- 5
fastNLP/io/file_utils.py View File

@@ -83,27 +83,41 @@ PRETRAIN_STATIC_FILES = {
}

DATASET_DIR = {
# Classification, English
'aclImdb': "imdb.zip",
"yelp-review-full": "yelp_review_full.tar.gz",
"yelp-review-polarity": "yelp_review_polarity.tar.gz",
"sst-2": "SST-2.zip",
"sst": "SST.zip",

# Classification, Chinese
"chn-senti-corp": "chn_senti_corp.zip",
"weibo-senti-100k": "WeiboSenti100k.zip",
"thuc-news": "THUCNews.zip",

# Matching, English
"mnli": "MNLI.zip",
"snli": "SNLI.zip",
"qnli": "QNLI.zip",
"xnli": "XNLI.zip",
"sst-2": "SST-2.zip",
"sst": "SST.zip",
"rte": "RTE.zip",

# Matching, Chinese
"cn-xnli": "XNLI.zip",

# Sequence Labeling, Chinese
"msra-ner": "MSRA_NER.zip",
"peopledaily": "peopledaily.zip",
"weibo-ner": "weibo_NER.zip",

# Chinese Word Segmentation
"cws-pku": 'cws_pku.zip',
"cws-cityu": "cws_cityu.zip",
"cws-as": 'cws_as.zip',
"cws-msra": 'cws_msra.zip',

"chn-senti-corp" : "chn_senti_corp.zip",
"weibo-senti-100k" : "WeiboSenti100k.zip"
# Summarization, English
"ext-cnndm": "ext-cnndm.zip",

}

PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR,


+ 12
- 58
fastNLP/io/loader/classification.py View File

@@ -373,63 +373,6 @@ class ChnSentiCorpLoader(Loader):
"""
从path中读取数据

:param path:
:return:
"""
ds = DataSet()
with open(path, 'r', encoding='utf-8') as f:
f.readline()
for line in f:
line = line.strip()
tab_index = line.index('\t')
if tab_index!=-1:
target = line[:tab_index]
raw_chars = line[tab_index+1:]
if raw_chars:
ds.append(Instance(raw_chars=raw_chars, target=target))
return ds

def download(self)->str:
"""
自动下载数据,该数据取自https://github.com/pengming617/bert_classification/tree/master/data,在
https://arxiv.org/pdf/1904.09223.pdf与https://arxiv.org/pdf/1906.08101.pdf有使用

:return:
"""
output_dir = self._get_dataset_path('chn-senti-corp')
return output_dir


class ChnSentiCorpLoader(Loader):
"""
支持读取的数据的格式为,第一行为标题(具体内容会被忽略),之后一行为一个sample,第一个制表符之前被认为是label,第
一个制表符及之后认为是句子

Example::

label raw_chars
1 這間酒店環境和服務態度亦算不錯,但房間空間太小~~
1 <荐书> 推荐所有喜欢<红楼>的红迷们一定要收藏这本书,要知道...
0 商品的不足暂时还没发现,京东的订单处理速度实在.......周二就打包完成,周五才发货...

读取后的DataSet具有以下的field

.. csv-table::
:header: "raw_chars", "target"

"這間酒店環境和服務態度亦算不錯,但房間空間太小~~", "1"
"<荐书> 推荐所有喜欢<红楼>...", "1"
"..."

"""

def __init__(self):
super().__init__()

def _load(self, path: str):
"""
从path中读取数据

:param path:
:return:
"""
@@ -441,7 +384,7 @@ class ChnSentiCorpLoader(Loader):
tab_index = line.index('\t')
if tab_index != -1:
target = line[:tab_index]
raw_chars = line[tab_index + 1:]
raw_chars = line[tab_index+1:]
if raw_chars:
ds.append(Instance(raw_chars=raw_chars, target=target))
return ds
@@ -486,6 +429,17 @@ class THUCNewsLoader(Loader):
ds.append(Instance(raw_chars=raw_chars, target=target))
return ds

def download(self) -> str:
"""
自动下载数据,该数据取自

http://thuctc.thunlp.org/#%E4%B8%AD%E6%96%87%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E6%95%B0%E6%8D%AE%E9%9B%86THUCNews

:return:
"""
output_dir = self._get_dataset_path('thuc-news')
return output_dir


class WeiboSenti100kLoader(Loader):
"""


+ 10
- 0
fastNLP/io/loader/conll.py View File

@@ -316,6 +316,16 @@ class CTBLoader(Loader):
dataset = self.loader._load(path)
return dataset

def download(self):
"""
由于版权限制,不能提供自动下载功能。可参考

https://catalog.ldc.upenn.edu/LDC2013T21

:return:
"""
raise RuntimeError("CTB cannot be downloaded automatically.")


class CNNERLoader(Loader):
def _load(self, path: str):


+ 19
- 11
fastNLP/io/loader/coreference.py View File

@@ -13,23 +13,21 @@ from .json import JsonLoader

class CoReferenceLoader(JsonLoader):
"""
原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息,speakers包含每句话的说话者信息,cluster是指向现实中同一个事物的聚集,sentences是文本信息内容。
原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息,speakers包含每句话的说话者信息,cluster是指向现实中同一个事物的聚集,sentences是文本信息内容。

Example::
Example::

{"doc_key":"bc/cctv/00/cctv_001",
"speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]",
"clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]",
"sentences":[["I","have","an","apple"],["It","is","good"]]
}
{"doc_key":"bc/cctv/00/cctv_001",
"speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]",
"clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]",
"sentences":[["I","have","an","apple"],["It","is","good"]]
}

读取预处理好的Conll2012数据。
读取预处理好的Conll2012数据。

"""
"""
def __init__(self, fields=None, dropna=False):
super().__init__(fields, dropna)
# self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1),
# "clusters":Const.TARGET,"sentences":Const.INPUTS(2)}
self.fields = {"doc_key": Const.RAW_WORDS(0), "speakers": Const.RAW_WORDS(1), "clusters": Const.RAW_WORDS(2),
"sentences": Const.RAW_WORDS(3)}

@@ -48,3 +46,13 @@ class CoReferenceLoader(JsonLoader):
ins = d
dataset.append(Instance(**ins))
return dataset

def download(self):
"""
由于版权限制,不能提供自动下载功能。可参考

https://www.aclweb.org/anthology/W12-4501

:return:
"""
raise RuntimeError("CoReference cannot be downloaded automatically.")

+ 36
- 15
fastNLP/io/loader/matching.py View File

@@ -7,7 +7,7 @@ __all__ = [
"RTELoader",
"QuoraLoader",
"BQCorpusLoader",
"XNLILoader",
"CNXNLILoader",
"LCQMCLoader"
]

@@ -135,12 +135,12 @@ class SNLILoader(JsonLoader):
"""
从指定一个或多个路径中的文件中读取数据,返回 :class:`~fastNLP.io.DataBundle` 。

读取的field根据ConllLoader初始化时传入的headers决定。
读取的field根据Loader初始化时传入的field决定。

:param str paths: 传入一个目录, 将在该目录下寻找snli_1.0_train.jsonl, snli_1.0_dev.jsonl
和snli_1.0_test.jsonl三个文件。

:return: 返回的:class:`~fastNLP.io.DataBundle`
:return: 返回的 :class:`~fastNLP.io.DataBundle`
"""
_paths = {}
if paths is None:
@@ -222,8 +222,7 @@ class QNLILoader(JsonLoader):
"""
如果您的实验使用到了该数据,请引用

.. todo::
补充
https://arxiv.org/pdf/1809.05053.pdf

:return:
"""
@@ -276,6 +275,13 @@ class RTELoader(Loader):
return ds
def download(self):
"""
如果您的实验使用到了该数据,请引用GLUE Benchmark

https://openreview.net/pdf?id=rJ4km2R5t7

:return:
"""
return self._get_dataset_path('rte')


@@ -321,10 +327,17 @@ class QuoraLoader(Loader):
return ds
def download(self):
"""
由于版权限制,不能提供自动下载功能。可参考

https://www.kaggle.com/c/quora-question-pairs/data

:return:
"""
raise RuntimeError("Quora cannot be downloaded automatically.")


class XNLILoader(Loader):
class CNXNLILoader(Loader):
"""
别名:
数据集简介:中文句对NLI(本为multi-lingual的数据集,但是这里只取了中文的数据集)。原句子已被MOSES tokenizer处理
@@ -341,7 +354,7 @@ class XNLILoader(Loader):
"""

def __init__(self):
super(XNLILoader, self).__init__()
super(CNXNLILoader, self).__init__()

def _load(self, path: str = None):
csv_loader = CSVLoader(sep='\t')
@@ -384,7 +397,7 @@ class XNLILoader(Loader):
https://arxiv.org/pdf/1809.05053.pdf 有使用
:return:
"""
output_dir = self._get_dataset_path('xnli')
output_dir = self._get_dataset_path('cn-xnli')
return output_dir


@@ -423,6 +436,16 @@ class BQCorpusLoader(Loader):
ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target))
return ds

def download(self):
"""
由于版权限制,不能提供自动下载功能。可参考

https://github.com/ymcui/Chinese-BERT-wwm

:return:
"""
raise RuntimeError("BQCorpus cannot be downloaded automatically.")


class LCQMCLoader(Loader):
"""
@@ -461,16 +484,14 @@ class LCQMCLoader(Loader):
ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target))
return ds

'''
def download(self)->str:
def download(self):
"""
自动下载数据,该数据取自论文 LCQMC: A Large-scale Chinese Question Matching Corpus.
InProceedings of the 27thInternational Conference on Computational Linguistics. 1952–1962.
由于版权限制,不能提供自动下载功能。可参考

https://github.com/ymcui/Chinese-BERT-wwm

:return:
"""
output_dir = self._get_dataset_path('chn-senti-corp')
return output_dir
'''
raise RuntimeError("LCQMC cannot be downloaded automatically.")



+ 9
- 9
fastNLP/io/pipe/matching.py View File

@@ -7,7 +7,7 @@ __all__ = [
"QuoraBertPipe",
"QNLIBertPipe",
"MNLIBertPipe",
"XNLIBertPipe",
"CNXNLIBertPipe",
"BQCorpusBertPipe",
"LCQMCBertPipe",
"MatchingPipe",
@@ -16,7 +16,7 @@ __all__ = [
"QuoraPipe",
"QNLIPipe",
"MNLIPipe",
"XNLIPipe",
"CNXNLIPipe",
"BQCorpusPipe",
"LCQMCPipe",
]
@@ -25,7 +25,7 @@ import warnings

from .pipe import Pipe
from .utils import get_tokenizer
from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, XNLILoader, LCQMCLoader
from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, CNXNLILoader, LCQMCLoader
from ...core.const import Const
from ...core.vocabulary import Vocabulary
from ...core._logger import logger
@@ -354,10 +354,10 @@ class LCQMCPipe(MatchingPipe):
return data_bundle


class XNLIPipe(MatchingPipe):
def process_from_file(self, paths = None):
data_bundle = XNLILoader().load(paths)
data_bundle = GranularizePipe(task = 'XNLI').process(data_bundle)
class CNXNLIPipe(MatchingPipe):
def process_from_file(self, paths=None):
data_bundle = CNXNLILoader().load(paths)
data_bundle = GranularizePipe(task='XNLI').process(data_bundle)
data_bundle = RenamePipe().process(data_bundle) #使中文数据的field
data_bundle = self.process(data_bundle)
data_bundle = RenamePipe().process(data_bundle)
@@ -473,9 +473,9 @@ class BQCorpusBertPipe(MatchingBertPipe):
return data_bundle


class XNLIBertPipe(MatchingBertPipe):
class CNXNLIBertPipe(MatchingBertPipe):
def process_from_file(self, paths = None):
data_bundle = XNLILoader().load(paths)
data_bundle = CNXNLILoader().load(paths)
data_bundle = GranularizePipe(task='XNLI').process(data_bundle)
data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle)
data_bundle = self.process(data_bundle)


+ 2
- 2
test/io/loader/test_matching_loader.py View File

@@ -5,7 +5,7 @@ import os

from fastNLP.io import DataBundle
from fastNLP.io.loader.matching import RTELoader, QNLILoader, SNLILoader, QuoraLoader, MNLILoader, \
BQCorpusLoader, XNLILoader, LCQMCLoader
BQCorpusLoader, CNXNLILoader, LCQMCLoader


@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis")
@@ -31,7 +31,7 @@ class TestMatchingLoad(unittest.TestCase):
'MNLI': ('test/data_for_tests/io/MNLI', MNLILoader, (5, 5, 5, 5, 6), True),
'Quora': ('test/data_for_tests/io/Quora', QuoraLoader, (2, 2, 2), False),
'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusLoader, (5, 5, 5), False),
'XNLI': ('test/data_for_tests/io/XNLI', XNLILoader, (6, 7, 6), False),
'XNLI': ('test/data_for_tests/io/XNLI', CNXNLILoader, (6, 7, 6), False),
'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCLoader, (5, 6, 6), False),
}
for k, v in data_set_dict.items():


+ 3
- 3
test/io/pipe/test_matching.py View File

@@ -4,9 +4,9 @@ import os

from fastNLP.io import DataBundle
from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, QNLIPipe, QuoraPipe, MNLIPipe, \
XNLIPipe, BQCorpusPipe, LCQMCPipe
CNXNLIPipe, BQCorpusPipe, LCQMCPipe
from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, QNLIBertPipe, QuoraBertPipe, MNLIBertPipe, \
XNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe
CNXNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe


@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis")
@@ -38,7 +38,7 @@ class TestRunMatchingPipe(unittest.TestCase):
'QNLI': ('test/data_for_tests/io/QNLI', QNLIPipe, QNLIBertPipe, (5, 5, 5), (372, 2), True),
'MNLI': ('test/data_for_tests/io/MNLI', MNLIPipe, MNLIBertPipe, (5, 5, 5, 5, 6), (459, 3), True),
'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusPipe, BQCorpusBertPipe, (5, 5, 5), (32, 2), False),
'XNLI': ('test/data_for_tests/io/XNLI', XNLIPipe, XNLIBertPipe, (6, 7, 6), (37, 3), False),
'XNLI': ('test/data_for_tests/io/XNLI', CNXNLIPipe, CNXNLIBertPipe, (6, 7, 6), (37, 3), False),
'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCPipe, LCQMCBertPipe, (5, 6, 6), (36, 2), False),
}
for k, v in data_set_dict.items():


Loading…
Cancel
Save