@@ -83,27 +83,41 @@ PRETRAIN_STATIC_FILES = { | |||
} | |||
DATASET_DIR = { | |||
# Classification, English | |||
'aclImdb': "imdb.zip", | |||
"yelp-review-full": "yelp_review_full.tar.gz", | |||
"yelp-review-polarity": "yelp_review_polarity.tar.gz", | |||
"sst-2": "SST-2.zip", | |||
"sst": "SST.zip", | |||
# Classification, Chinese | |||
"chn-senti-corp": "chn_senti_corp.zip", | |||
"weibo-senti-100k": "WeiboSenti100k.zip", | |||
"thuc-news": "THUCNews.zip", | |||
# Matching, English | |||
"mnli": "MNLI.zip", | |||
"snli": "SNLI.zip", | |||
"qnli": "QNLI.zip", | |||
"xnli": "XNLI.zip", | |||
"sst-2": "SST-2.zip", | |||
"sst": "SST.zip", | |||
"rte": "RTE.zip", | |||
# Matching, Chinese | |||
"cn-xnli": "XNLI.zip", | |||
# Sequence Labeling, Chinese | |||
"msra-ner": "MSRA_NER.zip", | |||
"peopledaily": "peopledaily.zip", | |||
"weibo-ner": "weibo_NER.zip", | |||
# Chinese Word Segmentation | |||
"cws-pku": 'cws_pku.zip', | |||
"cws-cityu": "cws_cityu.zip", | |||
"cws-as": 'cws_as.zip', | |||
"cws-msra": 'cws_msra.zip', | |||
"chn-senti-corp" : "chn_senti_corp.zip", | |||
"weibo-senti-100k" : "WeiboSenti100k.zip" | |||
# Summarization, English | |||
"ext-cnndm": "ext-cnndm.zip", | |||
} | |||
PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, | |||
@@ -373,63 +373,6 @@ class ChnSentiCorpLoader(Loader): | |||
""" | |||
从path中读取数据 | |||
:param path: | |||
:return: | |||
""" | |||
ds = DataSet() | |||
with open(path, 'r', encoding='utf-8') as f: | |||
f.readline() | |||
for line in f: | |||
line = line.strip() | |||
tab_index = line.index('\t') | |||
if tab_index!=-1: | |||
target = line[:tab_index] | |||
raw_chars = line[tab_index+1:] | |||
if raw_chars: | |||
ds.append(Instance(raw_chars=raw_chars, target=target)) | |||
return ds | |||
def download(self)->str: | |||
""" | |||
自动下载数据,该数据取自https://github.com/pengming617/bert_classification/tree/master/data,在 | |||
https://arxiv.org/pdf/1904.09223.pdf与https://arxiv.org/pdf/1906.08101.pdf有使用 | |||
:return: | |||
""" | |||
output_dir = self._get_dataset_path('chn-senti-corp') | |||
return output_dir | |||
class ChnSentiCorpLoader(Loader): | |||
""" | |||
支持读取的数据的格式为,第一行为标题(具体内容会被忽略),之后一行为一个sample,第一个制表符之前被认为是label,第 | |||
一个制表符及之后认为是句子 | |||
Example:: | |||
label raw_chars | |||
1 這間酒店環境和服務態度亦算不錯,但房間空間太小~~ | |||
1 <荐书> 推荐所有喜欢<红楼>的红迷们一定要收藏这本书,要知道... | |||
0 商品的不足暂时还没发现,京东的订单处理速度实在.......周二就打包完成,周五才发货... | |||
读取后的DataSet具有以下的field | |||
.. csv-table:: | |||
:header: "raw_chars", "target" | |||
"這間酒店環境和服務態度亦算不錯,但房間空間太小~~", "1" | |||
"<荐书> 推荐所有喜欢<红楼>...", "1" | |||
"..." | |||
""" | |||
def __init__(self): | |||
super().__init__() | |||
def _load(self, path: str): | |||
""" | |||
从path中读取数据 | |||
:param path: | |||
:return: | |||
""" | |||
@@ -441,7 +384,7 @@ class ChnSentiCorpLoader(Loader): | |||
tab_index = line.index('\t') | |||
if tab_index != -1: | |||
target = line[:tab_index] | |||
raw_chars = line[tab_index + 1:] | |||
raw_chars = line[tab_index+1:] | |||
if raw_chars: | |||
ds.append(Instance(raw_chars=raw_chars, target=target)) | |||
return ds | |||
@@ -486,6 +429,17 @@ class THUCNewsLoader(Loader): | |||
ds.append(Instance(raw_chars=raw_chars, target=target)) | |||
return ds | |||
def download(self) -> str: | |||
""" | |||
自动下载数据,该数据取自 | |||
http://thuctc.thunlp.org/#%E4%B8%AD%E6%96%87%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E6%95%B0%E6%8D%AE%E9%9B%86THUCNews | |||
:return: | |||
""" | |||
output_dir = self._get_dataset_path('thuc-news') | |||
return output_dir | |||
class WeiboSenti100kLoader(Loader): | |||
""" | |||
@@ -316,6 +316,16 @@ class CTBLoader(Loader): | |||
dataset = self.loader._load(path) | |||
return dataset | |||
def download(self): | |||
""" | |||
由于版权限制,不能提供自动下载功能。可参考 | |||
https://catalog.ldc.upenn.edu/LDC2013T21 | |||
:return: | |||
""" | |||
raise RuntimeError("CTB cannot be downloaded automatically.") | |||
class CNNERLoader(Loader): | |||
def _load(self, path: str): | |||
@@ -13,23 +13,21 @@ from .json import JsonLoader | |||
class CoReferenceLoader(JsonLoader): | |||
""" | |||
原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息,speakers包含每句话的说话者信息,cluster是指向现实中同一个事物的聚集,sentences是文本信息内容。 | |||
原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息,speakers包含每句话的说话者信息,cluster是指向现实中同一个事物的聚集,sentences是文本信息内容。 | |||
Example:: | |||
Example:: | |||
{"doc_key":"bc/cctv/00/cctv_001", | |||
"speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]", | |||
"clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]", | |||
"sentences":[["I","have","an","apple"],["It","is","good"]] | |||
} | |||
{"doc_key":"bc/cctv/00/cctv_001", | |||
"speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]", | |||
"clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]", | |||
"sentences":[["I","have","an","apple"],["It","is","good"]] | |||
} | |||
读取预处理好的Conll2012数据。 | |||
读取预处理好的Conll2012数据。 | |||
""" | |||
""" | |||
def __init__(self, fields=None, dropna=False): | |||
super().__init__(fields, dropna) | |||
# self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1), | |||
# "clusters":Const.TARGET,"sentences":Const.INPUTS(2)} | |||
self.fields = {"doc_key": Const.RAW_WORDS(0), "speakers": Const.RAW_WORDS(1), "clusters": Const.RAW_WORDS(2), | |||
"sentences": Const.RAW_WORDS(3)} | |||
@@ -48,3 +46,13 @@ class CoReferenceLoader(JsonLoader): | |||
ins = d | |||
dataset.append(Instance(**ins)) | |||
return dataset | |||
def download(self): | |||
""" | |||
由于版权限制,不能提供自动下载功能。可参考 | |||
https://www.aclweb.org/anthology/W12-4501 | |||
:return: | |||
""" | |||
raise RuntimeError("CoReference cannot be downloaded automatically.") |
@@ -7,7 +7,7 @@ __all__ = [ | |||
"RTELoader", | |||
"QuoraLoader", | |||
"BQCorpusLoader", | |||
"XNLILoader", | |||
"CNXNLILoader", | |||
"LCQMCLoader" | |||
] | |||
@@ -135,12 +135,12 @@ class SNLILoader(JsonLoader): | |||
""" | |||
从指定一个或多个路径中的文件中读取数据,返回 :class:`~fastNLP.io.DataBundle` 。 | |||
读取的field根据ConllLoader初始化时传入的headers决定。 | |||
读取的field根据Loader初始化时传入的field决定。 | |||
:param str paths: 传入一个目录, 将在该目录下寻找snli_1.0_train.jsonl, snli_1.0_dev.jsonl | |||
和snli_1.0_test.jsonl三个文件。 | |||
:return: 返回的:class:`~fastNLP.io.DataBundle` | |||
:return: 返回的 :class:`~fastNLP.io.DataBundle` | |||
""" | |||
_paths = {} | |||
if paths is None: | |||
@@ -222,8 +222,7 @@ class QNLILoader(JsonLoader): | |||
""" | |||
如果您的实验使用到了该数据,请引用 | |||
.. todo:: | |||
补充 | |||
https://arxiv.org/pdf/1809.05053.pdf | |||
:return: | |||
""" | |||
@@ -276,6 +275,13 @@ class RTELoader(Loader): | |||
return ds | |||
def download(self): | |||
""" | |||
如果您的实验使用到了该数据,请引用GLUE Benchmark | |||
https://openreview.net/pdf?id=rJ4km2R5t7 | |||
:return: | |||
""" | |||
return self._get_dataset_path('rte') | |||
@@ -321,10 +327,17 @@ class QuoraLoader(Loader): | |||
return ds | |||
def download(self): | |||
""" | |||
由于版权限制,不能提供自动下载功能。可参考 | |||
https://www.kaggle.com/c/quora-question-pairs/data | |||
:return: | |||
""" | |||
raise RuntimeError("Quora cannot be downloaded automatically.") | |||
class XNLILoader(Loader): | |||
class CNXNLILoader(Loader): | |||
""" | |||
别名: | |||
数据集简介:中文句对NLI(本为multi-lingual的数据集,但是这里只取了中文的数据集)。原句子已被MOSES tokenizer处理 | |||
@@ -341,7 +354,7 @@ class XNLILoader(Loader): | |||
""" | |||
def __init__(self): | |||
super(XNLILoader, self).__init__() | |||
super(CNXNLILoader, self).__init__() | |||
def _load(self, path: str = None): | |||
csv_loader = CSVLoader(sep='\t') | |||
@@ -384,7 +397,7 @@ class XNLILoader(Loader): | |||
https://arxiv.org/pdf/1809.05053.pdf 有使用 | |||
:return: | |||
""" | |||
output_dir = self._get_dataset_path('xnli') | |||
output_dir = self._get_dataset_path('cn-xnli') | |||
return output_dir | |||
@@ -423,6 +436,16 @@ class BQCorpusLoader(Loader): | |||
ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target)) | |||
return ds | |||
def download(self): | |||
""" | |||
由于版权限制,不能提供自动下载功能。可参考 | |||
https://github.com/ymcui/Chinese-BERT-wwm | |||
:return: | |||
""" | |||
raise RuntimeError("BQCorpus cannot be downloaded automatically.") | |||
class LCQMCLoader(Loader): | |||
""" | |||
@@ -461,16 +484,14 @@ class LCQMCLoader(Loader): | |||
ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target)) | |||
return ds | |||
''' | |||
def download(self)->str: | |||
def download(self): | |||
""" | |||
自动下载数据,该数据取自论文 LCQMC: A Large-scale Chinese Question Matching Corpus. | |||
InProceedings of the 27thInternational Conference on Computational Linguistics. 1952–1962. | |||
由于版权限制,不能提供自动下载功能。可参考 | |||
https://github.com/ymcui/Chinese-BERT-wwm | |||
:return: | |||
""" | |||
output_dir = self._get_dataset_path('chn-senti-corp') | |||
return output_dir | |||
''' | |||
raise RuntimeError("LCQMC cannot be downloaded automatically.") | |||
@@ -7,7 +7,7 @@ __all__ = [ | |||
"QuoraBertPipe", | |||
"QNLIBertPipe", | |||
"MNLIBertPipe", | |||
"XNLIBertPipe", | |||
"CNXNLIBertPipe", | |||
"BQCorpusBertPipe", | |||
"LCQMCBertPipe", | |||
"MatchingPipe", | |||
@@ -16,7 +16,7 @@ __all__ = [ | |||
"QuoraPipe", | |||
"QNLIPipe", | |||
"MNLIPipe", | |||
"XNLIPipe", | |||
"CNXNLIPipe", | |||
"BQCorpusPipe", | |||
"LCQMCPipe", | |||
] | |||
@@ -25,7 +25,7 @@ import warnings | |||
from .pipe import Pipe | |||
from .utils import get_tokenizer | |||
from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, XNLILoader, LCQMCLoader | |||
from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, CNXNLILoader, LCQMCLoader | |||
from ...core.const import Const | |||
from ...core.vocabulary import Vocabulary | |||
from ...core._logger import logger | |||
@@ -354,10 +354,10 @@ class LCQMCPipe(MatchingPipe): | |||
return data_bundle | |||
class XNLIPipe(MatchingPipe): | |||
def process_from_file(self, paths = None): | |||
data_bundle = XNLILoader().load(paths) | |||
data_bundle = GranularizePipe(task = 'XNLI').process(data_bundle) | |||
class CNXNLIPipe(MatchingPipe): | |||
def process_from_file(self, paths=None): | |||
data_bundle = CNXNLILoader().load(paths) | |||
data_bundle = GranularizePipe(task='XNLI').process(data_bundle) | |||
data_bundle = RenamePipe().process(data_bundle) #使中文数据的field | |||
data_bundle = self.process(data_bundle) | |||
data_bundle = RenamePipe().process(data_bundle) | |||
@@ -473,9 +473,9 @@ class BQCorpusBertPipe(MatchingBertPipe): | |||
return data_bundle | |||
class XNLIBertPipe(MatchingBertPipe): | |||
class CNXNLIBertPipe(MatchingBertPipe): | |||
def process_from_file(self, paths = None): | |||
data_bundle = XNLILoader().load(paths) | |||
data_bundle = CNXNLILoader().load(paths) | |||
data_bundle = GranularizePipe(task='XNLI').process(data_bundle) | |||
data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) | |||
data_bundle = self.process(data_bundle) | |||
@@ -5,7 +5,7 @@ import os | |||
from fastNLP.io import DataBundle | |||
from fastNLP.io.loader.matching import RTELoader, QNLILoader, SNLILoader, QuoraLoader, MNLILoader, \ | |||
BQCorpusLoader, XNLILoader, LCQMCLoader | |||
BQCorpusLoader, CNXNLILoader, LCQMCLoader | |||
@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | |||
@@ -31,7 +31,7 @@ class TestMatchingLoad(unittest.TestCase): | |||
'MNLI': ('test/data_for_tests/io/MNLI', MNLILoader, (5, 5, 5, 5, 6), True), | |||
'Quora': ('test/data_for_tests/io/Quora', QuoraLoader, (2, 2, 2), False), | |||
'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusLoader, (5, 5, 5), False), | |||
'XNLI': ('test/data_for_tests/io/XNLI', XNLILoader, (6, 7, 6), False), | |||
'XNLI': ('test/data_for_tests/io/XNLI', CNXNLILoader, (6, 7, 6), False), | |||
'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCLoader, (5, 6, 6), False), | |||
} | |||
for k, v in data_set_dict.items(): | |||
@@ -4,9 +4,9 @@ import os | |||
from fastNLP.io import DataBundle | |||
from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, QNLIPipe, QuoraPipe, MNLIPipe, \ | |||
XNLIPipe, BQCorpusPipe, LCQMCPipe | |||
CNXNLIPipe, BQCorpusPipe, LCQMCPipe | |||
from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, QNLIBertPipe, QuoraBertPipe, MNLIBertPipe, \ | |||
XNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe | |||
CNXNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe | |||
@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | |||
@@ -38,7 +38,7 @@ class TestRunMatchingPipe(unittest.TestCase): | |||
'QNLI': ('test/data_for_tests/io/QNLI', QNLIPipe, QNLIBertPipe, (5, 5, 5), (372, 2), True), | |||
'MNLI': ('test/data_for_tests/io/MNLI', MNLIPipe, MNLIBertPipe, (5, 5, 5, 5, 6), (459, 3), True), | |||
'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusPipe, BQCorpusBertPipe, (5, 5, 5), (32, 2), False), | |||
'XNLI': ('test/data_for_tests/io/XNLI', XNLIPipe, XNLIBertPipe, (6, 7, 6), (37, 3), False), | |||
'XNLI': ('test/data_for_tests/io/XNLI', CNXNLIPipe, CNXNLIBertPipe, (6, 7, 6), (37, 3), False), | |||
'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCPipe, LCQMCBertPipe, (5, 6, 6), (36, 2), False), | |||
} | |||
for k, v in data_set_dict.items(): | |||