@@ -83,27 +83,41 @@ PRETRAIN_STATIC_FILES = { | |||||
} | } | ||||
DATASET_DIR = { | DATASET_DIR = { | ||||
# Classification, English | |||||
'aclImdb': "imdb.zip", | 'aclImdb': "imdb.zip", | ||||
"yelp-review-full": "yelp_review_full.tar.gz", | "yelp-review-full": "yelp_review_full.tar.gz", | ||||
"yelp-review-polarity": "yelp_review_polarity.tar.gz", | "yelp-review-polarity": "yelp_review_polarity.tar.gz", | ||||
"sst-2": "SST-2.zip", | |||||
"sst": "SST.zip", | |||||
# Classification, Chinese | |||||
"chn-senti-corp": "chn_senti_corp.zip", | |||||
"weibo-senti-100k": "WeiboSenti100k.zip", | |||||
"thuc-news": "THUCNews.zip", | |||||
# Matching, English | |||||
"mnli": "MNLI.zip", | "mnli": "MNLI.zip", | ||||
"snli": "SNLI.zip", | "snli": "SNLI.zip", | ||||
"qnli": "QNLI.zip", | "qnli": "QNLI.zip", | ||||
"xnli": "XNLI.zip", | |||||
"sst-2": "SST-2.zip", | |||||
"sst": "SST.zip", | |||||
"rte": "RTE.zip", | "rte": "RTE.zip", | ||||
# Matching, Chinese | |||||
"cn-xnli": "XNLI.zip", | |||||
# Sequence Labeling, Chinese | |||||
"msra-ner": "MSRA_NER.zip", | "msra-ner": "MSRA_NER.zip", | ||||
"peopledaily": "peopledaily.zip", | "peopledaily": "peopledaily.zip", | ||||
"weibo-ner": "weibo_NER.zip", | "weibo-ner": "weibo_NER.zip", | ||||
# Chinese Word Segmentation | |||||
"cws-pku": 'cws_pku.zip', | "cws-pku": 'cws_pku.zip', | ||||
"cws-cityu": "cws_cityu.zip", | "cws-cityu": "cws_cityu.zip", | ||||
"cws-as": 'cws_as.zip', | "cws-as": 'cws_as.zip', | ||||
"cws-msra": 'cws_msra.zip', | "cws-msra": 'cws_msra.zip', | ||||
"chn-senti-corp" : "chn_senti_corp.zip", | |||||
"weibo-senti-100k" : "WeiboSenti100k.zip" | |||||
# Summarization, English | |||||
"ext-cnndm": "ext-cnndm.zip", | |||||
} | } | ||||
PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, | PRETRAIN_MAP = {'elmo': PRETRAINED_ELMO_MODEL_DIR, | ||||
@@ -373,63 +373,6 @@ class ChnSentiCorpLoader(Loader): | |||||
""" | """ | ||||
从path中读取数据 | 从path中读取数据 | ||||
:param path: | |||||
:return: | |||||
""" | |||||
ds = DataSet() | |||||
with open(path, 'r', encoding='utf-8') as f: | |||||
f.readline() | |||||
for line in f: | |||||
line = line.strip() | |||||
tab_index = line.index('\t') | |||||
if tab_index!=-1: | |||||
target = line[:tab_index] | |||||
raw_chars = line[tab_index+1:] | |||||
if raw_chars: | |||||
ds.append(Instance(raw_chars=raw_chars, target=target)) | |||||
return ds | |||||
def download(self)->str: | |||||
""" | |||||
自动下载数据,该数据取自https://github.com/pengming617/bert_classification/tree/master/data,在 | |||||
https://arxiv.org/pdf/1904.09223.pdf与https://arxiv.org/pdf/1906.08101.pdf有使用 | |||||
:return: | |||||
""" | |||||
output_dir = self._get_dataset_path('chn-senti-corp') | |||||
return output_dir | |||||
class ChnSentiCorpLoader(Loader): | |||||
""" | |||||
支持读取的数据的格式为,第一行为标题(具体内容会被忽略),之后一行为一个sample,第一个制表符之前被认为是label,第 | |||||
一个制表符及之后认为是句子 | |||||
Example:: | |||||
label raw_chars | |||||
1 這間酒店環境和服務態度亦算不錯,但房間空間太小~~ | |||||
1 <荐书> 推荐所有喜欢<红楼>的红迷们一定要收藏这本书,要知道... | |||||
0 商品的不足暂时还没发现,京东的订单处理速度实在.......周二就打包完成,周五才发货... | |||||
读取后的DataSet具有以下的field | |||||
.. csv-table:: | |||||
:header: "raw_chars", "target" | |||||
"這間酒店環境和服務態度亦算不錯,但房間空間太小~~", "1" | |||||
"<荐书> 推荐所有喜欢<红楼>...", "1" | |||||
"..." | |||||
""" | |||||
def __init__(self): | |||||
super().__init__() | |||||
def _load(self, path: str): | |||||
""" | |||||
从path中读取数据 | |||||
:param path: | :param path: | ||||
:return: | :return: | ||||
""" | """ | ||||
@@ -441,7 +384,7 @@ class ChnSentiCorpLoader(Loader): | |||||
tab_index = line.index('\t') | tab_index = line.index('\t') | ||||
if tab_index != -1: | if tab_index != -1: | ||||
target = line[:tab_index] | target = line[:tab_index] | ||||
raw_chars = line[tab_index + 1:] | |||||
raw_chars = line[tab_index+1:] | |||||
if raw_chars: | if raw_chars: | ||||
ds.append(Instance(raw_chars=raw_chars, target=target)) | ds.append(Instance(raw_chars=raw_chars, target=target)) | ||||
return ds | return ds | ||||
@@ -486,6 +429,17 @@ class THUCNewsLoader(Loader): | |||||
ds.append(Instance(raw_chars=raw_chars, target=target)) | ds.append(Instance(raw_chars=raw_chars, target=target)) | ||||
return ds | return ds | ||||
def download(self) -> str: | |||||
""" | |||||
自动下载数据,该数据取自 | |||||
http://thuctc.thunlp.org/#%E4%B8%AD%E6%96%87%E6%96%87%E6%9C%AC%E5%88%86%E7%B1%BB%E6%95%B0%E6%8D%AE%E9%9B%86THUCNews | |||||
:return: | |||||
""" | |||||
output_dir = self._get_dataset_path('thuc-news') | |||||
return output_dir | |||||
class WeiboSenti100kLoader(Loader): | class WeiboSenti100kLoader(Loader): | ||||
""" | """ | ||||
@@ -316,6 +316,16 @@ class CTBLoader(Loader): | |||||
dataset = self.loader._load(path) | dataset = self.loader._load(path) | ||||
return dataset | return dataset | ||||
def download(self): | |||||
""" | |||||
由于版权限制,不能提供自动下载功能。可参考 | |||||
https://catalog.ldc.upenn.edu/LDC2013T21 | |||||
:return: | |||||
""" | |||||
raise RuntimeError("CTB cannot be downloaded automatically.") | |||||
class CNNERLoader(Loader): | class CNNERLoader(Loader): | ||||
def _load(self, path: str): | def _load(self, path: str): | ||||
@@ -13,23 +13,21 @@ from .json import JsonLoader | |||||
class CoReferenceLoader(JsonLoader): | class CoReferenceLoader(JsonLoader): | ||||
""" | """ | ||||
原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息,speakers包含每句话的说话者信息,cluster是指向现实中同一个事物的聚集,sentences是文本信息内容。 | |||||
原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息,speakers包含每句话的说话者信息,cluster是指向现实中同一个事物的聚集,sentences是文本信息内容。 | |||||
Example:: | |||||
Example:: | |||||
{"doc_key":"bc/cctv/00/cctv_001", | |||||
"speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]", | |||||
"clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]", | |||||
"sentences":[["I","have","an","apple"],["It","is","good"]] | |||||
} | |||||
{"doc_key":"bc/cctv/00/cctv_001", | |||||
"speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]", | |||||
"clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]", | |||||
"sentences":[["I","have","an","apple"],["It","is","good"]] | |||||
} | |||||
读取预处理好的Conll2012数据。 | |||||
读取预处理好的Conll2012数据。 | |||||
""" | |||||
""" | |||||
def __init__(self, fields=None, dropna=False): | def __init__(self, fields=None, dropna=False): | ||||
super().__init__(fields, dropna) | super().__init__(fields, dropna) | ||||
# self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1), | |||||
# "clusters":Const.TARGET,"sentences":Const.INPUTS(2)} | |||||
self.fields = {"doc_key": Const.RAW_WORDS(0), "speakers": Const.RAW_WORDS(1), "clusters": Const.RAW_WORDS(2), | self.fields = {"doc_key": Const.RAW_WORDS(0), "speakers": Const.RAW_WORDS(1), "clusters": Const.RAW_WORDS(2), | ||||
"sentences": Const.RAW_WORDS(3)} | "sentences": Const.RAW_WORDS(3)} | ||||
@@ -48,3 +46,13 @@ class CoReferenceLoader(JsonLoader): | |||||
ins = d | ins = d | ||||
dataset.append(Instance(**ins)) | dataset.append(Instance(**ins)) | ||||
return dataset | return dataset | ||||
def download(self): | |||||
""" | |||||
由于版权限制,不能提供自动下载功能。可参考 | |||||
https://www.aclweb.org/anthology/W12-4501 | |||||
:return: | |||||
""" | |||||
raise RuntimeError("CoReference cannot be downloaded automatically.") |
@@ -7,7 +7,7 @@ __all__ = [ | |||||
"RTELoader", | "RTELoader", | ||||
"QuoraLoader", | "QuoraLoader", | ||||
"BQCorpusLoader", | "BQCorpusLoader", | ||||
"XNLILoader", | |||||
"CNXNLILoader", | |||||
"LCQMCLoader" | "LCQMCLoader" | ||||
] | ] | ||||
@@ -135,12 +135,12 @@ class SNLILoader(JsonLoader): | |||||
""" | """ | ||||
从指定一个或多个路径中的文件中读取数据,返回 :class:`~fastNLP.io.DataBundle` 。 | 从指定一个或多个路径中的文件中读取数据,返回 :class:`~fastNLP.io.DataBundle` 。 | ||||
读取的field根据ConllLoader初始化时传入的headers决定。 | |||||
读取的field根据Loader初始化时传入的field决定。 | |||||
:param str paths: 传入一个目录, 将在该目录下寻找snli_1.0_train.jsonl, snli_1.0_dev.jsonl | :param str paths: 传入一个目录, 将在该目录下寻找snli_1.0_train.jsonl, snli_1.0_dev.jsonl | ||||
和snli_1.0_test.jsonl三个文件。 | 和snli_1.0_test.jsonl三个文件。 | ||||
:return: 返回的:class:`~fastNLP.io.DataBundle` | |||||
:return: 返回的 :class:`~fastNLP.io.DataBundle` | |||||
""" | """ | ||||
_paths = {} | _paths = {} | ||||
if paths is None: | if paths is None: | ||||
@@ -222,8 +222,7 @@ class QNLILoader(JsonLoader): | |||||
""" | """ | ||||
如果您的实验使用到了该数据,请引用 | 如果您的实验使用到了该数据,请引用 | ||||
.. todo:: | |||||
补充 | |||||
https://arxiv.org/pdf/1809.05053.pdf | |||||
:return: | :return: | ||||
""" | """ | ||||
@@ -276,6 +275,13 @@ class RTELoader(Loader): | |||||
return ds | return ds | ||||
def download(self): | def download(self): | ||||
""" | |||||
如果您的实验使用到了该数据,请引用GLUE Benchmark | |||||
https://openreview.net/pdf?id=rJ4km2R5t7 | |||||
:return: | |||||
""" | |||||
return self._get_dataset_path('rte') | return self._get_dataset_path('rte') | ||||
@@ -321,10 +327,17 @@ class QuoraLoader(Loader): | |||||
return ds | return ds | ||||
def download(self): | def download(self): | ||||
""" | |||||
由于版权限制,不能提供自动下载功能。可参考 | |||||
https://www.kaggle.com/c/quora-question-pairs/data | |||||
:return: | |||||
""" | |||||
raise RuntimeError("Quora cannot be downloaded automatically.") | raise RuntimeError("Quora cannot be downloaded automatically.") | ||||
class XNLILoader(Loader): | |||||
class CNXNLILoader(Loader): | |||||
""" | """ | ||||
别名: | 别名: | ||||
数据集简介:中文句对NLI(本为multi-lingual的数据集,但是这里只取了中文的数据集)。原句子已被MOSES tokenizer处理 | 数据集简介:中文句对NLI(本为multi-lingual的数据集,但是这里只取了中文的数据集)。原句子已被MOSES tokenizer处理 | ||||
@@ -341,7 +354,7 @@ class XNLILoader(Loader): | |||||
""" | """ | ||||
def __init__(self): | def __init__(self): | ||||
super(XNLILoader, self).__init__() | |||||
super(CNXNLILoader, self).__init__() | |||||
def _load(self, path: str = None): | def _load(self, path: str = None): | ||||
csv_loader = CSVLoader(sep='\t') | csv_loader = CSVLoader(sep='\t') | ||||
@@ -384,7 +397,7 @@ class XNLILoader(Loader): | |||||
https://arxiv.org/pdf/1809.05053.pdf 有使用 | https://arxiv.org/pdf/1809.05053.pdf 有使用 | ||||
:return: | :return: | ||||
""" | """ | ||||
output_dir = self._get_dataset_path('xnli') | |||||
output_dir = self._get_dataset_path('cn-xnli') | |||||
return output_dir | return output_dir | ||||
@@ -423,6 +436,16 @@ class BQCorpusLoader(Loader): | |||||
ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target)) | ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target)) | ||||
return ds | return ds | ||||
def download(self): | |||||
""" | |||||
由于版权限制,不能提供自动下载功能。可参考 | |||||
https://github.com/ymcui/Chinese-BERT-wwm | |||||
:return: | |||||
""" | |||||
raise RuntimeError("BQCorpus cannot be downloaded automatically.") | |||||
class LCQMCLoader(Loader): | class LCQMCLoader(Loader): | ||||
""" | """ | ||||
@@ -461,16 +484,14 @@ class LCQMCLoader(Loader): | |||||
ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target)) | ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target)) | ||||
return ds | return ds | ||||
''' | |||||
def download(self)->str: | |||||
def download(self): | |||||
""" | """ | ||||
自动下载数据,该数据取自论文 LCQMC: A Large-scale Chinese Question Matching Corpus. | |||||
InProceedings of the 27thInternational Conference on Computational Linguistics. 1952–1962. | |||||
由于版权限制,不能提供自动下载功能。可参考 | |||||
https://github.com/ymcui/Chinese-BERT-wwm | |||||
:return: | :return: | ||||
""" | """ | ||||
output_dir = self._get_dataset_path('chn-senti-corp') | |||||
return output_dir | |||||
''' | |||||
raise RuntimeError("LCQMC cannot be downloaded automatically.") | |||||
@@ -7,7 +7,7 @@ __all__ = [ | |||||
"QuoraBertPipe", | "QuoraBertPipe", | ||||
"QNLIBertPipe", | "QNLIBertPipe", | ||||
"MNLIBertPipe", | "MNLIBertPipe", | ||||
"XNLIBertPipe", | |||||
"CNXNLIBertPipe", | |||||
"BQCorpusBertPipe", | "BQCorpusBertPipe", | ||||
"LCQMCBertPipe", | "LCQMCBertPipe", | ||||
"MatchingPipe", | "MatchingPipe", | ||||
@@ -16,7 +16,7 @@ __all__ = [ | |||||
"QuoraPipe", | "QuoraPipe", | ||||
"QNLIPipe", | "QNLIPipe", | ||||
"MNLIPipe", | "MNLIPipe", | ||||
"XNLIPipe", | |||||
"CNXNLIPipe", | |||||
"BQCorpusPipe", | "BQCorpusPipe", | ||||
"LCQMCPipe", | "LCQMCPipe", | ||||
] | ] | ||||
@@ -25,7 +25,7 @@ import warnings | |||||
from .pipe import Pipe | from .pipe import Pipe | ||||
from .utils import get_tokenizer | from .utils import get_tokenizer | ||||
from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, XNLILoader, LCQMCLoader | |||||
from ..loader.matching import SNLILoader, MNLILoader, QNLILoader, RTELoader, QuoraLoader, BQCorpusLoader, CNXNLILoader, LCQMCLoader | |||||
from ...core.const import Const | from ...core.const import Const | ||||
from ...core.vocabulary import Vocabulary | from ...core.vocabulary import Vocabulary | ||||
from ...core._logger import logger | from ...core._logger import logger | ||||
@@ -354,10 +354,10 @@ class LCQMCPipe(MatchingPipe): | |||||
return data_bundle | return data_bundle | ||||
class XNLIPipe(MatchingPipe): | |||||
def process_from_file(self, paths = None): | |||||
data_bundle = XNLILoader().load(paths) | |||||
data_bundle = GranularizePipe(task = 'XNLI').process(data_bundle) | |||||
class CNXNLIPipe(MatchingPipe): | |||||
def process_from_file(self, paths=None): | |||||
data_bundle = CNXNLILoader().load(paths) | |||||
data_bundle = GranularizePipe(task='XNLI').process(data_bundle) | |||||
data_bundle = RenamePipe().process(data_bundle) #使中文数据的field | data_bundle = RenamePipe().process(data_bundle) #使中文数据的field | ||||
data_bundle = self.process(data_bundle) | data_bundle = self.process(data_bundle) | ||||
data_bundle = RenamePipe().process(data_bundle) | data_bundle = RenamePipe().process(data_bundle) | ||||
@@ -473,9 +473,9 @@ class BQCorpusBertPipe(MatchingBertPipe): | |||||
return data_bundle | return data_bundle | ||||
class XNLIBertPipe(MatchingBertPipe): | |||||
class CNXNLIBertPipe(MatchingBertPipe): | |||||
def process_from_file(self, paths = None): | def process_from_file(self, paths = None): | ||||
data_bundle = XNLILoader().load(paths) | |||||
data_bundle = CNXNLILoader().load(paths) | |||||
data_bundle = GranularizePipe(task='XNLI').process(data_bundle) | data_bundle = GranularizePipe(task='XNLI').process(data_bundle) | ||||
data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) | data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) | ||||
data_bundle = self.process(data_bundle) | data_bundle = self.process(data_bundle) | ||||
@@ -5,7 +5,7 @@ import os | |||||
from fastNLP.io import DataBundle | from fastNLP.io import DataBundle | ||||
from fastNLP.io.loader.matching import RTELoader, QNLILoader, SNLILoader, QuoraLoader, MNLILoader, \ | from fastNLP.io.loader.matching import RTELoader, QNLILoader, SNLILoader, QuoraLoader, MNLILoader, \ | ||||
BQCorpusLoader, XNLILoader, LCQMCLoader | |||||
BQCorpusLoader, CNXNLILoader, LCQMCLoader | |||||
@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | ||||
@@ -31,7 +31,7 @@ class TestMatchingLoad(unittest.TestCase): | |||||
'MNLI': ('test/data_for_tests/io/MNLI', MNLILoader, (5, 5, 5, 5, 6), True), | 'MNLI': ('test/data_for_tests/io/MNLI', MNLILoader, (5, 5, 5, 5, 6), True), | ||||
'Quora': ('test/data_for_tests/io/Quora', QuoraLoader, (2, 2, 2), False), | 'Quora': ('test/data_for_tests/io/Quora', QuoraLoader, (2, 2, 2), False), | ||||
'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusLoader, (5, 5, 5), False), | 'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusLoader, (5, 5, 5), False), | ||||
'XNLI': ('test/data_for_tests/io/XNLI', XNLILoader, (6, 7, 6), False), | |||||
'XNLI': ('test/data_for_tests/io/XNLI', CNXNLILoader, (6, 7, 6), False), | |||||
'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCLoader, (5, 6, 6), False), | 'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCLoader, (5, 6, 6), False), | ||||
} | } | ||||
for k, v in data_set_dict.items(): | for k, v in data_set_dict.items(): | ||||
@@ -4,9 +4,9 @@ import os | |||||
from fastNLP.io import DataBundle | from fastNLP.io import DataBundle | ||||
from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, QNLIPipe, QuoraPipe, MNLIPipe, \ | from fastNLP.io.pipe.matching import SNLIPipe, RTEPipe, QNLIPipe, QuoraPipe, MNLIPipe, \ | ||||
XNLIPipe, BQCorpusPipe, LCQMCPipe | |||||
CNXNLIPipe, BQCorpusPipe, LCQMCPipe | |||||
from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, QNLIBertPipe, QuoraBertPipe, MNLIBertPipe, \ | from fastNLP.io.pipe.matching import SNLIBertPipe, RTEBertPipe, QNLIBertPipe, QuoraBertPipe, MNLIBertPipe, \ | ||||
XNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe | |||||
CNXNLIBertPipe, BQCorpusBertPipe, LCQMCBertPipe | |||||
@unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis") | ||||
@@ -38,7 +38,7 @@ class TestRunMatchingPipe(unittest.TestCase): | |||||
'QNLI': ('test/data_for_tests/io/QNLI', QNLIPipe, QNLIBertPipe, (5, 5, 5), (372, 2), True), | 'QNLI': ('test/data_for_tests/io/QNLI', QNLIPipe, QNLIBertPipe, (5, 5, 5), (372, 2), True), | ||||
'MNLI': ('test/data_for_tests/io/MNLI', MNLIPipe, MNLIBertPipe, (5, 5, 5, 5, 6), (459, 3), True), | 'MNLI': ('test/data_for_tests/io/MNLI', MNLIPipe, MNLIBertPipe, (5, 5, 5, 5, 6), (459, 3), True), | ||||
'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusPipe, BQCorpusBertPipe, (5, 5, 5), (32, 2), False), | 'BQCorpus': ('test/data_for_tests/io/BQCorpus', BQCorpusPipe, BQCorpusBertPipe, (5, 5, 5), (32, 2), False), | ||||
'XNLI': ('test/data_for_tests/io/XNLI', XNLIPipe, XNLIBertPipe, (6, 7, 6), (37, 3), False), | |||||
'XNLI': ('test/data_for_tests/io/XNLI', CNXNLIPipe, CNXNLIBertPipe, (6, 7, 6), (37, 3), False), | |||||
'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCPipe, LCQMCBertPipe, (5, 6, 6), (36, 2), False), | 'LCQMC': ('test/data_for_tests/io/LCQMC', LCQMCPipe, LCQMCBertPipe, (5, 6, 6), (36, 2), False), | ||||
} | } | ||||
for k, v in data_set_dict.items(): | for k, v in data_set_dict.items(): | ||||