|
- """undocumented"""
-
- __all__ = [
- "MNLILoader",
- "SNLILoader",
- "QNLILoader",
- "RTELoader",
- "QuoraLoader",
- "BQCorpusLoader",
- "XNLILoader",
- "LCQMCLoader"
- ]
-
- import os
- import warnings
- from typing import Union, Dict
-
- from .json import JsonLoader
- from .loader import Loader
- from .. import DataBundle
- from ...core.const import Const
- from ...core.dataset import DataSet
- from ...core.instance import Instance
- from .csv import CSVLoader
- from ..utils import check_loader_paths
-
-
- class MNLILoader(Loader):
- """
- 读取MNLI任务的数据,读取之后的DataSet中包含以下的内容,words0是sentence1, words1是sentence2, target是gold_label, 测试集中没
- 有target列。
-
- .. csv-table::
- :header: "raw_words1", "raw_words2", "target"
-
- "The new rights are...", "Everyone really likes..", "neutral"
- "This site includes a...", "The Government Executive...", "contradiction"
- "...", "...","."
-
- """
-
- def __init__(self):
- super().__init__()
-
- def _load(self, path: str):
- ds = DataSet()
- with open(path, 'r', encoding='utf-8') as f:
- f.readline() # 跳过header
- if path.endswith("test_matched.tsv") or path.endswith('test_mismatched.tsv'):
- warnings.warn("RTE's test file has no target.")
- for line in f:
- line = line.strip()
- if line:
- parts = line.split('\t')
- raw_words1 = parts[8]
- raw_words2 = parts[9]
- if raw_words1 and raw_words2:
- ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2))
- else:
- for line in f:
- line = line.strip()
- if line:
- parts = line.split('\t')
- raw_words1 = parts[8]
- raw_words2 = parts[9]
- target = parts[-1]
- if raw_words1 and raw_words2 and target:
- ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target))
- return ds
-
- def load(self, paths: str = None):
- """
-
- :param str paths: 传入数据所在目录,会在该目录下寻找dev_matched.tsv, dev_mismatched.tsv, test_matched.tsv,
- test_mismatched.tsv, train.tsv文件夹
- :return: DataBundle
- """
- if paths:
- paths = os.path.abspath(os.path.expanduser(paths))
- else:
- paths = self.download()
- if not os.path.isdir(paths):
- raise NotADirectoryError(f"{paths} is not a valid directory.")
-
- files = {'dev_matched': "dev_matched.tsv",
- "dev_mismatched": "dev_mismatched.tsv",
- "test_matched": "test_matched.tsv",
- "test_mismatched": "test_mismatched.tsv",
- "train": 'train.tsv'}
-
- datasets = {}
- for name, filename in files.items():
- filepath = os.path.join(paths, filename)
- if not os.path.isfile(filepath):
- if 'test' not in name:
- raise FileNotFoundError(f"{name} not found in directory {filepath}.")
- datasets[name] = self._load(filepath)
-
- data_bundle = DataBundle(datasets=datasets)
-
- return data_bundle
-
- def download(self):
- """
- 如果你使用了这个数据,请引用
-
- https://www.nyu.edu/projects/bowman/multinli/paper.pdf
- :return:
- """
- output_dir = self._get_dataset_path('mnli')
- return output_dir
-
-
- class SNLILoader(JsonLoader):
- """
- 读取之后的DataSet中的field情况为
-
- .. csv-table:: 下面是使用SNLILoader加载的DataSet所具备的field
- :header: "raw_words1", "raw_words2", "target"
-
- "The new rights are...", "Everyone really likes..", "neutral"
- "This site includes a...", "The Government Executive...", "entailment"
- "...", "...", "."
-
- """
-
- def __init__(self):
- super().__init__(fields={
- 'sentence1': Const.RAW_WORDS(0),
- 'sentence2': Const.RAW_WORDS(1),
- 'gold_label': Const.TARGET,
- })
-
- def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle:
- """
- 从指定一个或多个路径中的文件中读取数据,返回 :class:`~fastNLP.io.DataBundle` 。
-
- 读取的field根据ConllLoader初始化时传入的headers决定。
-
- :param str paths: 传入一个目录, 将在该目录下寻找snli_1.0_train.jsonl, snli_1.0_dev.jsonl
- 和snli_1.0_test.jsonl三个文件。
-
- :return: 返回的:class:`~fastNLP.io.DataBundle`
- """
- _paths = {}
- if paths is None:
- paths = self.download()
- if paths:
- if os.path.isdir(paths):
- if not os.path.isfile(os.path.join(paths, 'snli_1.0_train.jsonl')):
- raise FileNotFoundError(f"snli_1.0_train.jsonl is not found in {paths}")
- _paths['train'] = os.path.join(paths, 'snli_1.0_train.jsonl')
- for filename in ['snli_1.0_dev.jsonl', 'snli_1.0_test.jsonl']:
- filepath = os.path.join(paths, filename)
- _paths[filename.split('_')[-1].split('.')[0]] = filepath
- paths = _paths
- else:
- raise NotADirectoryError(f"{paths} is not a valid directory.")
-
- datasets = {name: self._load(path) for name, path in paths.items()}
- data_bundle = DataBundle(datasets=datasets)
- return data_bundle
-
- def download(self):
- """
- 如果您的文章使用了这份数据,请引用
-
- http://nlp.stanford.edu/pubs/snli_paper.pdf
-
- :return: str
- """
- return self._get_dataset_path('snli')
-
-
- class QNLILoader(JsonLoader):
- """
- QNLI数据集的Loader,
- 加载的DataSet将具备以下的field, raw_words1是question, raw_words2是sentence, target是label
-
- .. csv-table::
- :header: "raw_words1", "raw_words2", "target"
-
- "What came into force after the new...", "As of that day...", "entailment"
- "What is the first major...", "The most important tributaries", "not_entailment"
- "...","."
-
- test数据集没有target列
-
- """
-
- def __init__(self):
- super().__init__()
-
- def _load(self, path):
- ds = DataSet()
-
- with open(path, 'r', encoding='utf-8') as f:
- f.readline() # 跳过header
- if path.endswith("test.tsv"):
- warnings.warn("QNLI's test file has no target.")
- for line in f:
- line = line.strip()
- if line:
- parts = line.split('\t')
- raw_words1 = parts[1]
- raw_words2 = parts[2]
- if raw_words1 and raw_words2:
- ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2))
- else:
- for line in f:
- line = line.strip()
- if line:
- parts = line.split('\t')
- raw_words1 = parts[1]
- raw_words2 = parts[2]
- target = parts[-1]
- if raw_words1 and raw_words2 and target:
- ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target))
- return ds
-
- def download(self):
- """
- 如果您的实验使用到了该数据,请引用
-
- .. todo::
- 补充
-
- :return:
- """
- return self._get_dataset_path('qnli')
-
-
- class RTELoader(Loader):
- """
- RTE数据的loader
- 加载的DataSet将具备以下的field, raw_words1是sentence0,raw_words2是sentence1, target是label
-
- .. csv-table::
- :header: "raw_words1", "raw_words2", "target"
-
- "Dana Reeve, the widow of the actor...", "Christopher Reeve had an...", "not_entailment"
- "Yet, we now are discovering that...", "Bacteria is winning...", "entailment"
- "...","."
-
- test数据集没有target列
- """
-
- def __init__(self):
- super().__init__()
-
- def _load(self, path: str):
- ds = DataSet()
-
- with open(path, 'r', encoding='utf-8') as f:
- f.readline() # 跳过header
- if path.endswith("test.tsv"):
- warnings.warn("RTE's test file has no target.")
- for line in f:
- line = line.strip()
- if line:
- parts = line.split('\t')
- raw_words1 = parts[1]
- raw_words2 = parts[2]
- if raw_words1 and raw_words2:
- ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2))
- else:
- for line in f:
- line = line.strip()
- if line:
- parts = line.split('\t')
- raw_words1 = parts[1]
- raw_words2 = parts[2]
- target = parts[-1]
- if raw_words1 and raw_words2 and target:
- ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target))
- return ds
-
- def download(self):
- return self._get_dataset_path('rte')
-
-
- class QuoraLoader(Loader):
- """
- Quora matching任务的数据集Loader
-
- 支持读取的文件中的内容,应该有以下的形式, 以制表符分隔,且前三列的内容必须是:第一列是label,第二列和第三列是句子
-
- Example::
-
- 1 How do I get funding for my web based startup idea ? How do I get seed funding pre product ? 327970
- 1 How can I stop my depression ? What can I do to stop being depressed ? 339556
- ...
-
- 加载的DataSet将具备以下的field
-
- .. csv-table::
- :header: "raw_words1", "raw_words2", "target"
-
- "What should I do to avoid...", "1"
- "How do I not sleep in a boring class...", "0"
- "...","."
-
- """
-
- def __init__(self):
- super().__init__()
-
- def _load(self, path: str):
- ds = DataSet()
-
- with open(path, 'r', encoding='utf-8') as f:
- for line in f:
- line = line.strip()
- if line:
- parts = line.split('\t')
- raw_words1 = parts[1]
- raw_words2 = parts[2]
- target = parts[0]
- if raw_words1 and raw_words2 and target:
- ds.append(Instance(raw_words1=raw_words1, raw_words2=raw_words2, target=target))
- return ds
-
- def download(self):
- raise RuntimeError("Quora cannot be downloaded automatically.")
-
-
- class XNLILoader(Loader):
- """
- 别名:
- 数据集简介:中文句对NLI(本为multi-lingual的数据集,但是这里只取了中文的数据集)。原句子已被MOSES tokenizer处理
- 原始数据为:
- train中的数据包括premise,hypo和label三个field
- dev和test中的数据为csv或json格式,包括十多个field,这里只取与以上三个field中的数据
- 读取后的Dataset将具有以下数据结构:
-
- .. csv-table::
- :header: "raw_chars1", "raw_chars2", "target"
- "从概念上看,奶油收入有两个基本方面产品和地理.", "产品和地理是什么使奶油抹霜工作.", "1"
- ""...", "...", "..."
-
- """
-
- def __init__(self):
- super(XNLILoader, self).__init__()
-
- def _load(self, path: str = None):
- csv_loader = CSVLoader(sep='\t')
- ds_all = csv_loader._load(path)
- ds_zh = DataSet()
- for i in ds_all:
- if i['language'] == 'zh':
- ds_zh.append(Instance(raw_chars1=i['sentence1'], raw_chars2=i['sentence2'], target=i['gold_label']))
-
- return ds_zh
-
- def _load_train(self, path: str = None):
- csv_loader = CSVLoader(sep='\t')
- ds = csv_loader._load(path)
- ds.rename_field('label', 'target')
- ds.rename_field('premise', 'raw_chars1')
- ds.rename_field('hypo', 'raw_chars2')
- ds.apply(lambda i: "".join(i['raw_chars1'].split()), new_field_name='raw_chars1')
- ds.apply(lambda i: "".join(i['raw_chars2'].split()), new_field_name='raw_chars2')
- return ds
-
- def load(self, paths: Union[str, Dict[str, str]] = None) -> DataBundle:
- if paths is None:
- paths = self.download()
- paths = check_loader_paths(paths)
- datasets = {}
- for name, path in paths.items():
- if name == 'train':
- datasets[name] = self._load_train(path)
- else:
- datasets[name] = self._load(path)
-
- data_bundle = DataBundle(datasets=datasets)
- return data_bundle
-
-
- class BQCorpusLoader(Loader):
- """
- 别名:
- 数据集简介:句子对二分类任务(判断是否具有相同的语义)
- 原始数据内容为:
- 每行一个sample,第一个','之前为text1,第二个','之前为text2,第二个','之后为target
- 第一行为sentence1 sentence2 label
- 读取后的Dataset将具有以下数据结构:
-
- .. csv-table::
- :header: "raw_chars1", "raw_chars2", "target"
- "不是邀请的如何贷款?", "我不是你们邀请的客人可以贷款吗?", "1"
- "如何满足微粒银行的审核", "建设银行有微粒贷的资格吗", "0"
- "...", "...", "..."
-
- """
-
- def __init__(self):
- super(BQCorpusLoader, self).__init__()
-
- def _load(self, path: str = None):
- ds = DataSet()
- with open(path, 'r', encoding='utf-8') as f:
- next(f)
- for line in f:
- line = line.strip()
- target = line[-1]
- sep_index = line.index(',')
- raw_chars1 = line[:sep_index]
- raw_chars2 = line[sep_index + 1:]
-
- if raw_chars1:
- ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target))
- return ds
-
-
- class LCQMCLoader(Loader):
- """
- 别名:
- 数据集简介:句对匹配(question matching)
- 原始数据为:
- '喜欢打篮球的男生喜欢什么样的女生\t爱打篮球的男生喜欢什么样的女生\t1\n'
- '晚上睡觉带着耳机听音乐有什么害处吗?\t孕妇可以戴耳机听音乐吗?\t0\n'
- 读取后的Dataset将具有以下的数据结构:
-
- .. csv-table::
- :header: "raw_chars1", "raw_chars2", "target"
- "喜欢打篮球的男生喜欢什么样的女生?", "爱打篮球的男生喜欢什么样的女生?", "1"
- "晚上睡觉带着耳机听音乐有什么害处吗?", "妇可以戴耳机听音乐吗?", "0"
- ""...", "...", "..."
-
- """
-
- def __init__(self):
- super(LCQMCLoader, self).__init__()
-
- def _load(self, path: str = None):
- ds = DataSet()
- with open(path, 'r', encoding='utf-8') as f:
- for line in f:
- line = line.strip()
- line_segments = line.split('\t')
- assert len(line_segments) == 3
-
- target = line_segments[-1]
-
- raw_chars1 = line_segments[0]
- raw_chars2 = line_segments[1]
-
- if raw_chars1:
- ds.append(Instance(raw_chars1=raw_chars1, raw_chars2=raw_chars2, target=target))
- return ds
-
- '''
- def download(self)->str:
- """
- 自动下载数据,该数据取自论文 LCQMC: A Large-scale Chinese Question Matching Corpus.
- InProceedings of the 27thInternational Conference on Computational Linguistics. 1952–1962.
-
- :return:
- """
- output_dir = self._get_dataset_path('chn-senti-corp')
- return output_dir
- '''
-
|