diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index 05d75f43..fe4ca245 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -17,17 +17,17 @@ __all__ = [ 'CSVLoader', 'JsonLoader', - 'ConllLoader', - 'PeopleDailyCorpusLoader', - 'Conll2003Loader', 'ModelLoader', 'ModelSaver', - 'SSTLoader', - + 'ConllLoader', + 'Conll2003Loader', 'MatchingLoader', + 'PeopleDailyCorpusLoader', 'SNLILoader', + 'SSTLoader', + 'SST2Loader', 'MNLILoader', 'QNLILoader', 'QuoraLoader', @@ -36,10 +36,7 @@ __all__ = [ from .embed_loader import EmbedLoader from .base_loader import DataInfo, DataSetLoader -from .dataset_loader import CSVLoader, JsonLoader, ConllLoader, \ - PeopleDailyCorpusLoader, Conll2003Loader +from .dataset_loader import CSVLoader, JsonLoader from .model_io import ModelLoader, ModelSaver -from .data_loader.sst import SSTLoader -from .data_loader.matching import MatchingLoader, SNLILoader, \ - MNLILoader, QNLILoader, QuoraLoader, RTELoader +from .data_loader import * diff --git a/fastNLP/io/data_loader/__init__.py b/fastNLP/io/data_loader/__init__.py index 893ef0e2..d4777ff8 100644 --- a/fastNLP/io/data_loader/__init__.py +++ b/fastNLP/io/data_loader/__init__.py @@ -4,26 +4,32 @@ 这些模块的使用方法如下: """ __all__ = [ + 'ConllLoader', + 'Conll2003Loader', 'IMDBLoader', 'MatchingLoader', 'MNLILoader', 'MTL16Loader', + 'PeopleDailyCorpusLoader', 'QNLILoader', 'QuoraLoader', 'RTELoader', 'SSTLoader', + 'SST2Loader', 'SNLILoader', 'YelpLoader', ] +from .conll import ConllLoader, Conll2003Loader from .imdb import IMDBLoader from .matching import MatchingLoader from .mnli import MNLILoader from .mtl import MTL16Loader +from .people_daily import PeopleDailyCorpusLoader from .qnli import QNLILoader from .quora import QuoraLoader from .rte import RTELoader from .snli import SNLILoader -from .sst import SSTLoader +from .sst import SSTLoader, SST2Loader from .yelp import YelpLoader diff --git a/fastNLP/io/data_loader/conll.py b/fastNLP/io/data_loader/conll.py new file mode 100644 index 00000000..9a7cb674 --- /dev/null +++ b/fastNLP/io/data_loader/conll.py @@ -0,0 +1,73 @@ + +from ...core import DataSet +from ...core import Instance +from ..base_loader import DataSetLoader +from ..file_reader import _read_conll + + +class ConllLoader(DataSetLoader): + """ + 别名::class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.data_loader.ConllLoader` + + 读取Conll格式的数据. 数据格式详见 http://conll.cemantix.org/2012/data.html. 数据中以"-DOCSTART-"开头的行将被忽略,因为 + 该符号在conll 2003中被用为文档分割符。 + + 列号从0开始, 每列对应内容为:: + + Column Type + 0 Document ID + 1 Part number + 2 Word number + 3 Word itself + 4 Part-of-Speech + 5 Parse bit + 6 Predicate lemma + 7 Predicate Frameset ID + 8 Word sense + 9 Speaker/Author + 10 Named Entities + 11:N Predicate Arguments + N Coreference + + :param headers: 每一列数据的名称,需为List or Tuple of str。``header`` 与 ``indexes`` 一一对应 + :param indexes: 需要保留的数据列下标,从0开始。若为 ``None`` ,则所有列都保留。Default: ``None`` + :param dropna: 是否忽略非法数据,若 ``False`` ,遇到非法数据时抛出 ``ValueError`` 。Default: ``False`` + """ + + def __init__(self, headers, indexes=None, dropna=False): + super(ConllLoader, self).__init__() + if not isinstance(headers, (list, tuple)): + raise TypeError( + 'invalid headers: {}, should be list of strings'.format(headers)) + self.headers = headers + self.dropna = dropna + if indexes is None: + self.indexes = list(range(len(self.headers))) + else: + if len(indexes) != len(headers): + raise ValueError + self.indexes = indexes + + def _load(self, path): + ds = DataSet() + for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna): + ins = {h: data[i] for i, h in enumerate(self.headers)} + ds.append(Instance(**ins)) + return ds + + +class Conll2003Loader(ConllLoader): + """ + 别名::class:`fastNLP.io.Conll2003Loader` :class:`fastNLP.io.dataset_loader.Conll2003Loader` + + 读取Conll2003数据 + + 关于数据集的更多信息,参考: + https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data + """ + + def __init__(self): + headers = [ + 'tokens', 'pos', 'chunks', 'ner', + ] + super(Conll2003Loader, self).__init__(headers=headers) diff --git a/fastNLP/io/data_loader/people_daily.py b/fastNLP/io/data_loader/people_daily.py new file mode 100644 index 00000000..d8c55aef --- /dev/null +++ b/fastNLP/io/data_loader/people_daily.py @@ -0,0 +1,85 @@ + +from ..base_loader import DataSetLoader +from ...core.dataset import DataSet +from ...core.instance import Instance +from ...core.const import Const + + +class PeopleDailyCorpusLoader(DataSetLoader): + """ + 别名::class:`fastNLP.io.PeopleDailyCorpusLoader` :class:`fastNLP.io.dataset_loader.PeopleDailyCorpusLoader` + + 读取人民日报数据集 + """ + + def __init__(self, pos=True, ner=True): + super(PeopleDailyCorpusLoader, self).__init__() + self.pos = pos + self.ner = ner + + def _load(self, data_path): + with open(data_path, "r", encoding="utf-8") as f: + sents = f.readlines() + examples = [] + for sent in sents: + if len(sent) <= 2: + continue + inside_ne = False + sent_pos_tag = [] + sent_words = [] + sent_ner = [] + words = sent.strip().split()[1:] + for word in words: + if "[" in word and "]" in word: + ner_tag = "U" + print(word) + elif "[" in word: + inside_ne = True + ner_tag = "B" + word = word[1:] + elif "]" in word: + ner_tag = "L" + word = word[:word.index("]")] + if inside_ne is True: + inside_ne = False + else: + raise RuntimeError("only ] appears!") + else: + if inside_ne is True: + ner_tag = "I" + else: + ner_tag = "O" + tmp = word.split("/") + token, pos = tmp[0], tmp[1] + sent_ner.append(ner_tag) + sent_pos_tag.append(pos) + sent_words.append(token) + example = [sent_words] + if self.pos is True: + example.append(sent_pos_tag) + if self.ner is True: + example.append(sent_ner) + examples.append(example) + return self.convert(examples) + + def convert(self, data): + """ + + :param data: python 内置对象 + :return: 一个 :class:`~fastNLP.DataSet` 类型的对象 + """ + data_set = DataSet() + for item in data: + sent_words = item[0] + if self.pos is True and self.ner is True: + instance = Instance( + words=sent_words, pos_tags=item[1], ner=item[2]) + elif self.pos is True: + instance = Instance(words=sent_words, pos_tags=item[1]) + elif self.ner is True: + instance = Instance(words=sent_words, ner=item[1]) + else: + instance = Instance(words=sent_words) + data_set.append(instance) + data_set.apply(lambda ins: len(ins[Const.INPUT]), new_field_name=Const.INPUT_LEN) + return data_set diff --git a/fastNLP/io/dataset_loader.py b/fastNLP/io/dataset_loader.py index 2881e6e9..ad6bbdc1 100644 --- a/fastNLP/io/dataset_loader.py +++ b/fastNLP/io/dataset_loader.py @@ -15,199 +15,13 @@ dataset_loader模块实现了许多 DataSetLoader, 用于读取不同格式的 __all__ = [ 'CSVLoader', 'JsonLoader', - 'ConllLoader', - 'PeopleDailyCorpusLoader', - 'Conll2003Loader', ] -import os -from nltk import Tree -from typing import Union, Dict -from ..core.vocabulary import Vocabulary + from ..core.dataset import DataSet from ..core.instance import Instance -from .file_reader import _read_csv, _read_json, _read_conll -from .base_loader import DataSetLoader, DataInfo -from ..core.const import Const -from ..modules.encoder._bert import BertTokenizer - - -class PeopleDailyCorpusLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.PeopleDailyCorpusLoader` :class:`fastNLP.io.dataset_loader.PeopleDailyCorpusLoader` - - 读取人民日报数据集 - """ - - def __init__(self, pos=True, ner=True): - super(PeopleDailyCorpusLoader, self).__init__() - self.pos = pos - self.ner = ner - - def _load(self, data_path): - with open(data_path, "r", encoding="utf-8") as f: - sents = f.readlines() - examples = [] - for sent in sents: - if len(sent) <= 2: - continue - inside_ne = False - sent_pos_tag = [] - sent_words = [] - sent_ner = [] - words = sent.strip().split()[1:] - for word in words: - if "[" in word and "]" in word: - ner_tag = "U" - print(word) - elif "[" in word: - inside_ne = True - ner_tag = "B" - word = word[1:] - elif "]" in word: - ner_tag = "L" - word = word[:word.index("]")] - if inside_ne is True: - inside_ne = False - else: - raise RuntimeError("only ] appears!") - else: - if inside_ne is True: - ner_tag = "I" - else: - ner_tag = "O" - tmp = word.split("/") - token, pos = tmp[0], tmp[1] - sent_ner.append(ner_tag) - sent_pos_tag.append(pos) - sent_words.append(token) - example = [sent_words] - if self.pos is True: - example.append(sent_pos_tag) - if self.ner is True: - example.append(sent_ner) - examples.append(example) - return self.convert(examples) - - def convert(self, data): - """ - - :param data: python 内置对象 - :return: 一个 :class:`~fastNLP.DataSet` 类型的对象 - """ - data_set = DataSet() - for item in data: - sent_words = item[0] - if self.pos is True and self.ner is True: - instance = Instance( - words=sent_words, pos_tags=item[1], ner=item[2]) - elif self.pos is True: - instance = Instance(words=sent_words, pos_tags=item[1]) - elif self.ner is True: - instance = Instance(words=sent_words, ner=item[1]) - else: - instance = Instance(words=sent_words) - data_set.append(instance) - data_set.apply(lambda ins: len(ins[Const.INPUT]), new_field_name=Const.INPUT_LEN) - return data_set - - -class ConllLoader(DataSetLoader): - """ - 别名::class:`fastNLP.io.ConllLoader` :class:`fastNLP.io.dataset_loader.ConllLoader` - - 读取Conll格式的数据. 数据格式详见 http://conll.cemantix.org/2012/data.html. 数据中以"-DOCSTART-"开头的行将被忽略,因为 - 该符号在conll 2003中被用为文档分割符。 - - 列号从0开始, 每列对应内容为:: - - Column Type - 0 Document ID - 1 Part number - 2 Word number - 3 Word itself - 4 Part-of-Speech - 5 Parse bit - 6 Predicate lemma - 7 Predicate Frameset ID - 8 Word sense - 9 Speaker/Author - 10 Named Entities - 11:N Predicate Arguments - N Coreference - - :param headers: 每一列数据的名称,需为List or Tuple of str。``header`` 与 ``indexes`` 一一对应 - :param indexes: 需要保留的数据列下标,从0开始。若为 ``None`` ,则所有列都保留。Default: ``None`` - :param dropna: 是否忽略非法数据,若 ``False`` ,遇到非法数据时抛出 ``ValueError`` 。Default: ``False`` - """ - - def __init__(self, headers, indexes=None, dropna=False): - super(ConllLoader, self).__init__() - if not isinstance(headers, (list, tuple)): - raise TypeError( - 'invalid headers: {}, should be list of strings'.format(headers)) - self.headers = headers - self.dropna = dropna - if indexes is None: - self.indexes = list(range(len(self.headers))) - else: - if len(indexes) != len(headers): - raise ValueError - self.indexes = indexes - - def _load(self, path): - ds = DataSet() - for idx, data in _read_conll(path, indexes=self.indexes, dropna=self.dropna): - ins = {h: data[i] for i, h in enumerate(self.headers)} - ds.append(Instance(**ins)) - return ds - - -class Conll2003Loader(ConllLoader): - """ - 别名::class:`fastNLP.io.Conll2003Loader` :class:`fastNLP.io.dataset_loader.Conll2003Loader` - - 读取Conll2003数据 - - 关于数据集的更多信息,参考: - https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data - """ - - def __init__(self): - headers = [ - 'tokens', 'pos', 'chunks', 'ner', - ] - super(Conll2003Loader, self).__init__(headers=headers) - - -def _cut_long_sentence(sent, max_sample_length=200): - """ - 将长于max_sample_length的sentence截成多段,只会在有空格的地方发生截断。 - 所以截取的句子可能长于或者短于max_sample_length - - :param sent: str. - :param max_sample_length: int. - :return: list of str. - """ - sent_no_space = sent.replace(' ', '') - cutted_sentence = [] - if len(sent_no_space) > max_sample_length: - parts = sent.strip().split() - new_line = '' - length = 0 - for part in parts: - length += len(part) - new_line += part + ' ' - if length > max_sample_length: - new_line = new_line[:-1] - cutted_sentence.append(new_line) - length = 0 - new_line = '' - if new_line != '': - cutted_sentence.append(new_line[:-1]) - else: - cutted_sentence.append(sent) - return cutted_sentence +from .file_reader import _read_csv, _read_json +from .base_loader import DataSetLoader class JsonLoader(DataSetLoader): @@ -272,6 +86,36 @@ class CSVLoader(DataSetLoader): return ds +def _cut_long_sentence(sent, max_sample_length=200): + """ + 将长于max_sample_length的sentence截成多段,只会在有空格的地方发生截断。 + 所以截取的句子可能长于或者短于max_sample_length + + :param sent: str. + :param max_sample_length: int. + :return: list of str. + """ + sent_no_space = sent.replace(' ', '') + cutted_sentence = [] + if len(sent_no_space) > max_sample_length: + parts = sent.strip().split() + new_line = '' + length = 0 + for part in parts: + length += len(part) + new_line += part + ' ' + if length > max_sample_length: + new_line = new_line[:-1] + cutted_sentence.append(new_line) + length = 0 + new_line = '' + if new_line != '': + cutted_sentence.append(new_line[:-1]) + else: + cutted_sentence.append(sent) + return cutted_sentence + + def _add_seg_tag(data): """ diff --git a/legacy/api/api.py b/legacy/api/api.py index d5d1df6b..1408731f 100644 --- a/legacy/api/api.py +++ b/legacy/api/api.py @@ -8,7 +8,8 @@ import os from fastNLP.core.dataset import DataSet from .utils import load_url from .processor import ModelProcessor -from fastNLP.io.dataset_loader import _cut_long_sentence, ConllLoader +from fastNLP.io.dataset_loader import _cut_long_sentence +from fastNLP.io.data_loader import ConllLoader from fastNLP.core.instance import Instance from ..api.pipeline import Pipeline from fastNLP.core.metrics import SpanFPreRecMetric diff --git a/reproduction/README.md b/reproduction/README.md index 6482ba2f..0bc0d66c 100644 --- a/reproduction/README.md +++ b/reproduction/README.md @@ -20,8 +20,8 @@ - [NER](seqence_labelling/ner) -## Coreference resolution (指代消解) -- [Coreference resolution 指代消解任务复现](coreference_resolution) +## Coreference resolution (共指消解) +- [Coreference resolution 共指消解任务复现](coreference_resolution) ## Summarization (摘要) diff --git a/reproduction/Star_transformer/datasets.py b/reproduction/Star_transformer/datasets.py index 1173d1a0..41d3f34b 100644 --- a/reproduction/Star_transformer/datasets.py +++ b/reproduction/Star_transformer/datasets.py @@ -2,8 +2,7 @@ import torch import json import os from fastNLP import Vocabulary -from fastNLP.io.dataset_loader import ConllLoader -from fastNLP.io.data_loader import SSTLoader, SNLILoader +from fastNLP.io.data_loader import ConllLoader, SSTLoader, SNLILoader from fastNLP.core import Const as C import numpy as np diff --git a/reproduction/joint_cws_parse/data/data_loader.py b/reproduction/joint_cws_parse/data/data_loader.py index 7802ea09..0644b01d 100644 --- a/reproduction/joint_cws_parse/data/data_loader.py +++ b/reproduction/joint_cws_parse/data/data_loader.py @@ -1,7 +1,7 @@ from fastNLP.io.base_loader import DataSetLoader, DataInfo -from fastNLP.io.dataset_loader import ConllLoader +from fastNLP.io.data_loader import ConllLoader import numpy as np from itertools import chain diff --git a/test/io/test_dataset_loader.py b/test/io/test_dataset_loader.py index 09ad8c83..3e3c54e2 100644 --- a/test/io/test_dataset_loader.py +++ b/test/io/test_dataset_loader.py @@ -1,8 +1,7 @@ import unittest import os -from fastNLP.io import Conll2003Loader, PeopleDailyCorpusLoader, CSVLoader, JsonLoader -from fastNLP.io.data_loader import SSTLoader, SNLILoader -from reproduction.text_classification.data.yelpLoader import yelpLoader +from fastNLP.io import CSVLoader, JsonLoader +from fastNLP.io.data_loader import SSTLoader, SNLILoader, Conll2003Loader, PeopleDailyCorpusLoader class TestDatasetLoader(unittest.TestCase): @@ -31,7 +30,7 @@ class TestDatasetLoader(unittest.TestCase): ds = JsonLoader().load('test/data_for_tests/sample_snli.jsonl') assert len(ds) == 3 - def test_SST(self): + def no_test_SST(self): train_data = """(3 (2 (2 The) (2 Rock)) (4 (3 (2 is) (4 (2 destined) (2 (2 (2 (2 (2 to) (2 (2 be) (2 (2 the) (2 (2 21st) (2 (2 (2 Century) (2 's)) (2 (3 new) (2 (2 ``) (2 Conan)))))))) (2 '')) (2 and)) (3 (2 that) (3 (2 he) (3 (2 's) (3 (2 going) (3 (2 to) (4 (3 (2 make) (3 (3 (2 a) (3 splash)) (2 (2 even) (3 greater)))) (2 (2 than) (2 (2 (2 (2 (1 (2 Arnold) (2 Schwarzenegger)) (2 ,)) (2 (2 Jean-Claud) (2 (2 Van) (2 Damme)))) (2 or)) (2 (2 Steven) (2 Segal))))))))))))) (2 .))) (4 (4 (4 (2 The) (4 (3 gorgeously) (3 (2 elaborate) (2 continuation)))) (2 (2 (2 of) (2 ``)) (2 (2 The) (2 (2 (2 Lord) (2 (2 of) (2 (2 the) (2 Rings)))) (2 (2 '') (2 trilogy)))))) (2 (3 (2 (2 is) (2 (2 so) (2 huge))) (2 (2 that) (3 (2 (2 (2 a) (2 column)) (2 (2 of) (2 words))) (2 (2 (2 (2 can) (1 not)) (3 adequately)) (2 (2 describe) (2 (3 (2 (2 co-writer\/director) (2 (2 Peter) (3 (2 Jackson) (2 's)))) (3 (2 expanded) (2 vision))) (2 (2 of) (2 (2 (2 J.R.R.) (2 (2 Tolkien) (2 's))) (2 Middle-earth))))))))) (2 .))) (3 (3 (2 (2 (2 (2 (2 Singer\/composer) (2 (2 Bryan) (2 Adams))) (2 (2 contributes) (2 (2 (2 a) (2 slew)) (2 (2 of) (2 songs))))) (2 (2 --) (2 (2 (2 (2 a) (2 (2 few) (3 potential))) (2 (2 (2 hits) (2 ,)) (2 (2 (2 a) (2 few)) (1 (1 (2 more) (1 (2 simply) (2 intrusive))) (2 (2 to) (2 (2 the) (2 story))))))) (2 --)))) (2 but)) (3 (4 (2 the) (3 (2 whole) (2 package))) (2 (3 certainly) (3 (2 captures) (2 (1 (2 the) (2 (2 (2 intended) (2 (2 ,) (2 (2 er) (2 ,)))) (3 spirit))) (2 (2 of) (2 (2 the) (2 piece)))))))) (2 .))