From 7413276997731b3e816444c1db3caf624b743405 Mon Sep 17 00:00:00 2001 From: zide05 <845465009@qq.com> Date: Sun, 22 Sep 2019 09:47:33 +0800 Subject: [PATCH] modify pipe documents --- fastNLP/io/__init__.py | 7 ++ fastNLP/io/pipe/__init__.py | 4 +- fastNLP/io/pipe/classification.py | 162 ++++++++++++++++++++++++------ fastNLP/io/pipe/conll.py | 103 +++++++++++++++---- fastNLP/io/pipe/coreference.py | 30 ++++-- fastNLP/io/pipe/cws.py | 19 +++- fastNLP/io/pipe/matching.py | 51 ++++++++-- 7 files changed, 303 insertions(+), 73 deletions(-) diff --git a/fastNLP/io/__init__.py b/fastNLP/io/__init__.py index c8b3dfaa..63fde69a 100644 --- a/fastNLP/io/__init__.py +++ b/fastNLP/io/__init__.py @@ -25,6 +25,8 @@ __all__ = [ 'SSTLoader', 'SST2Loader', "ChnSentiCorpLoader", + "THUCNewsLoader", + "WeiboSenti100kLoader", 'ConllLoader', 'Conll2003Loader', @@ -45,6 +47,9 @@ __all__ = [ "SNLILoader", "QNLILoader", "RTELoader", + "XNLILoader", + "BQCorpusLoader", + "LCQMCLoader", "Pipe", @@ -54,6 +59,8 @@ __all__ = [ "SST2Pipe", "IMDBPipe", "ChnSentiCorpPipe", + "THUCNewsPipe", + "WeiboSenti100kPipe", "Conll2003Pipe", "Conll2003NERPipe", diff --git a/fastNLP/io/pipe/__init__.py b/fastNLP/io/pipe/__init__.py index 0ddb1f2d..212f9e66 100644 --- a/fastNLP/io/pipe/__init__.py +++ b/fastNLP/io/pipe/__init__.py @@ -18,6 +18,8 @@ __all__ = [ "SST2Pipe", "IMDBPipe", "ChnSentiCorpPipe", + "THUCNewsPipe", + "WeiboSenti100kPipe", "Conll2003NERPipe", "OntoNotesNERPipe", @@ -42,7 +44,7 @@ __all__ = [ "CoReferencePipe" ] -from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe +from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \ MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe diff --git a/fastNLP/io/pipe/classification.py b/fastNLP/io/pipe/classification.py index 409cfe53..1c44cc23 100644 --- a/fastNLP/io/pipe/classification.py +++ b/fastNLP/io/pipe/classification.py @@ -97,11 +97,22 @@ class YelpFullPipe(_CLSPipe): 处理YelpFull的数据, 处理之后DataSet中的内容如下 .. csv-table:: 下面是使用YelpFullPipe处理后的DataSet所具备的field - :header: "raw_words", "words", "target", "seq_len" + :header: "raw_words", "target", "words", "seq_len" + + "I got 'new' tires from them and within...", 0 ,"[7, 110, 22, 107, 22, 499, 59, 140, 3,...]", 160 + " Don't waste your time. We had two dif... ", 0, "[277, 17, 278, 38, 30, 112, 24, 85, 27...", 40 + "...", ., "[...]", . - "It 's a ...", "[4, 2, 10, ...]", 0, 10 - "Offers that ...", "[20, 40, ...]", 1, 21 - "...", "[...]", ., . + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_words | target | words | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | False | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ """ @@ -193,11 +204,22 @@ class YelpPolarityPipe(_CLSPipe): 处理YelpPolarity的数据, 处理之后DataSet中的内容如下 .. csv-table:: 下面是使用YelpFullPipe处理后的DataSet所具备的field - :header: "raw_words", "words", "target", "seq_len" + :header: "raw_words", "target", "words", "seq_len" - "It 's a ...", "[4, 2, 10, ...]", 0, 10 - "Offers that ...", "[20, 40, ...]", 1, 21 - "...", "[...]", ., . + "I got 'new' tires from them and within...", 0 ,"[7, 110, 22, 107, 22, 499, 59, 140, 3,...]", 160 + " Don't waste your time. We had two dif... ", 0, "[277, 17, 278, 38, 30, 112, 24, 85, 27...", 40 + "...", ., "[...]", . + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_words | target | words | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | False | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ """ @@ -211,6 +233,19 @@ class YelpPolarityPipe(_CLSPipe): self.lower = lower def process(self, data_bundle): + """ + 传入的DataSet应该具备如下的结构 + + .. csv-table:: + :header: "raw_words", "target" + + "I got 'new' tires from them and... ", "1" + "Don't waste your time. We had two...", "1" + "...", "..." + + :param data_bundle: + :return: + """ # 复制一列words data_bundle = _add_words_field(data_bundle, lower=self.lower) @@ -244,9 +279,20 @@ class SSTPipe(_CLSPipe): .. csv-table:: 下面是使用SSTPipe处理后的DataSet所具备的field :header: "raw_words", "words", "target", "seq_len" - "It 's a ...", "[4, 2, 10, ...]", 0, 16 - "Offers that ...", "[20, 40, ...]", 1, 18 - "...", "[...]", ., . + "It 's a lovely film with lovely perfor...", 1, "[187, 6, 5, 132, 120, 70, 132, 188, 25...", 13 + "No one goes unindicted here , which is...", 0, "[191, 126, 192, 193, 194, 4, 195, 17, ...", 13 + "...", ., "[...]", . + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_words | target | words | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | False | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ """ @@ -278,11 +324,11 @@ class SSTPipe(_CLSPipe): """ 对DataBundle中的数据进行预处理。输入的DataSet应该至少拥有raw_words这一列,且内容类似与 - .. csv-table:: + .. csv-table:: 下面是使用SSTLoader读取的DataSet所具备的field :header: "raw_words" - "(3 (2 It) (4 (4 (2 's) (4 (3 (2 a)..." - "(4 (4 (2 Offers) (3 (3 (2 that) (3 (3 rare)..." + "(2 (3 (3 Effective) (2 but)) (1 (1 too-tepid)..." + "(3 (3 (2 If) (3 (2 you) (3 (2 sometimes) ..." "..." :param ~fastNLP.io.DataBundle data_bundle: 需要处理的DataBundle对象 @@ -335,12 +381,23 @@ class SST2Pipe(_CLSPipe): 加载SST2的数据, 处理完成之后DataSet将拥有以下的field .. csv-table:: - :header: "raw_words", "words", "target", "seq_len" + :header: "raw_words", "target", "words", "seq_len" - "it 's a charming and... ", "[3, 4, 5, 6, 7,...]", 1, 43 - "unflinchingly bleak and...", "[10, 11, 7,...]", 1, 21 + "it 's a charming and often affecting j... ", 1, "[19, 9, 6, 111, 5, 112, 113, 114, 3]", 9 + "unflinchingly bleak and desperate", 0, "[115, 116, 5, 117]", 4 "...", "...", ., . + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_words | target | words | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | False | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + """ def __init__(self, lower=False, tokenizer='spacy'): @@ -357,11 +414,11 @@ class SST2Pipe(_CLSPipe): 可以处理的DataSet应该具备如下的结构 .. csv-table:: - :header: "raw_words", "target" + :header: "raw_words", "target" - "it 's a charming and... ", 1 - "unflinchingly bleak and...", 1 - "...", "..." + "it 's a charming and often affecting...", "1" + "unflinchingly bleak and...", "0" + "..." :param data_bundle: :return: @@ -420,15 +477,26 @@ class IMDBPipe(_CLSPipe): 经过本Pipe处理后DataSet将如下 .. csv-table:: 输出DataSet的field - :header: "raw_words", "words", "target", "seq_len" + :header: "raw_words", "target", "words", "seq_len" - "Bromwell High is a cartoon ... ", "[3, 5, 6, 9, ...]", 0, 20 - "Story of a man who has ...", "[20, 43, 9, 10, ...]", 1, 31 - "...", "[...]", ., . + "Bromwell High is a cartoon ... ", 0, "[3, 5, 6, 9, ...]", 20 + "Story of a man who has ...", 1, "[20, 43, 9, 10, ...]", 31 + "...", ., "[...]", . 其中raw_words为str类型,是原文; words是转换为index的输入; target是转换为index的目标值; words列被设置为input; target列被设置为target。 + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_words | target | words | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | False | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + """ def __init__(self, lower: bool = False, tokenizer: str = 'spacy'): @@ -493,13 +561,23 @@ class ChnSentiCorpPipe(Pipe): 处理之后的DataSet有以下的结构 .. csv-table:: - :header: "raw_chars", "chars", "target", "seq_len" + :header: "raw_chars", "target", "chars", "seq_len" - "這間酒店環境和服務態度亦算不錯,但房間空間太小~~", "[2, 3, 4, 5, ...]", 1, 31 - "<荐书> 推荐所有喜欢<红楼>...", "[10, 21, ....]", 1, 25 + "這間酒店環境和服務態度亦算不錯,但房間空間太小~~", 1, "[2, 3, 4, 5, ...]", 31 + "<荐书> 推荐所有喜欢<红楼>...", 1, "[10, 21, ....]", 25 "..." 其中chars, seq_len是input,target是target + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_chars | target | chars | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ """ def __init__(self, bigrams=False, trigrams=False): @@ -590,12 +668,22 @@ class THUCNewsPipe(_CLSPipe): 处理之后的DataSet有以下的结构 .. csv-table:: - :header: "raw_chars", "chars", "target", "seq_len" + :header: "raw_chars", "target", "chars", "seq_len" - "马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道...", "[409, 1197, 2146, 213, ...]", 0, 746 + "马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道...", 0, "[409, 1197, 2146, 213, ...]", 746 "..." 其中chars, seq_len是input,target是target + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_chars | target | chars | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ :param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果 设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 @@ -691,12 +779,22 @@ class WeiboSenti100kPipe(_CLSPipe): 处理之后的DataSet有以下的结构 .. csv-table:: - :header: "raw_chars", "chars", "target", "seq_len" + :header: "raw_chars", "target", "chars", "seq_len" - "六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", "[0, 690, 18, ...]", 0, 56 + "六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", 0, "[0, 690, 18, ...]", 56 "..." 其中chars, seq_len是input,target是target + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_chars | target | chars | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | False | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ :param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果 设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 diff --git a/fastNLP/io/pipe/conll.py b/fastNLP/io/pipe/conll.py index 70af5acb..918cff9f 100644 --- a/fastNLP/io/pipe/conll.py +++ b/fastNLP/io/pipe/conll.py @@ -87,15 +87,26 @@ class Conll2003NERPipe(_NERPipe): 经过该Pipe过后,DataSet中的内容如下所示 .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader - :header: "raw_words", "words", "target", "seq_len" + :header: "raw_words", "target", "words", "seq_len" - "[Nadim, Ladki]", "[2, 3]", "[1, 2]", 2 - "[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[3, 4,...]", 6 + "[Nadim, Ladki]", "[1, 2]", "[2, 3]", 2 + "[AL-AIN, United, Arab, ...]", "[3, 4,...]", "[4, 5, 6,...]", 6 "[...]", "[...]", "[...]", . raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。 + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_words | target | words | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | True | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + """ def process_from_file(self, paths) -> DataBundle: @@ -112,17 +123,28 @@ class Conll2003NERPipe(_NERPipe): class Conll2003Pipe(Pipe): - r""" + """ 经过该Pipe后,DataSet中的内容如下 .. csv-table:: - :header: "raw_words" , "words", "pos", "chunk", "ner", "seq_len" + :header: "raw_words" , "pos", "chunk", "ner", "words", "seq_len" - "[Nadim, Ladki]", "[2, 3]", "[0, 0]", "[1, 2]", "[1, 2]", 2 - "[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[1, 2...]", "[3, 4...]", "[3, 4...]", 6 + "[Nadim, Ladki]", "[0, 0]", "[1, 2]", "[1, 2]", "[2, 3]", 2 + "[AL-AIN, United, Arab, ...]", "[1, 2...]", "[3, 4...]", "[3, 4...]", "[4, 5, 6,...]", 6 "[...]", "[...]", "[...]", "[...]", "[...]", . 其中words, seq_len是input; pos, chunk, ner, seq_len是target + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+-------+-------+-------+-------+---------+ + | field_names | raw_words | pos | chunk | ner | words | seq_len | + +-------------+-----------+-------+-------+-------+-------+---------+ + | is_input | False | False | False | False | True | True | + | is_target | False | True | True | True | False | True | + | ignore_type | | False | False | False | False | False | + | pad_value | | 0 | 0 | 0 | 0 | 0 | + +-------------+-----------+-------+-------+-------+-------+---------+ + """ def __init__(self, chunk_encoding_type='bioes', ner_encoding_type='bioes', lower: bool = False): @@ -202,15 +224,26 @@ class OntoNotesNERPipe(_NERPipe): 处理OntoNotes的NER数据,处理之后DataSet中的field情况为 .. csv-table:: - :header: "raw_words", "words", "target", "seq_len" + :header: "raw_words", "target", "words", "seq_len" - "[Nadim, Ladki]", "[2, 3]", "[1, 2]", 2 - "[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[3, 4]", 6 + "[Nadim, Ladki]", "[1, 2]", "[2, 3]", 2 + "[AL-AIN, United, Arab, ...]", "[3, 4]", "[4, 5, 6,...]", 6 "[...]", "[...]", "[...]", . raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。 + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_words | target | words | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | True | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + """ def process_from_file(self, paths): @@ -306,15 +339,26 @@ class MsraNERPipe(_CNNERPipe): 处理MSRA-NER的数据,处理之后的DataSet的field情况为 .. csv-table:: - :header: "raw_chars", "chars", "target", "seq_len" + :header: "raw_chars", "target", "chars", "seq_len" - "[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 - "[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 + "[相, 比, 之, 下,...]", "[0, 0, 0, 0, ...]", "[2, 3, 4, 5, ...]", 11 + "[青, 岛, 海, 牛, 队, 和, ...]", "[1, 2, 3, ...]", "[10, 21, ....]", 21 "[...]", "[...]", "[...]", . raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_chars | target | chars | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | True | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + """ def process_from_file(self, paths=None) -> DataBundle: @@ -327,14 +371,26 @@ class PeopleDailyPipe(_CNNERPipe): 处理people daily的ner的数据,处理之后的DataSet的field情况为 .. csv-table:: - :header: "raw_chars", "chars", "target", "seq_len" + :header: "raw_chars", "target", "chars", "seq_len" - "[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 - "[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 + "[相, 比, 之, 下,...]", "[0, 0, 0, 0, ...]", "[2, 3, 4, 5, ...]", 11 + "[青, 岛, 海, 牛, 队, 和, ...]", "[1, 2, 3, ...]", "[10, 21, ....]", 21 "[...]", "[...]", "[...]", . raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_chars | target | chars | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | True | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + """ def process_from_file(self, paths=None) -> DataBundle: @@ -349,13 +405,24 @@ class WeiboNERPipe(_CNNERPipe): .. csv-table:: :header: "raw_chars", "chars", "target", "seq_len" - "[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 - "[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 + "['老', '百', '姓']", "[4, 3, 3]", "[38, 39, 40]", 3 + "['心']", "[0]", "[41]", 1 "[...]", "[...]", "[...]", . raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_chars | target | chars | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | True | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + """ def process_from_file(self, paths=None) -> DataBundle: diff --git a/fastNLP/io/pipe/coreference.py b/fastNLP/io/pipe/coreference.py index c1b218a5..0cf6c996 100644 --- a/fastNLP/io/pipe/coreference.py +++ b/fastNLP/io/pipe/coreference.py @@ -18,9 +18,29 @@ from ...core.const import Const class CoReferencePipe(Pipe): """ 对Coreference resolution问题进行处理,得到文章种类/说话者/字符级信息/序列长度。 + + 处理完成后数据包含文章类别、speaker信息、句子信息、句子对应的index、char、句子长度、target: + + .. csv-table:: + :header: "words1", "words2","words3","words4","chars","seq_len","target" + + "bc", "[[0,0],[1,1]]","[['I','am'],[]]","[[1,2],[]]","[[[1],[2,3]],[]]","[2,3]","[[[2,3],[6,7]],[[10,12],[20,22]]]" + "[...]", "[...]","[...]","[...]","[...]","[...]","[...]" + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+--------+-------+---------+ + | field_names | raw_chars | target | chars | seq_len | + +-------------+-----------+--------+-------+---------+ + | is_input | False | True | True | True | + | is_target | False | True | False | True | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+--------+-------+---------+ + """ - def __init__(self,config): + def __init__(self, config): super().__init__() self.config = config @@ -35,14 +55,6 @@ class CoReferencePipe(Pipe): "bc/cctv/00/cctv_0000_1", "[['Speaker#1', 'peaker#1'],[]]","[['He','is'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]" "[...]", "[...]","[...]","[...]" - 处理完成后数据包含文章类别、speaker信息、句子信息、句子对应的index、char、句子长度、target: - - .. csv-table:: - :header: "words1", "words2","words3","words4","chars","seq_len","target" - - "bc", "[[0,0],[1,1]]","[['I','am'],[]]","[[1,2],[]]","[[[1],[2,3]],[]]","[2,3]","[[[2,3],[6,7]],[[10,12],[20,22]]]" - "[...]", "[...]","[...]","[...]","[...]","[...]","[...]" - :param data_bundle: :return: diff --git a/fastNLP/io/pipe/cws.py b/fastNLP/io/pipe/cws.py index 97bda896..a2f2e7a2 100644 --- a/fastNLP/io/pipe/cws.py +++ b/fastNLP/io/pipe/cws.py @@ -138,13 +138,22 @@ class CWSPipe(Pipe): 对CWS数据进行预处理, 处理之后的数据,具备以下的结构 .. csv-table:: - :header: "raw_words", "chars", "target", "bigrams", "trigrams", "seq_len" + :header: "raw_words", "chars", "target", "seq_len" - "共同 创造 美好...", "[2, 3, 4...]", "[0, 2, 0, 2,...]", "[10, 4, 1,...]","[6, 4, 1,...]", 13 - "2001年 新年 钟声...", "[8, 9, 9, 7, ...]", "[0, 1, 1, 1, 2...]", "[11, 12, ...]","[3, 9, ...]", 20 - "...", "[...]","[...]", "[...]","[...]", . + "共同 创造 美好...", "[2, 3, 4...]", "[0, 2, 0, 2,...]", 13 + "2001年 新年 钟声...", "[8, 9, 9, 7, ...]", "[0, 1, 1, 1, 2...]", 20 + "...", "[...]","[...]", . - 其中bigrams仅当bigrams列为True的时候存在 + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+-----------+-------+--------+---------+ + | field_names | raw_words | chars | target | seq_len | + +-------------+-----------+-------+--------+---------+ + | is_input | False | True | True | True | + | is_target | False | False | True | True | + | ignore_type | | False | False | False | + | pad_value | | 0 | 0 | 0 | + +-------------+-----------+-------+--------+---------+ """ diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index def750c0..7747dec3 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -37,16 +37,27 @@ class MatchingBertPipe(Pipe): Matching任务的Bert pipe,输出的DataSet将包含以下的field .. csv-table:: - :header: "raw_words1", "raw_words2", "words", "target", "seq_len" + :header: "raw_words1", "raw_words2", "target", "words", "seq_len" - "The new rights are...", "Everyone really likes..", "[2, 3, 4, 5, ...]", 1, 10 - "This site includes a...", "The Government Executive...", "[11, 12, 13,...]", 0, 5 - "...", "...", "[...]", ., . + "The new rights are...", "Everyone really likes..", 1, "[2, 3, 4, 5, ...]", 10 + "This site includes a...", "The Government Executive...", 0, "[11, 12, 13,...]", 5 + "...", "...", ., "[...]", . words列是将raw_words1(即premise), raw_words2(即hypothesis)使用"[SEP]"链接起来转换为index的。 words列被设置为input,target列被设置为target和input(设置为input以方便在forward函数中计算loss, 如果不在forward函数中计算loss也不影响,fastNLP将根据forward函数的形参名进行传参). + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+------------+------------+--------+-------+---------+ + | field_names | raw_words1 | raw_words2 | target | words | seq_len | + +-------------+------------+------------+--------+-------+---------+ + | is_input | False | False | False | True | True | + | is_target | False | False | True | False | False | + | ignore_type | | | False | False | False | + | pad_value | | | 0 | 0 | 0 | + +-------------+------------+------------+--------+-------+---------+ + """ def __init__(self, lower=False, tokenizer: str = 'raw'): @@ -75,6 +86,18 @@ class MatchingBertPipe(Pipe): return data_bundle def process(self, data_bundle): + """ + 输入的data_bundle中的dataset需要具有以下结构: + + .. csv-table:: + :header: "raw_words1", "raw_words2", "target" + + "Dana Reeve, the widow of the actor...", "Christopher Reeve had an...", "not_entailment" + "...","..." + + :param data_bundle: + :return: + """ for dataset in data_bundle.datasets.values(): if dataset.has_field(Const.TARGET): dataset.drop(lambda x: x[Const.TARGET] == '-') @@ -178,15 +201,27 @@ class MatchingPipe(Pipe): Matching任务的Pipe。输出的DataSet将包含以下的field .. csv-table:: - :header: "raw_words1", "raw_words2", "words1", "words2", "target", "seq_len1", "seq_len2" + :header: "raw_words1", "raw_words2", "target", "words1", "words2", "seq_len1", "seq_len2" - "The new rights are...", "Everyone really likes..", "[2, 3, 4, 5, ...]", "[10, 20, 6]", 1, 10, 13 - "This site includes a...", "The Government Executive...", "[11, 12, 13,...]", "[2, 7, ...]", 0, 6, 7 - "...", "...", "[...]", "[...]", ., ., . + "The new rights are...", "Everyone really likes..", 1, "[2, 3, 4, 5, ...]", "[10, 20, 6]", 10, 13 + "This site includes a...", "The Government Executive...", 0, "[11, 12, 13,...]", "[2, 7, ...]", 6, 7 + "...", "...", ., "[...]", "[...]", ., . words1是premise,words2是hypothesis。其中words1,words2,seq_len1,seq_len2被设置为input;target被设置为target 和input(设置为input以方便在forward函数中计算loss,如果不在forward函数中计算loss也不影响,fastNLP将根据forward函数 的形参名进行传参)。 + + dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: + + +-------------+------------+------------+--------+--------+--------+----------+----------+ + | field_names | raw_words1 | raw_words2 | target | words1 | words2 | seq_len1 | seq_len2 | + +-------------+------------+------------+--------+--------+--------+----------+----------+ + | is_input | False | False | False | True | True | True | True | + | is_target | False | False | True | False | False | False | False | + | ignore_type | | | False | False | False | False | False | + | pad_value | | | 0 | 0 | 0 | 0 | 0 | + +-------------+------------+------------+--------+--------+--------+----------+----------+ + """ def __init__(self, lower=False, tokenizer: str = 'raw'):