@@ -25,6 +25,8 @@ __all__ = [ | |||||
'SSTLoader', | 'SSTLoader', | ||||
'SST2Loader', | 'SST2Loader', | ||||
"ChnSentiCorpLoader", | "ChnSentiCorpLoader", | ||||
"THUCNewsLoader", | |||||
"WeiboSenti100kLoader", | |||||
'ConllLoader', | 'ConllLoader', | ||||
'Conll2003Loader', | 'Conll2003Loader', | ||||
@@ -45,6 +47,9 @@ __all__ = [ | |||||
"SNLILoader", | "SNLILoader", | ||||
"QNLILoader", | "QNLILoader", | ||||
"RTELoader", | "RTELoader", | ||||
"XNLILoader", | |||||
"BQCorpusLoader", | |||||
"LCQMCLoader", | |||||
"Pipe", | "Pipe", | ||||
@@ -54,6 +59,8 @@ __all__ = [ | |||||
"SST2Pipe", | "SST2Pipe", | ||||
"IMDBPipe", | "IMDBPipe", | ||||
"ChnSentiCorpPipe", | "ChnSentiCorpPipe", | ||||
"THUCNewsPipe", | |||||
"WeiboSenti100kPipe", | |||||
"Conll2003Pipe", | "Conll2003Pipe", | ||||
"Conll2003NERPipe", | "Conll2003NERPipe", | ||||
@@ -18,6 +18,8 @@ __all__ = [ | |||||
"SST2Pipe", | "SST2Pipe", | ||||
"IMDBPipe", | "IMDBPipe", | ||||
"ChnSentiCorpPipe", | "ChnSentiCorpPipe", | ||||
"THUCNewsPipe", | |||||
"WeiboSenti100kPipe", | |||||
"Conll2003NERPipe", | "Conll2003NERPipe", | ||||
"OntoNotesNERPipe", | "OntoNotesNERPipe", | ||||
@@ -42,7 +44,7 @@ __all__ = [ | |||||
"CoReferencePipe" | "CoReferencePipe" | ||||
] | ] | ||||
from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe | |||||
from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe | |||||
from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe | from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe | ||||
from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \ | from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \ | ||||
MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe | MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe | ||||
@@ -97,11 +97,22 @@ class YelpFullPipe(_CLSPipe): | |||||
处理YelpFull的数据, 处理之后DataSet中的内容如下 | 处理YelpFull的数据, 处理之后DataSet中的内容如下 | ||||
.. csv-table:: 下面是使用YelpFullPipe处理后的DataSet所具备的field | .. csv-table:: 下面是使用YelpFullPipe处理后的DataSet所具备的field | ||||
:header: "raw_words", "words", "target", "seq_len" | |||||
:header: "raw_words", "target", "words", "seq_len" | |||||
"I got 'new' tires from them and within...", 0 ,"[7, 110, 22, 107, 22, 499, 59, 140, 3,...]", 160 | |||||
" Don't waste your time. We had two dif... ", 0, "[277, 17, 278, 38, 30, 112, 24, 85, 27...", 40 | |||||
"...", ., "[...]", . | |||||
"It 's a ...", "[4, 2, 10, ...]", 0, 10 | |||||
"Offers that ...", "[20, 40, ...]", 1, 21 | |||||
"...", "[...]", ., . | |||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| field_names | raw_words | target | words | seq_len | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| is_input | False | False | True | True | | |||||
| is_target | False | True | False | False | | |||||
| ignore_type | | False | False | False | | |||||
| pad_value | | 0 | 0 | 0 | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
""" | """ | ||||
@@ -193,11 +204,22 @@ class YelpPolarityPipe(_CLSPipe): | |||||
处理YelpPolarity的数据, 处理之后DataSet中的内容如下 | 处理YelpPolarity的数据, 处理之后DataSet中的内容如下 | ||||
.. csv-table:: 下面是使用YelpFullPipe处理后的DataSet所具备的field | .. csv-table:: 下面是使用YelpFullPipe处理后的DataSet所具备的field | ||||
:header: "raw_words", "words", "target", "seq_len" | |||||
:header: "raw_words", "target", "words", "seq_len" | |||||
"It 's a ...", "[4, 2, 10, ...]", 0, 10 | |||||
"Offers that ...", "[20, 40, ...]", 1, 21 | |||||
"...", "[...]", ., . | |||||
"I got 'new' tires from them and within...", 0 ,"[7, 110, 22, 107, 22, 499, 59, 140, 3,...]", 160 | |||||
" Don't waste your time. We had two dif... ", 0, "[277, 17, 278, 38, 30, 112, 24, 85, 27...", 40 | |||||
"...", ., "[...]", . | |||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| field_names | raw_words | target | words | seq_len | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| is_input | False | False | True | True | | |||||
| is_target | False | True | False | False | | |||||
| ignore_type | | False | False | False | | |||||
| pad_value | | 0 | 0 | 0 | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
""" | """ | ||||
@@ -211,6 +233,19 @@ class YelpPolarityPipe(_CLSPipe): | |||||
self.lower = lower | self.lower = lower | ||||
def process(self, data_bundle): | def process(self, data_bundle): | ||||
""" | |||||
传入的DataSet应该具备如下的结构 | |||||
.. csv-table:: | |||||
:header: "raw_words", "target" | |||||
"I got 'new' tires from them and... ", "1" | |||||
"Don't waste your time. We had two...", "1" | |||||
"...", "..." | |||||
:param data_bundle: | |||||
:return: | |||||
""" | |||||
# 复制一列words | # 复制一列words | ||||
data_bundle = _add_words_field(data_bundle, lower=self.lower) | data_bundle = _add_words_field(data_bundle, lower=self.lower) | ||||
@@ -244,9 +279,20 @@ class SSTPipe(_CLSPipe): | |||||
.. csv-table:: 下面是使用SSTPipe处理后的DataSet所具备的field | .. csv-table:: 下面是使用SSTPipe处理后的DataSet所具备的field | ||||
:header: "raw_words", "words", "target", "seq_len" | :header: "raw_words", "words", "target", "seq_len" | ||||
"It 's a ...", "[4, 2, 10, ...]", 0, 16 | |||||
"Offers that ...", "[20, 40, ...]", 1, 18 | |||||
"...", "[...]", ., . | |||||
"It 's a lovely film with lovely perfor...", 1, "[187, 6, 5, 132, 120, 70, 132, 188, 25...", 13 | |||||
"No one goes unindicted here , which is...", 0, "[191, 126, 192, 193, 194, 4, 195, 17, ...", 13 | |||||
"...", ., "[...]", . | |||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| field_names | raw_words | target | words | seq_len | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| is_input | False | False | True | True | | |||||
| is_target | False | True | False | False | | |||||
| ignore_type | | False | False | False | | |||||
| pad_value | | 0 | 0 | 0 | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
""" | """ | ||||
@@ -278,11 +324,11 @@ class SSTPipe(_CLSPipe): | |||||
""" | """ | ||||
对DataBundle中的数据进行预处理。输入的DataSet应该至少拥有raw_words这一列,且内容类似与 | 对DataBundle中的数据进行预处理。输入的DataSet应该至少拥有raw_words这一列,且内容类似与 | ||||
.. csv-table:: | |||||
.. csv-table:: 下面是使用SSTLoader读取的DataSet所具备的field | |||||
:header: "raw_words" | :header: "raw_words" | ||||
"(3 (2 It) (4 (4 (2 's) (4 (3 (2 a)..." | |||||
"(4 (4 (2 Offers) (3 (3 (2 that) (3 (3 rare)..." | |||||
"(2 (3 (3 Effective) (2 but)) (1 (1 too-tepid)..." | |||||
"(3 (3 (2 If) (3 (2 you) (3 (2 sometimes) ..." | |||||
"..." | "..." | ||||
:param ~fastNLP.io.DataBundle data_bundle: 需要处理的DataBundle对象 | :param ~fastNLP.io.DataBundle data_bundle: 需要处理的DataBundle对象 | ||||
@@ -335,12 +381,23 @@ class SST2Pipe(_CLSPipe): | |||||
加载SST2的数据, 处理完成之后DataSet将拥有以下的field | 加载SST2的数据, 处理完成之后DataSet将拥有以下的field | ||||
.. csv-table:: | .. csv-table:: | ||||
:header: "raw_words", "words", "target", "seq_len" | |||||
:header: "raw_words", "target", "words", "seq_len" | |||||
"it 's a charming and... ", "[3, 4, 5, 6, 7,...]", 1, 43 | |||||
"unflinchingly bleak and...", "[10, 11, 7,...]", 1, 21 | |||||
"it 's a charming and often affecting j... ", 1, "[19, 9, 6, 111, 5, 112, 113, 114, 3]", 9 | |||||
"unflinchingly bleak and desperate", 0, "[115, 116, 5, 117]", 4 | |||||
"...", "...", ., . | "...", "...", ., . | ||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| field_names | raw_words | target | words | seq_len | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| is_input | False | False | True | True | | |||||
| is_target | False | True | False | False | | |||||
| ignore_type | | False | False | False | | |||||
| pad_value | | 0 | 0 | 0 | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
""" | """ | ||||
def __init__(self, lower=False, tokenizer='spacy'): | def __init__(self, lower=False, tokenizer='spacy'): | ||||
@@ -357,11 +414,11 @@ class SST2Pipe(_CLSPipe): | |||||
可以处理的DataSet应该具备如下的结构 | 可以处理的DataSet应该具备如下的结构 | ||||
.. csv-table:: | .. csv-table:: | ||||
:header: "raw_words", "target" | |||||
:header: "raw_words", "target" | |||||
"it 's a charming and... ", 1 | |||||
"unflinchingly bleak and...", 1 | |||||
"...", "..." | |||||
"it 's a charming and often affecting...", "1" | |||||
"unflinchingly bleak and...", "0" | |||||
"..." | |||||
:param data_bundle: | :param data_bundle: | ||||
:return: | :return: | ||||
@@ -420,15 +477,26 @@ class IMDBPipe(_CLSPipe): | |||||
经过本Pipe处理后DataSet将如下 | 经过本Pipe处理后DataSet将如下 | ||||
.. csv-table:: 输出DataSet的field | .. csv-table:: 输出DataSet的field | ||||
:header: "raw_words", "words", "target", "seq_len" | |||||
:header: "raw_words", "target", "words", "seq_len" | |||||
"Bromwell High is a cartoon ... ", "[3, 5, 6, 9, ...]", 0, 20 | |||||
"Story of a man who has ...", "[20, 43, 9, 10, ...]", 1, 31 | |||||
"...", "[...]", ., . | |||||
"Bromwell High is a cartoon ... ", 0, "[3, 5, 6, 9, ...]", 20 | |||||
"Story of a man who has ...", 1, "[20, 43, 9, 10, ...]", 31 | |||||
"...", ., "[...]", . | |||||
其中raw_words为str类型,是原文; words是转换为index的输入; target是转换为index的目标值; | 其中raw_words为str类型,是原文; words是转换为index的输入; target是转换为index的目标值; | ||||
words列被设置为input; target列被设置为target。 | words列被设置为input; target列被设置为target。 | ||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| field_names | raw_words | target | words | seq_len | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| is_input | False | False | True | True | | |||||
| is_target | False | True | False | False | | |||||
| ignore_type | | False | False | False | | |||||
| pad_value | | 0 | 0 | 0 | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
""" | """ | ||||
def __init__(self, lower: bool = False, tokenizer: str = 'spacy'): | def __init__(self, lower: bool = False, tokenizer: str = 'spacy'): | ||||
@@ -493,13 +561,23 @@ class ChnSentiCorpPipe(Pipe): | |||||
处理之后的DataSet有以下的结构 | 处理之后的DataSet有以下的结构 | ||||
.. csv-table:: | .. csv-table:: | ||||
:header: "raw_chars", "chars", "target", "seq_len" | |||||
:header: "raw_chars", "target", "chars", "seq_len" | |||||
"這間酒店環境和服務態度亦算不錯,但房間空間太小~~", "[2, 3, 4, 5, ...]", 1, 31 | |||||
"<荐书> 推荐所有喜欢<红楼>...", "[10, 21, ....]", 1, 25 | |||||
"這間酒店環境和服務態度亦算不錯,但房間空間太小~~", 1, "[2, 3, 4, 5, ...]", 31 | |||||
"<荐书> 推荐所有喜欢<红楼>...", 1, "[10, 21, ....]", 25 | |||||
"..." | "..." | ||||
其中chars, seq_len是input,target是target | 其中chars, seq_len是input,target是target | ||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| field_names | raw_chars | target | chars | seq_len | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| is_input | False | True | True | True | | |||||
| is_target | False | True | False | False | | |||||
| ignore_type | | False | False | False | | |||||
| pad_value | | 0 | 0 | 0 | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
""" | """ | ||||
def __init__(self, bigrams=False, trigrams=False): | def __init__(self, bigrams=False, trigrams=False): | ||||
@@ -590,12 +668,22 @@ class THUCNewsPipe(_CLSPipe): | |||||
处理之后的DataSet有以下的结构 | 处理之后的DataSet有以下的结构 | ||||
.. csv-table:: | .. csv-table:: | ||||
:header: "raw_chars", "chars", "target", "seq_len" | |||||
:header: "raw_chars", "target", "chars", "seq_len" | |||||
"马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道...", "[409, 1197, 2146, 213, ...]", 0, 746 | |||||
"马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道...", 0, "[409, 1197, 2146, 213, ...]", 746 | |||||
"..." | "..." | ||||
其中chars, seq_len是input,target是target | 其中chars, seq_len是input,target是target | ||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| field_names | raw_chars | target | chars | seq_len | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| is_input | False | True | True | True | | |||||
| is_target | False | True | False | False | | |||||
| ignore_type | | False | False | False | | |||||
| pad_value | | 0 | 0 | 0 | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
:param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果 | :param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果 | ||||
设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 | 设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 | ||||
@@ -691,12 +779,22 @@ class WeiboSenti100kPipe(_CLSPipe): | |||||
处理之后的DataSet有以下的结构 | 处理之后的DataSet有以下的结构 | ||||
.. csv-table:: | .. csv-table:: | ||||
:header: "raw_chars", "chars", "target", "seq_len" | |||||
:header: "raw_chars", "target", "chars", "seq_len" | |||||
"六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", "[0, 690, 18, ...]", 0, 56 | |||||
"六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", 0, "[0, 690, 18, ...]", 56 | |||||
"..." | "..." | ||||
其中chars, seq_len是input,target是target | 其中chars, seq_len是input,target是target | ||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| field_names | raw_chars | target | chars | seq_len | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| is_input | False | True | True | True | | |||||
| is_target | False | True | False | False | | |||||
| ignore_type | | False | False | False | | |||||
| pad_value | | 0 | 0 | 0 | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
:param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果 | :param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果 | ||||
设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 | 设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 | ||||
@@ -87,15 +87,26 @@ class Conll2003NERPipe(_NERPipe): | |||||
经过该Pipe过后,DataSet中的内容如下所示 | 经过该Pipe过后,DataSet中的内容如下所示 | ||||
.. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader | .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader | ||||
:header: "raw_words", "words", "target", "seq_len" | |||||
:header: "raw_words", "target", "words", "seq_len" | |||||
"[Nadim, Ladki]", "[2, 3]", "[1, 2]", 2 | |||||
"[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[3, 4,...]", 6 | |||||
"[Nadim, Ladki]", "[1, 2]", "[2, 3]", 2 | |||||
"[AL-AIN, United, Arab, ...]", "[3, 4,...]", "[4, 5, 6,...]", 6 | |||||
"[...]", "[...]", "[...]", . | "[...]", "[...]", "[...]", . | ||||
raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | ||||
target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。 | target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。 | ||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| field_names | raw_words | target | words | seq_len | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| is_input | False | True | True | True | | |||||
| is_target | False | True | False | True | | |||||
| ignore_type | | False | False | False | | |||||
| pad_value | | 0 | 0 | 0 | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
""" | """ | ||||
def process_from_file(self, paths) -> DataBundle: | def process_from_file(self, paths) -> DataBundle: | ||||
@@ -112,17 +123,28 @@ class Conll2003NERPipe(_NERPipe): | |||||
class Conll2003Pipe(Pipe): | class Conll2003Pipe(Pipe): | ||||
r""" | |||||
""" | |||||
经过该Pipe后,DataSet中的内容如下 | 经过该Pipe后,DataSet中的内容如下 | ||||
.. csv-table:: | .. csv-table:: | ||||
:header: "raw_words" , "words", "pos", "chunk", "ner", "seq_len" | |||||
:header: "raw_words" , "pos", "chunk", "ner", "words", "seq_len" | |||||
"[Nadim, Ladki]", "[2, 3]", "[0, 0]", "[1, 2]", "[1, 2]", 2 | |||||
"[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[1, 2...]", "[3, 4...]", "[3, 4...]", 6 | |||||
"[Nadim, Ladki]", "[0, 0]", "[1, 2]", "[1, 2]", "[2, 3]", 2 | |||||
"[AL-AIN, United, Arab, ...]", "[1, 2...]", "[3, 4...]", "[3, 4...]", "[4, 5, 6,...]", 6 | |||||
"[...]", "[...]", "[...]", "[...]", "[...]", . | "[...]", "[...]", "[...]", "[...]", "[...]", . | ||||
其中words, seq_len是input; pos, chunk, ner, seq_len是target | 其中words, seq_len是input; pos, chunk, ner, seq_len是target | ||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+-----------+-------+-------+-------+-------+---------+ | |||||
| field_names | raw_words | pos | chunk | ner | words | seq_len | | |||||
+-------------+-----------+-------+-------+-------+-------+---------+ | |||||
| is_input | False | False | False | False | True | True | | |||||
| is_target | False | True | True | True | False | True | | |||||
| ignore_type | | False | False | False | False | False | | |||||
| pad_value | | 0 | 0 | 0 | 0 | 0 | | |||||
+-------------+-----------+-------+-------+-------+-------+---------+ | |||||
""" | """ | ||||
def __init__(self, chunk_encoding_type='bioes', ner_encoding_type='bioes', lower: bool = False): | def __init__(self, chunk_encoding_type='bioes', ner_encoding_type='bioes', lower: bool = False): | ||||
@@ -202,15 +224,26 @@ class OntoNotesNERPipe(_NERPipe): | |||||
处理OntoNotes的NER数据,处理之后DataSet中的field情况为 | 处理OntoNotes的NER数据,处理之后DataSet中的field情况为 | ||||
.. csv-table:: | .. csv-table:: | ||||
:header: "raw_words", "words", "target", "seq_len" | |||||
:header: "raw_words", "target", "words", "seq_len" | |||||
"[Nadim, Ladki]", "[2, 3]", "[1, 2]", 2 | |||||
"[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[3, 4]", 6 | |||||
"[Nadim, Ladki]", "[1, 2]", "[2, 3]", 2 | |||||
"[AL-AIN, United, Arab, ...]", "[3, 4]", "[4, 5, 6,...]", 6 | |||||
"[...]", "[...]", "[...]", . | "[...]", "[...]", "[...]", . | ||||
raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | ||||
target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。 | target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。 | ||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| field_names | raw_words | target | words | seq_len | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| is_input | False | True | True | True | | |||||
| is_target | False | True | False | True | | |||||
| ignore_type | | False | False | False | | |||||
| pad_value | | 0 | 0 | 0 | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
""" | """ | ||||
def process_from_file(self, paths): | def process_from_file(self, paths): | ||||
@@ -306,15 +339,26 @@ class MsraNERPipe(_CNNERPipe): | |||||
处理MSRA-NER的数据,处理之后的DataSet的field情况为 | 处理MSRA-NER的数据,处理之后的DataSet的field情况为 | ||||
.. csv-table:: | .. csv-table:: | ||||
:header: "raw_chars", "chars", "target", "seq_len" | |||||
:header: "raw_chars", "target", "chars", "seq_len" | |||||
"[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 | |||||
"[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 | |||||
"[相, 比, 之, 下,...]", "[0, 0, 0, 0, ...]", "[2, 3, 4, 5, ...]", 11 | |||||
"[青, 岛, 海, 牛, 队, 和, ...]", "[1, 2, 3, ...]", "[10, 21, ....]", 21 | |||||
"[...]", "[...]", "[...]", . | "[...]", "[...]", "[...]", . | ||||
raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | ||||
target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | ||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| field_names | raw_chars | target | chars | seq_len | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| is_input | False | True | True | True | | |||||
| is_target | False | True | False | True | | |||||
| ignore_type | | False | False | False | | |||||
| pad_value | | 0 | 0 | 0 | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
""" | """ | ||||
def process_from_file(self, paths=None) -> DataBundle: | def process_from_file(self, paths=None) -> DataBundle: | ||||
@@ -327,14 +371,26 @@ class PeopleDailyPipe(_CNNERPipe): | |||||
处理people daily的ner的数据,处理之后的DataSet的field情况为 | 处理people daily的ner的数据,处理之后的DataSet的field情况为 | ||||
.. csv-table:: | .. csv-table:: | ||||
:header: "raw_chars", "chars", "target", "seq_len" | |||||
:header: "raw_chars", "target", "chars", "seq_len" | |||||
"[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 | |||||
"[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 | |||||
"[相, 比, 之, 下,...]", "[0, 0, 0, 0, ...]", "[2, 3, 4, 5, ...]", 11 | |||||
"[青, 岛, 海, 牛, 队, 和, ...]", "[1, 2, 3, ...]", "[10, 21, ....]", 21 | |||||
"[...]", "[...]", "[...]", . | "[...]", "[...]", "[...]", . | ||||
raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | ||||
target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | ||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| field_names | raw_chars | target | chars | seq_len | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| is_input | False | True | True | True | | |||||
| is_target | False | True | False | True | | |||||
| ignore_type | | False | False | False | | |||||
| pad_value | | 0 | 0 | 0 | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
""" | """ | ||||
def process_from_file(self, paths=None) -> DataBundle: | def process_from_file(self, paths=None) -> DataBundle: | ||||
@@ -349,13 +405,24 @@ class WeiboNERPipe(_CNNERPipe): | |||||
.. csv-table:: | .. csv-table:: | ||||
:header: "raw_chars", "chars", "target", "seq_len" | :header: "raw_chars", "chars", "target", "seq_len" | ||||
"[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 | |||||
"[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 | |||||
"['老', '百', '姓']", "[4, 3, 3]", "[38, 39, 40]", 3 | |||||
"['心']", "[0]", "[41]", 1 | |||||
"[...]", "[...]", "[...]", . | "[...]", "[...]", "[...]", . | ||||
raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | ||||
target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | ||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| field_names | raw_chars | target | chars | seq_len | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| is_input | False | True | True | True | | |||||
| is_target | False | True | False | True | | |||||
| ignore_type | | False | False | False | | |||||
| pad_value | | 0 | 0 | 0 | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
""" | """ | ||||
def process_from_file(self, paths=None) -> DataBundle: | def process_from_file(self, paths=None) -> DataBundle: | ||||
@@ -18,9 +18,29 @@ from ...core.const import Const | |||||
class CoReferencePipe(Pipe): | class CoReferencePipe(Pipe): | ||||
""" | """ | ||||
对Coreference resolution问题进行处理,得到文章种类/说话者/字符级信息/序列长度。 | 对Coreference resolution问题进行处理,得到文章种类/说话者/字符级信息/序列长度。 | ||||
处理完成后数据包含文章类别、speaker信息、句子信息、句子对应的index、char、句子长度、target: | |||||
.. csv-table:: | |||||
:header: "words1", "words2","words3","words4","chars","seq_len","target" | |||||
"bc", "[[0,0],[1,1]]","[['I','am'],[]]","[[1,2],[]]","[[[1],[2,3]],[]]","[2,3]","[[[2,3],[6,7]],[[10,12],[20,22]]]" | |||||
"[...]", "[...]","[...]","[...]","[...]","[...]","[...]" | |||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| field_names | raw_chars | target | chars | seq_len | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
| is_input | False | True | True | True | | |||||
| is_target | False | True | False | True | | |||||
| ignore_type | | False | False | False | | |||||
| pad_value | | 0 | 0 | 0 | | |||||
+-------------+-----------+--------+-------+---------+ | |||||
""" | """ | ||||
def __init__(self,config): | |||||
def __init__(self, config): | |||||
super().__init__() | super().__init__() | ||||
self.config = config | self.config = config | ||||
@@ -35,14 +55,6 @@ class CoReferencePipe(Pipe): | |||||
"bc/cctv/00/cctv_0000_1", "[['Speaker#1', 'peaker#1'],[]]","[['He','is'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]" | "bc/cctv/00/cctv_0000_1", "[['Speaker#1', 'peaker#1'],[]]","[['He','is'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]" | ||||
"[...]", "[...]","[...]","[...]" | "[...]", "[...]","[...]","[...]" | ||||
处理完成后数据包含文章类别、speaker信息、句子信息、句子对应的index、char、句子长度、target: | |||||
.. csv-table:: | |||||
:header: "words1", "words2","words3","words4","chars","seq_len","target" | |||||
"bc", "[[0,0],[1,1]]","[['I','am'],[]]","[[1,2],[]]","[[[1],[2,3]],[]]","[2,3]","[[[2,3],[6,7]],[[10,12],[20,22]]]" | |||||
"[...]", "[...]","[...]","[...]","[...]","[...]","[...]" | |||||
:param data_bundle: | :param data_bundle: | ||||
:return: | :return: | ||||
@@ -138,13 +138,22 @@ class CWSPipe(Pipe): | |||||
对CWS数据进行预处理, 处理之后的数据,具备以下的结构 | 对CWS数据进行预处理, 处理之后的数据,具备以下的结构 | ||||
.. csv-table:: | .. csv-table:: | ||||
:header: "raw_words", "chars", "target", "bigrams", "trigrams", "seq_len" | |||||
:header: "raw_words", "chars", "target", "seq_len" | |||||
"共同 创造 美好...", "[2, 3, 4...]", "[0, 2, 0, 2,...]", "[10, 4, 1,...]","[6, 4, 1,...]", 13 | |||||
"2001年 新年 钟声...", "[8, 9, 9, 7, ...]", "[0, 1, 1, 1, 2...]", "[11, 12, ...]","[3, 9, ...]", 20 | |||||
"...", "[...]","[...]", "[...]","[...]", . | |||||
"共同 创造 美好...", "[2, 3, 4...]", "[0, 2, 0, 2,...]", 13 | |||||
"2001年 新年 钟声...", "[8, 9, 9, 7, ...]", "[0, 1, 1, 1, 2...]", 20 | |||||
"...", "[...]","[...]", . | |||||
其中bigrams仅当bigrams列为True的时候存在 | |||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+-----------+-------+--------+---------+ | |||||
| field_names | raw_words | chars | target | seq_len | | |||||
+-------------+-----------+-------+--------+---------+ | |||||
| is_input | False | True | True | True | | |||||
| is_target | False | False | True | True | | |||||
| ignore_type | | False | False | False | | |||||
| pad_value | | 0 | 0 | 0 | | |||||
+-------------+-----------+-------+--------+---------+ | |||||
""" | """ | ||||
@@ -37,16 +37,27 @@ class MatchingBertPipe(Pipe): | |||||
Matching任务的Bert pipe,输出的DataSet将包含以下的field | Matching任务的Bert pipe,输出的DataSet将包含以下的field | ||||
.. csv-table:: | .. csv-table:: | ||||
:header: "raw_words1", "raw_words2", "words", "target", "seq_len" | |||||
:header: "raw_words1", "raw_words2", "target", "words", "seq_len" | |||||
"The new rights are...", "Everyone really likes..", "[2, 3, 4, 5, ...]", 1, 10 | |||||
"This site includes a...", "The Government Executive...", "[11, 12, 13,...]", 0, 5 | |||||
"...", "...", "[...]", ., . | |||||
"The new rights are...", "Everyone really likes..", 1, "[2, 3, 4, 5, ...]", 10 | |||||
"This site includes a...", "The Government Executive...", 0, "[11, 12, 13,...]", 5 | |||||
"...", "...", ., "[...]", . | |||||
words列是将raw_words1(即premise), raw_words2(即hypothesis)使用"[SEP]"链接起来转换为index的。 | words列是将raw_words1(即premise), raw_words2(即hypothesis)使用"[SEP]"链接起来转换为index的。 | ||||
words列被设置为input,target列被设置为target和input(设置为input以方便在forward函数中计算loss, | words列被设置为input,target列被设置为target和input(设置为input以方便在forward函数中计算loss, | ||||
如果不在forward函数中计算loss也不影响,fastNLP将根据forward函数的形参名进行传参). | 如果不在forward函数中计算loss也不影响,fastNLP将根据forward函数的形参名进行传参). | ||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+------------+------------+--------+-------+---------+ | |||||
| field_names | raw_words1 | raw_words2 | target | words | seq_len | | |||||
+-------------+------------+------------+--------+-------+---------+ | |||||
| is_input | False | False | False | True | True | | |||||
| is_target | False | False | True | False | False | | |||||
| ignore_type | | | False | False | False | | |||||
| pad_value | | | 0 | 0 | 0 | | |||||
+-------------+------------+------------+--------+-------+---------+ | |||||
""" | """ | ||||
def __init__(self, lower=False, tokenizer: str = 'raw'): | def __init__(self, lower=False, tokenizer: str = 'raw'): | ||||
@@ -75,6 +86,18 @@ class MatchingBertPipe(Pipe): | |||||
return data_bundle | return data_bundle | ||||
def process(self, data_bundle): | def process(self, data_bundle): | ||||
""" | |||||
输入的data_bundle中的dataset需要具有以下结构: | |||||
.. csv-table:: | |||||
:header: "raw_words1", "raw_words2", "target" | |||||
"Dana Reeve, the widow of the actor...", "Christopher Reeve had an...", "not_entailment" | |||||
"...","..." | |||||
:param data_bundle: | |||||
:return: | |||||
""" | |||||
for dataset in data_bundle.datasets.values(): | for dataset in data_bundle.datasets.values(): | ||||
if dataset.has_field(Const.TARGET): | if dataset.has_field(Const.TARGET): | ||||
dataset.drop(lambda x: x[Const.TARGET] == '-') | dataset.drop(lambda x: x[Const.TARGET] == '-') | ||||
@@ -178,15 +201,27 @@ class MatchingPipe(Pipe): | |||||
Matching任务的Pipe。输出的DataSet将包含以下的field | Matching任务的Pipe。输出的DataSet将包含以下的field | ||||
.. csv-table:: | .. csv-table:: | ||||
:header: "raw_words1", "raw_words2", "words1", "words2", "target", "seq_len1", "seq_len2" | |||||
:header: "raw_words1", "raw_words2", "target", "words1", "words2", "seq_len1", "seq_len2" | |||||
"The new rights are...", "Everyone really likes..", "[2, 3, 4, 5, ...]", "[10, 20, 6]", 1, 10, 13 | |||||
"This site includes a...", "The Government Executive...", "[11, 12, 13,...]", "[2, 7, ...]", 0, 6, 7 | |||||
"...", "...", "[...]", "[...]", ., ., . | |||||
"The new rights are...", "Everyone really likes..", 1, "[2, 3, 4, 5, ...]", "[10, 20, 6]", 10, 13 | |||||
"This site includes a...", "The Government Executive...", 0, "[11, 12, 13,...]", "[2, 7, ...]", 6, 7 | |||||
"...", "...", ., "[...]", "[...]", ., . | |||||
words1是premise,words2是hypothesis。其中words1,words2,seq_len1,seq_len2被设置为input;target被设置为target | words1是premise,words2是hypothesis。其中words1,words2,seq_len1,seq_len2被设置为input;target被设置为target | ||||
和input(设置为input以方便在forward函数中计算loss,如果不在forward函数中计算loss也不影响,fastNLP将根据forward函数 | 和input(设置为input以方便在forward函数中计算loss,如果不在forward函数中计算loss也不影响,fastNLP将根据forward函数 | ||||
的形参名进行传参)。 | 的形参名进行传参)。 | ||||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||||
+-------------+------------+------------+--------+--------+--------+----------+----------+ | |||||
| field_names | raw_words1 | raw_words2 | target | words1 | words2 | seq_len1 | seq_len2 | | |||||
+-------------+------------+------------+--------+--------+--------+----------+----------+ | |||||
| is_input | False | False | False | True | True | True | True | | |||||
| is_target | False | False | True | False | False | False | False | | |||||
| ignore_type | | | False | False | False | False | False | | |||||
| pad_value | | | 0 | 0 | 0 | 0 | 0 | | |||||
+-------------+------------+------------+--------+--------+--------+----------+----------+ | |||||
""" | """ | ||||
def __init__(self, lower=False, tokenizer: str = 'raw'): | def __init__(self, lower=False, tokenizer: str = 'raw'): | ||||