@@ -25,6 +25,8 @@ __all__ = [ | |||
'SSTLoader', | |||
'SST2Loader', | |||
"ChnSentiCorpLoader", | |||
"THUCNewsLoader", | |||
"WeiboSenti100kLoader", | |||
'ConllLoader', | |||
'Conll2003Loader', | |||
@@ -45,6 +47,9 @@ __all__ = [ | |||
"SNLILoader", | |||
"QNLILoader", | |||
"RTELoader", | |||
"XNLILoader", | |||
"BQCorpusLoader", | |||
"LCQMCLoader", | |||
"Pipe", | |||
@@ -54,6 +59,8 @@ __all__ = [ | |||
"SST2Pipe", | |||
"IMDBPipe", | |||
"ChnSentiCorpPipe", | |||
"THUCNewsPipe", | |||
"WeiboSenti100kPipe", | |||
"Conll2003Pipe", | |||
"Conll2003NERPipe", | |||
@@ -18,6 +18,8 @@ __all__ = [ | |||
"SST2Pipe", | |||
"IMDBPipe", | |||
"ChnSentiCorpPipe", | |||
"THUCNewsPipe", | |||
"WeiboSenti100kPipe", | |||
"Conll2003NERPipe", | |||
"OntoNotesNERPipe", | |||
@@ -42,7 +44,7 @@ __all__ = [ | |||
"CoReferencePipe" | |||
] | |||
from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe | |||
from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe | |||
from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe | |||
from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \ | |||
MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe | |||
@@ -97,11 +97,22 @@ class YelpFullPipe(_CLSPipe): | |||
处理YelpFull的数据, 处理之后DataSet中的内容如下 | |||
.. csv-table:: 下面是使用YelpFullPipe处理后的DataSet所具备的field | |||
:header: "raw_words", "words", "target", "seq_len" | |||
:header: "raw_words", "target", "words", "seq_len" | |||
"I got 'new' tires from them and within...", 0 ,"[7, 110, 22, 107, 22, 499, 59, 140, 3,...]", 160 | |||
" Don't waste your time. We had two dif... ", 0, "[277, 17, 278, 38, 30, 112, 24, 85, 27...", 40 | |||
"...", ., "[...]", . | |||
"It 's a ...", "[4, 2, 10, ...]", 0, 10 | |||
"Offers that ...", "[20, 40, ...]", 1, 21 | |||
"...", "[...]", ., . | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+-----------+--------+-------+---------+ | |||
| field_names | raw_words | target | words | seq_len | | |||
+-------------+-----------+--------+-------+---------+ | |||
| is_input | False | False | True | True | | |||
| is_target | False | True | False | False | | |||
| ignore_type | | False | False | False | | |||
| pad_value | | 0 | 0 | 0 | | |||
+-------------+-----------+--------+-------+---------+ | |||
""" | |||
@@ -193,11 +204,22 @@ class YelpPolarityPipe(_CLSPipe): | |||
处理YelpPolarity的数据, 处理之后DataSet中的内容如下 | |||
.. csv-table:: 下面是使用YelpFullPipe处理后的DataSet所具备的field | |||
:header: "raw_words", "words", "target", "seq_len" | |||
:header: "raw_words", "target", "words", "seq_len" | |||
"It 's a ...", "[4, 2, 10, ...]", 0, 10 | |||
"Offers that ...", "[20, 40, ...]", 1, 21 | |||
"...", "[...]", ., . | |||
"I got 'new' tires from them and within...", 0 ,"[7, 110, 22, 107, 22, 499, 59, 140, 3,...]", 160 | |||
" Don't waste your time. We had two dif... ", 0, "[277, 17, 278, 38, 30, 112, 24, 85, 27...", 40 | |||
"...", ., "[...]", . | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+-----------+--------+-------+---------+ | |||
| field_names | raw_words | target | words | seq_len | | |||
+-------------+-----------+--------+-------+---------+ | |||
| is_input | False | False | True | True | | |||
| is_target | False | True | False | False | | |||
| ignore_type | | False | False | False | | |||
| pad_value | | 0 | 0 | 0 | | |||
+-------------+-----------+--------+-------+---------+ | |||
""" | |||
@@ -211,6 +233,19 @@ class YelpPolarityPipe(_CLSPipe): | |||
self.lower = lower | |||
def process(self, data_bundle): | |||
""" | |||
传入的DataSet应该具备如下的结构 | |||
.. csv-table:: | |||
:header: "raw_words", "target" | |||
"I got 'new' tires from them and... ", "1" | |||
"Don't waste your time. We had two...", "1" | |||
"...", "..." | |||
:param data_bundle: | |||
:return: | |||
""" | |||
# 复制一列words | |||
data_bundle = _add_words_field(data_bundle, lower=self.lower) | |||
@@ -244,9 +279,20 @@ class SSTPipe(_CLSPipe): | |||
.. csv-table:: 下面是使用SSTPipe处理后的DataSet所具备的field | |||
:header: "raw_words", "words", "target", "seq_len" | |||
"It 's a ...", "[4, 2, 10, ...]", 0, 16 | |||
"Offers that ...", "[20, 40, ...]", 1, 18 | |||
"...", "[...]", ., . | |||
"It 's a lovely film with lovely perfor...", 1, "[187, 6, 5, 132, 120, 70, 132, 188, 25...", 13 | |||
"No one goes unindicted here , which is...", 0, "[191, 126, 192, 193, 194, 4, 195, 17, ...", 13 | |||
"...", ., "[...]", . | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+-----------+--------+-------+---------+ | |||
| field_names | raw_words | target | words | seq_len | | |||
+-------------+-----------+--------+-------+---------+ | |||
| is_input | False | False | True | True | | |||
| is_target | False | True | False | False | | |||
| ignore_type | | False | False | False | | |||
| pad_value | | 0 | 0 | 0 | | |||
+-------------+-----------+--------+-------+---------+ | |||
""" | |||
@@ -278,11 +324,11 @@ class SSTPipe(_CLSPipe): | |||
""" | |||
对DataBundle中的数据进行预处理。输入的DataSet应该至少拥有raw_words这一列,且内容类似与 | |||
.. csv-table:: | |||
.. csv-table:: 下面是使用SSTLoader读取的DataSet所具备的field | |||
:header: "raw_words" | |||
"(3 (2 It) (4 (4 (2 's) (4 (3 (2 a)..." | |||
"(4 (4 (2 Offers) (3 (3 (2 that) (3 (3 rare)..." | |||
"(2 (3 (3 Effective) (2 but)) (1 (1 too-tepid)..." | |||
"(3 (3 (2 If) (3 (2 you) (3 (2 sometimes) ..." | |||
"..." | |||
:param ~fastNLP.io.DataBundle data_bundle: 需要处理的DataBundle对象 | |||
@@ -335,12 +381,23 @@ class SST2Pipe(_CLSPipe): | |||
加载SST2的数据, 处理完成之后DataSet将拥有以下的field | |||
.. csv-table:: | |||
:header: "raw_words", "words", "target", "seq_len" | |||
:header: "raw_words", "target", "words", "seq_len" | |||
"it 's a charming and... ", "[3, 4, 5, 6, 7,...]", 1, 43 | |||
"unflinchingly bleak and...", "[10, 11, 7,...]", 1, 21 | |||
"it 's a charming and often affecting j... ", 1, "[19, 9, 6, 111, 5, 112, 113, 114, 3]", 9 | |||
"unflinchingly bleak and desperate", 0, "[115, 116, 5, 117]", 4 | |||
"...", "...", ., . | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+-----------+--------+-------+---------+ | |||
| field_names | raw_words | target | words | seq_len | | |||
+-------------+-----------+--------+-------+---------+ | |||
| is_input | False | False | True | True | | |||
| is_target | False | True | False | False | | |||
| ignore_type | | False | False | False | | |||
| pad_value | | 0 | 0 | 0 | | |||
+-------------+-----------+--------+-------+---------+ | |||
""" | |||
def __init__(self, lower=False, tokenizer='spacy'): | |||
@@ -357,11 +414,11 @@ class SST2Pipe(_CLSPipe): | |||
可以处理的DataSet应该具备如下的结构 | |||
.. csv-table:: | |||
:header: "raw_words", "target" | |||
:header: "raw_words", "target" | |||
"it 's a charming and... ", 1 | |||
"unflinchingly bleak and...", 1 | |||
"...", "..." | |||
"it 's a charming and often affecting...", "1" | |||
"unflinchingly bleak and...", "0" | |||
"..." | |||
:param data_bundle: | |||
:return: | |||
@@ -420,15 +477,26 @@ class IMDBPipe(_CLSPipe): | |||
经过本Pipe处理后DataSet将如下 | |||
.. csv-table:: 输出DataSet的field | |||
:header: "raw_words", "words", "target", "seq_len" | |||
:header: "raw_words", "target", "words", "seq_len" | |||
"Bromwell High is a cartoon ... ", "[3, 5, 6, 9, ...]", 0, 20 | |||
"Story of a man who has ...", "[20, 43, 9, 10, ...]", 1, 31 | |||
"...", "[...]", ., . | |||
"Bromwell High is a cartoon ... ", 0, "[3, 5, 6, 9, ...]", 20 | |||
"Story of a man who has ...", 1, "[20, 43, 9, 10, ...]", 31 | |||
"...", ., "[...]", . | |||
其中raw_words为str类型,是原文; words是转换为index的输入; target是转换为index的目标值; | |||
words列被设置为input; target列被设置为target。 | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+-----------+--------+-------+---------+ | |||
| field_names | raw_words | target | words | seq_len | | |||
+-------------+-----------+--------+-------+---------+ | |||
| is_input | False | False | True | True | | |||
| is_target | False | True | False | False | | |||
| ignore_type | | False | False | False | | |||
| pad_value | | 0 | 0 | 0 | | |||
+-------------+-----------+--------+-------+---------+ | |||
""" | |||
def __init__(self, lower: bool = False, tokenizer: str = 'spacy'): | |||
@@ -493,13 +561,23 @@ class ChnSentiCorpPipe(Pipe): | |||
处理之后的DataSet有以下的结构 | |||
.. csv-table:: | |||
:header: "raw_chars", "chars", "target", "seq_len" | |||
:header: "raw_chars", "target", "chars", "seq_len" | |||
"這間酒店環境和服務態度亦算不錯,但房間空間太小~~", "[2, 3, 4, 5, ...]", 1, 31 | |||
"<荐书> 推荐所有喜欢<红楼>...", "[10, 21, ....]", 1, 25 | |||
"這間酒店環境和服務態度亦算不錯,但房間空間太小~~", 1, "[2, 3, 4, 5, ...]", 31 | |||
"<荐书> 推荐所有喜欢<红楼>...", 1, "[10, 21, ....]", 25 | |||
"..." | |||
其中chars, seq_len是input,target是target | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+-----------+--------+-------+---------+ | |||
| field_names | raw_chars | target | chars | seq_len | | |||
+-------------+-----------+--------+-------+---------+ | |||
| is_input | False | True | True | True | | |||
| is_target | False | True | False | False | | |||
| ignore_type | | False | False | False | | |||
| pad_value | | 0 | 0 | 0 | | |||
+-------------+-----------+--------+-------+---------+ | |||
""" | |||
def __init__(self, bigrams=False, trigrams=False): | |||
@@ -590,12 +668,22 @@ class THUCNewsPipe(_CLSPipe): | |||
处理之后的DataSet有以下的结构 | |||
.. csv-table:: | |||
:header: "raw_chars", "chars", "target", "seq_len" | |||
:header: "raw_chars", "target", "chars", "seq_len" | |||
"马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道...", "[409, 1197, 2146, 213, ...]", 0, 746 | |||
"马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道...", 0, "[409, 1197, 2146, 213, ...]", 746 | |||
"..." | |||
其中chars, seq_len是input,target是target | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+-----------+--------+-------+---------+ | |||
| field_names | raw_chars | target | chars | seq_len | | |||
+-------------+-----------+--------+-------+---------+ | |||
| is_input | False | True | True | True | | |||
| is_target | False | True | False | False | | |||
| ignore_type | | False | False | False | | |||
| pad_value | | 0 | 0 | 0 | | |||
+-------------+-----------+--------+-------+---------+ | |||
:param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果 | |||
设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 | |||
@@ -691,12 +779,22 @@ class WeiboSenti100kPipe(_CLSPipe): | |||
处理之后的DataSet有以下的结构 | |||
.. csv-table:: | |||
:header: "raw_chars", "chars", "target", "seq_len" | |||
:header: "raw_chars", "target", "chars", "seq_len" | |||
"六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", "[0, 690, 18, ...]", 0, 56 | |||
"六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", 0, "[0, 690, 18, ...]", 56 | |||
"..." | |||
其中chars, seq_len是input,target是target | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+-----------+--------+-------+---------+ | |||
| field_names | raw_chars | target | chars | seq_len | | |||
+-------------+-----------+--------+-------+---------+ | |||
| is_input | False | True | True | True | | |||
| is_target | False | True | False | False | | |||
| ignore_type | | False | False | False | | |||
| pad_value | | 0 | 0 | 0 | | |||
+-------------+-----------+--------+-------+---------+ | |||
:param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果 | |||
设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 | |||
@@ -87,15 +87,26 @@ class Conll2003NERPipe(_NERPipe): | |||
经过该Pipe过后,DataSet中的内容如下所示 | |||
.. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader | |||
:header: "raw_words", "words", "target", "seq_len" | |||
:header: "raw_words", "target", "words", "seq_len" | |||
"[Nadim, Ladki]", "[2, 3]", "[1, 2]", 2 | |||
"[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[3, 4,...]", 6 | |||
"[Nadim, Ladki]", "[1, 2]", "[2, 3]", 2 | |||
"[AL-AIN, United, Arab, ...]", "[3, 4,...]", "[4, 5, 6,...]", 6 | |||
"[...]", "[...]", "[...]", . | |||
raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||
target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。 | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+-----------+--------+-------+---------+ | |||
| field_names | raw_words | target | words | seq_len | | |||
+-------------+-----------+--------+-------+---------+ | |||
| is_input | False | True | True | True | | |||
| is_target | False | True | False | True | | |||
| ignore_type | | False | False | False | | |||
| pad_value | | 0 | 0 | 0 | | |||
+-------------+-----------+--------+-------+---------+ | |||
""" | |||
def process_from_file(self, paths) -> DataBundle: | |||
@@ -112,17 +123,28 @@ class Conll2003NERPipe(_NERPipe): | |||
class Conll2003Pipe(Pipe): | |||
r""" | |||
""" | |||
经过该Pipe后,DataSet中的内容如下 | |||
.. csv-table:: | |||
:header: "raw_words" , "words", "pos", "chunk", "ner", "seq_len" | |||
:header: "raw_words" , "pos", "chunk", "ner", "words", "seq_len" | |||
"[Nadim, Ladki]", "[2, 3]", "[0, 0]", "[1, 2]", "[1, 2]", 2 | |||
"[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[1, 2...]", "[3, 4...]", "[3, 4...]", 6 | |||
"[Nadim, Ladki]", "[0, 0]", "[1, 2]", "[1, 2]", "[2, 3]", 2 | |||
"[AL-AIN, United, Arab, ...]", "[1, 2...]", "[3, 4...]", "[3, 4...]", "[4, 5, 6,...]", 6 | |||
"[...]", "[...]", "[...]", "[...]", "[...]", . | |||
其中words, seq_len是input; pos, chunk, ner, seq_len是target | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+-----------+-------+-------+-------+-------+---------+ | |||
| field_names | raw_words | pos | chunk | ner | words | seq_len | | |||
+-------------+-----------+-------+-------+-------+-------+---------+ | |||
| is_input | False | False | False | False | True | True | | |||
| is_target | False | True | True | True | False | True | | |||
| ignore_type | | False | False | False | False | False | | |||
| pad_value | | 0 | 0 | 0 | 0 | 0 | | |||
+-------------+-----------+-------+-------+-------+-------+---------+ | |||
""" | |||
def __init__(self, chunk_encoding_type='bioes', ner_encoding_type='bioes', lower: bool = False): | |||
@@ -202,15 +224,26 @@ class OntoNotesNERPipe(_NERPipe): | |||
处理OntoNotes的NER数据,处理之后DataSet中的field情况为 | |||
.. csv-table:: | |||
:header: "raw_words", "words", "target", "seq_len" | |||
:header: "raw_words", "target", "words", "seq_len" | |||
"[Nadim, Ladki]", "[2, 3]", "[1, 2]", 2 | |||
"[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[3, 4]", 6 | |||
"[Nadim, Ladki]", "[1, 2]", "[2, 3]", 2 | |||
"[AL-AIN, United, Arab, ...]", "[3, 4]", "[4, 5, 6,...]", 6 | |||
"[...]", "[...]", "[...]", . | |||
raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||
target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。 | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+-----------+--------+-------+---------+ | |||
| field_names | raw_words | target | words | seq_len | | |||
+-------------+-----------+--------+-------+---------+ | |||
| is_input | False | True | True | True | | |||
| is_target | False | True | False | True | | |||
| ignore_type | | False | False | False | | |||
| pad_value | | 0 | 0 | 0 | | |||
+-------------+-----------+--------+-------+---------+ | |||
""" | |||
def process_from_file(self, paths): | |||
@@ -306,15 +339,26 @@ class MsraNERPipe(_CNNERPipe): | |||
处理MSRA-NER的数据,处理之后的DataSet的field情况为 | |||
.. csv-table:: | |||
:header: "raw_chars", "chars", "target", "seq_len" | |||
:header: "raw_chars", "target", "chars", "seq_len" | |||
"[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 | |||
"[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 | |||
"[相, 比, 之, 下,...]", "[0, 0, 0, 0, ...]", "[2, 3, 4, 5, ...]", 11 | |||
"[青, 岛, 海, 牛, 队, 和, ...]", "[1, 2, 3, ...]", "[10, 21, ....]", 21 | |||
"[...]", "[...]", "[...]", . | |||
raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||
target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+-----------+--------+-------+---------+ | |||
| field_names | raw_chars | target | chars | seq_len | | |||
+-------------+-----------+--------+-------+---------+ | |||
| is_input | False | True | True | True | | |||
| is_target | False | True | False | True | | |||
| ignore_type | | False | False | False | | |||
| pad_value | | 0 | 0 | 0 | | |||
+-------------+-----------+--------+-------+---------+ | |||
""" | |||
def process_from_file(self, paths=None) -> DataBundle: | |||
@@ -327,14 +371,26 @@ class PeopleDailyPipe(_CNNERPipe): | |||
处理people daily的ner的数据,处理之后的DataSet的field情况为 | |||
.. csv-table:: | |||
:header: "raw_chars", "chars", "target", "seq_len" | |||
:header: "raw_chars", "target", "chars", "seq_len" | |||
"[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 | |||
"[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 | |||
"[相, 比, 之, 下,...]", "[0, 0, 0, 0, ...]", "[2, 3, 4, 5, ...]", 11 | |||
"[青, 岛, 海, 牛, 队, 和, ...]", "[1, 2, 3, ...]", "[10, 21, ....]", 21 | |||
"[...]", "[...]", "[...]", . | |||
raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||
target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+-----------+--------+-------+---------+ | |||
| field_names | raw_chars | target | chars | seq_len | | |||
+-------------+-----------+--------+-------+---------+ | |||
| is_input | False | True | True | True | | |||
| is_target | False | True | False | True | | |||
| ignore_type | | False | False | False | | |||
| pad_value | | 0 | 0 | 0 | | |||
+-------------+-----------+--------+-------+---------+ | |||
""" | |||
def process_from_file(self, paths=None) -> DataBundle: | |||
@@ -349,13 +405,24 @@ class WeiboNERPipe(_CNNERPipe): | |||
.. csv-table:: | |||
:header: "raw_chars", "chars", "target", "seq_len" | |||
"[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11 | |||
"[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21 | |||
"['老', '百', '姓']", "[4, 3, 3]", "[38, 39, 40]", 3 | |||
"['心']", "[0]", "[41]", 1 | |||
"[...]", "[...]", "[...]", . | |||
raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 | |||
target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+-----------+--------+-------+---------+ | |||
| field_names | raw_chars | target | chars | seq_len | | |||
+-------------+-----------+--------+-------+---------+ | |||
| is_input | False | True | True | True | | |||
| is_target | False | True | False | True | | |||
| ignore_type | | False | False | False | | |||
| pad_value | | 0 | 0 | 0 | | |||
+-------------+-----------+--------+-------+---------+ | |||
""" | |||
def process_from_file(self, paths=None) -> DataBundle: | |||
@@ -18,9 +18,29 @@ from ...core.const import Const | |||
class CoReferencePipe(Pipe): | |||
""" | |||
对Coreference resolution问题进行处理,得到文章种类/说话者/字符级信息/序列长度。 | |||
处理完成后数据包含文章类别、speaker信息、句子信息、句子对应的index、char、句子长度、target: | |||
.. csv-table:: | |||
:header: "words1", "words2","words3","words4","chars","seq_len","target" | |||
"bc", "[[0,0],[1,1]]","[['I','am'],[]]","[[1,2],[]]","[[[1],[2,3]],[]]","[2,3]","[[[2,3],[6,7]],[[10,12],[20,22]]]" | |||
"[...]", "[...]","[...]","[...]","[...]","[...]","[...]" | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+-----------+--------+-------+---------+ | |||
| field_names | raw_chars | target | chars | seq_len | | |||
+-------------+-----------+--------+-------+---------+ | |||
| is_input | False | True | True | True | | |||
| is_target | False | True | False | True | | |||
| ignore_type | | False | False | False | | |||
| pad_value | | 0 | 0 | 0 | | |||
+-------------+-----------+--------+-------+---------+ | |||
""" | |||
def __init__(self,config): | |||
def __init__(self, config): | |||
super().__init__() | |||
self.config = config | |||
@@ -35,14 +55,6 @@ class CoReferencePipe(Pipe): | |||
"bc/cctv/00/cctv_0000_1", "[['Speaker#1', 'peaker#1'],[]]","[['He','is'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]" | |||
"[...]", "[...]","[...]","[...]" | |||
处理完成后数据包含文章类别、speaker信息、句子信息、句子对应的index、char、句子长度、target: | |||
.. csv-table:: | |||
:header: "words1", "words2","words3","words4","chars","seq_len","target" | |||
"bc", "[[0,0],[1,1]]","[['I','am'],[]]","[[1,2],[]]","[[[1],[2,3]],[]]","[2,3]","[[[2,3],[6,7]],[[10,12],[20,22]]]" | |||
"[...]", "[...]","[...]","[...]","[...]","[...]","[...]" | |||
:param data_bundle: | |||
:return: | |||
@@ -138,13 +138,22 @@ class CWSPipe(Pipe): | |||
对CWS数据进行预处理, 处理之后的数据,具备以下的结构 | |||
.. csv-table:: | |||
:header: "raw_words", "chars", "target", "bigrams", "trigrams", "seq_len" | |||
:header: "raw_words", "chars", "target", "seq_len" | |||
"共同 创造 美好...", "[2, 3, 4...]", "[0, 2, 0, 2,...]", "[10, 4, 1,...]","[6, 4, 1,...]", 13 | |||
"2001年 新年 钟声...", "[8, 9, 9, 7, ...]", "[0, 1, 1, 1, 2...]", "[11, 12, ...]","[3, 9, ...]", 20 | |||
"...", "[...]","[...]", "[...]","[...]", . | |||
"共同 创造 美好...", "[2, 3, 4...]", "[0, 2, 0, 2,...]", 13 | |||
"2001年 新年 钟声...", "[8, 9, 9, 7, ...]", "[0, 1, 1, 1, 2...]", 20 | |||
"...", "[...]","[...]", . | |||
其中bigrams仅当bigrams列为True的时候存在 | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+-----------+-------+--------+---------+ | |||
| field_names | raw_words | chars | target | seq_len | | |||
+-------------+-----------+-------+--------+---------+ | |||
| is_input | False | True | True | True | | |||
| is_target | False | False | True | True | | |||
| ignore_type | | False | False | False | | |||
| pad_value | | 0 | 0 | 0 | | |||
+-------------+-----------+-------+--------+---------+ | |||
""" | |||
@@ -37,16 +37,27 @@ class MatchingBertPipe(Pipe): | |||
Matching任务的Bert pipe,输出的DataSet将包含以下的field | |||
.. csv-table:: | |||
:header: "raw_words1", "raw_words2", "words", "target", "seq_len" | |||
:header: "raw_words1", "raw_words2", "target", "words", "seq_len" | |||
"The new rights are...", "Everyone really likes..", "[2, 3, 4, 5, ...]", 1, 10 | |||
"This site includes a...", "The Government Executive...", "[11, 12, 13,...]", 0, 5 | |||
"...", "...", "[...]", ., . | |||
"The new rights are...", "Everyone really likes..", 1, "[2, 3, 4, 5, ...]", 10 | |||
"This site includes a...", "The Government Executive...", 0, "[11, 12, 13,...]", 5 | |||
"...", "...", ., "[...]", . | |||
words列是将raw_words1(即premise), raw_words2(即hypothesis)使用"[SEP]"链接起来转换为index的。 | |||
words列被设置为input,target列被设置为target和input(设置为input以方便在forward函数中计算loss, | |||
如果不在forward函数中计算loss也不影响,fastNLP将根据forward函数的形参名进行传参). | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+------------+------------+--------+-------+---------+ | |||
| field_names | raw_words1 | raw_words2 | target | words | seq_len | | |||
+-------------+------------+------------+--------+-------+---------+ | |||
| is_input | False | False | False | True | True | | |||
| is_target | False | False | True | False | False | | |||
| ignore_type | | | False | False | False | | |||
| pad_value | | | 0 | 0 | 0 | | |||
+-------------+------------+------------+--------+-------+---------+ | |||
""" | |||
def __init__(self, lower=False, tokenizer: str = 'raw'): | |||
@@ -75,6 +86,18 @@ class MatchingBertPipe(Pipe): | |||
return data_bundle | |||
def process(self, data_bundle): | |||
""" | |||
输入的data_bundle中的dataset需要具有以下结构: | |||
.. csv-table:: | |||
:header: "raw_words1", "raw_words2", "target" | |||
"Dana Reeve, the widow of the actor...", "Christopher Reeve had an...", "not_entailment" | |||
"...","..." | |||
:param data_bundle: | |||
:return: | |||
""" | |||
for dataset in data_bundle.datasets.values(): | |||
if dataset.has_field(Const.TARGET): | |||
dataset.drop(lambda x: x[Const.TARGET] == '-') | |||
@@ -178,15 +201,27 @@ class MatchingPipe(Pipe): | |||
Matching任务的Pipe。输出的DataSet将包含以下的field | |||
.. csv-table:: | |||
:header: "raw_words1", "raw_words2", "words1", "words2", "target", "seq_len1", "seq_len2" | |||
:header: "raw_words1", "raw_words2", "target", "words1", "words2", "seq_len1", "seq_len2" | |||
"The new rights are...", "Everyone really likes..", "[2, 3, 4, 5, ...]", "[10, 20, 6]", 1, 10, 13 | |||
"This site includes a...", "The Government Executive...", "[11, 12, 13,...]", "[2, 7, ...]", 0, 6, 7 | |||
"...", "...", "[...]", "[...]", ., ., . | |||
"The new rights are...", "Everyone really likes..", 1, "[2, 3, 4, 5, ...]", "[10, 20, 6]", 10, 13 | |||
"This site includes a...", "The Government Executive...", 0, "[11, 12, 13,...]", "[2, 7, ...]", 6, 7 | |||
"...", "...", ., "[...]", "[...]", ., . | |||
words1是premise,words2是hypothesis。其中words1,words2,seq_len1,seq_len2被设置为input;target被设置为target | |||
和input(设置为input以方便在forward函数中计算loss,如果不在forward函数中计算loss也不影响,fastNLP将根据forward函数 | |||
的形参名进行传参)。 | |||
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为:: | |||
+-------------+------------+------------+--------+--------+--------+----------+----------+ | |||
| field_names | raw_words1 | raw_words2 | target | words1 | words2 | seq_len1 | seq_len2 | | |||
+-------------+------------+------------+--------+--------+--------+----------+----------+ | |||
| is_input | False | False | False | True | True | True | True | | |||
| is_target | False | False | True | False | False | False | False | | |||
| ignore_type | | | False | False | False | False | False | | |||
| pad_value | | | 0 | 0 | 0 | 0 | 0 | | |||
+-------------+------------+------------+--------+--------+--------+----------+----------+ | |||
""" | |||
def __init__(self, lower=False, tokenizer: str = 'raw'): | |||