Browse Source

Merge pull request #230 from zide05/dev0.5.0

modify pipe documents
tags/v0.4.10
Yige Xu GitHub 5 years ago
parent
commit
8dc81c4205
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 303 additions and 73 deletions
  1. +7
    -0
      fastNLP/io/__init__.py
  2. +3
    -1
      fastNLP/io/pipe/__init__.py
  3. +130
    -32
      fastNLP/io/pipe/classification.py
  4. +85
    -18
      fastNLP/io/pipe/conll.py
  5. +21
    -9
      fastNLP/io/pipe/coreference.py
  6. +14
    -5
      fastNLP/io/pipe/cws.py
  7. +43
    -8
      fastNLP/io/pipe/matching.py

+ 7
- 0
fastNLP/io/__init__.py View File

@@ -25,6 +25,8 @@ __all__ = [
'SSTLoader', 'SSTLoader',
'SST2Loader', 'SST2Loader',
"ChnSentiCorpLoader", "ChnSentiCorpLoader",
"THUCNewsLoader",
"WeiboSenti100kLoader",


'ConllLoader', 'ConllLoader',
'Conll2003Loader', 'Conll2003Loader',
@@ -45,6 +47,9 @@ __all__ = [
"SNLILoader", "SNLILoader",
"QNLILoader", "QNLILoader",
"RTELoader", "RTELoader",
"XNLILoader",
"BQCorpusLoader",
"LCQMCLoader",


"Pipe", "Pipe",


@@ -54,6 +59,8 @@ __all__ = [
"SST2Pipe", "SST2Pipe",
"IMDBPipe", "IMDBPipe",
"ChnSentiCorpPipe", "ChnSentiCorpPipe",
"THUCNewsPipe",
"WeiboSenti100kPipe",


"Conll2003Pipe", "Conll2003Pipe",
"Conll2003NERPipe", "Conll2003NERPipe",


+ 3
- 1
fastNLP/io/pipe/__init__.py View File

@@ -18,6 +18,8 @@ __all__ = [
"SST2Pipe", "SST2Pipe",
"IMDBPipe", "IMDBPipe",
"ChnSentiCorpPipe", "ChnSentiCorpPipe",
"THUCNewsPipe",
"WeiboSenti100kPipe",


"Conll2003NERPipe", "Conll2003NERPipe",
"OntoNotesNERPipe", "OntoNotesNERPipe",
@@ -42,7 +44,7 @@ __all__ = [
"CoReferencePipe" "CoReferencePipe"
] ]


from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe
from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe, ChnSentiCorpPipe, THUCNewsPipe, WeiboSenti100kPipe
from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe from .conll import Conll2003NERPipe, OntoNotesNERPipe, MsraNERPipe, WeiboNERPipe, PeopleDailyPipe
from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \ from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe, QNLIBertPipe, MNLIBertPipe, \
MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe MatchingPipe, RTEPipe, SNLIPipe, QuoraPipe, QNLIPipe, MNLIPipe


+ 130
- 32
fastNLP/io/pipe/classification.py View File

@@ -97,11 +97,22 @@ class YelpFullPipe(_CLSPipe):
处理YelpFull的数据, 处理之后DataSet中的内容如下 处理YelpFull的数据, 处理之后DataSet中的内容如下


.. csv-table:: 下面是使用YelpFullPipe处理后的DataSet所具备的field .. csv-table:: 下面是使用YelpFullPipe处理后的DataSet所具备的field
:header: "raw_words", "words", "target", "seq_len"
:header: "raw_words", "target", "words", "seq_len"

"I got 'new' tires from them and within...", 0 ,"[7, 110, 22, 107, 22, 499, 59, 140, 3,...]", 160
" Don't waste your time. We had two dif... ", 0, "[277, 17, 278, 38, 30, 112, 24, 85, 27...", 40
"...", ., "[...]", .


"It 's a ...", "[4, 2, 10, ...]", 0, 10
"Offers that ...", "[20, 40, ...]", 1, 21
"...", "[...]", ., .
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | False | True | True |
| is_target | False | True | False | False |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+


""" """
@@ -193,11 +204,22 @@ class YelpPolarityPipe(_CLSPipe):
处理YelpPolarity的数据, 处理之后DataSet中的内容如下 处理YelpPolarity的数据, 处理之后DataSet中的内容如下


.. csv-table:: 下面是使用YelpFullPipe处理后的DataSet所具备的field .. csv-table:: 下面是使用YelpFullPipe处理后的DataSet所具备的field
:header: "raw_words", "words", "target", "seq_len"
:header: "raw_words", "target", "words", "seq_len"


"It 's a ...", "[4, 2, 10, ...]", 0, 10
"Offers that ...", "[20, 40, ...]", 1, 21
"...", "[...]", ., .
"I got 'new' tires from them and within...", 0 ,"[7, 110, 22, 107, 22, 499, 59, 140, 3,...]", 160
" Don't waste your time. We had two dif... ", 0, "[277, 17, 278, 38, 30, 112, 24, 85, 27...", 40
"...", ., "[...]", .

dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | False | True | True |
| is_target | False | True | False | False |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+


""" """
@@ -211,6 +233,19 @@ class YelpPolarityPipe(_CLSPipe):
self.lower = lower self.lower = lower
def process(self, data_bundle): def process(self, data_bundle):
"""
传入的DataSet应该具备如下的结构

.. csv-table::
:header: "raw_words", "target"

"I got 'new' tires from them and... ", "1"
"Don't waste your time. We had two...", "1"
"...", "..."

:param data_bundle:
:return:
"""
# 复制一列words # 复制一列words
data_bundle = _add_words_field(data_bundle, lower=self.lower) data_bundle = _add_words_field(data_bundle, lower=self.lower)
@@ -244,9 +279,20 @@ class SSTPipe(_CLSPipe):
.. csv-table:: 下面是使用SSTPipe处理后的DataSet所具备的field .. csv-table:: 下面是使用SSTPipe处理后的DataSet所具备的field
:header: "raw_words", "words", "target", "seq_len" :header: "raw_words", "words", "target", "seq_len"


"It 's a ...", "[4, 2, 10, ...]", 0, 16
"Offers that ...", "[20, 40, ...]", 1, 18
"...", "[...]", ., .
"It 's a lovely film with lovely perfor...", 1, "[187, 6, 5, 132, 120, 70, 132, 188, 25...", 13
"No one goes unindicted here , which is...", 0, "[191, 126, 192, 193, 194, 4, 195, 17, ...", 13
"...", ., "[...]", .

dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | False | True | True |
| is_target | False | True | False | False |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+


""" """
@@ -278,11 +324,11 @@ class SSTPipe(_CLSPipe):
""" """
对DataBundle中的数据进行预处理。输入的DataSet应该至少拥有raw_words这一列,且内容类似与 对DataBundle中的数据进行预处理。输入的DataSet应该至少拥有raw_words这一列,且内容类似与


.. csv-table::
.. csv-table:: 下面是使用SSTLoader读取的DataSet所具备的field
:header: "raw_words" :header: "raw_words"


"(3 (2 It) (4 (4 (2 's) (4 (3 (2 a)..."
"(4 (4 (2 Offers) (3 (3 (2 that) (3 (3 rare)..."
"(2 (3 (3 Effective) (2 but)) (1 (1 too-tepid)..."
"(3 (3 (2 If) (3 (2 you) (3 (2 sometimes) ..."
"..." "..."


:param ~fastNLP.io.DataBundle data_bundle: 需要处理的DataBundle对象 :param ~fastNLP.io.DataBundle data_bundle: 需要处理的DataBundle对象
@@ -335,12 +381,23 @@ class SST2Pipe(_CLSPipe):
加载SST2的数据, 处理完成之后DataSet将拥有以下的field 加载SST2的数据, 处理完成之后DataSet将拥有以下的field


.. csv-table:: .. csv-table::
:header: "raw_words", "words", "target", "seq_len"
:header: "raw_words", "target", "words", "seq_len"


"it 's a charming and... ", "[3, 4, 5, 6, 7,...]", 1, 43
"unflinchingly bleak and...", "[10, 11, 7,...]", 1, 21
"it 's a charming and often affecting j... ", 1, "[19, 9, 6, 111, 5, 112, 113, 114, 3]", 9
"unflinchingly bleak and desperate", 0, "[115, 116, 5, 117]", 4
"...", "...", ., . "...", "...", ., .


dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | False | True | True |
| is_target | False | True | False | False |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+

""" """
def __init__(self, lower=False, tokenizer='spacy'): def __init__(self, lower=False, tokenizer='spacy'):
@@ -357,11 +414,11 @@ class SST2Pipe(_CLSPipe):
可以处理的DataSet应该具备如下的结构 可以处理的DataSet应该具备如下的结构


.. csv-table:: .. csv-table::
:header: "raw_words", "target"
:header: "raw_words", "target"


"it 's a charming and... ", 1
"unflinchingly bleak and...", 1
"...", "..."
"it 's a charming and often affecting...", "1"
"unflinchingly bleak and...", "0"
"..."


:param data_bundle: :param data_bundle:
:return: :return:
@@ -420,15 +477,26 @@ class IMDBPipe(_CLSPipe):
经过本Pipe处理后DataSet将如下 经过本Pipe处理后DataSet将如下


.. csv-table:: 输出DataSet的field .. csv-table:: 输出DataSet的field
:header: "raw_words", "words", "target", "seq_len"
:header: "raw_words", "target", "words", "seq_len"


"Bromwell High is a cartoon ... ", "[3, 5, 6, 9, ...]", 0, 20
"Story of a man who has ...", "[20, 43, 9, 10, ...]", 1, 31
"...", "[...]", ., .
"Bromwell High is a cartoon ... ", 0, "[3, 5, 6, 9, ...]", 20
"Story of a man who has ...", 1, "[20, 43, 9, 10, ...]", 31
"...", ., "[...]", .


其中raw_words为str类型,是原文; words是转换为index的输入; target是转换为index的目标值; 其中raw_words为str类型,是原文; words是转换为index的输入; target是转换为index的目标值;
words列被设置为input; target列被设置为target。 words列被设置为input; target列被设置为target。


dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | False | True | True |
| is_target | False | True | False | False |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+

""" """
def __init__(self, lower: bool = False, tokenizer: str = 'spacy'): def __init__(self, lower: bool = False, tokenizer: str = 'spacy'):
@@ -493,13 +561,23 @@ class ChnSentiCorpPipe(Pipe):
处理之后的DataSet有以下的结构 处理之后的DataSet有以下的结构


.. csv-table:: .. csv-table::
:header: "raw_chars", "chars", "target", "seq_len"
:header: "raw_chars", "target", "chars", "seq_len"


"這間酒店環境和服務態度亦算不錯,但房間空間太小~~", "[2, 3, 4, 5, ...]", 1, 31
"<荐书> 推荐所有喜欢<红楼>...", "[10, 21, ....]", 1, 25
"這間酒店環境和服務態度亦算不錯,但房間空間太小~~", 1, "[2, 3, 4, 5, ...]", 31
"<荐书> 推荐所有喜欢<红楼>...", 1, "[10, 21, ....]", 25
"..." "..."


其中chars, seq_len是input,target是target 其中chars, seq_len是input,target是target
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+-----------+--------+-------+---------+
| field_names | raw_chars | target | chars | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | True | True | True |
| is_target | False | True | False | False |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+


""" """
def __init__(self, bigrams=False, trigrams=False): def __init__(self, bigrams=False, trigrams=False):
@@ -590,12 +668,22 @@ class THUCNewsPipe(_CLSPipe):
处理之后的DataSet有以下的结构 处理之后的DataSet有以下的结构


.. csv-table:: .. csv-table::
:header: "raw_chars", "chars", "target", "seq_len"
:header: "raw_chars", "target", "chars", "seq_len"


"马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道...", "[409, 1197, 2146, 213, ...]", 0, 746
"马晓旭意外受伤让国奥警惕 无奈大雨格外青睐殷家军记者傅亚雨沈阳报道...", 0, "[409, 1197, 2146, 213, ...]", 746
"..." "..."


其中chars, seq_len是input,target是target 其中chars, seq_len是input,target是target
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+-----------+--------+-------+---------+
| field_names | raw_chars | target | chars | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | True | True | True |
| is_target | False | True | False | False |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+


:param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果 :param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果
设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过
@@ -691,12 +779,22 @@ class WeiboSenti100kPipe(_CLSPipe):
处理之后的DataSet有以下的结构 处理之后的DataSet有以下的结构


.. csv-table:: .. csv-table::
:header: "raw_chars", "chars", "target", "seq_len"
:header: "raw_chars", "target", "chars", "seq_len"


"六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", "[0, 690, 18, ...]", 0, 56
"六一出生的?好讽刺…… //@祭春姬:他爸爸是外星人吧 //@面孔小高:现在的孩子都怎么了 [怒][怒][怒]", 0, "[0, 690, 18, ...]", 56
"..." "..."


其中chars, seq_len是input,target是target 其中chars, seq_len是input,target是target
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+-----------+--------+-------+---------+
| field_names | raw_chars | target | chars | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | True | True | True |
| is_target | False | True | False | False |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+


:param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果 :param bool bigrams: 是否增加一列bigrams. bigrams的构成是['复', '旦', '大', '学', ...]->["复旦", "旦大", ...]。如果
设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过 设置为True,返回的DataSet将有一列名为bigrams, 且已经转换为了index并设置为input,对应的vocab可以通过


+ 85
- 18
fastNLP/io/pipe/conll.py View File

@@ -87,15 +87,26 @@ class Conll2003NERPipe(_NERPipe):
经过该Pipe过后,DataSet中的内容如下所示 经过该Pipe过后,DataSet中的内容如下所示


.. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader .. csv-table:: Following is a demo layout of DataSet returned by Conll2003Loader
:header: "raw_words", "words", "target", "seq_len"
:header: "raw_words", "target", "words", "seq_len"


"[Nadim, Ladki]", "[2, 3]", "[1, 2]", 2
"[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[3, 4,...]", 6
"[Nadim, Ladki]", "[1, 2]", "[2, 3]", 2
"[AL-AIN, United, Arab, ...]", "[3, 4,...]", "[4, 5, 6,...]", 6
"[...]", "[...]", "[...]", . "[...]", "[...]", "[...]", .


raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的
target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。 target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。


dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | True | True | True |
| is_target | False | True | False | True |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+

""" """
def process_from_file(self, paths) -> DataBundle: def process_from_file(self, paths) -> DataBundle:
@@ -112,17 +123,28 @@ class Conll2003NERPipe(_NERPipe):




class Conll2003Pipe(Pipe): class Conll2003Pipe(Pipe):
r"""
"""
经过该Pipe后,DataSet中的内容如下 经过该Pipe后,DataSet中的内容如下


.. csv-table:: .. csv-table::
:header: "raw_words" , "words", "pos", "chunk", "ner", "seq_len"
:header: "raw_words" , "pos", "chunk", "ner", "words", "seq_len"


"[Nadim, Ladki]", "[2, 3]", "[0, 0]", "[1, 2]", "[1, 2]", 2
"[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[1, 2...]", "[3, 4...]", "[3, 4...]", 6
"[Nadim, Ladki]", "[0, 0]", "[1, 2]", "[1, 2]", "[2, 3]", 2
"[AL-AIN, United, Arab, ...]", "[1, 2...]", "[3, 4...]", "[3, 4...]", "[4, 5, 6,...]", 6
"[...]", "[...]", "[...]", "[...]", "[...]", . "[...]", "[...]", "[...]", "[...]", "[...]", .


其中words, seq_len是input; pos, chunk, ner, seq_len是target 其中words, seq_len是input; pos, chunk, ner, seq_len是target
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+-----------+-------+-------+-------+-------+---------+
| field_names | raw_words | pos | chunk | ner | words | seq_len |
+-------------+-----------+-------+-------+-------+-------+---------+
| is_input | False | False | False | False | True | True |
| is_target | False | True | True | True | False | True |
| ignore_type | | False | False | False | False | False |
| pad_value | | 0 | 0 | 0 | 0 | 0 |
+-------------+-----------+-------+-------+-------+-------+---------+



""" """
def __init__(self, chunk_encoding_type='bioes', ner_encoding_type='bioes', lower: bool = False): def __init__(self, chunk_encoding_type='bioes', ner_encoding_type='bioes', lower: bool = False):
@@ -202,15 +224,26 @@ class OntoNotesNERPipe(_NERPipe):
处理OntoNotes的NER数据,处理之后DataSet中的field情况为 处理OntoNotes的NER数据,处理之后DataSet中的field情况为


.. csv-table:: .. csv-table::
:header: "raw_words", "words", "target", "seq_len"
:header: "raw_words", "target", "words", "seq_len"


"[Nadim, Ladki]", "[2, 3]", "[1, 2]", 2
"[AL-AIN, United, Arab, ...]", "[4, 5, 6,...]", "[3, 4]", 6
"[Nadim, Ladki]", "[1, 2]", "[2, 3]", 2
"[AL-AIN, United, Arab, ...]", "[3, 4]", "[4, 5, 6,...]", 6
"[...]", "[...]", "[...]", . "[...]", "[...]", "[...]", .


raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 raw_words列为List[str], 是未转换的原始数据; words列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的
target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。 target。返回的DataSet中被设置为input有words, target, seq_len; 设置为target有target。


dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+-----------+--------+-------+---------+
| field_names | raw_words | target | words | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | True | True | True |
| is_target | False | True | False | True |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+

""" """
def process_from_file(self, paths): def process_from_file(self, paths):
@@ -306,15 +339,26 @@ class MsraNERPipe(_CNNERPipe):
处理MSRA-NER的数据,处理之后的DataSet的field情况为 处理MSRA-NER的数据,处理之后的DataSet的field情况为


.. csv-table:: .. csv-table::
:header: "raw_chars", "chars", "target", "seq_len"
:header: "raw_chars", "target", "chars", "seq_len"


"[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11
"[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21
"[相, 比, 之, 下,...]", "[0, 0, 0, 0, ...]", "[2, 3, 4, 5, ...]", 11
"[青, 岛, 海, 牛, 队, 和, ...]", "[1, 2, 3, ...]", "[10, 21, ....]", 21
"[...]", "[...]", "[...]", . "[...]", "[...]", "[...]", .


raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的
target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。


dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+-----------+--------+-------+---------+
| field_names | raw_chars | target | chars | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | True | True | True |
| is_target | False | True | False | True |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+

""" """
def process_from_file(self, paths=None) -> DataBundle: def process_from_file(self, paths=None) -> DataBundle:
@@ -327,14 +371,26 @@ class PeopleDailyPipe(_CNNERPipe):
处理people daily的ner的数据,处理之后的DataSet的field情况为 处理people daily的ner的数据,处理之后的DataSet的field情况为


.. csv-table:: .. csv-table::
:header: "raw_chars", "chars", "target", "seq_len"
:header: "raw_chars", "target", "chars", "seq_len"


"[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11
"[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21
"[相, 比, 之, 下,...]", "[0, 0, 0, 0, ...]", "[2, 3, 4, 5, ...]", 11
"[青, 岛, 海, 牛, 队, 和, ...]", "[1, 2, 3, ...]", "[10, 21, ....]", 21
"[...]", "[...]", "[...]", . "[...]", "[...]", "[...]", .


raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的
target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。

dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+-----------+--------+-------+---------+
| field_names | raw_chars | target | chars | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | True | True | True |
| is_target | False | True | False | True |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+

""" """
def process_from_file(self, paths=None) -> DataBundle: def process_from_file(self, paths=None) -> DataBundle:
@@ -349,13 +405,24 @@ class WeiboNERPipe(_CNNERPipe):
.. csv-table:: .. csv-table::
:header: "raw_chars", "chars", "target", "seq_len" :header: "raw_chars", "chars", "target", "seq_len"


"[相, 比, 之, 下,...]", "[2, 3, 4, 5, ...]", "[0, 0, 0, 0, ...]", 11
"[青, 岛, 海, 牛, 队, 和, ...]", "[10, 21, ....]", "[1, 2, 3, ...]", 21
"['老', '百', '姓']", "[4, 3, 3]", "[38, 39, 40]", 3
"['心']", "[0]", "[41]", 1
"[...]", "[...]", "[...]", . "[...]", "[...]", "[...]", .


raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的 raw_chars列为List[str], 是未转换的原始数据; chars列为List[int],是转换为index的输入数据; target列是List[int],是转换为index的
target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。 target。返回的DataSet中被设置为input有chars, target, seq_len; 设置为target有target。


dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+-----------+--------+-------+---------+
| field_names | raw_chars | target | chars | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | True | True | True |
| is_target | False | True | False | True |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+

""" """
def process_from_file(self, paths=None) -> DataBundle: def process_from_file(self, paths=None) -> DataBundle:


+ 21
- 9
fastNLP/io/pipe/coreference.py View File

@@ -18,9 +18,29 @@ from ...core.const import Const
class CoReferencePipe(Pipe): class CoReferencePipe(Pipe):
""" """
对Coreference resolution问题进行处理,得到文章种类/说话者/字符级信息/序列长度。 对Coreference resolution问题进行处理,得到文章种类/说话者/字符级信息/序列长度。

处理完成后数据包含文章类别、speaker信息、句子信息、句子对应的index、char、句子长度、target:

.. csv-table::
:header: "words1", "words2","words3","words4","chars","seq_len","target"

"bc", "[[0,0],[1,1]]","[['I','am'],[]]","[[1,2],[]]","[[[1],[2,3]],[]]","[2,3]","[[[2,3],[6,7]],[[10,12],[20,22]]]"
"[...]", "[...]","[...]","[...]","[...]","[...]","[...]"

dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+-----------+--------+-------+---------+
| field_names | raw_chars | target | chars | seq_len |
+-------------+-----------+--------+-------+---------+
| is_input | False | True | True | True |
| is_target | False | True | False | True |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+--------+-------+---------+

""" """


def __init__(self,config):
def __init__(self, config):
super().__init__() super().__init__()
self.config = config self.config = config


@@ -35,14 +55,6 @@ class CoReferencePipe(Pipe):
"bc/cctv/00/cctv_0000_1", "[['Speaker#1', 'peaker#1'],[]]","[['He','is'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]" "bc/cctv/00/cctv_0000_1", "[['Speaker#1', 'peaker#1'],[]]","[['He','is'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]"
"[...]", "[...]","[...]","[...]" "[...]", "[...]","[...]","[...]"


处理完成后数据包含文章类别、speaker信息、句子信息、句子对应的index、char、句子长度、target:
.. csv-table::
:header: "words1", "words2","words3","words4","chars","seq_len","target"

"bc", "[[0,0],[1,1]]","[['I','am'],[]]","[[1,2],[]]","[[[1],[2,3]],[]]","[2,3]","[[[2,3],[6,7]],[[10,12],[20,22]]]"
"[...]", "[...]","[...]","[...]","[...]","[...]","[...]"



:param data_bundle: :param data_bundle:
:return: :return:


+ 14
- 5
fastNLP/io/pipe/cws.py View File

@@ -138,13 +138,22 @@ class CWSPipe(Pipe):
对CWS数据进行预处理, 处理之后的数据,具备以下的结构 对CWS数据进行预处理, 处理之后的数据,具备以下的结构


.. csv-table:: .. csv-table::
:header: "raw_words", "chars", "target", "bigrams", "trigrams", "seq_len"
:header: "raw_words", "chars", "target", "seq_len"


"共同 创造 美好...", "[2, 3, 4...]", "[0, 2, 0, 2,...]", "[10, 4, 1,...]","[6, 4, 1,...]", 13
"2001年 新年 钟声...", "[8, 9, 9, 7, ...]", "[0, 1, 1, 1, 2...]", "[11, 12, ...]","[3, 9, ...]", 20
"...", "[...]","[...]", "[...]","[...]", .
"共同 创造 美好...", "[2, 3, 4...]", "[0, 2, 0, 2,...]", 13
"2001年 新年 钟声...", "[8, 9, 9, 7, ...]", "[0, 1, 1, 1, 2...]", 20
"...", "[...]","[...]", .


其中bigrams仅当bigrams列为True的时候存在
dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+-----------+-------+--------+---------+
| field_names | raw_words | chars | target | seq_len |
+-------------+-----------+-------+--------+---------+
| is_input | False | True | True | True |
| is_target | False | False | True | True |
| ignore_type | | False | False | False |
| pad_value | | 0 | 0 | 0 |
+-------------+-----------+-------+--------+---------+


""" """


+ 43
- 8
fastNLP/io/pipe/matching.py View File

@@ -37,16 +37,27 @@ class MatchingBertPipe(Pipe):
Matching任务的Bert pipe,输出的DataSet将包含以下的field Matching任务的Bert pipe,输出的DataSet将包含以下的field


.. csv-table:: .. csv-table::
:header: "raw_words1", "raw_words2", "words", "target", "seq_len"
:header: "raw_words1", "raw_words2", "target", "words", "seq_len"


"The new rights are...", "Everyone really likes..", "[2, 3, 4, 5, ...]", 1, 10
"This site includes a...", "The Government Executive...", "[11, 12, 13,...]", 0, 5
"...", "...", "[...]", ., .
"The new rights are...", "Everyone really likes..", 1, "[2, 3, 4, 5, ...]", 10
"This site includes a...", "The Government Executive...", 0, "[11, 12, 13,...]", 5
"...", "...", ., "[...]", .


words列是将raw_words1(即premise), raw_words2(即hypothesis)使用"[SEP]"链接起来转换为index的。 words列是将raw_words1(即premise), raw_words2(即hypothesis)使用"[SEP]"链接起来转换为index的。
words列被设置为input,target列被设置为target和input(设置为input以方便在forward函数中计算loss, words列被设置为input,target列被设置为target和input(设置为input以方便在forward函数中计算loss,
如果不在forward函数中计算loss也不影响,fastNLP将根据forward函数的形参名进行传参). 如果不在forward函数中计算loss也不影响,fastNLP将根据forward函数的形参名进行传参).


dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+------------+------------+--------+-------+---------+
| field_names | raw_words1 | raw_words2 | target | words | seq_len |
+-------------+------------+------------+--------+-------+---------+
| is_input | False | False | False | True | True |
| is_target | False | False | True | False | False |
| ignore_type | | | False | False | False |
| pad_value | | | 0 | 0 | 0 |
+-------------+------------+------------+--------+-------+---------+

""" """
def __init__(self, lower=False, tokenizer: str = 'raw'): def __init__(self, lower=False, tokenizer: str = 'raw'):
@@ -75,6 +86,18 @@ class MatchingBertPipe(Pipe):
return data_bundle return data_bundle
def process(self, data_bundle): def process(self, data_bundle):
"""
输入的data_bundle中的dataset需要具有以下结构:

.. csv-table::
:header: "raw_words1", "raw_words2", "target"

"Dana Reeve, the widow of the actor...", "Christopher Reeve had an...", "not_entailment"
"...","..."

:param data_bundle:
:return:
"""
for dataset in data_bundle.datasets.values(): for dataset in data_bundle.datasets.values():
if dataset.has_field(Const.TARGET): if dataset.has_field(Const.TARGET):
dataset.drop(lambda x: x[Const.TARGET] == '-') dataset.drop(lambda x: x[Const.TARGET] == '-')
@@ -178,15 +201,27 @@ class MatchingPipe(Pipe):
Matching任务的Pipe。输出的DataSet将包含以下的field Matching任务的Pipe。输出的DataSet将包含以下的field


.. csv-table:: .. csv-table::
:header: "raw_words1", "raw_words2", "words1", "words2", "target", "seq_len1", "seq_len2"
:header: "raw_words1", "raw_words2", "target", "words1", "words2", "seq_len1", "seq_len2"


"The new rights are...", "Everyone really likes..", "[2, 3, 4, 5, ...]", "[10, 20, 6]", 1, 10, 13
"This site includes a...", "The Government Executive...", "[11, 12, 13,...]", "[2, 7, ...]", 0, 6, 7
"...", "...", "[...]", "[...]", ., ., .
"The new rights are...", "Everyone really likes..", 1, "[2, 3, 4, 5, ...]", "[10, 20, 6]", 10, 13
"This site includes a...", "The Government Executive...", 0, "[11, 12, 13,...]", "[2, 7, ...]", 6, 7
"...", "...", ., "[...]", "[...]", ., .


words1是premise,words2是hypothesis。其中words1,words2,seq_len1,seq_len2被设置为input;target被设置为target words1是premise,words2是hypothesis。其中words1,words2,seq_len1,seq_len2被设置为input;target被设置为target
和input(设置为input以方便在forward函数中计算loss,如果不在forward函数中计算loss也不影响,fastNLP将根据forward函数 和input(设置为input以方便在forward函数中计算loss,如果不在forward函数中计算loss也不影响,fastNLP将根据forward函数
的形参名进行传参)。 的形参名进行传参)。

dataset的print_field_meta()函数输出的各个field的被设置成input和target的情况为::

+-------------+------------+------------+--------+--------+--------+----------+----------+
| field_names | raw_words1 | raw_words2 | target | words1 | words2 | seq_len1 | seq_len2 |
+-------------+------------+------------+--------+--------+--------+----------+----------+
| is_input | False | False | False | True | True | True | True |
| is_target | False | False | True | False | False | False | False |
| ignore_type | | | False | False | False | False | False |
| pad_value | | | 0 | 0 | 0 | 0 | 0 |
+-------------+------------+------------+--------+--------+--------+----------+----------+

""" """
def __init__(self, lower=False, tokenizer: str = 'raw'): def __init__(self, lower=False, tokenizer: str = 'raw'):


Loading…
Cancel
Save