From 92fa48e1ce01f1d01500f74dc647c72d358536e0 Mon Sep 17 00:00:00 2001 From: benbijituo Date: Fri, 27 Sep 2019 16:37:34 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86CNXNLI=E7=9A=84?= =?UTF-8?q?=5Fload()=EF=BC=8C=E5=8F=AF=E4=BB=A5=E5=A4=84=E7=90=86=E7=89=B9?= =?UTF-8?q?=E6=AE=8A=E7=9A=84instance=E6=A0=BC=E5=BC=8F=E5=A6=82=E4=B8=8B?= =?UTF-8?q?=EF=BC=9A=20=E2=80=9CXXX\t"XXX\tXXX?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/io/pipe/matching.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index dbe69525..016730f2 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -351,6 +351,10 @@ class MNLIPipe(MatchingPipe): class LCQMCPipe(MatchingPipe): + def __init__(self): + super().__init__() + self.tokenizer = 'cn-char' + def process_from_file(self, paths=None): data_bundle = LCQMCLoader().load(paths) data_bundle = RenamePipe().process(data_bundle) @@ -360,6 +364,10 @@ class LCQMCPipe(MatchingPipe): class CNXNLIPipe(MatchingPipe): + def __init__(self): + super().__init__() + self.tokenizer = 'cn-char' + def process_from_file(self, paths=None): data_bundle = CNXNLILoader().load(paths) data_bundle = GranularizePipe(task='XNLI').process(data_bundle) @@ -370,6 +378,10 @@ class CNXNLIPipe(MatchingPipe): class BQCorpusPipe(MatchingPipe): + def __init__(self): + super().__init__() + self.tokenizer = 'cn-char' + def process_from_file(self, paths=None): data_bundle = BQCorpusLoader().load(paths) data_bundle = RenamePipe().process(data_bundle) @@ -462,6 +474,10 @@ class MachingTruncatePipe(Pipe): # truncate sentence for bert, modify seq_len class LCQMCBertPipe(MatchingBertPipe): + def __init__(self): + super().__init__() + self.tokenizer = 'cn-char' + def process_from_file(self, paths=None): data_bundle = LCQMCLoader().load(paths) data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) @@ -472,6 +488,10 @@ class LCQMCBertPipe(MatchingBertPipe): class BQCorpusBertPipe(MatchingBertPipe): + def __init__(self): + super().__init__() + self.tokenizer = 'cn-char' + def process_from_file(self, paths=None): data_bundle = BQCorpusLoader().load(paths) data_bundle = RenamePipe(task='cn-nli-bert').process(data_bundle) @@ -482,6 +502,10 @@ class BQCorpusBertPipe(MatchingBertPipe): class CNXNLIBertPipe(MatchingBertPipe): + def __init__(self): + super().__init__() + self.tokenizer = 'cn-char' + def process_from_file(self, paths=None): data_bundle = CNXNLILoader().load(paths) data_bundle = GranularizePipe(task='XNLI').process(data_bundle) From 6b147711af95ad02938cd7ccdda95884fe9de4e1 Mon Sep 17 00:00:00 2001 From: benbijituo Date: Fri, 27 Sep 2019 16:49:26 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E4=BA=86CNXNLI=E7=9A=84?= =?UTF-8?q?=5Fload()=EF=BC=8C=E5=8F=AF=E4=BB=A5=E5=A4=84=E7=90=86=E7=89=B9?= =?UTF-8?q?=E6=AE=8A=E7=9A=84instance=E6=A0=BC=E5=BC=8F=E5=A6=82=E4=B8=8B?= =?UTF-8?q?=EF=BC=9A=20=E2=80=9CXXX\t"XXX\tXXX?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/io/pipe/matching.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index 016730f2..6af8f5a6 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -352,8 +352,7 @@ class MNLIPipe(MatchingPipe): class LCQMCPipe(MatchingPipe): def __init__(self): - super().__init__() - self.tokenizer = 'cn-char' + super().__init__(tokenizer='cn-char') def process_from_file(self, paths=None): data_bundle = LCQMCLoader().load(paths) @@ -365,8 +364,7 @@ class LCQMCPipe(MatchingPipe): class CNXNLIPipe(MatchingPipe): def __init__(self): - super().__init__() - self.tokenizer = 'cn-char' + super().__init__(tokenizer='cn-char') def process_from_file(self, paths=None): data_bundle = CNXNLILoader().load(paths) @@ -379,8 +377,7 @@ class CNXNLIPipe(MatchingPipe): class BQCorpusPipe(MatchingPipe): def __init__(self): - super().__init__() - self.tokenizer = 'cn-char' + super().__init__(tokenizer='cn-char') def process_from_file(self, paths=None): data_bundle = BQCorpusLoader().load(paths) @@ -475,8 +472,7 @@ class MachingTruncatePipe(Pipe): # truncate sentence for bert, modify seq_len class LCQMCBertPipe(MatchingBertPipe): def __init__(self): - super().__init__() - self.tokenizer = 'cn-char' + super().__init__(tokenizer='cn-char') def process_from_file(self, paths=None): data_bundle = LCQMCLoader().load(paths) @@ -489,8 +485,7 @@ class LCQMCBertPipe(MatchingBertPipe): class BQCorpusBertPipe(MatchingBertPipe): def __init__(self): - super().__init__() - self.tokenizer = 'cn-char' + super().__init__(tokenizer='cn-char') def process_from_file(self, paths=None): data_bundle = BQCorpusLoader().load(paths) @@ -503,8 +498,7 @@ class BQCorpusBertPipe(MatchingBertPipe): class CNXNLIBertPipe(MatchingBertPipe): def __init__(self): - super().__init__() - self.tokenizer = 'cn-char' + super().__init__(tokenizer='cn-char') def process_from_file(self, paths=None): data_bundle = CNXNLILoader().load(paths) From 7636ef2990b11ce15082ea71ee233c06357644e3 Mon Sep 17 00:00:00 2001 From: Yige Xu Date: Sat, 28 Sep 2019 18:51:57 +0800 Subject: [PATCH 3/3] fix bugs in Chinese Matching Pipe --- fastNLP/io/pipe/matching.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fastNLP/io/pipe/matching.py b/fastNLP/io/pipe/matching.py index 6af8f5a6..f58706fe 100644 --- a/fastNLP/io/pipe/matching.py +++ b/fastNLP/io/pipe/matching.py @@ -351,8 +351,8 @@ class MNLIPipe(MatchingPipe): class LCQMCPipe(MatchingPipe): - def __init__(self): - super().__init__(tokenizer='cn-char') + def __init__(self, tokenizer='cn=char'): + super().__init__(tokenizer=tokenizer) def process_from_file(self, paths=None): data_bundle = LCQMCLoader().load(paths) @@ -363,8 +363,8 @@ class LCQMCPipe(MatchingPipe): class CNXNLIPipe(MatchingPipe): - def __init__(self): - super().__init__(tokenizer='cn-char') + def __init__(self, tokenizer='cn-char'): + super().__init__(tokenizer=tokenizer) def process_from_file(self, paths=None): data_bundle = CNXNLILoader().load(paths) @@ -376,8 +376,8 @@ class CNXNLIPipe(MatchingPipe): class BQCorpusPipe(MatchingPipe): - def __init__(self): - super().__init__(tokenizer='cn-char') + def __init__(self, tokenizer='cn-char'): + super().__init__(tokenizer=tokenizer) def process_from_file(self, paths=None): data_bundle = BQCorpusLoader().load(paths) @@ -471,8 +471,8 @@ class MachingTruncatePipe(Pipe): # truncate sentence for bert, modify seq_len class LCQMCBertPipe(MatchingBertPipe): - def __init__(self): - super().__init__(tokenizer='cn-char') + def __init__(self, tokenizer='cn=char'): + super().__init__(tokenizer=tokenizer) def process_from_file(self, paths=None): data_bundle = LCQMCLoader().load(paths) @@ -484,8 +484,8 @@ class LCQMCBertPipe(MatchingBertPipe): class BQCorpusBertPipe(MatchingBertPipe): - def __init__(self): - super().__init__(tokenizer='cn-char') + def __init__(self, tokenizer='cn-char'): + super().__init__(tokenizer=tokenizer) def process_from_file(self, paths=None): data_bundle = BQCorpusLoader().load(paths) @@ -497,8 +497,8 @@ class BQCorpusBertPipe(MatchingBertPipe): class CNXNLIBertPipe(MatchingBertPipe): - def __init__(self): - super().__init__(tokenizer='cn-char') + def __init__(self, tokenizer='cn-char'): + super().__init__(tokenizer=tokenizer) def process_from_file(self, paths=None): data_bundle = CNXNLILoader().load(paths)