From 191af01b177bba8295336cf8e33995b237f4caae Mon Sep 17 00:00:00 2001 From: wyg <1505116161@qq.com> Date: Mon, 8 Jul 2019 13:43:28 +0800 Subject: [PATCH] =?UTF-8?q?[verify]=20sst2loader/IMDB=20use=20spacy=20toke?= =?UTF-8?q?nizer=20[verify]=20char=5Flevel=20=E6=97=A0=E7=A9=BA=E6=A0=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- reproduction/text_classification/data/IMDBLoader.py | 8 ++++++-- reproduction/text_classification/data/sstLoader.py | 8 +++++--- reproduction/text_classification/data/yelpLoader.py | 8 ++++---- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/reproduction/text_classification/data/IMDBLoader.py b/reproduction/text_classification/data/IMDBLoader.py index 30daf233..0cdab15e 100644 --- a/reproduction/text_classification/data/IMDBLoader.py +++ b/reproduction/text_classification/data/IMDBLoader.py @@ -8,6 +8,7 @@ from fastNLP import Vocabulary from fastNLP import Const # from reproduction.utils import check_dataloader_paths from functools import partial +from reproduction.utils import check_dataloader_paths, get_tokenizer class IMDBLoader(DataSetLoader): @@ -22,6 +23,7 @@ class IMDBLoader(DataSetLoader): def __init__(self): super(IMDBLoader, self).__init__() + self.tokenizer = get_tokenizer() def _load(self, path): dataset = DataSet() @@ -32,7 +34,7 @@ class IMDBLoader(DataSetLoader): continue parts = line.split('\t') target = parts[0] - words = parts[1].lower().split() + words = self.tokenizer(parts[1].lower()) dataset.append(Instance(words=words, target=target)) if len(dataset)==0: @@ -52,13 +54,15 @@ class IMDBLoader(DataSetLoader): for name, path in paths.items(): dataset = self.load(path) datasets[name] = dataset - + def wordtochar(words): chars = [] for word in words: word = word.lower() for char in word: chars.append(char) + chars.append('') + chars.pop() return chars if char_level_op: diff --git a/reproduction/text_classification/data/sstLoader.py b/reproduction/text_classification/data/sstLoader.py index e1907d8f..14524ea5 100644 --- a/reproduction/text_classification/data/sstLoader.py +++ b/reproduction/text_classification/data/sstLoader.py @@ -116,7 +116,7 @@ class sst2Loader(DataSetLoader): if idx<=skip_row: continue target = row[1] - words=self.tokenizer(words) + words=self.tokenizer(row[0]) ds.append(Instance(words=words,target=target)) all_count+=1 print("all count:", all_count) @@ -137,11 +137,13 @@ class sst2Loader(DataSetLoader): datasets[name] = dataset def wordtochar(words): - chars=[] + chars = [] for word in words: - word=word.lower() + word = word.lower() for char in word: chars.append(char) + chars.append('') + chars.pop() return chars input_name, target_name = 'words', 'target' diff --git a/reproduction/text_classification/data/yelpLoader.py b/reproduction/text_classification/data/yelpLoader.py index c5c91f17..f34cfbbf 100644 --- a/reproduction/text_classification/data/yelpLoader.py +++ b/reproduction/text_classification/data/yelpLoader.py @@ -141,14 +141,14 @@ class yelpLoader(DataSetLoader): _train_ds = [info.datasets[name] for name in train_ds] if train_ds else info.datasets.values() - def wordtochar(words): - - chars=[] + chars = [] for word in words: - word=word.lower() + word = word.lower() for char in word: chars.append(char) + chars.append('') + chars.pop() return chars input_name, target_name = 'words', 'target'