From d8bd40daf0e0c322ad13a208cb2d26becd299393 Mon Sep 17 00:00:00 2001 From: wyg <1505116161@qq.com> Date: Mon, 8 Jul 2019 13:00:53 +0800 Subject: [PATCH 1/2] [verify] sst2loader use spacy tokenizer --- reproduction/text_classification/data/sstLoader.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/reproduction/text_classification/data/sstLoader.py b/reproduction/text_classification/data/sstLoader.py index d8403b7a..e1907d8f 100644 --- a/reproduction/text_classification/data/sstLoader.py +++ b/reproduction/text_classification/data/sstLoader.py @@ -7,6 +7,7 @@ from fastNLP import Instance from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader import csv from typing import Union, Dict +from reproduction.utils import check_dataloader_paths, get_tokenizer class SSTLoader(DataSetLoader): URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip' @@ -104,6 +105,7 @@ class sst2Loader(DataSetLoader): ''' def __init__(self): super(sst2Loader, self).__init__() + self.tokenizer = get_tokenizer() def _load(self, path: str) -> DataSet: ds = DataSet() @@ -114,7 +116,7 @@ class sst2Loader(DataSetLoader): if idx<=skip_row: continue target = row[1] - words = row[0].split() + words=self.tokenizer(words) ds.append(Instance(words=words,target=target)) all_count+=1 print("all count:", all_count) From 191af01b177bba8295336cf8e33995b237f4caae Mon Sep 17 00:00:00 2001 From: wyg <1505116161@qq.com> Date: Mon, 8 Jul 2019 13:43:28 +0800 Subject: [PATCH 2/2] =?UTF-8?q?[verify]=20sst2loader/IMDB=20use=20spacy=20?= =?UTF-8?q?tokenizer=20[verify]=20char=5Flevel=20=E6=97=A0=E7=A9=BA?= =?UTF-8?q?=E6=A0=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- reproduction/text_classification/data/IMDBLoader.py | 8 ++++++-- reproduction/text_classification/data/sstLoader.py | 8 +++++--- reproduction/text_classification/data/yelpLoader.py | 8 ++++---- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/reproduction/text_classification/data/IMDBLoader.py b/reproduction/text_classification/data/IMDBLoader.py index 30daf233..0cdab15e 100644 --- a/reproduction/text_classification/data/IMDBLoader.py +++ b/reproduction/text_classification/data/IMDBLoader.py @@ -8,6 +8,7 @@ from fastNLP import Vocabulary from fastNLP import Const # from reproduction.utils import check_dataloader_paths from functools import partial +from reproduction.utils import check_dataloader_paths, get_tokenizer class IMDBLoader(DataSetLoader): @@ -22,6 +23,7 @@ class IMDBLoader(DataSetLoader): def __init__(self): super(IMDBLoader, self).__init__() + self.tokenizer = get_tokenizer() def _load(self, path): dataset = DataSet() @@ -32,7 +34,7 @@ class IMDBLoader(DataSetLoader): continue parts = line.split('\t') target = parts[0] - words = parts[1].lower().split() + words = self.tokenizer(parts[1].lower()) dataset.append(Instance(words=words, target=target)) if len(dataset)==0: @@ -52,13 +54,15 @@ class IMDBLoader(DataSetLoader): for name, path in paths.items(): dataset = self.load(path) datasets[name] = dataset - + def wordtochar(words): chars = [] for word in words: word = word.lower() for char in word: chars.append(char) + chars.append('') + chars.pop() return chars if char_level_op: diff --git a/reproduction/text_classification/data/sstLoader.py b/reproduction/text_classification/data/sstLoader.py index e1907d8f..14524ea5 100644 --- a/reproduction/text_classification/data/sstLoader.py +++ b/reproduction/text_classification/data/sstLoader.py @@ -116,7 +116,7 @@ class sst2Loader(DataSetLoader): if idx<=skip_row: continue target = row[1] - words=self.tokenizer(words) + words=self.tokenizer(row[0]) ds.append(Instance(words=words,target=target)) all_count+=1 print("all count:", all_count) @@ -137,11 +137,13 @@ class sst2Loader(DataSetLoader): datasets[name] = dataset def wordtochar(words): - chars=[] + chars = [] for word in words: - word=word.lower() + word = word.lower() for char in word: chars.append(char) + chars.append('') + chars.pop() return chars input_name, target_name = 'words', 'target' diff --git a/reproduction/text_classification/data/yelpLoader.py b/reproduction/text_classification/data/yelpLoader.py index c5c91f17..f34cfbbf 100644 --- a/reproduction/text_classification/data/yelpLoader.py +++ b/reproduction/text_classification/data/yelpLoader.py @@ -141,14 +141,14 @@ class yelpLoader(DataSetLoader): _train_ds = [info.datasets[name] for name in train_ds] if train_ds else info.datasets.values() - def wordtochar(words): - - chars=[] + chars = [] for word in words: - word=word.lower() + word = word.lower() for char in word: chars.append(char) + chars.append('') + chars.pop() return chars input_name, target_name = 'words', 'target'