[bugfix] IMDB/SST/YELP dataloader char_level无空格 [add]IMDB/SST use spacy tokenizertags/v0.4.10
@@ -8,6 +8,7 @@ from fastNLP import Vocabulary | |||||
from fastNLP import Const | from fastNLP import Const | ||||
# from reproduction.utils import check_dataloader_paths | # from reproduction.utils import check_dataloader_paths | ||||
from functools import partial | from functools import partial | ||||
from reproduction.utils import check_dataloader_paths, get_tokenizer | |||||
class IMDBLoader(DataSetLoader): | class IMDBLoader(DataSetLoader): | ||||
@@ -22,6 +23,7 @@ class IMDBLoader(DataSetLoader): | |||||
def __init__(self): | def __init__(self): | ||||
super(IMDBLoader, self).__init__() | super(IMDBLoader, self).__init__() | ||||
self.tokenizer = get_tokenizer() | |||||
def _load(self, path): | def _load(self, path): | ||||
dataset = DataSet() | dataset = DataSet() | ||||
@@ -32,7 +34,7 @@ class IMDBLoader(DataSetLoader): | |||||
continue | continue | ||||
parts = line.split('\t') | parts = line.split('\t') | ||||
target = parts[0] | target = parts[0] | ||||
words = parts[1].lower().split() | |||||
words = self.tokenizer(parts[1].lower()) | |||||
dataset.append(Instance(words=words, target=target)) | dataset.append(Instance(words=words, target=target)) | ||||
if len(dataset)==0: | if len(dataset)==0: | ||||
@@ -52,13 +54,15 @@ class IMDBLoader(DataSetLoader): | |||||
for name, path in paths.items(): | for name, path in paths.items(): | ||||
dataset = self.load(path) | dataset = self.load(path) | ||||
datasets[name] = dataset | datasets[name] = dataset | ||||
def wordtochar(words): | def wordtochar(words): | ||||
chars = [] | chars = [] | ||||
for word in words: | for word in words: | ||||
word = word.lower() | word = word.lower() | ||||
for char in word: | for char in word: | ||||
chars.append(char) | chars.append(char) | ||||
chars.append('') | |||||
chars.pop() | |||||
return chars | return chars | ||||
if char_level_op: | if char_level_op: | ||||
@@ -7,6 +7,7 @@ from fastNLP import Instance | |||||
from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader | from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader | ||||
import csv | import csv | ||||
from typing import Union, Dict | from typing import Union, Dict | ||||
from reproduction.utils import check_dataloader_paths, get_tokenizer | |||||
class SSTLoader(DataSetLoader): | class SSTLoader(DataSetLoader): | ||||
URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip' | URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip' | ||||
@@ -104,6 +105,7 @@ class sst2Loader(DataSetLoader): | |||||
''' | ''' | ||||
def __init__(self): | def __init__(self): | ||||
super(sst2Loader, self).__init__() | super(sst2Loader, self).__init__() | ||||
self.tokenizer = get_tokenizer() | |||||
def _load(self, path: str) -> DataSet: | def _load(self, path: str) -> DataSet: | ||||
ds = DataSet() | ds = DataSet() | ||||
@@ -114,7 +116,7 @@ class sst2Loader(DataSetLoader): | |||||
if idx<=skip_row: | if idx<=skip_row: | ||||
continue | continue | ||||
target = row[1] | target = row[1] | ||||
words = row[0].split() | |||||
words=self.tokenizer(row[0]) | |||||
ds.append(Instance(words=words,target=target)) | ds.append(Instance(words=words,target=target)) | ||||
all_count+=1 | all_count+=1 | ||||
print("all count:", all_count) | print("all count:", all_count) | ||||
@@ -135,11 +137,13 @@ class sst2Loader(DataSetLoader): | |||||
datasets[name] = dataset | datasets[name] = dataset | ||||
def wordtochar(words): | def wordtochar(words): | ||||
chars=[] | |||||
chars = [] | |||||
for word in words: | for word in words: | ||||
word=word.lower() | |||||
word = word.lower() | |||||
for char in word: | for char in word: | ||||
chars.append(char) | chars.append(char) | ||||
chars.append('') | |||||
chars.pop() | |||||
return chars | return chars | ||||
input_name, target_name = 'words', 'target' | input_name, target_name = 'words', 'target' | ||||
@@ -141,14 +141,14 @@ class yelpLoader(DataSetLoader): | |||||
_train_ds = [info.datasets[name] | _train_ds = [info.datasets[name] | ||||
for name in train_ds] if train_ds else info.datasets.values() | for name in train_ds] if train_ds else info.datasets.values() | ||||
def wordtochar(words): | def wordtochar(words): | ||||
chars=[] | |||||
chars = [] | |||||
for word in words: | for word in words: | ||||
word=word.lower() | |||||
word = word.lower() | |||||
for char in word: | for char in word: | ||||
chars.append(char) | chars.append(char) | ||||
chars.append('') | |||||
chars.pop() | |||||
return chars | return chars | ||||
input_name, target_name = 'words', 'target' | input_name, target_name = 'words', 'target' | ||||