Browse Source

[verify] sst2loader/IMDB use spacy tokenizer

[verify] char_level 无空格
tags/v0.4.10
wyg 5 years ago
parent
commit
191af01b17
3 changed files with 15 additions and 9 deletions
  1. +6
    -2
      reproduction/text_classification/data/IMDBLoader.py
  2. +5
    -3
      reproduction/text_classification/data/sstLoader.py
  3. +4
    -4
      reproduction/text_classification/data/yelpLoader.py

+ 6
- 2
reproduction/text_classification/data/IMDBLoader.py View File

@@ -8,6 +8,7 @@ from fastNLP import Vocabulary
from fastNLP import Const from fastNLP import Const
# from reproduction.utils import check_dataloader_paths # from reproduction.utils import check_dataloader_paths
from functools import partial from functools import partial
from reproduction.utils import check_dataloader_paths, get_tokenizer




class IMDBLoader(DataSetLoader): class IMDBLoader(DataSetLoader):
@@ -22,6 +23,7 @@ class IMDBLoader(DataSetLoader):


def __init__(self): def __init__(self):
super(IMDBLoader, self).__init__() super(IMDBLoader, self).__init__()
self.tokenizer = get_tokenizer()


def _load(self, path): def _load(self, path):
dataset = DataSet() dataset = DataSet()
@@ -32,7 +34,7 @@ class IMDBLoader(DataSetLoader):
continue continue
parts = line.split('\t') parts = line.split('\t')
target = parts[0] target = parts[0]
words = parts[1].lower().split()
words = self.tokenizer(parts[1].lower())
dataset.append(Instance(words=words, target=target)) dataset.append(Instance(words=words, target=target))


if len(dataset)==0: if len(dataset)==0:
@@ -52,13 +54,15 @@ class IMDBLoader(DataSetLoader):
for name, path in paths.items(): for name, path in paths.items():
dataset = self.load(path) dataset = self.load(path)
datasets[name] = dataset datasets[name] = dataset
def wordtochar(words): def wordtochar(words):
chars = [] chars = []
for word in words: for word in words:
word = word.lower() word = word.lower()
for char in word: for char in word:
chars.append(char) chars.append(char)
chars.append('')
chars.pop()
return chars return chars


if char_level_op: if char_level_op:


+ 5
- 3
reproduction/text_classification/data/sstLoader.py View File

@@ -116,7 +116,7 @@ class sst2Loader(DataSetLoader):
if idx<=skip_row: if idx<=skip_row:
continue continue
target = row[1] target = row[1]
words=self.tokenizer(words)
words=self.tokenizer(row[0])
ds.append(Instance(words=words,target=target)) ds.append(Instance(words=words,target=target))
all_count+=1 all_count+=1
print("all count:", all_count) print("all count:", all_count)
@@ -137,11 +137,13 @@ class sst2Loader(DataSetLoader):
datasets[name] = dataset datasets[name] = dataset


def wordtochar(words): def wordtochar(words):
chars=[]
chars = []
for word in words: for word in words:
word=word.lower()
word = word.lower()
for char in word: for char in word:
chars.append(char) chars.append(char)
chars.append('')
chars.pop()
return chars return chars


input_name, target_name = 'words', 'target' input_name, target_name = 'words', 'target'


+ 4
- 4
reproduction/text_classification/data/yelpLoader.py View File

@@ -141,14 +141,14 @@ class yelpLoader(DataSetLoader):
_train_ds = [info.datasets[name] _train_ds = [info.datasets[name]
for name in train_ds] if train_ds else info.datasets.values() for name in train_ds] if train_ds else info.datasets.values()



def wordtochar(words): def wordtochar(words):

chars=[]
chars = []
for word in words: for word in words:
word=word.lower()
word = word.lower()
for char in word: for char in word:
chars.append(char) chars.append(char)
chars.append('')
chars.pop()
return chars return chars


input_name, target_name = 'words', 'target' input_name, target_name = 'words', 'target'


Loading…
Cancel
Save