Browse Source

Merge pull request #178 from SrWYG/dev0.5.0

[bugfix] IMDB/SST/YELP dataloader char_level无空格 [add]IMDB/SST use spacy tokenizer
tags/v0.4.10
Yige XU GitHub 6 years ago
parent
commit
1f4cd0c8b6
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 17 additions and 9 deletions
  1. +6
    -2
      reproduction/text_classification/data/IMDBLoader.py
  2. +7
    -3
      reproduction/text_classification/data/sstLoader.py
  3. +4
    -4
      reproduction/text_classification/data/yelpLoader.py

+ 6
- 2
reproduction/text_classification/data/IMDBLoader.py View File

@@ -8,6 +8,7 @@ from fastNLP import Vocabulary
from fastNLP import Const from fastNLP import Const
# from reproduction.utils import check_dataloader_paths # from reproduction.utils import check_dataloader_paths
from functools import partial from functools import partial
from reproduction.utils import check_dataloader_paths, get_tokenizer




class IMDBLoader(DataSetLoader): class IMDBLoader(DataSetLoader):
@@ -22,6 +23,7 @@ class IMDBLoader(DataSetLoader):


def __init__(self): def __init__(self):
super(IMDBLoader, self).__init__() super(IMDBLoader, self).__init__()
self.tokenizer = get_tokenizer()


def _load(self, path): def _load(self, path):
dataset = DataSet() dataset = DataSet()
@@ -32,7 +34,7 @@ class IMDBLoader(DataSetLoader):
continue continue
parts = line.split('\t') parts = line.split('\t')
target = parts[0] target = parts[0]
words = parts[1].lower().split()
words = self.tokenizer(parts[1].lower())
dataset.append(Instance(words=words, target=target)) dataset.append(Instance(words=words, target=target))


if len(dataset)==0: if len(dataset)==0:
@@ -52,13 +54,15 @@ class IMDBLoader(DataSetLoader):
for name, path in paths.items(): for name, path in paths.items():
dataset = self.load(path) dataset = self.load(path)
datasets[name] = dataset datasets[name] = dataset
def wordtochar(words): def wordtochar(words):
chars = [] chars = []
for word in words: for word in words:
word = word.lower() word = word.lower()
for char in word: for char in word:
chars.append(char) chars.append(char)
chars.append('')
chars.pop()
return chars return chars


if char_level_op: if char_level_op:


+ 7
- 3
reproduction/text_classification/data/sstLoader.py View File

@@ -7,6 +7,7 @@ from fastNLP import Instance
from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader
import csv import csv
from typing import Union, Dict from typing import Union, Dict
from reproduction.utils import check_dataloader_paths, get_tokenizer


class SSTLoader(DataSetLoader): class SSTLoader(DataSetLoader):
URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip' URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip'
@@ -104,6 +105,7 @@ class sst2Loader(DataSetLoader):
''' '''
def __init__(self): def __init__(self):
super(sst2Loader, self).__init__() super(sst2Loader, self).__init__()
self.tokenizer = get_tokenizer()


def _load(self, path: str) -> DataSet: def _load(self, path: str) -> DataSet:
ds = DataSet() ds = DataSet()
@@ -114,7 +116,7 @@ class sst2Loader(DataSetLoader):
if idx<=skip_row: if idx<=skip_row:
continue continue
target = row[1] target = row[1]
words = row[0].split()
words=self.tokenizer(row[0])
ds.append(Instance(words=words,target=target)) ds.append(Instance(words=words,target=target))
all_count+=1 all_count+=1
print("all count:", all_count) print("all count:", all_count)
@@ -135,11 +137,13 @@ class sst2Loader(DataSetLoader):
datasets[name] = dataset datasets[name] = dataset


def wordtochar(words): def wordtochar(words):
chars=[]
chars = []
for word in words: for word in words:
word=word.lower()
word = word.lower()
for char in word: for char in word:
chars.append(char) chars.append(char)
chars.append('')
chars.pop()
return chars return chars


input_name, target_name = 'words', 'target' input_name, target_name = 'words', 'target'


+ 4
- 4
reproduction/text_classification/data/yelpLoader.py View File

@@ -141,14 +141,14 @@ class yelpLoader(DataSetLoader):
_train_ds = [info.datasets[name] _train_ds = [info.datasets[name]
for name in train_ds] if train_ds else info.datasets.values() for name in train_ds] if train_ds else info.datasets.values()



def wordtochar(words): def wordtochar(words):

chars=[]
chars = []
for word in words: for word in words:
word=word.lower()
word = word.lower()
for char in word: for char in word:
chars.append(char) chars.append(char)
chars.append('')
chars.pop()
return chars return chars


input_name, target_name = 'words', 'target' input_name, target_name = 'words', 'target'


Loading…
Cancel
Save