Browse Source

[verify] sst2loader use spacy tokenizer

tags/v0.4.10
wyg 5 years ago
parent
commit
d8bd40daf0
1 changed files with 3 additions and 1 deletions
  1. +3
    -1
      reproduction/text_classification/data/sstLoader.py

+ 3
- 1
reproduction/text_classification/data/sstLoader.py View File

@@ -7,6 +7,7 @@ from fastNLP import Instance
from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader
import csv
from typing import Union, Dict
from reproduction.utils import check_dataloader_paths, get_tokenizer

class SSTLoader(DataSetLoader):
URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip'
@@ -104,6 +105,7 @@ class sst2Loader(DataSetLoader):
'''
def __init__(self):
super(sst2Loader, self).__init__()
self.tokenizer = get_tokenizer()

def _load(self, path: str) -> DataSet:
ds = DataSet()
@@ -114,7 +116,7 @@ class sst2Loader(DataSetLoader):
if idx<=skip_row:
continue
target = row[1]
words = row[0].split()
words=self.tokenizer(words)
ds.append(Instance(words=words,target=target))
all_count+=1
print("all count:", all_count)


Loading…
Cancel
Save