From d8bd40daf0e0c322ad13a208cb2d26becd299393 Mon Sep 17 00:00:00 2001
From: wyg <1505116161@qq.com>
Date: Mon, 8 Jul 2019 13:00:53 +0800
Subject: [PATCH] [verify] sst2loader use spacy tokenizer

---
 reproduction/text_classification/data/sstLoader.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/reproduction/text_classification/data/sstLoader.py b/reproduction/text_classification/data/sstLoader.py
index d8403b7a..e1907d8f 100644
--- a/reproduction/text_classification/data/sstLoader.py
+++ b/reproduction/text_classification/data/sstLoader.py
@@ -7,6 +7,7 @@ from fastNLP import Instance
 from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader
 import csv
 from typing import Union, Dict
+from reproduction.utils import check_dataloader_paths, get_tokenizer
 
 class SSTLoader(DataSetLoader):
     URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip'
@@ -104,6 +105,7 @@ class sst2Loader(DataSetLoader):
     '''
     def __init__(self):
         super(sst2Loader, self).__init__()
+        self.tokenizer = get_tokenizer()
 
     def _load(self, path: str) -> DataSet:
         ds = DataSet()
@@ -114,7 +116,7 @@ class sst2Loader(DataSetLoader):
             if idx<=skip_row:
                 continue
             target = row[1]
-            words = row[0].split()
+            words=self.tokenizer(words)
             ds.append(Instance(words=words,target=target))
             all_count+=1
         print("all count:", all_count)