From d8bd40daf0e0c322ad13a208cb2d26becd299393 Mon Sep 17 00:00:00 2001
From: wyg <1505116161@qq.com>
Date: Mon, 8 Jul 2019 13:00:53 +0800
Subject: [PATCH 1/2] [verify] sst2loader use spacy tokenizer

---
 reproduction/text_classification/data/sstLoader.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/reproduction/text_classification/data/sstLoader.py b/reproduction/text_classification/data/sstLoader.py
index d8403b7a..e1907d8f 100644
--- a/reproduction/text_classification/data/sstLoader.py
+++ b/reproduction/text_classification/data/sstLoader.py
@@ -7,6 +7,7 @@ from fastNLP import Instance
 from fastNLP.io.embed_loader import EmbeddingOption, EmbedLoader
 import csv
 from typing import Union, Dict
+from reproduction.utils import check_dataloader_paths, get_tokenizer
 
 class SSTLoader(DataSetLoader):
     URL = 'https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip'
@@ -104,6 +105,7 @@ class sst2Loader(DataSetLoader):
     '''
     def __init__(self):
         super(sst2Loader, self).__init__()
+        self.tokenizer = get_tokenizer()
 
     def _load(self, path: str) -> DataSet:
         ds = DataSet()
@@ -114,7 +116,7 @@ class sst2Loader(DataSetLoader):
             if idx<=skip_row:
                 continue
             target = row[1]
-            words = row[0].split()
+            words=self.tokenizer(words)
             ds.append(Instance(words=words,target=target))
             all_count+=1
         print("all count:", all_count)

From 191af01b177bba8295336cf8e33995b237f4caae Mon Sep 17 00:00:00 2001
From: wyg <1505116161@qq.com>
Date: Mon, 8 Jul 2019 13:43:28 +0800
Subject: [PATCH 2/2] =?UTF-8?q?[verify]=20sst2loader/IMDB=20use=20spacy=20?=
 =?UTF-8?q?tokenizer=20[verify]=20char=5Flevel=20=E6=97=A0=E7=A9=BA?=
 =?UTF-8?q?=E6=A0=BC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 reproduction/text_classification/data/IMDBLoader.py | 8 ++++++--
 reproduction/text_classification/data/sstLoader.py  | 8 +++++---
 reproduction/text_classification/data/yelpLoader.py | 8 ++++----
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/reproduction/text_classification/data/IMDBLoader.py b/reproduction/text_classification/data/IMDBLoader.py
index 30daf233..0cdab15e 100644
--- a/reproduction/text_classification/data/IMDBLoader.py
+++ b/reproduction/text_classification/data/IMDBLoader.py
@@ -8,6 +8,7 @@ from fastNLP import Vocabulary
 from fastNLP import Const
 # from reproduction.utils import check_dataloader_paths
 from functools import partial
+from reproduction.utils import check_dataloader_paths, get_tokenizer
 
 
 class IMDBLoader(DataSetLoader):
@@ -22,6 +23,7 @@ class IMDBLoader(DataSetLoader):
 
     def __init__(self):
         super(IMDBLoader, self).__init__()
+        self.tokenizer = get_tokenizer()
 
     def _load(self, path):
         dataset = DataSet()
@@ -32,7 +34,7 @@ class IMDBLoader(DataSetLoader):
                     continue
                 parts = line.split('\t')
                 target = parts[0]
-                words = parts[1].lower().split()
+                words = self.tokenizer(parts[1].lower())
                 dataset.append(Instance(words=words, target=target))
 
         if len(dataset)==0:
@@ -52,13 +54,15 @@ class IMDBLoader(DataSetLoader):
         for name, path in paths.items():
             dataset = self.load(path)
             datasets[name] = dataset
-            
+
         def wordtochar(words):
             chars = []
             for word in words:
                 word = word.lower()
                 for char in word:
                     chars.append(char)
+                chars.append('')
+            chars.pop()
             return chars
 
         if char_level_op:
diff --git a/reproduction/text_classification/data/sstLoader.py b/reproduction/text_classification/data/sstLoader.py
index e1907d8f..14524ea5 100644
--- a/reproduction/text_classification/data/sstLoader.py
+++ b/reproduction/text_classification/data/sstLoader.py
@@ -116,7 +116,7 @@ class sst2Loader(DataSetLoader):
             if idx<=skip_row:
                 continue
             target = row[1]
-            words=self.tokenizer(words)
+            words=self.tokenizer(row[0])
             ds.append(Instance(words=words,target=target))
             all_count+=1
         print("all count:", all_count)
@@ -137,11 +137,13 @@ class sst2Loader(DataSetLoader):
             datasets[name] = dataset
 
         def wordtochar(words):
-            chars=[]
+            chars = []
             for word in words:
-                word=word.lower()
+                word = word.lower()
                 for char in word:
                     chars.append(char)
+                chars.append('')
+            chars.pop()
             return chars
 
         input_name, target_name = 'words', 'target'
diff --git a/reproduction/text_classification/data/yelpLoader.py b/reproduction/text_classification/data/yelpLoader.py
index c5c91f17..f34cfbbf 100644
--- a/reproduction/text_classification/data/yelpLoader.py
+++ b/reproduction/text_classification/data/yelpLoader.py
@@ -141,14 +141,14 @@ class yelpLoader(DataSetLoader):
         _train_ds = [info.datasets[name]
                      for name in train_ds] if train_ds else info.datasets.values()
 
-
         def wordtochar(words):
-
-            chars=[]
+            chars = []
             for word in words:
-                word=word.lower()
+                word = word.lower()
                 for char in word:
                     chars.append(char)
+                chars.append('')
+            chars.pop()
             return chars
 
         input_name, target_name = 'words', 'target'