From 9d92c9506366c7741d1b28c012e442d6b3edb077 Mon Sep 17 00:00:00 2001
From: Yige XU <xuyige1996@gmail.com>
Date: Tue, 3 Jul 2018 16:36:41 +0800
Subject: [PATCH] Add files via upload

---
 fastNLP/loader/base_preprocess.py |  35 +++++++
 fastNLP/loader/dataset_loader.py  |  26 ++---
 fastNLP/loader/preprocess.py      | 164 ++++++++++++++++++++++++++++++
 3 files changed, 206 insertions(+), 19 deletions(-)
 create mode 100644 fastNLP/loader/base_preprocess.py
 create mode 100644 fastNLP/loader/preprocess.py
diff --git a/fastNLP/loader/base_preprocess.py b/fastNLP/loader/base_preprocess.py
new file mode 100644
index 00000000..988c0bba
--- /dev/null
+++ b/fastNLP/loader/base_preprocess.py
@@ -0,0 +1,35 @@
+
+
+class BasePreprocess(object):
+
+
+    def __init__(self, data, pickle_path):
+        super(BasePreprocess, self).__init__()
+        self.data = data
+        self.pickle_path = pickle_path
+        if not self.pickle_path.endswith('/'):
+            self.pickle_path = self.pickle_path + '/'
+
+    def word2id(self):
+        pass
+
+    def id2word(self):
+        pass
+
+    def class2id(self):
+        pass
+
+    def id2class(self):
+        pass
+
+    def embedding(self):
+        pass
+
+    def data_train(self):
+        pass
+
+    def data_dev(self):
+        pass
+
+    def data_test(self):
+        pass
\ No newline at end of file
diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py
index 0cec50e5..7e0770bd 100644
--- a/fastNLP/loader/dataset_loader.py
+++ b/fastNLP/loader/dataset_loader.py
@@ -18,29 +18,17 @@ class POSDatasetLoader(DatasetLoader):
 
 
     def load(self):
+        assert os.path.exists(self.data_path)
+        with open(self.data_path, "r", encoding="utf-8") as f:
+            line = f.read()
+        return line
+
+    def load_lines(self):
         assert os.path.exists(self.data_path)
         with open(self.data_path, "r", encoding="utf-8") as f:
             lines = f.readlines()
-        return self.parse(lines)
+        return lines
 
-    @staticmethod
-    def parse(lines):
-        """
-        :param lines: lines from dataset
-        :return: list(list(list())): the three level of lists are
-                token, sentence, and dataset
-        """
-        dataset = list()
-        for line in lines:
-            sentence = list()
-            words = line.split(" ")
-            for w in words:
-                tokens = list()
-                tokens.append(w.split('/')[0])
-                tokens.append(w.split('/')[1])
-                sentence.append(tokens)
-            dataset.append(sentence)
-        return dataset
 
 class ClassficationDatasetLoader(DatasetLoader):
     """loader for classfication data sets"""
diff --git a/fastNLP/loader/preprocess.py b/fastNLP/loader/preprocess.py
new file mode 100644
index 00000000..8e880107
--- /dev/null
+++ b/fastNLP/loader/preprocess.py
@@ -0,0 +1,164 @@
+import pickle
+import _pickle
+import os
+
+from fastNLP.loader.base_preprocess import BasePreprocess
+
+DEFAULT_PADDING_LABEL = '<pad>'             #dict index = 0
+DEFAULT_UNKNOWN_LABEL = '<unk>'             #dict index = 1
+DEFAULT_RESERVED_LABEL = ['<reserved-2>',
+                          '<reserved-3>',
+                          '<reserved-4>']   #dict index = 2~4
+#the first vocab in dict with the index = 5
+
+
+
+class POSPreprocess(BasePreprocess):
+
+    """
+        This class are used to preprocess the pos datasets.
+        In these datasets, each line are divided by '\t'
+    while the first Col is the vocabulary and the second
+    Col is the label.
+        Different sentence are divided by an empty line.
+        e.g:
+        Tom label1
+        and label2
+        Jerry   label1
+        .   label3
+
+        Hello   label4
+        world   label5
+        !   label3
+        In this file, there are two sentence "Tom and Jerry ."
+    and "Hello world !". Each word has its own label from label1
+    to label5.
+    """
+
+    def __init__(self, data, pickle_path):
+        super(POSPreprocess, self).__init(data, pickle_path)
+        self.build_dict()
+        self.word2id()
+        self.id2word()
+        self.class2id()
+        self.id2class()
+        self.embedding()
+        self.data_train()
+        self.data_dev()
+        self.data_test()
+        #...
+
+
+    def build_dict(self):
+        self.word_dict = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
+                          DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3,
+                          DEFAULT_RESERVED_LABEL[2]: 4}
+        self.label_dict = {}
+        for w in self.data:
+            if len(w) == 0:
+                continue
+            word = w.split('\t')
+
+            if word[0] not in self.word_dict:
+                index = len(self.word_dict)
+                self.word_dict[word[0]] = index
+
+            for label in word[1: ]:
+                if label not in self.label_dict:
+                    index = len(self.label_dict)
+                    self.label_dict[label] = index
+
+
+    def pickle_exist(self, pickle_name):
+        """
+        :param pickle_name: the filename of target pickle file
+        :return: True if file exists else False
+        """
+        if not os.path.exists(self.pickle_path):
+            os.makedirs(self.pickle_path)
+        file_name = self.pickle_path + pickle_name
+        if os.path.exists(file_name):
+            return True
+        else:
+            return False
+
+
+    def word2id(self):
+        if self.pickle_exist("word2id.pkl"):
+            return
+        # nothing will be done if word2id.pkl exists
+
+        file_name = self.pickle_path + "word2id.pkl"
+        with open(file_name, "wb", encoding='utf-8') as f:
+            _pickle.dump(self.word_dict, f)
+
+
+    def id2word(self):
+        if self.pickle_exist("id2word.pkl"):
+            return
+        #nothing will be done if id2word.pkl exists
+
+        id2word_dict = {}
+        for word in self.word_dict:
+            id2word_dict[self.word_dict[word]] = word
+        file_name = self.pickle_path + "id2word.pkl"
+        with open(file_name, "wb", encoding='utf-8') as f:
+            _pickle.dump(id2word_dict, f)
+
+
+    def class2id(self):
+        if self.pickle_exist("class2id.pkl"):
+            return
+        # nothing will be done if class2id.pkl exists
+
+        file_name = self.pickle_path + "class2id.pkl"
+        with open(file_name, "wb", encoding='utf-8') as f:
+            _pickle.dump(self.label_dict, f)
+
+
+    def id2class(self):
+        if self.pickle_exist("id2class.pkl"):
+            return
+        #nothing will be done if id2class.pkl exists
+
+        id2class_dict = {}
+        for label in self.label_dict:
+            id2class_dict[self.label_dict[label]] = label
+        file_name = self.pickle_path + "id2class.pkl"
+        with open(file_name, "wb", encoding='utf-8') as f:
+            _pickle.dump(id2class_dict, f)
+
+
+    def embedding(self):
+        if self.pickle_exist("embedding.pkl"):
+            return
+        #nothing will be done if embedding.pkl exists
+
+
+    def data_train(self):
+        if self.pickle_exist("data_train.pkl"):
+            return
+        #nothing will be done if data_train.pkl exists
+
+        data_train = []
+        sentence = []
+        for w in self.data:
+            if len(w) == 0:
+                wid = []
+                lid = []
+                for i in range(len(sentence)):
+                    wid.append(self.word_dict[sentence[i][0]])
+                    lid.append(self.label_dict[sentence[i][1]])
+                data_train.append((wid, lid))
+                sentence = []
+            sentence.append(w.split('\t'))
+
+        file_name = self.pickle_path + "data_train.pkl"
+        with open(file_name, "wb", encoding='utf-8') as f:
+            _pickle.dump(data_train, f)
+
+    def data_dev(self):
+        pass
+
+    def data_test(self):
+        pass