Merge pull request #9 from xuyige/master

Add files via upload
7 years ago · 67557222c0
--- a/fastNLP/loader/base_preprocess.py
+++ b/fastNLP/loader/base_preprocess.py
@@ -0,0 +1,35 @@
 class BasePreprocess(object):
    def __init__(self, data, pickle_path):
        super(BasePreprocess, self).__init__()
        self.data = data
        self.pickle_path = pickle_path
        if not self.pickle_path.endswith('/'):
            self.pickle_path = self.pickle_path + '/'
    def word2id(self):
        raise NotImplementedError
    def id2word(self):
        raise NotImplementedError
    def class2id(self):
        raise NotImplementedError
    def id2class(self):
        raise NotImplementedError
    def embedding(self):
        raise NotImplementedError
    def data_train(self):
        raise NotImplementedError
    def data_dev(self):
        raise NotImplementedError
    def data_test(self):
        raise NotImplementedError
--- a/fastNLP/loader/dataset_loader.py
+++ b/fastNLP/loader/dataset_loader.py
@@ -18,29 +18,17 @@ class POSDatasetLoader(DatasetLoader):
        #self.data_set = self.load()
    def load(self):
        assert os.path.exists(self.data_path)
        with open(self.data_path, "r", encoding="utf-8") as f:
            line = f.read()
        return line
    def load_lines(self):
        assert os.path.exists(self.data_path)
        with open(self.data_path, "r", encoding="utf-8") as f:
            lines = f.readlines()
        return self.parse(lines)
        return lines
    @staticmethod
    def parse(lines):
        """
        :param lines: lines from dataset
        :return: list(list(list())): the three level of lists are
                token, sentence, and dataset
        """
        dataset = list()
        for line in lines:
            sentence = list()
            words = line.split(" ")
            for w in words:
                tokens = list()
                tokens.append(w.split('/')[0])
                tokens.append(w.split('/')[1])
                sentence.append(tokens)
            dataset.append(sentence)
        return dataset
 class ClassificationDatasetLoader(DatasetLoader):
--- a/fastNLP/loader/preprocess.py
+++ b/fastNLP/loader/preprocess.py
@@ -0,0 +1,164 @@
 import pickle
 import _pickle
 import os
 from fastNLP.loader.base_preprocess import BasePreprocess
 DEFAULT_PADDING_LABEL = '<pad>'             #dict index = 0
 DEFAULT_UNKNOWN_LABEL = '<unk>'             #dict index = 1
 DEFAULT_RESERVED_LABEL = ['<reserved-2>',
                          '<reserved-3>',
                          '<reserved-4>']   #dict index = 2~4
 #the first vocab in dict with the index = 5
 class POSPreprocess(BasePreprocess):
    """
        This class are used to preprocess the pos datasets.
        In these datasets, each line are divided by '\t'
    while the first Col is the vocabulary and the second
    Col is the label.
        Different sentence are divided by an empty line.
        e.g:
        Tom label1
        and label2
        Jerry   label1
        .   label3
        Hello   label4
        world   label5
        !   label3
        In this file, there are two sentence "Tom and Jerry ."
    and "Hello world !". Each word has its own label from label1
    to label5.
    """
    def __init__(self, data, pickle_path):
        super(POSPreprocess, self).__init(data, pickle_path)
        self.build_dict()
        self.word2id()
        self.id2word()
        self.class2id()
        self.id2class()
        self.embedding()
        self.data_train()
        self.data_dev()
        self.data_test()
        #...
    def build_dict(self):
        self.word_dict = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
                          DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3,
                          DEFAULT_RESERVED_LABEL[2]: 4}
        self.label_dict = {}
        for w in self.data:
            if len(w) == 0:
                continue
            word = w.split('\t')
            if word[0] not in self.word_dict:
                index = len(self.word_dict)
                self.word_dict[word[0]] = index
            for label in word[1: ]:
                if label not in self.label_dict:
                    index = len(self.label_dict)
                    self.label_dict[label] = index
    def pickle_exist(self, pickle_name):
        """
        :param pickle_name: the filename of target pickle file
        :return: True if file exists else False
        """
        if not os.path.exists(self.pickle_path):
            os.makedirs(self.pickle_path)
        file_name = self.pickle_path + pickle_name
        if os.path.exists(file_name):
            return True
        else:
            return False
    def word2id(self):
        if self.pickle_exist("word2id.pkl"):
            return
        # nothing will be done if word2id.pkl exists
        file_name = self.pickle_path + "word2id.pkl"
        with open(file_name, "wb", encoding='utf-8') as f:
            _pickle.dump(self.word_dict, f)
    def id2word(self):
        if self.pickle_exist("id2word.pkl"):
            return
        #nothing will be done if id2word.pkl exists
        id2word_dict = {}
        for word in self.word_dict:
            id2word_dict[self.word_dict[word]] = word
        file_name = self.pickle_path + "id2word.pkl"
        with open(file_name, "wb", encoding='utf-8') as f:
            _pickle.dump(id2word_dict, f)
    def class2id(self):
        if self.pickle_exist("class2id.pkl"):
            return
        # nothing will be done if class2id.pkl exists
        file_name = self.pickle_path + "class2id.pkl"
        with open(file_name, "wb", encoding='utf-8') as f:
            _pickle.dump(self.label_dict, f)
    def id2class(self):
        if self.pickle_exist("id2class.pkl"):
            return
        #nothing will be done if id2class.pkl exists
        id2class_dict = {}
        for label in self.label_dict:
            id2class_dict[self.label_dict[label]] = label
        file_name = self.pickle_path + "id2class.pkl"
        with open(file_name, "wb", encoding='utf-8') as f:
            _pickle.dump(id2class_dict, f)
    def embedding(self):
        if self.pickle_exist("embedding.pkl"):
            return
        #nothing will be done if embedding.pkl exists
    def data_train(self):
        if self.pickle_exist("data_train.pkl"):
            return
        #nothing will be done if data_train.pkl exists
        data_train = []
        sentence = []
        for w in self.data:
            if len(w) == 0:
                wid = []
                lid = []
                for i in range(len(sentence)):
                    wid.append(self.word_dict[sentence[i][0]])
                    lid.append(self.label_dict[sentence[i][1]])
                data_train.append((wid, lid))
                sentence = []
            sentence.append(w.split('\t'))
        file_name = self.pickle_path + "data_train.pkl"
        with open(file_name, "wb", encoding='utf-8') as f:
            _pickle.dump(data_train, f)
    def data_dev(self):
        pass
    def data_test(self):
        pass