diff --git a/fastNLP/loader/base_preprocess.py b/fastNLP/loader/base_preprocess.py new file mode 100644 index 00000000..988c0bba --- /dev/null +++ b/fastNLP/loader/base_preprocess.py @@ -0,0 +1,35 @@ + + +class BasePreprocess(object): + + + def __init__(self, data, pickle_path): + super(BasePreprocess, self).__init__() + self.data = data + self.pickle_path = pickle_path + if not self.pickle_path.endswith('/'): + self.pickle_path = self.pickle_path + '/' + + def word2id(self): + pass + + def id2word(self): + pass + + def class2id(self): + pass + + def id2class(self): + pass + + def embedding(self): + pass + + def data_train(self): + pass + + def data_dev(self): + pass + + def data_test(self): + pass \ No newline at end of file diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py index 0cec50e5..7e0770bd 100644 --- a/fastNLP/loader/dataset_loader.py +++ b/fastNLP/loader/dataset_loader.py @@ -18,29 +18,17 @@ class POSDatasetLoader(DatasetLoader): def load(self): + assert os.path.exists(self.data_path) + with open(self.data_path, "r", encoding="utf-8") as f: + line = f.read() + return line + + def load_lines(self): assert os.path.exists(self.data_path) with open(self.data_path, "r", encoding="utf-8") as f: lines = f.readlines() - return self.parse(lines) + return lines - @staticmethod - def parse(lines): - """ - :param lines: lines from dataset - :return: list(list(list())): the three level of lists are - token, sentence, and dataset - """ - dataset = list() - for line in lines: - sentence = list() - words = line.split(" ") - for w in words: - tokens = list() - tokens.append(w.split('/')[0]) - tokens.append(w.split('/')[1]) - sentence.append(tokens) - dataset.append(sentence) - return dataset class ClassficationDatasetLoader(DatasetLoader): """loader for classfication data sets""" diff --git a/fastNLP/loader/preprocess.py b/fastNLP/loader/preprocess.py new file mode 100644 index 00000000..8e880107 --- /dev/null +++ b/fastNLP/loader/preprocess.py @@ -0,0 +1,164 @@ +import pickle +import _pickle +import os + +from fastNLP.loader.base_preprocess import BasePreprocess + +DEFAULT_PADDING_LABEL = '' #dict index = 0 +DEFAULT_UNKNOWN_LABEL = '' #dict index = 1 +DEFAULT_RESERVED_LABEL = ['', + '', + ''] #dict index = 2~4 +#the first vocab in dict with the index = 5 + + + +class POSPreprocess(BasePreprocess): + + """ + This class are used to preprocess the pos datasets. + In these datasets, each line are divided by '\t' + while the first Col is the vocabulary and the second + Col is the label. + Different sentence are divided by an empty line. + e.g: + Tom label1 + and label2 + Jerry label1 + . label3 + + Hello label4 + world label5 + ! label3 + In this file, there are two sentence "Tom and Jerry ." + and "Hello world !". Each word has its own label from label1 + to label5. + """ + + def __init__(self, data, pickle_path): + super(POSPreprocess, self).__init(data, pickle_path) + self.build_dict() + self.word2id() + self.id2word() + self.class2id() + self.id2class() + self.embedding() + self.data_train() + self.data_dev() + self.data_test() + #... + + + def build_dict(self): + self.word_dict = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, + DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3, + DEFAULT_RESERVED_LABEL[2]: 4} + self.label_dict = {} + for w in self.data: + if len(w) == 0: + continue + word = w.split('\t') + + if word[0] not in self.word_dict: + index = len(self.word_dict) + self.word_dict[word[0]] = index + + for label in word[1: ]: + if label not in self.label_dict: + index = len(self.label_dict) + self.label_dict[label] = index + + + def pickle_exist(self, pickle_name): + """ + :param pickle_name: the filename of target pickle file + :return: True if file exists else False + """ + if not os.path.exists(self.pickle_path): + os.makedirs(self.pickle_path) + file_name = self.pickle_path + pickle_name + if os.path.exists(file_name): + return True + else: + return False + + + def word2id(self): + if self.pickle_exist("word2id.pkl"): + return + # nothing will be done if word2id.pkl exists + + file_name = self.pickle_path + "word2id.pkl" + with open(file_name, "wb", encoding='utf-8') as f: + _pickle.dump(self.word_dict, f) + + + def id2word(self): + if self.pickle_exist("id2word.pkl"): + return + #nothing will be done if id2word.pkl exists + + id2word_dict = {} + for word in self.word_dict: + id2word_dict[self.word_dict[word]] = word + file_name = self.pickle_path + "id2word.pkl" + with open(file_name, "wb", encoding='utf-8') as f: + _pickle.dump(id2word_dict, f) + + + def class2id(self): + if self.pickle_exist("class2id.pkl"): + return + # nothing will be done if class2id.pkl exists + + file_name = self.pickle_path + "class2id.pkl" + with open(file_name, "wb", encoding='utf-8') as f: + _pickle.dump(self.label_dict, f) + + + def id2class(self): + if self.pickle_exist("id2class.pkl"): + return + #nothing will be done if id2class.pkl exists + + id2class_dict = {} + for label in self.label_dict: + id2class_dict[self.label_dict[label]] = label + file_name = self.pickle_path + "id2class.pkl" + with open(file_name, "wb", encoding='utf-8') as f: + _pickle.dump(id2class_dict, f) + + + def embedding(self): + if self.pickle_exist("embedding.pkl"): + return + #nothing will be done if embedding.pkl exists + + + def data_train(self): + if self.pickle_exist("data_train.pkl"): + return + #nothing will be done if data_train.pkl exists + + data_train = [] + sentence = [] + for w in self.data: + if len(w) == 0: + wid = [] + lid = [] + for i in range(len(sentence)): + wid.append(self.word_dict[sentence[i][0]]) + lid.append(self.label_dict[sentence[i][1]]) + data_train.append((wid, lid)) + sentence = [] + sentence.append(w.split('\t')) + + file_name = self.pickle_path + "data_train.pkl" + with open(file_name, "wb", encoding='utf-8') as f: + _pickle.dump(data_train, f) + + def data_dev(self): + pass + + def data_test(self): + pass