Browse Source

Merge pull request #9 from xuyige/master

Add files via upload
tags/v0.1.0
Coet GitHub 6 years ago
parent
commit
67557222c0
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 206 additions and 19 deletions
  1. +35
    -0
      fastNLP/loader/base_preprocess.py
  2. +7
    -19
      fastNLP/loader/dataset_loader.py
  3. +164
    -0
      fastNLP/loader/preprocess.py

+ 35
- 0
fastNLP/loader/base_preprocess.py View File

@@ -0,0 +1,35 @@
class BasePreprocess(object):
def __init__(self, data, pickle_path):
super(BasePreprocess, self).__init__()
self.data = data
self.pickle_path = pickle_path
if not self.pickle_path.endswith('/'):
self.pickle_path = self.pickle_path + '/'
def word2id(self):
raise NotImplementedError
def id2word(self):
raise NotImplementedError
def class2id(self):
raise NotImplementedError
def id2class(self):
raise NotImplementedError
def embedding(self):
raise NotImplementedError
def data_train(self):
raise NotImplementedError
def data_dev(self):
raise NotImplementedError
def data_test(self):
raise NotImplementedError

+ 7
- 19
fastNLP/loader/dataset_loader.py View File

@@ -18,29 +18,17 @@ class POSDatasetLoader(DatasetLoader):
#self.data_set = self.load() #self.data_set = self.load()


def load(self): def load(self):
assert os.path.exists(self.data_path)
with open(self.data_path, "r", encoding="utf-8") as f:
line = f.read()
return line

def load_lines(self):
assert os.path.exists(self.data_path) assert os.path.exists(self.data_path)
with open(self.data_path, "r", encoding="utf-8") as f: with open(self.data_path, "r", encoding="utf-8") as f:
lines = f.readlines() lines = f.readlines()
return self.parse(lines)
return lines


@staticmethod
def parse(lines):
"""
:param lines: lines from dataset
:return: list(list(list())): the three level of lists are
token, sentence, and dataset
"""
dataset = list()
for line in lines:
sentence = list()
words = line.split(" ")
for w in words:
tokens = list()
tokens.append(w.split('/')[0])
tokens.append(w.split('/')[1])
sentence.append(tokens)
dataset.append(sentence)
return dataset




class ClassificationDatasetLoader(DatasetLoader): class ClassificationDatasetLoader(DatasetLoader):


+ 164
- 0
fastNLP/loader/preprocess.py View File

@@ -0,0 +1,164 @@
import pickle
import _pickle
import os
from fastNLP.loader.base_preprocess import BasePreprocess
DEFAULT_PADDING_LABEL = '<pad>' #dict index = 0
DEFAULT_UNKNOWN_LABEL = '<unk>' #dict index = 1
DEFAULT_RESERVED_LABEL = ['<reserved-2>',
'<reserved-3>',
'<reserved-4>'] #dict index = 2~4
#the first vocab in dict with the index = 5
class POSPreprocess(BasePreprocess):
"""
This class are used to preprocess the pos datasets.
In these datasets, each line are divided by '\t'
while the first Col is the vocabulary and the second
Col is the label.
Different sentence are divided by an empty line.
e.g:
Tom label1
and label2
Jerry label1
. label3
Hello label4
world label5
! label3
In this file, there are two sentence "Tom and Jerry ."
and "Hello world !". Each word has its own label from label1
to label5.
"""
def __init__(self, data, pickle_path):
super(POSPreprocess, self).__init(data, pickle_path)
self.build_dict()
self.word2id()
self.id2word()
self.class2id()
self.id2class()
self.embedding()
self.data_train()
self.data_dev()
self.data_test()
#...
def build_dict(self):
self.word_dict = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3,
DEFAULT_RESERVED_LABEL[2]: 4}
self.label_dict = {}
for w in self.data:
if len(w) == 0:
continue
word = w.split('\t')
if word[0] not in self.word_dict:
index = len(self.word_dict)
self.word_dict[word[0]] = index
for label in word[1: ]:
if label not in self.label_dict:
index = len(self.label_dict)
self.label_dict[label] = index
def pickle_exist(self, pickle_name):
"""
:param pickle_name: the filename of target pickle file
:return: True if file exists else False
"""
if not os.path.exists(self.pickle_path):
os.makedirs(self.pickle_path)
file_name = self.pickle_path + pickle_name
if os.path.exists(file_name):
return True
else:
return False
def word2id(self):
if self.pickle_exist("word2id.pkl"):
return
# nothing will be done if word2id.pkl exists
file_name = self.pickle_path + "word2id.pkl"
with open(file_name, "wb", encoding='utf-8') as f:
_pickle.dump(self.word_dict, f)
def id2word(self):
if self.pickle_exist("id2word.pkl"):
return
#nothing will be done if id2word.pkl exists
id2word_dict = {}
for word in self.word_dict:
id2word_dict[self.word_dict[word]] = word
file_name = self.pickle_path + "id2word.pkl"
with open(file_name, "wb", encoding='utf-8') as f:
_pickle.dump(id2word_dict, f)
def class2id(self):
if self.pickle_exist("class2id.pkl"):
return
# nothing will be done if class2id.pkl exists
file_name = self.pickle_path + "class2id.pkl"
with open(file_name, "wb", encoding='utf-8') as f:
_pickle.dump(self.label_dict, f)
def id2class(self):
if self.pickle_exist("id2class.pkl"):
return
#nothing will be done if id2class.pkl exists
id2class_dict = {}
for label in self.label_dict:
id2class_dict[self.label_dict[label]] = label
file_name = self.pickle_path + "id2class.pkl"
with open(file_name, "wb", encoding='utf-8') as f:
_pickle.dump(id2class_dict, f)
def embedding(self):
if self.pickle_exist("embedding.pkl"):
return
#nothing will be done if embedding.pkl exists
def data_train(self):
if self.pickle_exist("data_train.pkl"):
return
#nothing will be done if data_train.pkl exists
data_train = []
sentence = []
for w in self.data:
if len(w) == 0:
wid = []
lid = []
for i in range(len(sentence)):
wid.append(self.word_dict[sentence[i][0]])
lid.append(self.label_dict[sentence[i][1]])
data_train.append((wid, lid))
sentence = []
sentence.append(w.split('\t'))
file_name = self.pickle_path + "data_train.pkl"
with open(file_name, "wb", encoding='utf-8') as f:
_pickle.dump(data_train, f)
def data_dev(self):
pass
def data_test(self):
pass

Loading…
Cancel
Save