hummingbird
/
fastNLP

 
			
							import _pickle
import os

DEFAULT_PADDING_LABEL = '<pad>'  # dict index = 0
DEFAULT_UNKNOWN_LABEL = '<unk>'  # dict index = 1
DEFAULT_RESERVED_LABEL = ['<reserved-2>',
                          '<reserved-3>',
                          '<reserved-4>']  # dict index = 2~4


# the first vocab in dict with the index = 5


class BasePreprocess(object):

    def __init__(self, data, pickle_path):
        super(BasePreprocess, self).__init__()
        self.data = data
        self.pickle_path = pickle_path
        if not self.pickle_path.endswith('/'):
            self.pickle_path = self.pickle_path + '/'

    def word2id(self):
        raise NotImplementedError

    def id2word(self):
        raise NotImplementedError

    def class2id(self):
        raise NotImplementedError

    def id2class(self):
        raise NotImplementedError

    def embedding(self):
        raise NotImplementedError

    def data_train(self):
        raise NotImplementedError

    def data_dev(self):
        raise NotImplementedError

    def data_test(self):
        raise NotImplementedError


class POSPreprocess(BasePreprocess):

    """
        This class are used to preprocess the pos datasets.
        In these datasets, each line is divided by '\t'
        The first Col is the vocabulary.
        The second Col is the labels.
        Different sentence are divided by an empty line.
        e.g:
        Tom label1
        and label2
        Jerry   label1
        .   label3

        Hello   label4
        world   label5
        !   label3
        In this file, there are two sentence "Tom and Jerry ."
    and "Hello world !". Each word has its own label from label1
    to label5.
    """

    def __init__(self, data, pickle_path):
        super(POSPreprocess, self).__init__(data, pickle_path)
        self.word_dict = None
        self.label_dict = None
        self.build_dict()
        self.word2id()
        self.id2word()
        self.class2id()
        self.id2class()
        self.embedding()
        self.data_train()
        self.data_dev()
        self.data_test()

    def build_dict(self):
        self.word_dict = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1,
                          DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3,
                          DEFAULT_RESERVED_LABEL[2]: 4}
        self.label_dict = {}
        for w in self.data:
            if len(w) == 0:
                continue
            word = w.split('\t')

            if word[0] not in self.word_dict:
                index = len(self.word_dict)
                self.word_dict[word[0]] = index

            for label in word[1: ]:
                if label not in self.label_dict:
                    index = len(self.label_dict)
                    self.label_dict[label] = index

    def pickle_exist(self, pickle_name):
        """
        :param pickle_name: the filename of target pickle file
        :return: True if file exists else False
        """
        if not os.path.exists(self.pickle_path):
            os.makedirs(self.pickle_path)
        file_name = self.pickle_path + pickle_name
        if os.path.exists(file_name):
            return True
        else:
            return False

    def word2id(self):
        if self.pickle_exist("word2id.pkl"):
            return
        # nothing will be done if word2id.pkl exists

        file_name = self.pickle_path + "word2id.pkl"
        with open(file_name, "wb", encoding='utf-8') as f:
            _pickle.dump(self.word_dict, f)

    def id2word(self):
        if self.pickle_exist("id2word.pkl"):
            return
        # nothing will be done if id2word.pkl exists

        id2word_dict = {}
        for word in self.word_dict:
            id2word_dict[self.word_dict[word]] = word
        file_name = self.pickle_path + "id2word.pkl"
        with open(file_name, "wb", encoding='utf-8') as f:
            _pickle.dump(id2word_dict, f)

    def class2id(self):
        if self.pickle_exist("class2id.pkl"):
            return
        # nothing will be done if class2id.pkl exists

        file_name = self.pickle_path + "class2id.pkl"
        with open(file_name, "wb", encoding='utf-8') as f:
            _pickle.dump(self.label_dict, f)

    def id2class(self):
        if self.pickle_exist("id2class.pkl"):
            return
        # nothing will be done if id2class.pkl exists

        id2class_dict = {}
        for label in self.label_dict:
            id2class_dict[self.label_dict[label]] = label
        file_name = self.pickle_path + "id2class.pkl"
        with open(file_name, "wb", encoding='utf-8') as f:
            _pickle.dump(id2class_dict, f)

    def embedding(self):
        if self.pickle_exist("embedding.pkl"):
            return
        # nothing will be done if embedding.pkl exists

    def data_train(self):
        if self.pickle_exist("data_train.pkl"):
            return
        # nothing will be done if data_train.pkl exists

        data_train = []
        sentence = []
        for w in self.data:
            if len(w) == 0:
                wid = []
                lid = []
                for i in range(len(sentence)):
                    wid.append(self.word_dict[sentence[i][0]])
                    lid.append(self.label_dict[sentence[i][1]])
                data_train.append((wid, lid))
                sentence = []
            sentence.append(w.split('\t'))

        file_name = self.pickle_path + "data_train.pkl"
        with open(file_name, "wb", encoding='utf-8') as f:
            _pickle.dump(data_train, f)

    def data_dev(self):
        pass

    def data_test(self):
        pass