|
|
@@ -0,0 +1,99 @@ |
|
|
|
from copy import deepcopy |
|
|
|
|
|
|
|
DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0 |
|
|
|
DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1 |
|
|
|
DEFAULT_RESERVED_LABEL = ['<reserved-2>', |
|
|
|
'<reserved-3>', |
|
|
|
'<reserved-4>'] # dict index = 2~4 |
|
|
|
|
|
|
|
DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, |
|
|
|
DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3, |
|
|
|
DEFAULT_RESERVED_LABEL[2]: 4} |
|
|
|
|
|
|
|
def isiterable(p_object): |
|
|
|
try: |
|
|
|
it = iter(p_object) |
|
|
|
except TypeError: |
|
|
|
return False |
|
|
|
return True |
|
|
|
|
|
|
|
class Vocabulary(object): |
|
|
|
def __init__(self): |
|
|
|
self.word2idx = deepcopy(DEFAULT_WORD_TO_INDEX) |
|
|
|
self.padding_label = DEFAULT_PADDING_LABEL |
|
|
|
self.unknown_label = DEFAULT_UNKNOWN_LABEL |
|
|
|
self.idx2word = None |
|
|
|
|
|
|
|
def __len__(self): |
|
|
|
return len(self.word2idx) |
|
|
|
|
|
|
|
def update(self, word): |
|
|
|
"""add word or list of words into Vocabulary |
|
|
|
""" |
|
|
|
if not isinstance(word, str) and isiterable(word): |
|
|
|
# it's a nested list |
|
|
|
for w in word: |
|
|
|
self.update(w) |
|
|
|
else: |
|
|
|
# it's a word to be added |
|
|
|
self.word2idx[word] = len(self) |
|
|
|
if self.idx2word is not None: |
|
|
|
self.idx2word = None |
|
|
|
|
|
|
|
|
|
|
|
def __getitem__(self, w): |
|
|
|
""" like to_index(w) function, turn a word to the index |
|
|
|
if w is not in Vocabulary, return the unknown label |
|
|
|
""" |
|
|
|
if w in self.word2idx: |
|
|
|
return self.word2idx[w] |
|
|
|
else: |
|
|
|
return self.word2idx[DEFAULT_UNKNOWN_LABEL] |
|
|
|
|
|
|
|
def unknown_idx(self): |
|
|
|
return self.word2idx[self.unknown_label] |
|
|
|
|
|
|
|
def padding_idx(self): |
|
|
|
return self.word2idx[self.padding_label] |
|
|
|
|
|
|
|
def build_reverse_vocab(self): |
|
|
|
self.idx2word = {self.word2idx[w] : w for w in self.word2idx} |
|
|
|
|
|
|
|
def to_word(self, idx): |
|
|
|
"""given a word's index, return the word itself |
|
|
|
""" |
|
|
|
if self.idx2word is None: |
|
|
|
self.build_reverse_vocab() |
|
|
|
return self.idx2word[idx] |
|
|
|
|
|
|
|
def __getstate__(self): |
|
|
|
"""use to prepare data for pickle |
|
|
|
""" |
|
|
|
state = self.__dict__.copy() |
|
|
|
# no need to pickle idx2word as it can be constructed from word2idx |
|
|
|
del state['idx2word'] |
|
|
|
return state |
|
|
|
|
|
|
|
def __setstate__(self, state): |
|
|
|
"""use to restore state from pickle |
|
|
|
""" |
|
|
|
self.__dict__.update(state) |
|
|
|
self.idx2word = None |
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
import _pickle as pickle |
|
|
|
vocab = Vocabulary() |
|
|
|
filename = 'vocab' |
|
|
|
vocab.update(filename) |
|
|
|
vocab.update([filename, ['a'], [['b']], ['c']]) |
|
|
|
idx = vocab[filename] |
|
|
|
print('{} {}'.format(vocab.to_word(idx), vocab[filename])) |
|
|
|
|
|
|
|
with open(filename, 'wb') as f: |
|
|
|
pickle.dump(vocab, f) |
|
|
|
with open(filename, 'rb') as f: |
|
|
|
vocab = pickle.load(f) |
|
|
|
|
|
|
|
print('{} {}'.format(vocab.to_word(idx), vocab[filename])) |
|
|
|
print(vocab.word2idx) |
|
|
|
|