| @@ -0,0 +1,99 @@ | |||
| from copy import deepcopy | |||
| DEFAULT_PADDING_LABEL = '<pad>' # dict index = 0 | |||
| DEFAULT_UNKNOWN_LABEL = '<unk>' # dict index = 1 | |||
| DEFAULT_RESERVED_LABEL = ['<reserved-2>', | |||
| '<reserved-3>', | |||
| '<reserved-4>'] # dict index = 2~4 | |||
| DEFAULT_WORD_TO_INDEX = {DEFAULT_PADDING_LABEL: 0, DEFAULT_UNKNOWN_LABEL: 1, | |||
| DEFAULT_RESERVED_LABEL[0]: 2, DEFAULT_RESERVED_LABEL[1]: 3, | |||
| DEFAULT_RESERVED_LABEL[2]: 4} | |||
| def isiterable(p_object): | |||
| try: | |||
| it = iter(p_object) | |||
| except TypeError: | |||
| return False | |||
| return True | |||
| class Vocabulary(object): | |||
| def __init__(self): | |||
| self.word2idx = deepcopy(DEFAULT_WORD_TO_INDEX) | |||
| self.padding_label = DEFAULT_PADDING_LABEL | |||
| self.unknown_label = DEFAULT_UNKNOWN_LABEL | |||
| self.idx2word = None | |||
| def __len__(self): | |||
| return len(self.word2idx) | |||
| def update(self, word): | |||
| """add word or list of words into Vocabulary | |||
| """ | |||
| if not isinstance(word, str) and isiterable(word): | |||
| # it's a nested list | |||
| for w in word: | |||
| self.update(w) | |||
| else: | |||
| # it's a word to be added | |||
| self.word2idx[word] = len(self) | |||
| if self.idx2word is not None: | |||
| self.idx2word = None | |||
| def __getitem__(self, w): | |||
| """ like to_index(w) function, turn a word to the index | |||
| if w is not in Vocabulary, return the unknown label | |||
| """ | |||
| if w in self.word2idx: | |||
| return self.word2idx[w] | |||
| else: | |||
| return self.word2idx[DEFAULT_UNKNOWN_LABEL] | |||
| def unknown_idx(self): | |||
| return self.word2idx[self.unknown_label] | |||
| def padding_idx(self): | |||
| return self.word2idx[self.padding_label] | |||
| def build_reverse_vocab(self): | |||
| self.idx2word = {self.word2idx[w] : w for w in self.word2idx} | |||
| def to_word(self, idx): | |||
| """given a word's index, return the word itself | |||
| """ | |||
| if self.idx2word is None: | |||
| self.build_reverse_vocab() | |||
| return self.idx2word[idx] | |||
| def __getstate__(self): | |||
| """use to prepare data for pickle | |||
| """ | |||
| state = self.__dict__.copy() | |||
| # no need to pickle idx2word as it can be constructed from word2idx | |||
| del state['idx2word'] | |||
| return state | |||
| def __setstate__(self, state): | |||
| """use to restore state from pickle | |||
| """ | |||
| self.__dict__.update(state) | |||
| self.idx2word = None | |||
| if __name__ == '__main__': | |||
| import _pickle as pickle | |||
| vocab = Vocabulary() | |||
| filename = 'vocab' | |||
| vocab.update(filename) | |||
| vocab.update([filename, ['a'], [['b']], ['c']]) | |||
| idx = vocab[filename] | |||
| print('{} {}'.format(vocab.to_word(idx), vocab[filename])) | |||
| with open(filename, 'wb') as f: | |||
| pickle.dump(vocab, f) | |||
| with open(filename, 'rb') as f: | |||
| vocab = pickle.load(f) | |||
| print('{} {}'.format(vocab.to_word(idx), vocab[filename])) | |||
| print(vocab.word2idx) | |||