- import codecs
- import random
- import re
- import gensim
- import numpy as np
- from gensim import corpora
- from torch.utils.data import Dataset
- def clean_str(string):
- """
- Tokenization/string cleaning for all datasets except for SST.
- Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
- """
- string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
- string = re.sub(r"\'s", " \'s", string)
- string = re.sub(r"\'ve", " \'ve", string)
- string = re.sub(r"n\'t", " n\'t", string)
- string = re.sub(r"\'re", " \'re", string)
- string = re.sub(r"\'d", " \'d", string)
- string = re.sub(r"\'ll", " \'ll", string)
- string = re.sub(r",", " , ", string)
- string = re.sub(r"!", " ! ", string)
- string = re.sub(r"\(", " \( ", string)
- string = re.sub(r"\)", " \) ", string)
- string = re.sub(r"\?", " \? ", string)
- string = re.sub(r"\s{2,}", " ", string)
- return string.strip()
- def pad_sentences(sentence, padding_word=" <PAD/>"):
- sequence_length = 64
- sent = sentence.split()
- padded_sentence = sentence + padding_word * (sequence_length - len(sent))
- return padded_sentence
- # data loader
- class MRDataset(Dataset):
- def __init__(self):
- # load positive and negative sentenses from files
- with codecs.open("./rt-polaritydata/rt-polarity.pos", encoding='ISO-8859-1') as f:
- positive_examples = list(f.readlines())
- with codecs.open("./rt-polaritydata/rt-polarity.neg", encoding='ISO-8859-1') as f:
- negative_examples = list(f.readlines())
- # s.strip: clear "\n"; clear_str; pad
- positive_examples = [pad_sentences(clean_str(s.strip())) for s in positive_examples]
- negative_examples = [pad_sentences(clean_str(s.strip())) for s in negative_examples]
- self.examples = positive_examples + negative_examples
- self.sentences_texts = [sample.split() for sample in self.examples]
- # word dictionary
- dictionary = corpora.Dictionary(self.sentences_texts)
- self.word2id_dict = dictionary.token2id # transform to dict, like {"human":0, "a":1,...}
- # set lables: postive is 1; negative is 0
- positive_labels = [1 for _ in positive_examples]
- negative_labels = [0 for _ in negative_examples]
- self.lables = positive_labels + negative_labels
- examples_lables = list(zip(self.examples, self.lables))
- random.shuffle(examples_lables)
- self.MRDataset_frame = examples_lables
- # transform word to id
- self.MRDataset_wordid = \
- [(
- np.array([self.word2id_dict[word] for word in sent[0].split()], dtype=np.int64),
- sent[1]
- ) for sent in self.MRDataset_frame]
- def word_embeddings(self, path="./GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin"):
- # establish from google
- model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
- print('Please wait ... (it could take a while to load the file : {})'.format(path))
- word_dict = self.word2id_dict
- embedding_weights = np.random.uniform(-0.25, 0.25, (len(self.word2id_dict), 300))
- for word in word_dict:
- word_id = word_dict[word]
- if word in model.wv.vocab:
- embedding_weights[word_id, :] = model[word]
- return embedding_weights
- def __len__(self):
- return len(self.MRDataset_frame)
- def __getitem__(self, idx):
- sample = self.MRDataset_wordid[idx]
- return sample
- def getsent(self, idx):
- sample = self.MRDataset_wordid[idx][0]
- return sample
- def getlabel(self, idx):
- label = self.MRDataset_wordid[idx][1]
- return label
- def word2id(self):
- return self.word2id_dict
- def id2word(self):
- id2word_dict = dict([val, key] for key, val in self.word2id_dict.items())
- return id2word_dict
- class train_set(Dataset):
- def __init__(self, samples):
- self.train_frame = samples
- def __len__(self):
- return len(self.train_frame)
- def __getitem__(self, idx):
- return self.train_frame[idx]
- class test_set(Dataset):
- def __init__(self, samples):
- self.test_frame = samples
- def __len__(self):
- return len(self.test_frame)
- def __getitem__(self, idx):
- return self.test_frame[idx]