CNN for sentence classificationtags/v0.1.0
@@ -0,0 +1,110 @@ | |||||
# Byte-compiled / optimized / DLL files | |||||
__pycache__/ | |||||
*.py[cod] | |||||
*$py.class | |||||
# C extensions | |||||
*.so | |||||
# Distribution / packaging | |||||
.Python | |||||
build/ | |||||
develop-eggs/ | |||||
dist/ | |||||
downloads/ | |||||
eggs/ | |||||
.eggs/ | |||||
lib/ | |||||
lib64/ | |||||
parts/ | |||||
sdist/ | |||||
var/ | |||||
wheels/ | |||||
*.egg-info/ | |||||
.installed.cfg | |||||
*.egg | |||||
MANIFEST | |||||
# PyInstaller | |||||
# Usually these files are written by a python script from a template | |||||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | |||||
*.manifest | |||||
*.spec | |||||
# Installer logs | |||||
pip-log.txt | |||||
pip-delete-this-directory.txt | |||||
# Unit test / coverage reports | |||||
htmlcov/ | |||||
.tox/ | |||||
.coverage | |||||
.coverage.* | |||||
.cache | |||||
nosetests.xml | |||||
coverage.xml | |||||
*.cover | |||||
.hypothesis/ | |||||
.pytest_cache/ | |||||
# Translations | |||||
*.mo | |||||
*.pot | |||||
# Django stuff: | |||||
*.log | |||||
local_settings.py | |||||
db.sqlite3 | |||||
# Flask stuff: | |||||
instance/ | |||||
.webassets-cache | |||||
# Scrapy stuff: | |||||
.scrapy | |||||
# Sphinx documentation | |||||
docs/_build/ | |||||
# PyBuilder | |||||
target/ | |||||
# Jupyter Notebook | |||||
.ipynb_checkpoints | |||||
# pyenv | |||||
.python-version | |||||
# celery beat schedule file | |||||
celerybeat-schedule | |||||
# SageMath parsed files | |||||
*.sage.py | |||||
# Environments | |||||
.env | |||||
.venv | |||||
env/ | |||||
venv/ | |||||
ENV/ | |||||
env.bak/ | |||||
venv.bak/ | |||||
# Spyder project settings | |||||
.spyderproject | |||||
.spyproject | |||||
# Rope project settings | |||||
.ropeproject | |||||
# mkdocs documentation | |||||
/site | |||||
# mypy | |||||
.mypy_cache | |||||
#custom | |||||
GoogleNews-vectors-negative300.bin/ | |||||
GoogleNews-vectors-negative300.bin.gz | |||||
models/ | |||||
*.swp |
@@ -0,0 +1,77 @@ | |||||
## Introduction | |||||
This is the implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch. | |||||
* MRDataset, non-static-model(word2vec rained by Mikolov etal. (2013) on 100 billion words of Google News) | |||||
* It can be run in both CPU and GPU | |||||
* The best accuracy is 82.61%, which is better than 81.5% in the paper | |||||
(by Jingyuan Liu @Fudan University; Email:(fdjingyuan@outlook.com) Welcome to discussion!) | |||||
## Requirement | |||||
* python 3.6 | |||||
* pytorch > 0.1 | |||||
* numpy | |||||
* gensim | |||||
## Run | |||||
STEP 1 | |||||
install packages like gensim (other needed pakages is the same) | |||||
``` | |||||
pip install gensim | |||||
``` | |||||
STEP 2 | |||||
install MRdataset and word2vec resources | |||||
* MRdataset: you can download the dataset in (https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz) | |||||
* word2vec: you can download the file in (https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit) | |||||
Since this file is more than 1.5G, I did not display in folders. If you download the file, please remember modify the path in Function def word_embeddings(path = './GoogleNews-vectors-negative300.bin/'): | |||||
STEP 3 | |||||
train the model | |||||
``` | |||||
python train.py | |||||
``` | |||||
you will get the information printed in the screen, like | |||||
``` | |||||
Epoch [1/20], Iter [100/192] Loss: 0.7008 | |||||
Test Accuracy: 71.869159 % | |||||
Epoch [2/20], Iter [100/192] Loss: 0.5957 | |||||
Test Accuracy: 75.700935 % | |||||
Epoch [3/20], Iter [100/192] Loss: 0.4934 | |||||
Test Accuracy: 78.130841 % | |||||
...... | |||||
Epoch [20/20], Iter [100/192] Loss: 0.0364 | |||||
Test Accuracy: 81.495327 % | |||||
Best Accuracy: 82.616822 % | |||||
Best Model: models/cnn.pkl | |||||
``` | |||||
## Hyperparameters | |||||
According to the paper and experiment, I set: | |||||
|Epoch|Kernel Size|dropout|learning rate|batch size| | |||||
|---|---|---|---|---| | |||||
|20|\(h,300,100\)|0.5|0.0001|50| | |||||
h = [3,4,5] | |||||
If the accuracy is not improved, the learning rate will \*0.8. | |||||
## Result | |||||
I just tried one dataset : MR. (Other 6 dataset in paper SST-1, SST-2, TREC, CR, MPQA) | |||||
There are four models in paper: CNN-rand, CNN-static, CNN-non-static, CNN-multichannel. | |||||
I have tried CNN-non-static:A model with pre-trained vectors from word2vec. | |||||
All words—including the unknown ones that are randomly initialized and the pretrained vectors are fine-tuned for each task | |||||
(which has almost the best performance and the most difficut to implement among the four models) | |||||
|Dataset|Class Size|Best Result|Kim's Paper Result| | |||||
|---|---|---|---| | |||||
|MR|2|82.617%(CNN-non-static)|81.5%(CNN-nonstatic)| | |||||
## Reference | |||||
* [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) | |||||
* https://github.com/Shawn1993/cnn-text-classification-pytorch | |||||
* https://github.com/junwang4/CNN-sentence-classification-pytorch-2017/blob/master/utils.py | |||||
@@ -0,0 +1,149 @@ | |||||
import re | |||||
import sys | |||||
import itertools | |||||
import numpy as np | |||||
from torch.utils.data import Dataset, DataLoader | |||||
import random | |||||
import os | |||||
import pickle | |||||
import codecs | |||||
from gensim import corpora | |||||
import gensim | |||||
def clean_str(string): | |||||
""" | |||||
Tokenization/string cleaning for all datasets except for SST. | |||||
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py | |||||
""" | |||||
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) | |||||
string = re.sub(r"\'s", " \'s", string) | |||||
string = re.sub(r"\'ve", " \'ve", string) | |||||
string = re.sub(r"n\'t", " n\'t", string) | |||||
string = re.sub(r"\'re", " \'re", string) | |||||
string = re.sub(r"\'d", " \'d", string) | |||||
string = re.sub(r"\'ll", " \'ll", string) | |||||
string = re.sub(r",", " , ", string) | |||||
string = re.sub(r"!", " ! ", string) | |||||
string = re.sub(r"\(", " \( ", string) | |||||
string = re.sub(r"\)", " \) ", string) | |||||
string = re.sub(r"\?", " \? ", string) | |||||
string = re.sub(r"\s{2,}", " ", string) | |||||
return string.strip() | |||||
def pad_sentences(sentence, padding_word=" <PAD/>"): | |||||
sequence_length = 64 | |||||
sent = sentence.split() | |||||
padded_sentence = sentence + padding_word * (sequence_length - len(sent)) | |||||
return padded_sentence | |||||
#data loader | |||||
class MRDataset(Dataset): | |||||
def __init__(self): | |||||
#load positive and negative sentenses from files | |||||
with codecs.open("./rt-polaritydata/rt-polarity.pos",encoding ='ISO-8859-1') as f: | |||||
positive_examples = list(f.readlines()) | |||||
with codecs.open("./rt-polaritydata/rt-polarity.neg",encoding ='ISO-8859-1') as f: | |||||
negative_examples = list(f.readlines()) | |||||
#s.strip: clear "\n"; clear_str; pad | |||||
positive_examples = [pad_sentences(clean_str(s.strip())) for s in positive_examples] | |||||
negative_examples = [pad_sentences(clean_str(s.strip())) for s in negative_examples] | |||||
self.examples = positive_examples + negative_examples | |||||
self.sentences_texts = [sample.split() for sample in self.examples] | |||||
#word dictionary | |||||
dictionary = corpora.Dictionary(self.sentences_texts) | |||||
self.word2id_dict = dictionary.token2id # transform to dict, like {"human":0, "a":1,...} | |||||
#set lables: postive is 1; negative is 0 | |||||
positive_labels = [1 for _ in positive_examples] | |||||
negative_labels = [0 for _ in negative_examples] | |||||
self.lables = positive_labels + negative_labels | |||||
examples_lables = list(zip(self.examples,self.lables)) | |||||
random.shuffle(examples_lables) | |||||
self.MRDataset_frame = examples_lables | |||||
#transform word to id | |||||
self.MRDataset_wordid = \ | |||||
[( | |||||
np.array([self.word2id_dict[word] for word in sent[0].split()], dtype=np.int64), | |||||
sent[1] | |||||
) for sent in self.MRDataset_frame] | |||||
def word_embeddings(self, path = './GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin'): | |||||
#establish from google | |||||
model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True) | |||||
print('Please wait ... (it could take a while to load the file : {})'.format(path)) | |||||
word_dict = self.word2id_dict | |||||
embedding_weights = np.random.uniform(-0.25, 0.25, (len(self.word2id_dict), 300)) | |||||
for word in word_dict: | |||||
word_id = word_dict[word] | |||||
if word in model.wv.vocab: | |||||
embedding_weights[word_id, :] = model[word] | |||||
return embedding_weights | |||||
def __len__(self): | |||||
return len(self.MRDataset_frame) | |||||
def __getitem__(self,idx): | |||||
sample = self.MRDataset_wordid[idx] | |||||
return sample | |||||
def getsent(self, idx): | |||||
sample = self.MRDataset_wordid[idx][0] | |||||
return sample | |||||
def getlabel(self, idx): | |||||
label = self.MRDataset_wordid[idx][1] | |||||
return label | |||||
def word2id(self): | |||||
return self.word2id_dict | |||||
def id2word(self): | |||||
id2word_dict = dict([val,key] for key,val in self.word2id_dict.items()) | |||||
return id2word_dict | |||||
class train_set(Dataset): | |||||
def __init__(self, samples): | |||||
self.train_frame = samples | |||||
def __len__(self): | |||||
return len(self.train_frame) | |||||
def __getitem__(self, idx): | |||||
return self.train_frame[idx] | |||||
class test_set(Dataset): | |||||
def __init__(self, samples): | |||||
self.test_frame = samples | |||||
def __len__(self): | |||||
return len(self.test_frame) | |||||
def __getitem__(self, idx): | |||||
return self.test_frame[idx] |
@@ -0,0 +1,43 @@ | |||||
import os | |||||
import sys | |||||
import numpy as np | |||||
import torch | |||||
import torch.nn as nn | |||||
import torch.nn.functional as F | |||||
from torch.autograd import Variable | |||||
import dataset | |||||
class CNN_text(nn.Module): | |||||
def __init__(self, kernel_h=[3,4,5], kernel_num=100, embed_num=1000, embed_dim=300, dropout=0.5, L2_constrain=3, batchsize=50, pretrained_embeddings=None): | |||||
super(CNN_text, self).__init__() | |||||
self.embedding = nn.Embedding(embed_num,embed_dim) | |||||
self.dropout = nn.Dropout(dropout) | |||||
if pretrained_embeddings is not None: | |||||
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings)) | |||||
#the network structure | |||||
#Conv2d: input- N,C,H,W output- (50,100,62,1) | |||||
self.conv1 = nn.ModuleList([nn.Conv2d(1, 100, (K, 300)) for K in kernel_h]) | |||||
self.fc1 = nn.Linear(300,2) | |||||
def max_pooling(self, x): | |||||
x = F.relu(conv(x)).squeeze(3) #N,C,L - (50,100,62) | |||||
x = F.max_pool1d(x, x.size(2)).squeeze(2) | |||||
#x.size(2)=62 squeeze: (50,100,1) -> (50,100) | |||||
return x | |||||
def forward(self, x): | |||||
x = self.embedding(x) #output: (N,H,W) = (50,64,300) | |||||
x = x.unsqueeze(1) #(N,C,H,W) | |||||
x = [F.relu(conv(x)).squeeze(3) for conv in self.conv1] #[N, C, H(50,100,62),(50,100,61),(50,100,60)] | |||||
x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[N,C(50,100),(50,100),(50,100)] | |||||
x = torch.cat(x,1) | |||||
x = self.dropout(x) | |||||
x = self.fc1(x) | |||||
return x |
@@ -0,0 +1,102 @@ | |||||
import os | |||||
import torch | |||||
import torch.nn as nn | |||||
import torchvision.datasets as dsets | |||||
import torchvision.transforms as transforms | |||||
import dataset as dst | |||||
from model import CNN_text | |||||
from torch.autograd import Variable | |||||
from sklearn import cross_validation | |||||
from sklearn import datasets | |||||
# Hyper Parameters | |||||
batch_size = 50 | |||||
learning_rate = 0.0001 | |||||
num_epochs = 20 | |||||
cuda = True | |||||
#split Dataset | |||||
dataset = dst.MRDataset() | |||||
length = len(dataset) | |||||
train_dataset = dataset[:int(0.9*length)] | |||||
test_dataset = dataset[int(0.9*length):] | |||||
train_dataset = dst.train_set(train_dataset) | |||||
test_dataset = dst.test_set(test_dataset) | |||||
# Data Loader | |||||
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, | |||||
batch_size=batch_size, | |||||
shuffle=True) | |||||
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, | |||||
batch_size=batch_size, | |||||
shuffle=False) | |||||
#cnn | |||||
cnn = CNN_text(embed_num=len(dataset.word2id()), pretrained_embeddings=dataset.word_embeddings()) | |||||
if cuda: | |||||
cnn.cuda() | |||||
# Loss and Optimizer | |||||
criterion = nn.CrossEntropyLoss() | |||||
optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate) | |||||
#train and test | |||||
best_acc = None | |||||
for epoch in range(num_epochs): | |||||
# Train the Model | |||||
cnn.train() | |||||
for i, (sents,labels) in enumerate(train_loader): | |||||
sents = Variable(sents) | |||||
labels = Variable(labels) | |||||
if cuda: | |||||
sents = sents.cuda() | |||||
labels = labels.cuda() | |||||
optimizer.zero_grad() | |||||
outputs = cnn(sents) | |||||
loss = criterion(outputs, labels) | |||||
loss.backward() | |||||
optimizer.step() | |||||
if (i+1) % 100 == 0: | |||||
print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f' | |||||
%(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0])) | |||||
# Test the Model | |||||
cnn.eval() | |||||
correct = 0 | |||||
total = 0 | |||||
for sents, labels in test_loader: | |||||
sents = Variable(sents) | |||||
if cuda: | |||||
sents = sents.cuda() | |||||
labels = labels.cuda() | |||||
outputs = cnn(sents) | |||||
_, predicted = torch.max(outputs.data, 1) | |||||
total += labels.size(0) | |||||
correct += (predicted == labels).sum() | |||||
acc = 100. * correct / total | |||||
print('Test Accuracy: %f %%' % (acc)) | |||||
if best_acc is None or acc > best_acc: | |||||
best_acc = acc | |||||
if os.path.exists("models") is False: | |||||
os.makedirs("models") | |||||
torch.save(cnn.state_dict(), 'models/cnn.pkl') | |||||
else: | |||||
learning_rate = learning_rate * 0.8 | |||||
print("Best Accuracy: %f %%" % best_acc) | |||||
print("Best Model: models/cnn.pkl") |