Merge pull request #1 from fdjingyuan/master

CNN for sentence classification
7 years ago · d26d5b56ac
--- a/CNN-sentence_classification/.gitignore
+++ b/CNN-sentence_classification/.gitignore
@@ -0,0 +1,110 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 .hypothesis/
 .pytest_cache/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # pyenv
 .python-version
 # celery beat schedule file
 celerybeat-schedule
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache
 #custom
 GoogleNews-vectors-negative300.bin/
 GoogleNews-vectors-negative300.bin.gz
 models/
 *.swp
--- a/CNN-sentence_classification/README.md
+++ b/CNN-sentence_classification/README.md
@@ -0,0 +1,77 @@
 ## Introduction
 This is the implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch.
 * MRDataset, non-static-model(word2vec rained by Mikolov etal. (2013) on 100 billion words of Google News)
 * It can be run in both CPU and GPU
 * The best accuracy is 82.61%, which is better than 81.5% in the paper
 (by Jingyuan Liu @Fudan University; Email:(fdjingyuan@outlook.com) Welcome to discussion!)
 ## Requirement
 * python 3.6
 * pytorch > 0.1
 * numpy
 * gensim
 ## Run
 STEP 1
 install packages like gensim (other needed pakages is the same)
 ```
 pip install gensim
 ```
 STEP 2
 install MRdataset and word2vec resources
 * MRdataset: you can download the dataset in (https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz)
 * word2vec: you can download the file in (https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit)
 Since this file is more than 1.5G, I did not display in folders. If you download the file, please remember modify the path in Function def word_embeddings(path = './GoogleNews-vectors-negative300.bin/'):
 STEP 3
 train the model 
 ```
 python train.py
 ```
 you will get the information printed in the screen, like
 ```
 Epoch [1/20], Iter [100/192] Loss: 0.7008
 Test Accuracy: 71.869159 %
 Epoch [2/20], Iter [100/192] Loss: 0.5957
 Test Accuracy: 75.700935 %
 Epoch [3/20], Iter [100/192] Loss: 0.4934
 Test Accuracy: 78.130841 %
 ......
 Epoch [20/20], Iter [100/192] Loss: 0.0364
 Test Accuracy: 81.495327 %
 Best Accuracy: 82.616822 %
 Best Model: models/cnn.pkl
 ```
 ## Hyperparameters
 According to the paper and experiment, I set:
 |Epoch|Kernel Size|dropout|learning rate|batch size|
 |---|---|---|---|---|
 |20|\(h,300,100\)|0.5|0.0001|50|
 h = [3,4,5]
 If the accuracy is not improved, the learning rate will \*0.8.
 ## Result
 I just tried one dataset : MR. (Other 6 dataset in paper SST-1, SST-2, TREC, CR, MPQA)
 There are four models in paper: CNN-rand, CNN-static, CNN-non-static, CNN-multichannel.
 I have tried CNN-non-static:A model with pre-trained vectors from word2vec. 
 All words—including the unknown ones that are randomly initialized and the pretrained vectors are fine-tuned for each task
 (which has almost the best performance and the most difficut to implement among the four models)
 |Dataset|Class Size|Best Result|Kim's Paper Result|
 |---|---|---|---|
 |MR|2|82.617%(CNN-non-static)|81.5%(CNN-nonstatic)|
 ## Reference
 * [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882)
 * https://github.com/Shawn1993/cnn-text-classification-pytorch
 * https://github.com/junwang4/CNN-sentence-classification-pytorch-2017/blob/master/utils.py
--- a/CNN-sentence_classification/dataset.py
+++ b/CNN-sentence_classification/dataset.py
@@ -0,0 +1,149 @@
 import re
 import sys
 import itertools
 import numpy as np
 from torch.utils.data import Dataset, DataLoader
 import random
 import os
 import pickle
 import codecs
 from gensim import corpora
 import gensim
 def clean_str(string):
            """
            Tokenization/string cleaning for all datasets except for SST.
            Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
            """
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip()
 def pad_sentences(sentence, padding_word=" <PAD/>"):
    sequence_length = 64
    sent = sentence.split()
    padded_sentence = sentence + padding_word * (sequence_length - len(sent))
    return padded_sentence
 #data loader
 class MRDataset(Dataset):
    def __init__(self):
        #load positive and negative sentenses from files
        with codecs.open("./rt-polaritydata/rt-polarity.pos",encoding ='ISO-8859-1') as f:
            positive_examples = list(f.readlines())
        with codecs.open("./rt-polaritydata/rt-polarity.neg",encoding ='ISO-8859-1') as f:
            negative_examples = list(f.readlines())
        #s.strip: clear "\n"; clear_str; pad
        positive_examples = [pad_sentences(clean_str(s.strip())) for s in positive_examples]
        negative_examples = [pad_sentences(clean_str(s.strip())) for s in negative_examples]
        self.examples = positive_examples + negative_examples
        self.sentences_texts = [sample.split() for sample in self.examples]
        #word dictionary
        dictionary = corpora.Dictionary(self.sentences_texts) 
        self.word2id_dict = dictionary.token2id  # transform to dict, like {"human":0, "a":1,...}
        #set lables: postive is 1; negative is 0
        positive_labels = [1 for _ in positive_examples]
        negative_labels = [0 for _ in negative_examples]
        self.lables = positive_labels + negative_labels
        examples_lables = list(zip(self.examples,self.lables))
        random.shuffle(examples_lables)
        self.MRDataset_frame = examples_lables
        #transform word to id
        self.MRDataset_wordid = \
            [(
                np.array([self.word2id_dict[word] for word in sent[0].split()], dtype=np.int64), 
                sent[1]
            ) for sent in self.MRDataset_frame]
    def word_embeddings(self, path = './GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin'):
 	    #establish from google
 	    model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
 	    print('Please wait ... (it could take a while to load the file : {})'.format(path))
 	    word_dict = self.word2id_dict
 	    embedding_weights = np.random.uniform(-0.25, 0.25, (len(self.word2id_dict), 300))
 	    for word in word_dict:
            word_id = word_dict[word]
            if word in model.wv.vocab:
                embedding_weights[word_id, :] = model[word]
 	    return embedding_weights
    def __len__(self):
        return len(self.MRDataset_frame)
    def __getitem__(self,idx):
        sample = self.MRDataset_wordid[idx]      
        return sample
    def getsent(self, idx):
        sample = self.MRDataset_wordid[idx][0]       
        return sample
    def getlabel(self, idx):
        label = self.MRDataset_wordid[idx][1]
        return label
    def word2id(self):
        return self.word2id_dict
    def id2word(self):
        id2word_dict = dict([val,key] for key,val in self.word2id_dict.items()) 
        return id2word_dict
 class train_set(Dataset):
    def __init__(self, samples):
        self.train_frame = samples
    def __len__(self):
        return len(self.train_frame)
    def __getitem__(self, idx):
        return self.train_frame[idx]
 class test_set(Dataset):
    def __init__(self, samples):
        self.test_frame = samples
    def __len__(self):
        return len(self.test_frame)
    def __getitem__(self, idx):
        return self.test_frame[idx]
--- a/CNN-sentence_classification/model.py
+++ b/CNN-sentence_classification/model.py
@@ -0,0 +1,43 @@
 import os
 import sys
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.autograd import Variable
 import dataset
 class CNN_text(nn.Module):
    def __init__(self, kernel_h=[3,4,5], kernel_num=100, embed_num=1000, embed_dim=300, dropout=0.5, L2_constrain=3, batchsize=50, pretrained_embeddings=None):
        super(CNN_text, self).__init__()
        self.embedding = nn.Embedding(embed_num,embed_dim)
        self.dropout = nn.Dropout(dropout)
        if pretrained_embeddings is not None:
            self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
        #the network structure
        #Conv2d: input- N,C,H,W output- (50,100,62,1)
        self.conv1 = nn.ModuleList([nn.Conv2d(1, 100, (K, 300)) for K in kernel_h])
        self.fc1 = nn.Linear(300,2)
    def max_pooling(self, x):
        x = F.relu(conv(x)).squeeze(3) #N,C,L - (50,100,62)
        x = F.max_pool1d(x, x.size(2)).squeeze(2) 
        #x.size(2)=62  squeeze: (50,100,1) -> (50,100)
        return x
    def forward(self, x):
        x = self.embedding(x) #output: (N,H,W) = (50,64,300)
        x = x.unsqueeze(1) #(N,C,H,W)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.conv1] #[N, C, H(50,100,62),(50,100,61),(50,100,60)]
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[N,C(50,100),(50,100),(50,100)]
        x = torch.cat(x,1)
        x = self.dropout(x)
        x = self.fc1(x)
        return x
--- a/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg
+++ b/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg
--- a/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos
+++ b/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos
--- a/CNN-sentence_classification/train.py
+++ b/CNN-sentence_classification/train.py
@@ -0,0 +1,102 @@
 import os
 import torch 
 import torch.nn as nn
 import torchvision.datasets as dsets
 import torchvision.transforms as transforms
 import dataset as dst
 from model import CNN_text
 from torch.autograd import Variable
 from sklearn import cross_validation
 from sklearn import datasets
 # Hyper Parameters
 batch_size = 50
 learning_rate = 0.0001
 num_epochs = 20 
 cuda = True
 #split Dataset
 dataset = dst.MRDataset()
 length = len(dataset)
 train_dataset = dataset[:int(0.9*length)]
 test_dataset = dataset[int(0.9*length):]
 train_dataset = dst.train_set(train_dataset)
 test_dataset = dst.test_set(test_dataset)
 # Data Loader 
 train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=batch_size, 
                                           shuffle=True)
 test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                          batch_size=batch_size, 
                                          shuffle=False)
 #cnn 
 cnn = CNN_text(embed_num=len(dataset.word2id()), pretrained_embeddings=dataset.word_embeddings())
 if cuda:
    cnn.cuda()
 # Loss and Optimizer
 criterion = nn.CrossEntropyLoss()
 optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate)
 #train and test
 best_acc = None
 for epoch in range(num_epochs):
    # Train the Model
    cnn.train()
    for i, (sents,labels) in enumerate(train_loader):
        sents = Variable(sents)
        labels = Variable(labels)        
 	   if cuda:
 	       sents = sents.cuda()
 	       labels = labels.cuda()
        optimizer.zero_grad()
        outputs = cnn(sents)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        if (i+1) % 100 == 0:
            print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f' 
                   %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))
    # Test the Model
    cnn.eval()
    correct = 0
    total = 0
    for sents, labels in test_loader:
        sents = Variable(sents)
        if cuda:
            sents = sents.cuda()
            labels = labels.cuda()
        outputs = cnn(sents)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum()
    acc = 100. * correct / total
    print('Test Accuracy: %f %%' % (acc))
    if best_acc is None or acc > best_acc:
        best_acc = acc
        if os.path.exists("models") is False:
            os.makedirs("models")
        torch.save(cnn.state_dict(), 'models/cnn.pkl')
    else:
        learning_rate = learning_rate * 0.8
 print("Best Accuracy: %f %%" % best_acc)
 print("Best Model: models/cnn.pkl")