Browse Source

restructure files & add "modules" directory & add CRF.py

tags/v0.1.0
FengZiYjun 7 years ago
parent
commit
32652407df
59 changed files with 219 additions and 62459 deletions
  1. +0
    -8
      action/README.md
  2. +0
    -46
      action/action.py
  3. +0
    -87
      action/tester.py
  4. +0
    -93
      action/trainer.py
  5. +0
    -0
      fastNLP/__init__.py
  6. +174
    -0
      fastNLP/modules/CRF.py
  7. +0
    -0
      fastNLP/modules/__init__.py
  8. +0
    -0
      fastNLP/modules/attention/__init__.py
  9. +19
    -0
      fastNLP/modules/attention/attention.py
  10. +9
    -0
      fastNLP/modules/attention/linear_attention.py
  11. +0
    -0
      fastNLP/modules/convolution/__init__.py
  12. +0
    -0
      fastNLP/modules/recurrent/__init__.py
  13. +9
    -0
      fastNLP/modules/utils.py
  14. +0
    -0
      fastNLP/reproduction/CNN-sentence_classification/__init__.py
  15. +0
    -0
      fastNLP/reproduction/Char-aware_NLM/__init__.py
  16. +0
    -0
      fastNLP/reproduction/HAN-document_classification/__init__.py
  17. +0
    -0
      fastNLP/reproduction/__init__.py
  18. +0
    -0
      fastNLP/saver/__init__.py
  19. +0
    -37
      loader/base_loader.py
  20. +0
    -13
      loader/config_loader.py
  21. +0
    -47
      loader/dataset_loader.py
  22. +0
    -8
      loader/embed_loader.py
  23. +0
    -158
      model/base_model.py
  24. +0
    -356
      model/char_language_model.py
  25. +0
    -135
      model/word_seg_model.py
  26. +0
    -110
      reproduction/CNN-sentence_classification/.gitignore
  27. +0
    -77
      reproduction/CNN-sentence_classification/README.md
  28. +0
    -142
      reproduction/CNN-sentence_classification/dataset.py
  29. +0
    -43
      reproduction/CNN-sentence_classification/model.py
  30. +0
    -5331
      reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg
  31. +0
    -5331
      reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos
  32. +0
    -97
      reproduction/CNN-sentence_classification/train.py
  33. +0
    -21
      reproduction/Char-aware_NLM/LICENSE
  34. +0
    -40
      reproduction/Char-aware_NLM/README.md
  35. +0
    -148
      reproduction/Char-aware_NLM/model.py
  36. +0
    -123
      reproduction/Char-aware_NLM/test.py
  37. +0
    -3761
      reproduction/Char-aware_NLM/test.txt
  38. +0
    -275
      reproduction/Char-aware_NLM/train.py
  39. +0
    -42068
      reproduction/Char-aware_NLM/train.txt
  40. +0
    -86
      reproduction/Char-aware_NLM/utilities.py
  41. +0
    -3370
      reproduction/Char-aware_NLM/valid.txt
  42. +0
    -36
      reproduction/HAN-document_classification/README.md
  43. BIN
      reproduction/HAN-document_classification/data/test_samples.pkl
  44. BIN
      reproduction/HAN-document_classification/data/train_samples.pkl
  45. BIN
      reproduction/HAN-document_classification/data/yelp.word2vec
  46. +0
    -44
      reproduction/HAN-document_classification/evaluate.py
  47. +0
    -110
      reproduction/HAN-document_classification/model.py
  48. +0
    -51
      reproduction/HAN-document_classification/preprocess.py
  49. +0
    -167
      reproduction/HAN-document_classification/train.py
  50. +0
    -14
      saver/base_saver.py
  51. +0
    -12
      saver/logger.py
  52. +0
    -8
      saver/model_saver.py
  53. +0
    -0
      test/data_for_tests/charlm.txt
  54. +0
    -0
      test/data_for_tests/cws_test
  55. +0
    -0
      test/data_for_tests/cws_train
  56. +3
    -2
      test/test_charlm.py
  57. +0
    -0
      test/test_loader.py
  58. +2
    -2
      test/test_trainer.py
  59. +3
    -2
      test/test_word_seg.py

+ 0
- 8
action/README.md View File

@@ -1,8 +0,0 @@
SpaCy "Doc"
https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/tokens/doc.pyx#L80

SpaCy "Vocab"
https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/vocab.pyx#L25

SpaCy "Token"
https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/tokens/token.pyx#L27

+ 0
- 46
action/action.py View File

@@ -1,46 +0,0 @@
from saver.logger import Logger


class Action(object):
"""
base class for Trainer and Tester
"""

def __init__(self):
super(Action, self).__init__()
self.logger = Logger("logger_output.txt")

def load_config(self, args):
raise NotImplementedError

def load_dataset(self, args):
raise NotImplementedError

def log(self, string):
self.logger.log(string)

def batchify(self, batch_size, X, Y=None):
"""
:param batch_size: int
:param X: feature matrix of size [n_sample, m_feature]
:param Y: label vector of size [n_sample, 1] (optional)
:return iteration:int, the number of step in each epoch
generator:generator, to generate batch inputs
"""
n_samples = X.shape[0]
num_iter = n_samples // batch_size
if Y is None:
generator = self._batch_generate(batch_size, num_iter, X)
else:
generator = self._batch_generate(batch_size, num_iter, X, Y)
return num_iter, generator

@staticmethod
def _batch_generate(batch_size, num_iter, *data):
for step in range(num_iter):
start = batch_size * step
end = batch_size * (step + 1)
yield tuple([x[start:end] for x in data])

def make_log(self, *args):
return "log"

+ 0
- 87
action/tester.py View File

@@ -1,87 +0,0 @@
from collections import namedtuple

import numpy as np

from action.action import Action


class Tester(Action):
"""docstring for Tester"""

TestConfig = namedtuple("config", ["validate_in_training", "save_dev_input", "save_output",
"save_loss", "batch_size"])

def __init__(self, test_args):
"""
:param test_args: named tuple
"""
super(Tester, self).__init__()
self.validate_in_training = test_args.validate_in_training
self.save_dev_input = test_args.save_dev_input
self.valid_x = None
self.valid_y = None
self.save_output = test_args.save_output
self.output = None
self.save_loss = test_args.save_loss
self.mean_loss = None
self.batch_size = test_args.batch_size

def test(self, network, data):
print("testing")
network.mode(test=True) # turn on the testing mode
if self.save_dev_input:
if self.valid_x is None:
valid_x, valid_y = network.prepare_input(data)
self.valid_x = valid_x
self.valid_y = valid_y
else:
valid_x = self.valid_x
valid_y = self.valid_y
else:
valid_x, valid_y = network.prepare_input(data)

# split into batches by self.batch_size
iterations, test_batch_generator = self.batchify(self.batch_size, valid_x, valid_y)

batch_output = list()
loss_history = list()
# turn on the testing mode of the network
network.mode(test=True)

for step in range(iterations):
batch_x, batch_y = test_batch_generator.__next__()

# forward pass from tests input to predicted output
prediction = network.data_forward(batch_x)

loss = network.get_loss(prediction, batch_y)

if self.save_output:
batch_output.append(prediction.data)
if self.save_loss:
loss_history.append(loss)
self.log(self.make_log(step, loss))

if self.save_loss:
self.mean_loss = np.mean(np.array(loss_history))
if self.save_output:
self.output = self.make_output(batch_output)

@property
def loss(self):
return self.mean_loss

@property
def result(self):
return self.output

@staticmethod
def make_output(batch_outputs):
# construct full prediction with batch outputs
return np.concatenate(batch_outputs, axis=0)

def load_config(self, args):
raise NotImplementedError

def load_dataset(self, args):
raise NotImplementedError

+ 0
- 93
action/trainer.py View File

@@ -1,93 +0,0 @@
from collections import namedtuple

from .action import Action
from .tester import Tester


class Trainer(Action):
"""
Trainer is a common training pipeline shared among all models.
"""
TrainConfig = namedtuple("config", ["epochs", "validate", "save_when_better",
"log_per_step", "log_validation", "batch_size"])

def __init__(self, train_args):
"""
:param train_args: namedtuple
"""
super(Trainer, self).__init__()
self.n_epochs = train_args.epochs
self.validate = train_args.validate
self.save_when_better = train_args.save_when_better
self.log_per_step = train_args.log_per_step
self.log_validation = train_args.log_validation
self.batch_size = train_args.batch_size

def train(self, network, train_data, dev_data=None):
"""
:param network: the model controller
:param train_data: raw data for training
:param dev_data: raw data for validation
This method will call all the base methods of network (implemented in model.base_model).
"""
train_x, train_y = network.prepare_input(train_data)

iterations, train_batch_generator = self.batchify(self.batch_size, train_x, train_y)

test_args = Tester.TestConfig(save_output=True, validate_in_training=True,
save_dev_input=True, save_loss=True, batch_size=self.batch_size)
evaluator = Tester(test_args)

best_loss = 1e10
loss_history = list()

for epoch in range(self.n_epochs):
network.mode(test=False) # turn on the train mode

network.define_optimizer()
for step in range(iterations):
batch_x, batch_y = train_batch_generator.__next__()

prediction = network.data_forward(batch_x)

loss = network.get_loss(prediction, batch_y)
network.grad_backward()

if step % self.log_per_step == 0:
print("step ", step)
loss_history.append(loss)
self.log(self.make_log(epoch, step, loss))

#################### evaluate over dev set ###################
if self.validate:
if dev_data is None:
raise RuntimeError("No validation data provided.")
# give all controls to tester
evaluator.test(network, dev_data)

if self.log_validation:
self.log(self.make_valid_log(epoch, evaluator.loss))
if evaluator.loss < best_loss:
best_loss = evaluator.loss
if self.save_when_better:
self.save_model(network)

# finish training

def make_log(self, *args):
return "make a log"

def make_valid_log(self, *args):
return "make a valid log"

def save_model(self, model):
model.save()

def load_data(self, data_name):
print("load data")

def load_config(self, args):
raise NotImplementedError

def load_dataset(self, args):
raise NotImplementedError

action/__init__.py → fastNLP/__init__.py View File


+ 174
- 0
fastNLP/modules/CRF.py View File

@@ -0,0 +1,174 @@
import torch
from torch import nn


def log_sum_exp(x, dim=-1):
max_value, _ = x.max(dim=dim, keepdim=True)
res = torch.log(torch.sum(torch.exp(x - max_value), dim=dim, keepdim=True)) + max_value
return res.squeeze(dim)


def seq_len_to_byte_mask(seq_lens):
# usually seq_lens: LongTensor, batch_size
# return value: ByteTensor, batch_size x max_len
batch_size = seq_lens.size(0)
max_len = seq_lens.max()
broadcast_arange = torch.arange(max_len).view(1, -1).repeat(batch_size, 1)
mask = broadcast_arange.lt(seq_lens.float().view(-1, 1))
return mask


class ContionalRandomField(nn.Module):
def __init__(self, tag_size, include_start_end_trans=True):
"""
:param tag_size: int, num of tags
:param include_start_end_trans: bool, whether to include start/end tag
"""
super(ContionalRandomField, self).__init__()

self.include_start_end_trans = include_start_end_trans
self.tag_size = tag_size

# the meaning of entry in this matrix is (from_tag_id, to_tag_id) score
self.transition_m = nn.Parameter(torch.randn(tag_size, tag_size))
if self.include_start_end_trans:
self.start_scores = nn.Parameter(torch.randn(tag_size))
self.end_scores = nn.Parameter(torch.randn(tag_size))

self.reset_parameter()

def reset_parameter(self):
nn.init.xavier_normal_(self.transition_m)
if self.include_start_end_trans:
nn.init.normal_(self.start_scores)
nn.init.normal_(self.end_scores)

def _normalizer_likelihood(self, feats, masks):
"""
Computes the (batch_size,) denominator term for the log-likelihood, which is the
sum of the likelihoods across all possible state sequences.

:param feats:FloatTensor, batch_size x max_len x tag_size
:param masks:ByteTensor, batch_size x max_len
:return:FloatTensor, batch_size
"""
batch_size, max_len, _ = feats.size()

# alpha, batch_size x tag_size
if self.include_start_end_trans:
alpha = self.start_scores.view(1, -1) + feats[:, 0]
else:
alpha = feats[:, 0]

# broadcast_trans_m, the meaning of entry in this matrix is [batch_idx, to_tag_id, from_tag_id]
broadcast_trans_m = self.transition_m.permute(
1, 0).unsqueeze(0).repeat(batch_size, 1, 1)
# loop
for i in range(1, max_len):
emit_score = feats[:, i].unsqueeze(2)
new_alpha = broadcast_trans_m + alpha.unsqueeze(1) + emit_score

new_alpha = log_sum_exp(new_alpha, dim=2)

alpha = new_alpha * \
masks[:, i:i + 1].float() + alpha * \
(1 - masks[:, i:i + 1].float())

if self.include_start_end_trans:
alpha = alpha + self.end_scores.view(1, -1)

return log_sum_exp(alpha)

def _glod_score(self, feats, tags, masks):
"""
Compute the score for the gold path.
:param feats: FloatTensor, batch_size x tag_size x tag_size
:param tags: LongTensor, batch_size x max_len
:param masks: ByteTensor, batch_size x max_len
:return:FloatTensor, batch_size
"""
batch_size, max_len, _ = feats.size()

# alpha, B x 1
if self.include_start_end_trans:
alpha = self.start_scores.view(1, -1).repeat(batch_size, 1).gather(dim=1, index=tags[:, :1]) + \
feats[:, 0].gather(dim=1, index=tags[:, :1])
else:
alpha = feats[:, 0].gather(dim=1, index=tags[:, :1])

for i in range(1, max_len):
trans_score = self.transition_m[(
tags[:, i - 1], tags[:, i])].unsqueeze(1)
emit_score = feats[:, i].gather(dim=1, index=tags[:, i:i + 1])
new_alpha = alpha + trans_score + emit_score

alpha = new_alpha * \
masks[:, i:i + 1].float() + alpha * \
(1 - masks[:, i:i + 1].float())

if self.include_start_end_trans:
last_tag_index = masks.cumsum(dim=1, dtype=torch.long)[:, -1:] - 1
last_from_tag_id = tags.gather(dim=1, index=last_tag_index)
trans_score = self.end_scores.view(
1, -1).repeat(batch_size, 1).gather(dim=1, index=last_from_tag_id)
alpha = alpha + trans_score

return alpha.squeeze(1)

def forward(self, feats, tags, masks):
"""
Calculate the neg log likelihood
:param feats:FloatTensor, batch_size x tag_size x tag_size
:param tags:LongTensor, batch_size x max_len
:param masks:ByteTensor batch_size x max_len
:return:FloatTensor, batch_size
"""
all_path_score = self._normalizer_likelihood(feats, masks)
gold_path_score = self._glod_score(feats, tags, masks)

return all_path_score - gold_path_score

def viterbi_decode(self, feats, masks):
"""
Given a feats matrix, return best decode path and best score.
:param feats:
:param masks:
:return:List[Tuple(List, float)],
"""
batch_size, max_len, tag_size = feats.size()

paths = torch.zeros(batch_size, max_len - 1, self.tag_size)
if self.include_start_end_trans:
alpha = self.start_scores.repeat(batch_size, 1) + feats[:, 0]
else:
alpha = feats[:, 0]
for i in range(1, max_len):
new_alpha = alpha.clone()
for t in range(self.tag_size):
pre_scores = self.transition_m[:, t].view(
1, self.tag_size) + alpha
max_scroe, indice = pre_scores.max(dim=1)
new_alpha[:, t] = max_scroe + feats[:, i, t]
paths[:, i - 1, t] = indice
alpha = new_alpha * \
masks[:, i:i + 1].float() + alpha * \
(1 - masks[:, i:i + 1].float())

if self.include_start_end_trans:
alpha += self.end_scores.view(1, -1)

max_scroes, indice = alpha.max(dim=1)
indice = indice.cpu().numpy()
final_paths = []
paths = paths.cpu().numpy().astype(int)

seq_lens = masks.cumsum(dim=1, dtype=torch.long)[:, -1]

for b in range(batch_size):
path = [indice[b]]
for i in range(seq_lens[b] - 2, -1, -1):
index = paths[b, i, path[-1]]
path.append(index)
final_paths.append(path[::-1])

return list(zip(final_paths, max_scroes.detach().cpu().numpy()))

loader/__init__.py → fastNLP/modules/__init__.py View File


model/__init__.py → fastNLP/modules/attention/__init__.py View File


+ 19
- 0
fastNLP/modules/attention/attention.py View File

@@ -0,0 +1,19 @@
import torch

from fastNLP.modules.utils import mask_softmax


class Attention(torch.nn.Module):

def __init__(self, normalize=False):
super(Attention, self).__init__()
self.normalize = normalize

def forward(self, query, memory, mask):
similarities = self._atten_forward(query, memory)
if self.normalize:
return mask_softmax(similarities, mask)
return similarities

def _atten_forward(self, query, memory):
raise NotImplementedError

+ 9
- 0
fastNLP/modules/attention/linear_attention.py View File

@@ -0,0 +1,9 @@
from fastNLP.modules.attention.attention import Attention


class LinearAttention(Attention):
def __init__(self, normalize=False):
super(LinearAttention, self).__init__(normalize)

def _atten_forward(self, query, memory):
raise NotImplementedError

+ 0
- 0
fastNLP/modules/convolution/__init__.py View File


+ 0
- 0
fastNLP/modules/recurrent/__init__.py View File


+ 9
- 0
fastNLP/modules/utils.py View File

@@ -0,0 +1,9 @@
import torch


def mask_softmax(matrix, mask):
if mask is None:
result = torch.nn.functional.softmax(matrix, dim=-1)
else:
raise NotImplementedError
return result

+ 0
- 0
fastNLP/reproduction/CNN-sentence_classification/__init__.py View File


+ 0
- 0
fastNLP/reproduction/Char-aware_NLM/__init__.py View File


+ 0
- 0
fastNLP/reproduction/HAN-document_classification/__init__.py View File


+ 0
- 0
fastNLP/reproduction/__init__.py View File


+ 0
- 0
fastNLP/saver/__init__.py View File


+ 0
- 37
loader/base_loader.py View File

@@ -1,37 +0,0 @@
class BaseLoader(object):
"""docstring for BaseLoader"""

def __init__(self, data_name, data_path):
super(BaseLoader, self).__init__()
self.data_name = data_name
self.data_path = data_path

def load(self):
"""
:return: string
"""
with open(self.data_path, "r", encoding="utf-8") as f:
text = f.read()
return text

def load_lines(self):
with open(self.data_path, "r", encoding="utf=8") as f:
text = f.readlines()
return text


class ToyLoader0(BaseLoader):
"""
For charLM
"""

def __init__(self, name, path):
super(ToyLoader0, self).__init__(name, path)

def load(self):
with open(self.data_path, 'r') as f:
corpus = f.read().lower()
import re
corpus = re.sub(r"<unk>", "unk", corpus)
return corpus.split()


+ 0
- 13
loader/config_loader.py View File

@@ -1,13 +0,0 @@
from loader.base_loader import BaseLoader


class ConfigLoader(BaseLoader):
"""loader for configuration files"""

def __int__(self, data_name, data_path):
super(ConfigLoader, self).__init__(data_name, data_path)
self.config = self.parse(super(ConfigLoader, self).load())

@staticmethod
def parse(string):
raise NotImplementedError

+ 0
- 47
loader/dataset_loader.py View File

@@ -1,47 +0,0 @@
from loader.base_loader import BaseLoader


class DatasetLoader(BaseLoader):
""""loader for data sets"""

def __init__(self, data_name, data_path):
super(DatasetLoader, self).__init__(data_name, data_path)


class ConllLoader(DatasetLoader):
"""loader for conll format files"""

def __int__(self, data_name, data_path):
"""
:param str data_name: the name of the conll data set
:param str data_path: the path to the conll data set
"""
super(ConllLoader, self).__init__(data_name, data_path)
self.data_set = self.parse(self.load())

def load(self):
"""
:return: list lines: all lines in a conll file
"""
with open(self.data_path, "r", encoding="utf-8") as f:
lines = f.readlines()
return lines

@staticmethod
def parse(lines):
"""
:param list lines:a list containing all lines in a conll file.
:return: a 3D list
"""
sentences = list()
tokens = list()
for line in lines:
if line[0] == "#":
# skip the comments
continue
if line == "\n":
sentences.append(tokens)
tokens = []
continue
tokens.append(line.split())
return sentences

+ 0
- 8
loader/embed_loader.py View File

@@ -1,8 +0,0 @@
from loader.base_loader import BaseLoader


class EmbedLoader(BaseLoader):
"""docstring for EmbedLoader"""

def __init__(self, data_name, data_path):
super(EmbedLoader, self).__init__(data_name, data_path)

+ 0
- 158
model/base_model.py View File

@@ -1,158 +0,0 @@
import numpy as np


class BaseModel(object):
"""The base class of all models.
This class and its subclasses are actually "wrappers" of the PyTorch models.
They act as an interface between Trainer and the deep learning networks.
This interface provides the following methods to be called by Trainer.
- prepare_input
- mode
- define_optimizer
- data_forward
- grad_backward
- get_loss
"""

def __init__(self):
pass

def prepare_input(self, data):
"""
Perform data transformation from raw input to vector/matrix inputs.
:param data: raw inputs
:return (X, Y): tuple, input features and labels
"""
raise NotImplementedError

def mode(self, test=False):
"""
Tell the network to be trained or not, required by PyTorch.
:param test: bool
"""
raise NotImplementedError

def define_optimizer(self):
"""
Define PyTorch optimizer specified by the model.
"""
raise NotImplementedError

def data_forward(self, *x):
"""
Forward pass of the data.
:param x: input feature matrix and label vector
:return: output by the model
"""
# required by PyTorch nn
raise NotImplementedError

def grad_backward(self):
"""
Perform gradient descent to update the model parameters.
"""
raise NotImplementedError

def get_loss(self, pred, truth):
"""
Compute loss given model prediction and ground truth. Loss function specified by the model.
:param pred: prediction label vector
:param truth: ground truth label vector
:return: a scalar
"""
raise NotImplementedError


class ToyModel(BaseModel):
"""This is for code testing."""

def __init__(self):
super(ToyModel, self).__init__()
self.test_mode = False
self.weight = np.random.rand(5, 1)
self.bias = np.random.rand()
self._loss = 0

def prepare_input(self, data):
return data[:, :-1], data[:, -1]

def mode(self, test=False):
self.test_mode = test

def data_forward(self, x):
return np.matmul(x, self.weight) + self.bias

def grad_backward(self):
print("loss gradient backward")

def get_loss(self, pred, truth):
self._loss = np.mean(np.square(pred - truth))
return self._loss

def define_optimizer(self):
pass


class Vocabulary(object):
"""A look-up table that allows you to access `Lexeme` objects. The `Vocab`
instance also provides access to the `StringStore`, and owns underlying
data that is shared between `Doc` objects.
"""

def __init__(self):
"""Create the vocabulary.
RETURNS (Vocab): The newly constructed object.
"""
self.data_frame = None


class Document(object):
"""A sequence of Token objects. Access sentences and named entities, export
annotations to numpy arrays, losslessly serialize to compressed binary
strings. The `Doc` object holds an array of `Token` objects. The
Python-level `Token` and `Span` objects are views of this array, i.e.
they don't own the data themselves. -- spacy
"""

def __init__(self, vocab, words=None, spaces=None):
"""Create a Doc object.
vocab (Vocab): A vocabulary object, which must match any models you
want to use (e.g. tokenizer, parser, entity recognizer).
words (list or None): A list of unicode strings, to add to the document
as words. If `None`, defaults to empty list.
spaces (list or None): A list of boolean values, of the same length as
words. True means that the word is followed by a space, False means
it is not. If `None`, defaults to `[True]*len(words)`
user_data (dict or None): Optional extra data to attach to the Doc.
RETURNS (Doc): The newly constructed object.
"""
self.vocab = vocab
self.spaces = spaces
self.words = words
if spaces is None:
self.spaces = [True] * len(self.words)
elif len(spaces) != len(self.words):
raise ValueError("dismatch spaces and words")

def get_chunker(self, vocab):
return None

def push_back(self, vocab):
pass


class Token(object):
"""An individual token – i.e. a word, punctuation symbol, whitespace,
etc.
"""

def __init__(self, vocab, doc, offset):
"""Construct a `Token` object.
vocab (Vocabulary): A storage container for lexical types.
doc (Document): The parent document.
offset (int): The index of the token within the document.
"""
self.vocab = vocab
self.doc = doc
self.token = doc[offset]
self.i = offset

+ 0
- 356
model/char_language_model.py View File

@@ -1,356 +0,0 @@
import os
from collections import namedtuple

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

from model.base_model import BaseModel

USE_GPU = True


class CharLM(BaseModel):

"""
Controller of the Character-level Neural Language Model
To do:
- where the data goes, call data savers.
"""
DataTuple = namedtuple("DataTuple", ["feature", "label"])

def __init__(self, lstm_batch_size, lstm_seq_len):
super(CharLM, self).__init__()
"""
Settings: should come from config loader or pre-processing
"""
self.word_embed_dim = 300
self.char_embedding_dim = 15
self.cnn_batch_size = lstm_batch_size * lstm_seq_len
self.lstm_seq_len = lstm_seq_len
self.lstm_batch_size = lstm_batch_size
self.num_epoch = 10
self.old_PPL = 100000
self.best_PPL = 100000

"""
These parameters are set by pre-processing.
"""
self.max_word_len = None
self.num_char = None
self.vocab_size = None
self.preprocess("./data_for_tests/charlm.txt")

self.data = None # named tuple to store all data set
self.data_ready = False
self.criterion = nn.CrossEntropyLoss()
self._loss = None
self.use_gpu = USE_GPU

# word_emb_dim == hidden_size / num of hidden units
self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)),
to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)))

self.model = charLM(self.char_embedding_dim,
self.word_embed_dim,
self.vocab_size,
self.num_char,
use_gpu=self.use_gpu)
for param in self.model.parameters():
nn.init.uniform(param.data, -0.05, 0.05)

self.learning_rate = 0.1
self.optimizer = None

def prepare_input(self, raw_text):
"""
:param raw_text: raw input text consisting of words
:return: torch.Tensor, torch.Tensor
feature matrix, label vector
This function is only called once in Trainer.train, but may called multiple times in Tester.test
So Tester will save test input for frequent calls.
"""
if os.path.exists("cache/prep.pt") is False:
self.preprocess("./data_for_tests/charlm.txt") # To do: This is not good. Need to fix..
objects = torch.load("cache/prep.pt")
word_dict = objects["word_dict"]
char_dict = objects["char_dict"]
max_word_len = self.max_word_len
print("word/char dictionary built. Start making inputs.")

words = raw_text
input_vec = np.array(text2vec(words, char_dict, max_word_len))
# Labels are next-word index in word_dict with the same length as inputs
input_label = np.array([word_dict[w] for w in words[1:]] + [word_dict[words[-1]]])
feature_input = torch.from_numpy(input_vec)
label_input = torch.from_numpy(input_label)
return feature_input, label_input

def mode(self, test=False):
if test:
self.model.eval()
else:
self.model.train()

def data_forward(self, x):
"""
:param x: Tensor of size [lstm_batch_size, lstm_seq_len, max_word_len+2]
:return: Tensor of size [num_words, ?]
"""
# additional processing of inputs after batching
num_seq = x.size()[0] // self.lstm_seq_len
x = x[:num_seq * self.lstm_seq_len, :]
x = x.view(-1, self.lstm_seq_len, self.max_word_len + 2)

# detach hidden state of LSTM from last batch
hidden = [state.detach() for state in self.hidden]
output, self.hidden = self.model(to_var(x), hidden)
return output

def grad_backward(self):
self.model.zero_grad()
self._loss.backward()
torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
self.optimizer.step()

def get_loss(self, predict, truth):
self._loss = self.criterion(predict, to_var(truth))
return self._loss.data # No pytorch data structure exposed outsides

def define_optimizer(self):
# redefine optimizer for every new epoch
self.optimizer = optim.SGD(self.model.parameters(), lr=self.learning_rate, momentum=0.85)

def save(self):
print("network saved")
# torch.save(self.model, "cache/model.pkl")

def preprocess(self, all_text_files):
word_dict, char_dict = create_word_char_dict(all_text_files)
num_char = len(char_dict)
self.vocab_size = len(word_dict)
char_dict["BOW"] = num_char + 1
char_dict["EOW"] = num_char + 2
char_dict["PAD"] = 0
self.num_char = num_char + 3
# char_dict is a dict of (int, string), int counting from 0 to 47
reverse_word_dict = {value: key for key, value in word_dict.items()}
self.max_word_len = max([len(word) for word in word_dict])
objects = {
"word_dict": word_dict,
"char_dict": char_dict,
"reverse_word_dict": reverse_word_dict,
}
torch.save(objects, "cache/prep.pt")
print("Preprocess done.")


"""
Global Functions
"""


def batch_generator(x, batch_size):
# x: [num_words, in_channel, height, width]
# partitions x into batches
num_step = x.size()[0] // batch_size
for t in range(num_step):
yield x[t * batch_size:(t + 1) * batch_size]


def text2vec(words, char_dict, max_word_len):
""" Return list of list of int """
word_vec = []
for word in words:
vec = [char_dict[ch] for ch in word]
if len(vec) < max_word_len:
vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))]
vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]]
word_vec.append(vec)
return word_vec


def read_data(file_name):
with open(file_name, 'r') as f:
corpus = f.read().lower()
import re
corpus = re.sub(r"<unk>", "unk", corpus)
return corpus.split()


def get_char_dict(vocabulary):
char_dict = dict()
count = 1
for word in vocabulary:
for ch in word:
if ch not in char_dict:
char_dict[ch] = count
count += 1
return char_dict


def create_word_char_dict(*file_name):
text = []
for file in file_name:
text += read_data(file)
word_dict = {word: ix for ix, word in enumerate(set(text))}
char_dict = get_char_dict(word_dict)
return word_dict, char_dict


def to_var(x):
if torch.cuda.is_available() and USE_GPU:
x = x.cuda()
return Variable(x)


"""
Neural Network
"""


class Highway(nn.Module):
"""Highway network"""

def __init__(self, input_size):
super(Highway, self).__init__()
self.fc1 = nn.Linear(input_size, input_size, bias=True)
self.fc2 = nn.Linear(input_size, input_size, bias=True)

def forward(self, x):
t = F.sigmoid(self.fc1(x))
return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x)


class charLM(nn.Module):
"""Character-level Neural Language Model
CNN + highway network + LSTM
# Input:
4D tensor with shape [batch_size, in_channel, height, width]
# Output:
2D Tensor with shape [batch_size, vocab_size]
# Arguments:
char_emb_dim: the size of each character's embedding
word_emb_dim: the size of each word's embedding
vocab_size: num of unique words
num_char: num of characters
use_gpu: True or False
"""

def __init__(self, char_emb_dim, word_emb_dim,
vocab_size, num_char, use_gpu):
super(charLM, self).__init__()
self.char_emb_dim = char_emb_dim
self.word_emb_dim = word_emb_dim
self.vocab_size = vocab_size

# char embedding layer
self.char_embed = nn.Embedding(num_char, char_emb_dim)

# convolutions of filters with different sizes
self.convolutions = []

# list of tuples: (the number of filter, width)
# self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)]
self.filter_num_width = [(25, 1), (50, 2), (75, 3)]

for out_channel, filter_width in self.filter_num_width:
self.convolutions.append(
nn.Conv2d(
1, # in_channel
out_channel, # out_channel
kernel_size=(char_emb_dim, filter_width), # (height, width)
bias=True
)
)

self.highway_input_dim = sum([x for x, y in self.filter_num_width])

self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False)

# highway net
self.highway1 = Highway(self.highway_input_dim)
self.highway2 = Highway(self.highway_input_dim)

# LSTM
self.lstm_num_layers = 2

self.lstm = nn.LSTM(input_size=self.highway_input_dim,
hidden_size=self.word_emb_dim,
num_layers=self.lstm_num_layers,
bias=True,
dropout=0.5,
batch_first=True)

# output layer
self.dropout = nn.Dropout(p=0.5)
self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)

if use_gpu is True:
for x in range(len(self.convolutions)):
self.convolutions[x] = self.convolutions[x].cuda()
self.highway1 = self.highway1.cuda()
self.highway2 = self.highway2.cuda()
self.lstm = self.lstm.cuda()
self.dropout = self.dropout.cuda()
self.char_embed = self.char_embed.cuda()
self.linear = self.linear.cuda()
self.batch_norm = self.batch_norm.cuda()

def forward(self, x, hidden):
# Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2]
# Return: Variable of Tensor with shape [num_words, len(word_dict)]
lstm_batch_size = x.size()[0]
lstm_seq_len = x.size()[1]

x = x.contiguous().view(-1, x.size()[2])
# [num_seq*seq_len, max_word_len+2]

x = self.char_embed(x)
# [num_seq*seq_len, max_word_len+2, char_emb_dim]

x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
# [num_seq*seq_len, 1, char_emb_dim, max_word_len+2]

x = self.conv_layers(x)
# [num_seq*seq_len, total_num_filters]

x = self.batch_norm(x)
# [num_seq*seq_len, total_num_filters]

x = self.highway1(x)
x = self.highway2(x)
# [num_seq*seq_len, total_num_filters]

x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1)
# [num_seq, seq_len, total_num_filters]

x, hidden = self.lstm(x, hidden)
# [seq_len, num_seq, hidden_size]

x = self.dropout(x)
# [seq_len, num_seq, hidden_size]

x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1)
# [num_seq*seq_len, hidden_size]

x = self.linear(x)
# [num_seq*seq_len, vocab_size]
return x, hidden

def conv_layers(self, x):
chosen_list = list()
for conv in self.convolutions:
feature_map = F.tanh(conv(x))
# (batch_size, out_channel, 1, max_word_len-width+1)
chosen = torch.max(feature_map, 3)[0]
# (batch_size, out_channel, 1)
chosen = chosen.squeeze()
# (batch_size, out_channel)
chosen_list.append(chosen)

# (batch_size, total_num_filers)
return torch.cat(chosen_list, 1)

+ 0
- 135
model/word_seg_model.py View File

@@ -1,135 +0,0 @@
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable

from model.base_model import BaseModel

USE_GPU = True


def to_var(x):
if torch.cuda.is_available() and USE_GPU:
x = x.cuda()
return Variable(x)


class WordSegModel(BaseModel):
"""
Model controller for WordSeg
"""

def __init__(self):
super(WordSegModel, self).__init__()
self.id2word = None
self.word2id = None
self.id2tag = None
self.tag2id = None

self.lstm_batch_size = 8
self.lstm_seq_len = 32 # Trainer batch_size == lstm_batch_size * lstm_seq_len
self.hidden_dim = 100
self.lstm_num_layers = 2
self.vocab_size = 100
self.word_emb_dim = 100

self.model = WordSeg(self.hidden_dim, self.lstm_num_layers, self.vocab_size, self.word_emb_dim)
self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)),
to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)))

self.optimizer = None
self._loss = None

def prepare_input(self, data):
"""
perform word indices lookup to convert strings into indices
:param data: list of string, each string contains word + space + [B, M, E, S]
:return
"""
word_list = []
tag_list = []
for line in data:
if len(line) > 2:
tokens = line.split("#")
word_list.append(tokens[0])
tag_list.append(tokens[2][0])
self.id2word = list(set(word_list))
self.word2id = {word: idx for idx, word in enumerate(self.id2word)}
self.id2tag = list(set(tag_list))
self.tag2id = {tag: idx for idx, tag in enumerate(self.id2tag)}
words = np.array([self.word2id[w] for w in word_list]).reshape(-1, 1)
tags = np.array([self.tag2id[t] for t in tag_list]).reshape(-1, 1)
return words, tags

def mode(self, test=False):
if test:
self.model.eval()
else:
self.model.train()

def data_forward(self, x):
"""
:param x: sequence of length [batch_size], word indices
:return:
"""
x = x.reshape(self.lstm_batch_size, self.lstm_seq_len)
output, self.hidden = self.model(x, self.hidden)
return output

def define_optimizer(self):
self.optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85)

def get_loss(self, pred, truth):

self._loss = nn.CrossEntropyLoss(pred, truth)
return self._loss

def grad_backward(self):
self.model.zero_grad()
self._loss.backward()
torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2)
self.optimizer.step()


class WordSeg(nn.Module):
"""
PyTorch Network for word segmentation
"""

def __init__(self, hidden_dim, lstm_num_layers, vocab_size, word_emb_dim=100):
super(WordSeg, self).__init__()

self.vocab_size = vocab_size
self.word_emb_dim = word_emb_dim
self.lstm_num_layers = lstm_num_layers
self.hidden_dim = hidden_dim

self.word_emb = nn.Embedding(self.vocab_size, self.word_emb_dim)

self.lstm = nn.LSTM(input_size=self.word_emb_dim,
hidden_size=self.word_emb_dim,
num_layers=self.lstm_num_layers,
bias=True,
dropout=0.5,
batch_first=True)

self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)

def forward(self, x, hidden):
"""
:param x: tensor of shape [batch_size, seq_len], vocabulary index
:param hidden:
:return x: probability of vocabulary entries
hidden: (memory cell, hidden state) from LSTM
"""
# [batch_size, seq_len]
x = self.word_emb(x)
# [batch_size, seq_len, word_emb_size]
x, hidden = self.lstm(x, hidden)
# [batch_size, seq_len, word_emb_size]
x = x.contiguous().view(x.shape[0] * x.shape[1], -1)
# [batch_size*seq_len, word_emb_size]
x = self.linear(x)
# [batch_size*seq_len, vocab_size]
return x, hidden

+ 0
- 110
reproduction/CNN-sentence_classification/.gitignore View File

@@ -1,110 +0,0 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache

#custom
GoogleNews-vectors-negative300.bin/
GoogleNews-vectors-negative300.bin.gz
models/
*.swp

+ 0
- 77
reproduction/CNN-sentence_classification/README.md View File

@@ -1,77 +0,0 @@
## Introduction
This is the implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch.
* MRDataset, non-static-model(word2vec rained by Mikolov etal. (2013) on 100 billion words of Google News)
* It can be run in both CPU and GPU
* The best accuracy is 82.61%, which is better than 81.5% in the paper
(by Jingyuan Liu @Fudan University; Email:(fdjingyuan@outlook.com) Welcome to discussion!)

## Requirement
* python 3.6
* pytorch > 0.1
* numpy
* gensim

## Run
STEP 1
install packages like gensim (other needed pakages is the same)
```
pip install gensim
```

STEP 2
install MRdataset and word2vec resources
* MRdataset: you can download the dataset in (https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz)
* word2vec: you can download the file in (https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit)

Since this file is more than 1.5G, I did not display in folders. If you download the file, please remember modify the path in Function def word_embeddings(path = './GoogleNews-vectors-negative300.bin/'):


STEP 3
train the model
```
python train.py
```
you will get the information printed in the screen, like
```
Epoch [1/20], Iter [100/192] Loss: 0.7008
Test Accuracy: 71.869159 %
Epoch [2/20], Iter [100/192] Loss: 0.5957
Test Accuracy: 75.700935 %
Epoch [3/20], Iter [100/192] Loss: 0.4934
Test Accuracy: 78.130841 %

......
Epoch [20/20], Iter [100/192] Loss: 0.0364
Test Accuracy: 81.495327 %
Best Accuracy: 82.616822 %
Best Model: models/cnn.pkl
```

## Hyperparameters
According to the paper and experiment, I set:

|Epoch|Kernel Size|dropout|learning rate|batch size|
|---|---|---|---|---|
|20|\(h,300,100\)|0.5|0.0001|50|

h = [3,4,5]
If the accuracy is not improved, the learning rate will \*0.8.

## Result
I just tried one dataset : MR. (Other 6 dataset in paper SST-1, SST-2, TREC, CR, MPQA)
There are four models in paper: CNN-rand, CNN-static, CNN-non-static, CNN-multichannel.
I have tried CNN-non-static:A model with pre-trained vectors from word2vec.
All words—including the unknown ones that are randomly initialized and the pretrained vectors are fine-tuned for each task
(which has almost the best performance and the most difficut to implement among the four models)

|Dataset|Class Size|Best Result|Kim's Paper Result|
|---|---|---|---|
|MR|2|82.617%(CNN-non-static)|81.5%(CNN-nonstatic)|



## Reference
* [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882)
* https://github.com/Shawn1993/cnn-text-classification-pytorch
* https://github.com/junwang4/CNN-sentence-classification-pytorch-2017/blob/master/utils.py


+ 0
- 142
reproduction/CNN-sentence_classification/dataset.py View File

@@ -1,142 +0,0 @@
import codecs
import random
import re

import gensim
import numpy as np
from gensim import corpora
from torch.utils.data import Dataset


def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip()

def pad_sentences(sentence, padding_word=" <PAD/>"):
sequence_length = 64
sent = sentence.split()
padded_sentence = sentence + padding_word * (sequence_length - len(sent))
return padded_sentence


#data loader
class MRDataset(Dataset):
def __init__(self):

#load positive and negative sentenses from files
with codecs.open("./rt-polaritydata/rt-polarity.pos",encoding ='ISO-8859-1') as f:
positive_examples = list(f.readlines())
with codecs.open("./rt-polaritydata/rt-polarity.neg",encoding ='ISO-8859-1') as f:
negative_examples = list(f.readlines())
#s.strip: clear "\n"; clear_str; pad
positive_examples = [pad_sentences(clean_str(s.strip())) for s in positive_examples]
negative_examples = [pad_sentences(clean_str(s.strip())) for s in negative_examples]
self.examples = positive_examples + negative_examples
self.sentences_texts = [sample.split() for sample in self.examples]

#word dictionary
dictionary = corpora.Dictionary(self.sentences_texts)
self.word2id_dict = dictionary.token2id # transform to dict, like {"human":0, "a":1,...}

#set lables: postive is 1; negative is 0
positive_labels = [1 for _ in positive_examples]
negative_labels = [0 for _ in negative_examples]
self.lables = positive_labels + negative_labels
examples_lables = list(zip(self.examples,self.lables))
random.shuffle(examples_lables)
self.MRDataset_frame = examples_lables

#transform word to id
self.MRDataset_wordid = \
[(
np.array([self.word2id_dict[word] for word in sent[0].split()], dtype=np.int64),
sent[1]
) for sent in self.MRDataset_frame]

def word_embeddings(self, path="./GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin"):
# establish from google
model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)

print('Please wait ... (it could take a while to load the file : {})'.format(path))
word_dict = self.word2id_dict
embedding_weights = np.random.uniform(-0.25, 0.25, (len(self.word2id_dict), 300))

for word in word_dict:
word_id = word_dict[word]
if word in model.wv.vocab:
embedding_weights[word_id, :] = model[word]
return embedding_weights

def __len__(self):
return len(self.MRDataset_frame)

def __getitem__(self,idx):

sample = self.MRDataset_wordid[idx]
return sample

def getsent(self, idx):

sample = self.MRDataset_wordid[idx][0]
return sample

def getlabel(self, idx):

label = self.MRDataset_wordid[idx][1]
return label


def word2id(self):
return self.word2id_dict

def id2word(self):

id2word_dict = dict([val,key] for key,val in self.word2id_dict.items())
return id2word_dict

class train_set(Dataset):

def __init__(self, samples):

self.train_frame = samples

def __len__(self):

return len(self.train_frame)

def __getitem__(self, idx):

return self.train_frame[idx]


class test_set(Dataset):

def __init__(self, samples):

self.test_frame = samples

def __len__(self):

return len(self.test_frame)

def __getitem__(self, idx):

return self.test_frame[idx]

+ 0
- 43
reproduction/CNN-sentence_classification/model.py View File

@@ -1,43 +0,0 @@
import os
import sys
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

import dataset



class CNN_text(nn.Module):
def __init__(self, kernel_h=[3,4,5], kernel_num=100, embed_num=1000, embed_dim=300, dropout=0.5, L2_constrain=3, batchsize=50, pretrained_embeddings=None):
super(CNN_text, self).__init__()

self.embedding = nn.Embedding(embed_num,embed_dim)
self.dropout = nn.Dropout(dropout)
if pretrained_embeddings is not None:
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))

#the network structure
#Conv2d: input- N,C,H,W output- (50,100,62,1)
self.conv1 = nn.ModuleList([nn.Conv2d(1, 100, (K, 300)) for K in kernel_h])
self.fc1 = nn.Linear(300,2)

def max_pooling(self, x):
x = F.relu(conv(x)).squeeze(3) #N,C,L - (50,100,62)
x = F.max_pool1d(x, x.size(2)).squeeze(2)
#x.size(2)=62 squeeze: (50,100,1) -> (50,100)
return x

def forward(self, x):
x = self.embedding(x) #output: (N,H,W) = (50,64,300)
x = x.unsqueeze(1) #(N,C,H,W)
x = [F.relu(conv(x)).squeeze(3) for conv in self.conv1] #[N, C, H(50,100,62),(50,100,61),(50,100,60)]
x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[N,C(50,100),(50,100),(50,100)]
x = torch.cat(x,1)
x = self.dropout(x)
x = self.fc1(x)
return x

+ 0
- 5331
reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg
File diff suppressed because it is too large
View File


+ 0
- 5331
reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos
File diff suppressed because it is too large
View File


+ 0
- 97
reproduction/CNN-sentence_classification/train.py View File

@@ -1,97 +0,0 @@
import os

import
import
import torch
import torch.nn as nn
.dataset as dst
from .model import CNN_text
from torch.autograd import Variable

# Hyper Parameters
batch_size = 50
learning_rate = 0.0001
num_epochs = 20
cuda = True


#split Dataset
dataset = dst.MRDataset()
length = len(dataset)

train_dataset = dataset[:int(0.9*length)]
test_dataset = dataset[int(0.9*length):]

train_dataset = dst.train_set(train_dataset)
test_dataset = dst.test_set(test_dataset)



# Data Loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=batch_size,
shuffle=True)

test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
batch_size=batch_size,
shuffle=False)
#cnn

cnn = CNN_text(embed_num=len(dataset.word2id()), pretrained_embeddings=dataset.word_embeddings())
if cuda:
cnn.cuda()


# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate)

# train and tests
best_acc = None

for epoch in range(num_epochs):
# Train the Model
cnn.train()
for i, (sents,labels) in enumerate(train_loader):
sents = Variable(sents)
labels = Variable(labels)
if cuda:
sents = sents.cuda()
labels = labels.cuda()
optimizer.zero_grad()
outputs = cnn(sents)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
if (i+1) % 100 == 0:
print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f'
%(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))

# Test the Model
cnn.eval()
correct = 0
total = 0
for sents, labels in test_loader:
sents = Variable(sents)
if cuda:
sents = sents.cuda()
labels = labels.cuda()
outputs = cnn(sents)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum()
acc = 100. * correct / total
print('Test Accuracy: %f %%' % (acc))
if best_acc is None or acc > best_acc:
best_acc = acc
if os.path.exists("models") is False:
os.makedirs("models")
torch.save(cnn.state_dict(), 'models/cnn.pkl')
else:
learning_rate = learning_rate * 0.8

print("Best Accuracy: %f %%" % best_acc)
print("Best Model: models/cnn.pkl")

+ 0
- 21
reproduction/Char-aware_NLM/LICENSE View File

@@ -1,21 +0,0 @@
MIT License

Copyright (c) 2017

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

+ 0
- 40
reproduction/Char-aware_NLM/README.md View File

@@ -1,40 +0,0 @@

# PyTorch-Character-Aware-Neural-Language-Model

This is the PyTorch implementation of character-aware neural language model proposed in this [paper](https://arxiv.org/abs/1508.06615) by Yoon Kim.

## Requiredments
The code is run and tested with **Python 3.5.2** and **PyTorch 0.3.1**.

## HyperParameters
| HyperParam | value |
| ------ | :-------|
| LSTM batch size | 20 |
| LSTM sequence length | 35 |
| LSTM hidden units | 300 |
| epochs | 35 |
| initial learning rate | 1.0 |
| character embedding dimension | 15 |

## Demo
Train the model with split train/valid/test data.

`python train.py`

The trained model will saved in `cache/net.pkl`.
Test the model.

`python test.py`

Best result on test set:
PPl=127.2163
cross entropy loss=4.8459

## Acknowledgement
This implementation borrowed ideas from

https://github.com/jarfo/kchar

https://github.com/cronos123/Character-Aware-Neural-Language-Models



+ 0
- 148
reproduction/Char-aware_NLM/model.py View File

@@ -1,148 +0,0 @@

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F


class Highway(nn.Module):
"""Highway network"""
def __init__(self, input_size):
super(Highway, self).__init__()
self.fc1 = nn.Linear(input_size, input_size, bias=True)
self.fc2 = nn.Linear(input_size, input_size, bias=True)

def forward(self, x):
t = F.sigmoid(self.fc1(x))
return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1-t, x)


class charLM(nn.Module):
"""CNN + highway network + LSTM
# Input:
4D tensor with shape [batch_size, in_channel, height, width]
# Output:
2D Tensor with shape [batch_size, vocab_size]
# Arguments:
char_emb_dim: the size of each character's embedding
word_emb_dim: the size of each word's embedding
vocab_size: num of unique words
num_char: num of characters
use_gpu: True or False
"""
def __init__(self, char_emb_dim, word_emb_dim,
vocab_size, num_char, use_gpu):
super(charLM, self).__init__()
self.char_emb_dim = char_emb_dim
self.word_emb_dim = word_emb_dim
self.vocab_size = vocab_size

# char embedding layer
self.char_embed = nn.Embedding(num_char, char_emb_dim)

# convolutions of filters with different sizes
self.convolutions = []

# list of tuples: (the number of filter, width)
self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)]
for out_channel, filter_width in self.filter_num_width:
self.convolutions.append(
nn.Conv2d(
1, # in_channel
out_channel, # out_channel
kernel_size=(char_emb_dim, filter_width), # (height, width)
bias=True
)
)

self.highway_input_dim = sum([x for x, y in self.filter_num_width])

self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False)

# highway net
self.highway1 = Highway(self.highway_input_dim)
self.highway2 = Highway(self.highway_input_dim)

# LSTM
self.lstm_num_layers = 2

self.lstm = nn.LSTM(input_size=self.highway_input_dim,
hidden_size=self.word_emb_dim,
num_layers=self.lstm_num_layers,
bias=True,
dropout=0.5,
batch_first=True)

# output layer
self.dropout = nn.Dropout(p=0.5)
self.linear = nn.Linear(self.word_emb_dim, self.vocab_size)

if use_gpu is True:
for x in range(len(self.convolutions)):
self.convolutions[x] = self.convolutions[x].cuda()
self.highway1 = self.highway1.cuda()
self.highway2 = self.highway2.cuda()
self.lstm = self.lstm.cuda()
self.dropout = self.dropout.cuda()
self.char_embed = self.char_embed.cuda()
self.linear = self.linear.cuda()
self.batch_norm = self.batch_norm.cuda()


def forward(self, x, hidden):
# Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2]
# Return: Variable of Tensor with shape [num_words, len(word_dict)]
lstm_batch_size = x.size()[0]
lstm_seq_len = x.size()[1]

x = x.contiguous().view(-1, x.size()[2])
# [num_seq*seq_len, max_word_len+2]
x = self.char_embed(x)
# [num_seq*seq_len, max_word_len+2, char_emb_dim]
x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3)
# [num_seq*seq_len, 1, max_word_len+2, char_emb_dim]
x = self.conv_layers(x)
# [num_seq*seq_len, total_num_filters]

x = self.batch_norm(x)
# [num_seq*seq_len, total_num_filters]

x = self.highway1(x)
x = self.highway2(x)
# [num_seq*seq_len, total_num_filters]

x = x.contiguous().view(lstm_batch_size,lstm_seq_len, -1)
# [num_seq, seq_len, total_num_filters]
x, hidden = self.lstm(x, hidden)
# [seq_len, num_seq, hidden_size]
x = self.dropout(x)
# [seq_len, num_seq, hidden_size]
x = x.contiguous().view(lstm_batch_size*lstm_seq_len, -1)
# [num_seq*seq_len, hidden_size]

x = self.linear(x)
# [num_seq*seq_len, vocab_size]
return x, hidden


def conv_layers(self, x):
chosen_list = list()
for conv in self.convolutions:
feature_map = F.tanh(conv(x))
# (batch_size, out_channel, 1, max_word_len-width+1)
chosen = torch.max(feature_map, 3)[0]
# (batch_size, out_channel, 1)
chosen = chosen.squeeze()
# (batch_size, out_channel)
chosen_list.append(chosen)
# (batch_size, total_num_filers)
return torch.cat(chosen_list, 1)

+ 0
- 123
reproduction/Char-aware_NLM/test.py View File

@@ -1,123 +0,0 @@
import os
from collections import namedtuple

import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from utilities import *


def to_var(x):
if torch.cuda.is_available():
x = x.cuda()
return Variable(x)


def test(net, data, opt):
net.eval()
test_input = torch.from_numpy(data.test_input)
test_label = torch.from_numpy(data.test_label)

num_seq = test_input.size()[0] // opt.lstm_seq_len
test_input = test_input[:num_seq*opt.lstm_seq_len, :]
# [num_seq, seq_len, max_word_len+2]
test_input = test_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2)

criterion = nn.CrossEntropyLoss()

loss_list = []
num_hits = 0
total = 0
iterations = test_input.size()[0] // opt.lstm_batch_size
test_generator = batch_generator(test_input, opt.lstm_batch_size)
label_generator = batch_generator(test_label, opt.lstm_batch_size*opt.lstm_seq_len)

hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)),
to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)))
add_loss = 0.0
for t in range(iterations):
batch_input = test_generator.__next__ ()
batch_label = label_generator.__next__()
net.zero_grad()
hidden = [state.detach() for state in hidden]
test_output, hidden = net(to_var(batch_input), hidden)
test_loss = criterion(test_output, to_var(batch_label)).data
loss_list.append(test_loss)
add_loss += test_loss

print("Test Loss={0:.4f}".format(float(add_loss) / iterations))
print("Test PPL={0:.4f}".format(float(np.exp(add_loss / iterations))))


#############################################################

if __name__ == "__main__":

word_embed_dim = 300
char_embedding_dim = 15

if os.path.exists("cache/prep.pt") is False:
print("Cannot find prep.pt")

objetcs = torch.load("cache/prep.pt")

word_dict = objetcs["word_dict"]
char_dict = objetcs["char_dict"]
reverse_word_dict = objetcs["reverse_word_dict"]
max_word_len = objetcs["max_word_len"]
num_words = len(word_dict)

print("word/char dictionary built. Start making inputs.")


if os.path.exists("cache/data_sets.pt") is False:

test_text = read_data("./tests.txt")
test_set = np.array(text2vec(test_text, char_dict, max_word_len))

# Labels are next-word index in word_dict with the same length as inputs
test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]])

category = {"tests": test_set, "tlabel": test_label}
torch.save(category, "cache/data_sets.pt")
else:
data_sets = torch.load("cache/data_sets.pt")
test_set = data_sets["tests"]
test_label = data_sets["tlabel"]
train_set = data_sets["tdata"]
train_label = data_sets["trlabel"]


DataTuple = namedtuple("DataTuple", "test_input test_label train_input train_label ")
data = DataTuple( test_input=test_set,
test_label=test_label, train_label=train_label, train_input=train_set)

print("Loaded data sets. Start building network.")



USE_GPU = True
cnn_batch_size = 700
lstm_seq_len = 35
lstm_batch_size = 20

net = torch.load("cache/net.pkl")
Options = namedtuple("Options", [ "cnn_batch_size", "lstm_seq_len",
"max_word_len", "lstm_batch_size", "word_embed_dim"])
opt = Options(cnn_batch_size=lstm_seq_len*lstm_batch_size,
lstm_seq_len=lstm_seq_len,
max_word_len=max_word_len,
lstm_batch_size=lstm_batch_size,
word_embed_dim=word_embed_dim)


print("Network built. Start testing.")

test(net, data, opt)

+ 0
- 3761
reproduction/Char-aware_NLM/test.txt
File diff suppressed because it is too large
View File


+ 0
- 275
reproduction/Char-aware_NLM/train.py View File

@@ -1,275 +0,0 @@
import os
from collections import namedtuple

import numpy as np
import torch.optim as optim

from .model import charLM
from .test import test
from .utilities import *


def preprocess():
word_dict, char_dict = create_word_char_dict("charlm.txt", "train.txt", "tests.txt")
num_words = len(word_dict)
num_char = len(char_dict)
char_dict["BOW"] = num_char+1
char_dict["EOW"] = num_char+2
char_dict["PAD"] = 0
# dict of (int, string)
reverse_word_dict = {value:key for key, value in word_dict.items()}
max_word_len = max([len(word) for word in word_dict])

objects = {
"word_dict": word_dict,
"char_dict": char_dict,
"reverse_word_dict": reverse_word_dict,
"max_word_len": max_word_len
}
torch.save(objects, "cache/prep.pt")
print("Preprocess done.")


def to_var(x):
if torch.cuda.is_available():
x = x.cuda()
return Variable(x)


def train(net, data, opt):
"""
:param net: the pytorch model
:param data: numpy array
:param opt: named tuple
1. random seed
2. define local input
3. training settting: learning rate, loss, etc
4. main loop epoch
5. batchify
6. validation
7. save model
"""
torch.manual_seed(1024)

train_input = torch.from_numpy(data.train_input)
train_label = torch.from_numpy(data.train_label)
valid_input = torch.from_numpy(data.valid_input)
valid_label = torch.from_numpy(data.valid_label)

# [num_seq, seq_len, max_word_len+2]
num_seq = train_input.size()[0] // opt.lstm_seq_len
train_input = train_input[:num_seq*opt.lstm_seq_len, :]
train_input = train_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2)

num_seq = valid_input.size()[0] // opt.lstm_seq_len
valid_input = valid_input[:num_seq*opt.lstm_seq_len, :]
valid_input = valid_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2)

num_epoch = opt.epochs
num_iter_per_epoch = train_input.size()[0] // opt.lstm_batch_size
learning_rate = opt.init_lr
old_PPL = 100000
best_PPL = 100000

# Log-SoftMax
criterion = nn.CrossEntropyLoss()
# word_emb_dim == hidden_size / num of hidden units
hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)),
to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)))


for epoch in range(num_epoch):

################ Validation ####################
net.eval()
loss_batch = []
PPL_batch = []
iterations = valid_input.size()[0] // opt.lstm_batch_size
valid_generator = batch_generator(valid_input, opt.lstm_batch_size)
vlabel_generator = batch_generator(valid_label, opt.lstm_batch_size*opt.lstm_seq_len)


for t in range(iterations):
batch_input = valid_generator.__next__()
batch_label = vlabel_generator.__next__()

hidden = [state.detach() for state in hidden]
valid_output, hidden = net(to_var(batch_input), hidden)

length = valid_output.size()[0]

# [num_sample-1, len(word_dict)] vs [num_sample-1]
valid_loss = criterion(valid_output, to_var(batch_label))

PPL = torch.exp(valid_loss.data)

loss_batch.append(float(valid_loss))
PPL_batch.append(float(PPL))

PPL = np.mean(PPL_batch)
print("[epoch {}] valid PPL={}".format(epoch, PPL))
print("valid loss={}".format(np.mean(loss_batch)))
print("PPL decrease={}".format(float(old_PPL - PPL)))

# Preserve the best model
if best_PPL > PPL:
best_PPL = PPL
torch.save(net.state_dict(), "cache/model.pt")
torch.save(net, "cache/net.pkl")

# Adjust the learning rate
if float(old_PPL - PPL) <= 1.0:
learning_rate /= 2
print("halved lr:{}".format(learning_rate))

old_PPL = PPL

##################################################
#################### Training ####################
net.train()
optimizer = optim.SGD(net.parameters(),
lr = learning_rate,
momentum=0.85)

# split the first dim
input_generator = batch_generator(train_input, opt.lstm_batch_size)
label_generator = batch_generator(train_label, opt.lstm_batch_size*opt.lstm_seq_len)

for t in range(num_iter_per_epoch):
batch_input = input_generator.__next__()
batch_label = label_generator.__next__()

# detach hidden state of LSTM from last batch
hidden = [state.detach() for state in hidden]

output, hidden = net(to_var(batch_input), hidden)
# [num_word, vocab_size]
loss = criterion(output, to_var(batch_label))

net.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm(net.parameters(), 5, norm_type=2)
optimizer.step()
if (t+1) % 100 == 0:
print("[epoch {} step {}] train loss={}, Perplexity={}".format(epoch+1,
t+1, float(loss.data), float(np.exp(loss.data))))


torch.save(net.state_dict(), "cache/model.pt")
print("Training finished.")


################################################################

if __name__=="__main__":

word_embed_dim = 300
char_embedding_dim = 15

if os.path.exists("cache/prep.pt") is False:
preprocess()

objetcs = torch.load("cache/prep.pt")

word_dict = objetcs["word_dict"]
char_dict = objetcs["char_dict"]
reverse_word_dict = objetcs["reverse_word_dict"]
max_word_len = objetcs["max_word_len"]
num_words = len(word_dict)

print("word/char dictionary built. Start making inputs.")


if os.path.exists("cache/data_sets.pt") is False:
train_text = read_data("./train.txt")
valid_text = read_data("./charlm.txt")
test_text = read_data("./tests.txt")

train_set = np.array(text2vec(train_text, char_dict, max_word_len))
valid_set = np.array(text2vec(valid_text, char_dict, max_word_len))
test_set = np.array(text2vec(test_text, char_dict, max_word_len))

# Labels are next-word index in word_dict with the same length as inputs
train_label = np.array([word_dict[w] for w in train_text[1:]] + [word_dict[train_text[-1]]])
valid_label = np.array([word_dict[w] for w in valid_text[1:]] + [word_dict[valid_text[-1]]])
test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]])

category = {"tdata": train_set, "vdata": valid_set, "tests": test_set,
"trlabel":train_label, "vlabel":valid_label, "tlabel":test_label}
torch.save(category, "cache/data_sets.pt")
else:
data_sets = torch.load("cache/data_sets.pt")
train_set = data_sets["tdata"]
valid_set = data_sets["vdata"]
test_set = data_sets["tests"]
train_label = data_sets["trlabel"]
valid_label = data_sets["vlabel"]
test_label = data_sets["tlabel"]


DataTuple = namedtuple("DataTuple",
"train_input train_label valid_input valid_label test_input test_label")
data = DataTuple(train_input=train_set,
train_label=train_label,
valid_input=valid_set,
valid_label=valid_label,
test_input=test_set,
test_label=test_label)

print("Loaded data sets. Start building network.")



USE_GPU = True
cnn_batch_size = 700
lstm_seq_len = 35
lstm_batch_size = 20
# cnn_batch_size == lstm_seq_len * lstm_batch_size

net = charLM(char_embedding_dim,
word_embed_dim,
num_words,
len(char_dict),
use_gpu=USE_GPU)

for param in net.parameters():
nn.init.uniform(param.data, -0.05, 0.05)


Options = namedtuple("Options", [
"cnn_batch_size", "init_lr", "lstm_seq_len",
"max_word_len", "lstm_batch_size", "epochs",
"word_embed_dim"])
opt = Options(cnn_batch_size=lstm_seq_len*lstm_batch_size,
init_lr=1.0,
lstm_seq_len=lstm_seq_len,
max_word_len=max_word_len,
lstm_batch_size=lstm_batch_size,
epochs=35,
word_embed_dim=word_embed_dim)


print("Network built. Start training.")


# You can stop training anytime by "ctrl+C"
try:
train(net, data, opt)
except KeyboardInterrupt:
print('-' * 89)
print('Exiting from training early')


torch.save(net, "cache/net.pkl")
print("save net")


test(net, data, opt)

+ 0
- 42068
reproduction/Char-aware_NLM/train.txt
File diff suppressed because it is too large
View File


+ 0
- 86
reproduction/Char-aware_NLM/utilities.py View File

@@ -1,86 +0,0 @@
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F



def batch_generator(x, batch_size):
# x: [num_words, in_channel, height, width]
# partitions x into batches
num_step = x.size()[0] // batch_size
for t in range(num_step):
yield x[t*batch_size:(t+1)*batch_size]


def text2vec(words, char_dict, max_word_len):
""" Return list of list of int """
word_vec = []
for word in words:
vec = [char_dict[ch] for ch in word]
if len(vec) < max_word_len:
vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))]
vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]]
word_vec.append(vec)
return word_vec


def seq2vec(input_words, char_embedding, char_embedding_dim, char_table):
""" convert the input strings into character embeddings """
# input_words == list of string
# char_embedding == torch.nn.Embedding
# char_embedding_dim == int
# char_table == list of unique chars
# Returns: tensor of shape [len(input_words), char_embedding_dim, max_word_len+2]
max_word_len = max([len(word) for word in input_words])
print("max_word_len={}".format(max_word_len))
tensor_list = []
start_column = torch.ones(char_embedding_dim, 1)
end_column = torch.ones(char_embedding_dim, 1)

for word in input_words:
# convert string to word embedding
word_encoding = char_embedding_lookup(word, char_embedding, char_table)
# add start and end columns
word_encoding = torch.cat([start_column, word_encoding, end_column], 1)
# zero-pad right columns
word_encoding = F.pad(word_encoding, (0, max_word_len-word_encoding.size()[1]+2)).data
# create dimension
word_encoding = word_encoding.unsqueeze(0)

tensor_list.append(word_encoding)

return torch.cat(tensor_list, 0)


def read_data(file_name):
# Return: list of strings
with open(file_name, 'r') as f:
corpus = f.read().lower()
import re
corpus = re.sub(r"<unk>", "unk", corpus)
return corpus.split()


def get_char_dict(vocabulary):
# vocabulary == dict of (word, int)
# Return: dict of (char, int), starting from 1
char_dict = dict()
count = 1
for word in vocabulary:
for ch in word:
if ch not in char_dict:
char_dict[ch] = count
count += 1
return char_dict


def create_word_char_dict(*file_name):
text = []
for file in file_name:
text += read_data(file)
word_dict = {word:ix for ix, word in enumerate(set(text))}
char_dict = get_char_dict(word_dict)
return word_dict, char_dict


+ 0
- 3370
reproduction/Char-aware_NLM/valid.txt
File diff suppressed because it is too large
View File


+ 0
- 36
reproduction/HAN-document_classification/README.md View File

@@ -1,36 +0,0 @@
## Introduction
This is the implementation of [Hierarchical Attention Networks for Document Classification](https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf) paper in PyTorch.
* Dataset is 600k documents extracted from [Yelp 2018](https://www.yelp.com/dataset) customer reviews
* Use [NLTK](http://www.nltk.org/) and [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/) to tokenize documents and sentences
* Both CPU & GPU support
* The best accuracy is 71%, reaching the same performance in the paper

## Requirement
* python 3.6
* pytorch = 0.3.0
* numpy
* gensim
* nltk
* coreNLP

## Parameters
According to the paper and experiment, I set model parameters:
|word embedding dimension|GRU hidden size|GRU layer|word/sentence context vector dimension|
|---|---|---|---|
|200|50|1|100|

And the training parameters:
|Epoch|learning rate|momentum|batch size|
|---|---|---|---|
|3|0.01|0.9|64|

## Run
1. Prepare dataset. Download the [data set](https://www.yelp.com/dataset), and unzip the custom reviews as a file. Use preprocess.py to transform file into data set foe model input.
2. Train the model. Word enbedding of train data in 'yelp.word2vec'. The model will trained and autosaved in 'model.dict'
```
python train
```
3. Test the model.
```
python evaluate
```

BIN
reproduction/HAN-document_classification/data/test_samples.pkl View File


BIN
reproduction/HAN-document_classification/data/train_samples.pkl View File


BIN
reproduction/HAN-document_classification/data/yelp.word2vec View File


+ 0
- 44
reproduction/HAN-document_classification/evaluate.py View File

@@ -1,44 +0,0 @@
from model import *
from train import *

def evaluate(net, dataset, bactch_size=64, use_cuda=False):
dataloader = DataLoader(dataset, batch_size=bactch_size, collate_fn=collate, num_workers=0)
count = 0
if use_cuda:
net.cuda()
for i, batch_samples in enumerate(dataloader):
x, y = batch_samples
doc_list = []
for sample in x:
doc = []
for sent_vec in sample:
if use_cuda:
sent_vec = sent_vec.cuda()
doc.append(Variable(sent_vec, volatile=True))
doc_list.append(pack_sequence(doc))
if use_cuda:
y = y.cuda()
predicts = net(doc_list)
p, idx = torch.max(predicts, dim=1)
idx = idx.data
count += torch.sum(torch.eq(idx, y))
return count

if __name__ == '__main__':
'''
Evaluate the performance of model
'''
from gensim.models import Word2Vec
import gensim
from gensim import models
embed_model = Word2Vec.load('yelp.word2vec')
embedding = Embedding_layer(embed_model.wv, embed_model.wv.vector_size)
del embed_model

net = HAN(input_size=200, output_size=5,
word_hidden_size=50, word_num_layers=1, word_context_size=100,
sent_hidden_size=50, sent_num_layers=1, sent_context_size=100)
net.load_state_dict(torch.load('model.dict'))
test_dataset = YelpDocSet('reviews', 199, 4, embedding)
correct = evaluate(net, test_dataset, True)
print('accuracy {}'.format(correct/len(test_dataset)))

+ 0
- 110
reproduction/HAN-document_classification/model.py View File

@@ -1,110 +0,0 @@
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

def pack_sequence(tensor_seq, padding_value=0.0):
if len(tensor_seq) <= 0:
return
length = [v.size(0) for v in tensor_seq]
max_len = max(length)
size = [len(tensor_seq), max_len]
size.extend(list(tensor_seq[0].size()[1:]))
ans = torch.Tensor(*size).fill_(padding_value)
if tensor_seq[0].data.is_cuda:
ans = ans.cuda()
ans = Variable(ans)
for i, v in enumerate(tensor_seq):
ans[i, :length[i], :] = v
return ans

class HAN(nn.Module):
def __init__(self, input_size, output_size,
word_hidden_size, word_num_layers, word_context_size,
sent_hidden_size, sent_num_layers, sent_context_size):
super(HAN, self).__init__()

self.word_layer = AttentionNet(input_size,
word_hidden_size,
word_num_layers,
word_context_size)
self.sent_layer = AttentionNet(2* word_hidden_size,
sent_hidden_size,
sent_num_layers,
sent_context_size)
self.output_layer = nn.Linear(2* sent_hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=1)

def forward(self, batch_doc):
# input is a sequence of matrix
doc_vec_list = []
for doc in batch_doc:
sent_mat = self.word_layer(doc) # doc's dim (num_sent, seq_len, word_dim)
doc_vec_list.append(sent_mat) # sent_mat's dim (num_sent, vec_dim)
doc_vec = self.sent_layer(pack_sequence(doc_vec_list))
output = self.softmax(self.output_layer(doc_vec))
return output

class AttentionNet(nn.Module):
def __init__(self, input_size, gru_hidden_size, gru_num_layers, context_vec_size):
super(AttentionNet, self).__init__()
self.input_size = input_size
self.gru_hidden_size = gru_hidden_size
self.gru_num_layers = gru_num_layers
self.context_vec_size = context_vec_size

# Encoder
self.gru = nn.GRU(input_size=input_size,
hidden_size=gru_hidden_size,
num_layers=gru_num_layers,
batch_first=True,
bidirectional=True)
# Attention
self.fc = nn.Linear(2* gru_hidden_size, context_vec_size)
self.tanh = nn.Tanh()
self.softmax = nn.Softmax(dim=1)
# context vector
self.context_vec = nn.Parameter(torch.Tensor(context_vec_size, 1))
self.context_vec.data.uniform_(-0.1, 0.1)

def forward(self, inputs):
# GRU part
h_t, hidden = self.gru(inputs) # inputs's dim (batch_size, seq_len, word_dim)
u = self.tanh(self.fc(h_t))
# Attention part
alpha = self.softmax(torch.matmul(u, self.context_vec)) # u's dim (batch_size, seq_len, context_vec_size)
output = torch.bmm(torch.transpose(h_t, 1, 2), alpha) # alpha's dim (batch_size, seq_len, 1)
return torch.squeeze(output, dim=2) # output's dim (batch_size, 2*hidden_size, 1)


if __name__ == '__main__':
'''
Test the model correctness
'''
import numpy as np
use_cuda = True
net = HAN(input_size=200, output_size=5,
word_hidden_size=50, word_num_layers=1, word_context_size=100,
sent_hidden_size=50, sent_num_layers=1, sent_context_size=100)
optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
criterion = nn.NLLLoss()
test_time = 10
batch_size = 64
if use_cuda:
net.cuda()
print('test training')
for step in range(test_time):
x_data = [torch.randn(np.random.randint(1,10), 200, 200) for i in range(batch_size)]
y_data = torch.LongTensor([np.random.randint(0, 5) for i in range(batch_size)])
if use_cuda:
x_data = [x_i.cuda() for x_i in x_data]
y_data = y_data.cuda()
x = [Variable(x_i) for x_i in x_data]
y = Variable(y_data)
predict = net(x)
loss = criterion(predict, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(loss.data[0])

+ 0
- 51
reproduction/HAN-document_classification/preprocess.py View File

@@ -1,51 +0,0 @@
''''
Tokenize yelp dataset's documents using stanford core nlp
'''

import pickle
import json
import nltk
from nltk.tokenize import stanford
import os

input_filename = 'review.json'

# config for stanford core nlp
os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe'
path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar'
tokenizer = stanford.CoreNLPTokenizer()

in_dirname = 'review'
out_dirname = 'reviews'


f = open(input_filename, encoding='utf-8')
samples = []
j = 0
for i, line in enumerate(f.readlines()):
review = json.loads(line)
samples.append((review['stars'], review['text']))
if (i+1) % 5000 == 0:
print(i)
pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb'))
j += 1
samples = []
pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb'))
# samples = pickle.load(open(out_dirname + '/samples0.pkl', 'rb'))
# print(samples[0])


for fn in os.listdir(in_dirname):
print(fn)
precessed = []
for stars, text in pickle.load(open(os.path.join(in_dirname, fn), 'rb')):
tokens = []
sents = nltk.tokenize.sent_tokenize(text)
for s in sents:
tokens.append(tokenizer.tokenize(s))
precessed.append((stars, tokens))
# print(tokens)
if len(precessed) % 100 == 0:
print(len(precessed))
pickle.dump(precessed, open(os.path.join(out_dirname, fn), 'wb'))

+ 0
- 167
reproduction/HAN-document_classification/train.py View File

@@ -1,167 +0,0 @@
import os
import pickle

import nltk
import numpy as np
import torch

from model import *

class SentIter:
def __init__(self, dirname, count):
self.dirname = dirname
self.count = int(count)

def __iter__(self):
for f in os.listdir(self.dirname)[:self.count]:
with open(os.path.join(self.dirname, f), 'rb') as f:
for y, x in pickle.load(f):
for sent in x:
yield sent

def train_word_vec():
# load data
dirname = 'reviews'
sents = SentIter(dirname, 238)
# define model and train
model = models.Word2Vec(size=200, sg=0, workers=4, min_count=5)
model.build_vocab(sents)
model.train(sents, total_examples=model.corpus_count, epochs=10)
model.save('yelp.word2vec')
print(model.wv.similarity('woman', 'man'))
print(model.wv.similarity('nice', 'awful'))

class Embedding_layer:
def __init__(self, wv, vector_size):
self.wv = wv
self.vector_size = vector_size

def get_vec(self, w):
try:
v = self.wv[w]
except KeyError as e:
v = np.random.randn(self.vector_size)
return v


from torch.utils.data import DataLoader, Dataset
class YelpDocSet(Dataset):
def __init__(self, dirname, start_file, num_files, embedding):
self.dirname = dirname
self.num_files = num_files
self._files = os.listdir(dirname)[start_file:start_file + num_files]
self.embedding = embedding
self._cache = [(-1, None) for i in range(5)]

def get_doc(self, n):
file_id = n // 5000
idx = file_id % 5
if self._cache[idx][0] != file_id:
with open(os.path.join(self.dirname, self._files[file_id]), 'rb') as f:
self._cache[idx] = (file_id, pickle.load(f))
y, x = self._cache[idx][1][n % 5000]
sents = []
for s_list in x:
sents.append(' '.join(s_list))
x = '\n'.join(sents)
return x, y-1

def __len__(self):
return len(self._files)*5000

def __getitem__(self, n):
file_id = n // 5000
idx = file_id % 5
if self._cache[idx][0] != file_id:
print('load {} to {}'.format(file_id, idx))
with open(os.path.join(self.dirname, self._files[file_id]), 'rb') as f:
self._cache[idx] = (file_id, pickle.load(f))
y, x = self._cache[idx][1][n % 5000]
doc = []
for sent in x:
if len(sent) == 0:
continue
sent_vec = []
for word in sent:
vec = self.embedding.get_vec(word)
sent_vec.append(vec.tolist())
sent_vec = torch.Tensor(sent_vec)
doc.append(sent_vec)
if len(doc) == 0:
doc = [torch.zeros(1,200)]
return doc, y-1

def collate(iterable):
y_list = []
x_list = []
for x, y in iterable:
y_list.append(y)
x_list.append(x)
return x_list, torch.LongTensor(y_list)

def train(net, dataset, num_epoch, batch_size, print_size=10, use_cuda=False):
optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
criterion = nn.NLLLoss()

dataloader = DataLoader(dataset,
batch_size=batch_size,
collate_fn=collate,
num_workers=0)
running_loss = 0.0

if use_cuda:
net.cuda()
print('start training')
for epoch in range(num_epoch):
for i, batch_samples in enumerate(dataloader):
x, y = batch_samples
doc_list = []
for sample in x:
doc = []
for sent_vec in sample:
if use_cuda:
sent_vec = sent_vec.cuda()
doc.append(Variable(sent_vec))
doc_list.append(pack_sequence(doc))
if use_cuda:
y = y.cuda()
y = Variable(y)
predict = net(doc_list)
loss = criterion(predict, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
running_loss += loss.data[0]
if i % print_size == print_size-1:
print('{}, {}'.format(i+1, running_loss/print_size))
running_loss = 0.0
torch.save(net.state_dict(), 'model.dict')
torch.save(net.state_dict(), 'model.dict')

if __name__ == '__main__':
'''
Train process
'''
from gensim.models import Word2Vec
import gensim
from gensim import models

train_word_vec()

embed_model = Word2Vec.load('yelp.word2vec')
embedding = Embedding_layer(embed_model.wv, embed_model.wv.vector_size)
del embed_model
start_file = 0
dataset = YelpDocSet('reviews', start_file, 120-start_file, embedding)
print('training data size {}'.format(len(dataset)))
net = HAN(input_size=200, output_size=5,
word_hidden_size=50, word_num_layers=1, word_context_size=100,
sent_hidden_size=50, sent_num_layers=1, sent_context_size=100)
try:
net.load_state_dict(torch.load('model.dict'))
print("last time trained model has loaded")
except Exception:
print("cannot load model, train the inital model")
train(net, dataset, num_epoch=5, batch_size=64, use_cuda=True)

+ 0
- 14
saver/base_saver.py View File

@@ -1,14 +0,0 @@
class BaseSaver(object):
"""base class for all savers"""

def __init__(self, save_path):
self.save_path = save_path

def save_bytes(self):
raise NotImplementedError

def save_str(self):
raise NotImplementedError

def compress(self):
raise NotImplementedError

+ 0
- 12
saver/logger.py View File

@@ -1,12 +0,0 @@
from saver.base_saver import BaseSaver


class Logger(BaseSaver):
"""Logging"""

def __init__(self, save_path):
super(Logger, self).__init__(save_path)

def log(self, string):
with open(self.save_path, "a") as f:
f.write(string)

+ 0
- 8
saver/model_saver.py View File

@@ -1,8 +0,0 @@
from saver.base_saver import BaseSaver


class ModelSaver(BaseSaver):
"""Save a model"""

def __init__(self, save_path):
super(ModelSaver, self).__init__(save_path)

tests/data_for_tests/charlm.txt → test/data_for_tests/charlm.txt View File


tests/data_for_tests/cws_test → test/data_for_tests/cws_test View File


tests/data_for_tests/cws_train → test/data_for_tests/cws_train View File


tests/test_charlm.py → test/test_charlm.py View File

@@ -1,8 +1,9 @@
from action.tester import Tester
from action.trainer import Trainer
from loader.base_loader import ToyLoader0 from loader.base_loader import ToyLoader0
from model.char_language_model import CharLM from model.char_language_model import CharLM


from fastNLP.action import Tester
from fastNLP.action.trainer import Trainer



def test_charlm(): def test_charlm():
train_config = Trainer.TrainConfig(epochs=1, validate=True, save_when_better=True, train_config = Trainer.TrainConfig(epochs=1, validate=True, save_when_better=True,

tests/test_loader.py → test/test_loader.py View File


tests/test_trainer.py → test/test_trainer.py View File

@@ -1,10 +1,10 @@
from collections import namedtuple from collections import namedtuple


import numpy as np import numpy as np

from action.trainer import Trainer
from model.base_model import ToyModel from model.base_model import ToyModel


from fastNLP.action.trainer import Trainer



def test_trainer(): def test_trainer():
Config = namedtuple("config", ["epochs", "validate", "save_when_better"]) Config = namedtuple("config", ["epochs", "validate", "save_when_better"])

tests/test_word_seg.py → test/test_word_seg.py View File

@@ -1,8 +1,9 @@
from action.tester import Tester
from action.trainer import Trainer
from loader.base_loader import BaseLoader from loader.base_loader import BaseLoader
from model.word_seg_model import WordSegModel from model.word_seg_model import WordSegModel


from fastNLP.action import Tester
from fastNLP.action.trainer import Trainer



def test_charlm(): def test_charlm():
train_config = Trainer.TrainConfig(epochs=5, validate=False, save_when_better=False, train_config = Trainer.TrainConfig(epochs=5, validate=False, save_when_better=False,

Loading…
Cancel
Save