| @@ -1,8 +0,0 @@ | |||||
| SpaCy "Doc" | |||||
| https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/tokens/doc.pyx#L80 | |||||
| SpaCy "Vocab" | |||||
| https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/vocab.pyx#L25 | |||||
| SpaCy "Token" | |||||
| https://github.com/explosion/spaCy/blob/75d2a05c2938f412f0fae44748374e4de19cc2be/spacy/tokens/token.pyx#L27 | |||||
| @@ -1,46 +0,0 @@ | |||||
| from saver.logger import Logger | |||||
| class Action(object): | |||||
| """ | |||||
| base class for Trainer and Tester | |||||
| """ | |||||
| def __init__(self): | |||||
| super(Action, self).__init__() | |||||
| self.logger = Logger("logger_output.txt") | |||||
| def load_config(self, args): | |||||
| raise NotImplementedError | |||||
| def load_dataset(self, args): | |||||
| raise NotImplementedError | |||||
| def log(self, string): | |||||
| self.logger.log(string) | |||||
| def batchify(self, batch_size, X, Y=None): | |||||
| """ | |||||
| :param batch_size: int | |||||
| :param X: feature matrix of size [n_sample, m_feature] | |||||
| :param Y: label vector of size [n_sample, 1] (optional) | |||||
| :return iteration:int, the number of step in each epoch | |||||
| generator:generator, to generate batch inputs | |||||
| """ | |||||
| n_samples = X.shape[0] | |||||
| num_iter = n_samples // batch_size | |||||
| if Y is None: | |||||
| generator = self._batch_generate(batch_size, num_iter, X) | |||||
| else: | |||||
| generator = self._batch_generate(batch_size, num_iter, X, Y) | |||||
| return num_iter, generator | |||||
| @staticmethod | |||||
| def _batch_generate(batch_size, num_iter, *data): | |||||
| for step in range(num_iter): | |||||
| start = batch_size * step | |||||
| end = batch_size * (step + 1) | |||||
| yield tuple([x[start:end] for x in data]) | |||||
| def make_log(self, *args): | |||||
| return "log" | |||||
| @@ -1,87 +0,0 @@ | |||||
| from collections import namedtuple | |||||
| import numpy as np | |||||
| from action.action import Action | |||||
| class Tester(Action): | |||||
| """docstring for Tester""" | |||||
| TestConfig = namedtuple("config", ["validate_in_training", "save_dev_input", "save_output", | |||||
| "save_loss", "batch_size"]) | |||||
| def __init__(self, test_args): | |||||
| """ | |||||
| :param test_args: named tuple | |||||
| """ | |||||
| super(Tester, self).__init__() | |||||
| self.validate_in_training = test_args.validate_in_training | |||||
| self.save_dev_input = test_args.save_dev_input | |||||
| self.valid_x = None | |||||
| self.valid_y = None | |||||
| self.save_output = test_args.save_output | |||||
| self.output = None | |||||
| self.save_loss = test_args.save_loss | |||||
| self.mean_loss = None | |||||
| self.batch_size = test_args.batch_size | |||||
| def test(self, network, data): | |||||
| print("testing") | |||||
| network.mode(test=True) # turn on the testing mode | |||||
| if self.save_dev_input: | |||||
| if self.valid_x is None: | |||||
| valid_x, valid_y = network.prepare_input(data) | |||||
| self.valid_x = valid_x | |||||
| self.valid_y = valid_y | |||||
| else: | |||||
| valid_x = self.valid_x | |||||
| valid_y = self.valid_y | |||||
| else: | |||||
| valid_x, valid_y = network.prepare_input(data) | |||||
| # split into batches by self.batch_size | |||||
| iterations, test_batch_generator = self.batchify(self.batch_size, valid_x, valid_y) | |||||
| batch_output = list() | |||||
| loss_history = list() | |||||
| # turn on the testing mode of the network | |||||
| network.mode(test=True) | |||||
| for step in range(iterations): | |||||
| batch_x, batch_y = test_batch_generator.__next__() | |||||
| # forward pass from tests input to predicted output | |||||
| prediction = network.data_forward(batch_x) | |||||
| loss = network.get_loss(prediction, batch_y) | |||||
| if self.save_output: | |||||
| batch_output.append(prediction.data) | |||||
| if self.save_loss: | |||||
| loss_history.append(loss) | |||||
| self.log(self.make_log(step, loss)) | |||||
| if self.save_loss: | |||||
| self.mean_loss = np.mean(np.array(loss_history)) | |||||
| if self.save_output: | |||||
| self.output = self.make_output(batch_output) | |||||
| @property | |||||
| def loss(self): | |||||
| return self.mean_loss | |||||
| @property | |||||
| def result(self): | |||||
| return self.output | |||||
| @staticmethod | |||||
| def make_output(batch_outputs): | |||||
| # construct full prediction with batch outputs | |||||
| return np.concatenate(batch_outputs, axis=0) | |||||
| def load_config(self, args): | |||||
| raise NotImplementedError | |||||
| def load_dataset(self, args): | |||||
| raise NotImplementedError | |||||
| @@ -1,93 +0,0 @@ | |||||
| from collections import namedtuple | |||||
| from .action import Action | |||||
| from .tester import Tester | |||||
| class Trainer(Action): | |||||
| """ | |||||
| Trainer is a common training pipeline shared among all models. | |||||
| """ | |||||
| TrainConfig = namedtuple("config", ["epochs", "validate", "save_when_better", | |||||
| "log_per_step", "log_validation", "batch_size"]) | |||||
| def __init__(self, train_args): | |||||
| """ | |||||
| :param train_args: namedtuple | |||||
| """ | |||||
| super(Trainer, self).__init__() | |||||
| self.n_epochs = train_args.epochs | |||||
| self.validate = train_args.validate | |||||
| self.save_when_better = train_args.save_when_better | |||||
| self.log_per_step = train_args.log_per_step | |||||
| self.log_validation = train_args.log_validation | |||||
| self.batch_size = train_args.batch_size | |||||
| def train(self, network, train_data, dev_data=None): | |||||
| """ | |||||
| :param network: the model controller | |||||
| :param train_data: raw data for training | |||||
| :param dev_data: raw data for validation | |||||
| This method will call all the base methods of network (implemented in model.base_model). | |||||
| """ | |||||
| train_x, train_y = network.prepare_input(train_data) | |||||
| iterations, train_batch_generator = self.batchify(self.batch_size, train_x, train_y) | |||||
| test_args = Tester.TestConfig(save_output=True, validate_in_training=True, | |||||
| save_dev_input=True, save_loss=True, batch_size=self.batch_size) | |||||
| evaluator = Tester(test_args) | |||||
| best_loss = 1e10 | |||||
| loss_history = list() | |||||
| for epoch in range(self.n_epochs): | |||||
| network.mode(test=False) # turn on the train mode | |||||
| network.define_optimizer() | |||||
| for step in range(iterations): | |||||
| batch_x, batch_y = train_batch_generator.__next__() | |||||
| prediction = network.data_forward(batch_x) | |||||
| loss = network.get_loss(prediction, batch_y) | |||||
| network.grad_backward() | |||||
| if step % self.log_per_step == 0: | |||||
| print("step ", step) | |||||
| loss_history.append(loss) | |||||
| self.log(self.make_log(epoch, step, loss)) | |||||
| #################### evaluate over dev set ################### | |||||
| if self.validate: | |||||
| if dev_data is None: | |||||
| raise RuntimeError("No validation data provided.") | |||||
| # give all controls to tester | |||||
| evaluator.test(network, dev_data) | |||||
| if self.log_validation: | |||||
| self.log(self.make_valid_log(epoch, evaluator.loss)) | |||||
| if evaluator.loss < best_loss: | |||||
| best_loss = evaluator.loss | |||||
| if self.save_when_better: | |||||
| self.save_model(network) | |||||
| # finish training | |||||
| def make_log(self, *args): | |||||
| return "make a log" | |||||
| def make_valid_log(self, *args): | |||||
| return "make a valid log" | |||||
| def save_model(self, model): | |||||
| model.save() | |||||
| def load_data(self, data_name): | |||||
| print("load data") | |||||
| def load_config(self, args): | |||||
| raise NotImplementedError | |||||
| def load_dataset(self, args): | |||||
| raise NotImplementedError | |||||
| @@ -0,0 +1,174 @@ | |||||
| import torch | |||||
| from torch import nn | |||||
| def log_sum_exp(x, dim=-1): | |||||
| max_value, _ = x.max(dim=dim, keepdim=True) | |||||
| res = torch.log(torch.sum(torch.exp(x - max_value), dim=dim, keepdim=True)) + max_value | |||||
| return res.squeeze(dim) | |||||
| def seq_len_to_byte_mask(seq_lens): | |||||
| # usually seq_lens: LongTensor, batch_size | |||||
| # return value: ByteTensor, batch_size x max_len | |||||
| batch_size = seq_lens.size(0) | |||||
| max_len = seq_lens.max() | |||||
| broadcast_arange = torch.arange(max_len).view(1, -1).repeat(batch_size, 1) | |||||
| mask = broadcast_arange.lt(seq_lens.float().view(-1, 1)) | |||||
| return mask | |||||
| class ContionalRandomField(nn.Module): | |||||
| def __init__(self, tag_size, include_start_end_trans=True): | |||||
| """ | |||||
| :param tag_size: int, num of tags | |||||
| :param include_start_end_trans: bool, whether to include start/end tag | |||||
| """ | |||||
| super(ContionalRandomField, self).__init__() | |||||
| self.include_start_end_trans = include_start_end_trans | |||||
| self.tag_size = tag_size | |||||
| # the meaning of entry in this matrix is (from_tag_id, to_tag_id) score | |||||
| self.transition_m = nn.Parameter(torch.randn(tag_size, tag_size)) | |||||
| if self.include_start_end_trans: | |||||
| self.start_scores = nn.Parameter(torch.randn(tag_size)) | |||||
| self.end_scores = nn.Parameter(torch.randn(tag_size)) | |||||
| self.reset_parameter() | |||||
| def reset_parameter(self): | |||||
| nn.init.xavier_normal_(self.transition_m) | |||||
| if self.include_start_end_trans: | |||||
| nn.init.normal_(self.start_scores) | |||||
| nn.init.normal_(self.end_scores) | |||||
| def _normalizer_likelihood(self, feats, masks): | |||||
| """ | |||||
| Computes the (batch_size,) denominator term for the log-likelihood, which is the | |||||
| sum of the likelihoods across all possible state sequences. | |||||
| :param feats:FloatTensor, batch_size x max_len x tag_size | |||||
| :param masks:ByteTensor, batch_size x max_len | |||||
| :return:FloatTensor, batch_size | |||||
| """ | |||||
| batch_size, max_len, _ = feats.size() | |||||
| # alpha, batch_size x tag_size | |||||
| if self.include_start_end_trans: | |||||
| alpha = self.start_scores.view(1, -1) + feats[:, 0] | |||||
| else: | |||||
| alpha = feats[:, 0] | |||||
| # broadcast_trans_m, the meaning of entry in this matrix is [batch_idx, to_tag_id, from_tag_id] | |||||
| broadcast_trans_m = self.transition_m.permute( | |||||
| 1, 0).unsqueeze(0).repeat(batch_size, 1, 1) | |||||
| # loop | |||||
| for i in range(1, max_len): | |||||
| emit_score = feats[:, i].unsqueeze(2) | |||||
| new_alpha = broadcast_trans_m + alpha.unsqueeze(1) + emit_score | |||||
| new_alpha = log_sum_exp(new_alpha, dim=2) | |||||
| alpha = new_alpha * \ | |||||
| masks[:, i:i + 1].float() + alpha * \ | |||||
| (1 - masks[:, i:i + 1].float()) | |||||
| if self.include_start_end_trans: | |||||
| alpha = alpha + self.end_scores.view(1, -1) | |||||
| return log_sum_exp(alpha) | |||||
| def _glod_score(self, feats, tags, masks): | |||||
| """ | |||||
| Compute the score for the gold path. | |||||
| :param feats: FloatTensor, batch_size x tag_size x tag_size | |||||
| :param tags: LongTensor, batch_size x max_len | |||||
| :param masks: ByteTensor, batch_size x max_len | |||||
| :return:FloatTensor, batch_size | |||||
| """ | |||||
| batch_size, max_len, _ = feats.size() | |||||
| # alpha, B x 1 | |||||
| if self.include_start_end_trans: | |||||
| alpha = self.start_scores.view(1, -1).repeat(batch_size, 1).gather(dim=1, index=tags[:, :1]) + \ | |||||
| feats[:, 0].gather(dim=1, index=tags[:, :1]) | |||||
| else: | |||||
| alpha = feats[:, 0].gather(dim=1, index=tags[:, :1]) | |||||
| for i in range(1, max_len): | |||||
| trans_score = self.transition_m[( | |||||
| tags[:, i - 1], tags[:, i])].unsqueeze(1) | |||||
| emit_score = feats[:, i].gather(dim=1, index=tags[:, i:i + 1]) | |||||
| new_alpha = alpha + trans_score + emit_score | |||||
| alpha = new_alpha * \ | |||||
| masks[:, i:i + 1].float() + alpha * \ | |||||
| (1 - masks[:, i:i + 1].float()) | |||||
| if self.include_start_end_trans: | |||||
| last_tag_index = masks.cumsum(dim=1, dtype=torch.long)[:, -1:] - 1 | |||||
| last_from_tag_id = tags.gather(dim=1, index=last_tag_index) | |||||
| trans_score = self.end_scores.view( | |||||
| 1, -1).repeat(batch_size, 1).gather(dim=1, index=last_from_tag_id) | |||||
| alpha = alpha + trans_score | |||||
| return alpha.squeeze(1) | |||||
| def forward(self, feats, tags, masks): | |||||
| """ | |||||
| Calculate the neg log likelihood | |||||
| :param feats:FloatTensor, batch_size x tag_size x tag_size | |||||
| :param tags:LongTensor, batch_size x max_len | |||||
| :param masks:ByteTensor batch_size x max_len | |||||
| :return:FloatTensor, batch_size | |||||
| """ | |||||
| all_path_score = self._normalizer_likelihood(feats, masks) | |||||
| gold_path_score = self._glod_score(feats, tags, masks) | |||||
| return all_path_score - gold_path_score | |||||
| def viterbi_decode(self, feats, masks): | |||||
| """ | |||||
| Given a feats matrix, return best decode path and best score. | |||||
| :param feats: | |||||
| :param masks: | |||||
| :return:List[Tuple(List, float)], | |||||
| """ | |||||
| batch_size, max_len, tag_size = feats.size() | |||||
| paths = torch.zeros(batch_size, max_len - 1, self.tag_size) | |||||
| if self.include_start_end_trans: | |||||
| alpha = self.start_scores.repeat(batch_size, 1) + feats[:, 0] | |||||
| else: | |||||
| alpha = feats[:, 0] | |||||
| for i in range(1, max_len): | |||||
| new_alpha = alpha.clone() | |||||
| for t in range(self.tag_size): | |||||
| pre_scores = self.transition_m[:, t].view( | |||||
| 1, self.tag_size) + alpha | |||||
| max_scroe, indice = pre_scores.max(dim=1) | |||||
| new_alpha[:, t] = max_scroe + feats[:, i, t] | |||||
| paths[:, i - 1, t] = indice | |||||
| alpha = new_alpha * \ | |||||
| masks[:, i:i + 1].float() + alpha * \ | |||||
| (1 - masks[:, i:i + 1].float()) | |||||
| if self.include_start_end_trans: | |||||
| alpha += self.end_scores.view(1, -1) | |||||
| max_scroes, indice = alpha.max(dim=1) | |||||
| indice = indice.cpu().numpy() | |||||
| final_paths = [] | |||||
| paths = paths.cpu().numpy().astype(int) | |||||
| seq_lens = masks.cumsum(dim=1, dtype=torch.long)[:, -1] | |||||
| for b in range(batch_size): | |||||
| path = [indice[b]] | |||||
| for i in range(seq_lens[b] - 2, -1, -1): | |||||
| index = paths[b, i, path[-1]] | |||||
| path.append(index) | |||||
| final_paths.append(path[::-1]) | |||||
| return list(zip(final_paths, max_scroes.detach().cpu().numpy())) | |||||
| @@ -0,0 +1,19 @@ | |||||
| import torch | |||||
| from fastNLP.modules.utils import mask_softmax | |||||
| class Attention(torch.nn.Module): | |||||
| def __init__(self, normalize=False): | |||||
| super(Attention, self).__init__() | |||||
| self.normalize = normalize | |||||
| def forward(self, query, memory, mask): | |||||
| similarities = self._atten_forward(query, memory) | |||||
| if self.normalize: | |||||
| return mask_softmax(similarities, mask) | |||||
| return similarities | |||||
| def _atten_forward(self, query, memory): | |||||
| raise NotImplementedError | |||||
| @@ -0,0 +1,9 @@ | |||||
| from fastNLP.modules.attention.attention import Attention | |||||
| class LinearAttention(Attention): | |||||
| def __init__(self, normalize=False): | |||||
| super(LinearAttention, self).__init__(normalize) | |||||
| def _atten_forward(self, query, memory): | |||||
| raise NotImplementedError | |||||
| @@ -0,0 +1,9 @@ | |||||
| import torch | |||||
| def mask_softmax(matrix, mask): | |||||
| if mask is None: | |||||
| result = torch.nn.functional.softmax(matrix, dim=-1) | |||||
| else: | |||||
| raise NotImplementedError | |||||
| return result | |||||
| @@ -1,37 +0,0 @@ | |||||
| class BaseLoader(object): | |||||
| """docstring for BaseLoader""" | |||||
| def __init__(self, data_name, data_path): | |||||
| super(BaseLoader, self).__init__() | |||||
| self.data_name = data_name | |||||
| self.data_path = data_path | |||||
| def load(self): | |||||
| """ | |||||
| :return: string | |||||
| """ | |||||
| with open(self.data_path, "r", encoding="utf-8") as f: | |||||
| text = f.read() | |||||
| return text | |||||
| def load_lines(self): | |||||
| with open(self.data_path, "r", encoding="utf=8") as f: | |||||
| text = f.readlines() | |||||
| return text | |||||
| class ToyLoader0(BaseLoader): | |||||
| """ | |||||
| For charLM | |||||
| """ | |||||
| def __init__(self, name, path): | |||||
| super(ToyLoader0, self).__init__(name, path) | |||||
| def load(self): | |||||
| with open(self.data_path, 'r') as f: | |||||
| corpus = f.read().lower() | |||||
| import re | |||||
| corpus = re.sub(r"<unk>", "unk", corpus) | |||||
| return corpus.split() | |||||
| @@ -1,13 +0,0 @@ | |||||
| from loader.base_loader import BaseLoader | |||||
| class ConfigLoader(BaseLoader): | |||||
| """loader for configuration files""" | |||||
| def __int__(self, data_name, data_path): | |||||
| super(ConfigLoader, self).__init__(data_name, data_path) | |||||
| self.config = self.parse(super(ConfigLoader, self).load()) | |||||
| @staticmethod | |||||
| def parse(string): | |||||
| raise NotImplementedError | |||||
| @@ -1,47 +0,0 @@ | |||||
| from loader.base_loader import BaseLoader | |||||
| class DatasetLoader(BaseLoader): | |||||
| """"loader for data sets""" | |||||
| def __init__(self, data_name, data_path): | |||||
| super(DatasetLoader, self).__init__(data_name, data_path) | |||||
| class ConllLoader(DatasetLoader): | |||||
| """loader for conll format files""" | |||||
| def __int__(self, data_name, data_path): | |||||
| """ | |||||
| :param str data_name: the name of the conll data set | |||||
| :param str data_path: the path to the conll data set | |||||
| """ | |||||
| super(ConllLoader, self).__init__(data_name, data_path) | |||||
| self.data_set = self.parse(self.load()) | |||||
| def load(self): | |||||
| """ | |||||
| :return: list lines: all lines in a conll file | |||||
| """ | |||||
| with open(self.data_path, "r", encoding="utf-8") as f: | |||||
| lines = f.readlines() | |||||
| return lines | |||||
| @staticmethod | |||||
| def parse(lines): | |||||
| """ | |||||
| :param list lines:a list containing all lines in a conll file. | |||||
| :return: a 3D list | |||||
| """ | |||||
| sentences = list() | |||||
| tokens = list() | |||||
| for line in lines: | |||||
| if line[0] == "#": | |||||
| # skip the comments | |||||
| continue | |||||
| if line == "\n": | |||||
| sentences.append(tokens) | |||||
| tokens = [] | |||||
| continue | |||||
| tokens.append(line.split()) | |||||
| return sentences | |||||
| @@ -1,8 +0,0 @@ | |||||
| from loader.base_loader import BaseLoader | |||||
| class EmbedLoader(BaseLoader): | |||||
| """docstring for EmbedLoader""" | |||||
| def __init__(self, data_name, data_path): | |||||
| super(EmbedLoader, self).__init__(data_name, data_path) | |||||
| @@ -1,158 +0,0 @@ | |||||
| import numpy as np | |||||
| class BaseModel(object): | |||||
| """The base class of all models. | |||||
| This class and its subclasses are actually "wrappers" of the PyTorch models. | |||||
| They act as an interface between Trainer and the deep learning networks. | |||||
| This interface provides the following methods to be called by Trainer. | |||||
| - prepare_input | |||||
| - mode | |||||
| - define_optimizer | |||||
| - data_forward | |||||
| - grad_backward | |||||
| - get_loss | |||||
| """ | |||||
| def __init__(self): | |||||
| pass | |||||
| def prepare_input(self, data): | |||||
| """ | |||||
| Perform data transformation from raw input to vector/matrix inputs. | |||||
| :param data: raw inputs | |||||
| :return (X, Y): tuple, input features and labels | |||||
| """ | |||||
| raise NotImplementedError | |||||
| def mode(self, test=False): | |||||
| """ | |||||
| Tell the network to be trained or not, required by PyTorch. | |||||
| :param test: bool | |||||
| """ | |||||
| raise NotImplementedError | |||||
| def define_optimizer(self): | |||||
| """ | |||||
| Define PyTorch optimizer specified by the model. | |||||
| """ | |||||
| raise NotImplementedError | |||||
| def data_forward(self, *x): | |||||
| """ | |||||
| Forward pass of the data. | |||||
| :param x: input feature matrix and label vector | |||||
| :return: output by the model | |||||
| """ | |||||
| # required by PyTorch nn | |||||
| raise NotImplementedError | |||||
| def grad_backward(self): | |||||
| """ | |||||
| Perform gradient descent to update the model parameters. | |||||
| """ | |||||
| raise NotImplementedError | |||||
| def get_loss(self, pred, truth): | |||||
| """ | |||||
| Compute loss given model prediction and ground truth. Loss function specified by the model. | |||||
| :param pred: prediction label vector | |||||
| :param truth: ground truth label vector | |||||
| :return: a scalar | |||||
| """ | |||||
| raise NotImplementedError | |||||
| class ToyModel(BaseModel): | |||||
| """This is for code testing.""" | |||||
| def __init__(self): | |||||
| super(ToyModel, self).__init__() | |||||
| self.test_mode = False | |||||
| self.weight = np.random.rand(5, 1) | |||||
| self.bias = np.random.rand() | |||||
| self._loss = 0 | |||||
| def prepare_input(self, data): | |||||
| return data[:, :-1], data[:, -1] | |||||
| def mode(self, test=False): | |||||
| self.test_mode = test | |||||
| def data_forward(self, x): | |||||
| return np.matmul(x, self.weight) + self.bias | |||||
| def grad_backward(self): | |||||
| print("loss gradient backward") | |||||
| def get_loss(self, pred, truth): | |||||
| self._loss = np.mean(np.square(pred - truth)) | |||||
| return self._loss | |||||
| def define_optimizer(self): | |||||
| pass | |||||
| class Vocabulary(object): | |||||
| """A look-up table that allows you to access `Lexeme` objects. The `Vocab` | |||||
| instance also provides access to the `StringStore`, and owns underlying | |||||
| data that is shared between `Doc` objects. | |||||
| """ | |||||
| def __init__(self): | |||||
| """Create the vocabulary. | |||||
| RETURNS (Vocab): The newly constructed object. | |||||
| """ | |||||
| self.data_frame = None | |||||
| class Document(object): | |||||
| """A sequence of Token objects. Access sentences and named entities, export | |||||
| annotations to numpy arrays, losslessly serialize to compressed binary | |||||
| strings. The `Doc` object holds an array of `Token` objects. The | |||||
| Python-level `Token` and `Span` objects are views of this array, i.e. | |||||
| they don't own the data themselves. -- spacy | |||||
| """ | |||||
| def __init__(self, vocab, words=None, spaces=None): | |||||
| """Create a Doc object. | |||||
| vocab (Vocab): A vocabulary object, which must match any models you | |||||
| want to use (e.g. tokenizer, parser, entity recognizer). | |||||
| words (list or None): A list of unicode strings, to add to the document | |||||
| as words. If `None`, defaults to empty list. | |||||
| spaces (list or None): A list of boolean values, of the same length as | |||||
| words. True means that the word is followed by a space, False means | |||||
| it is not. If `None`, defaults to `[True]*len(words)` | |||||
| user_data (dict or None): Optional extra data to attach to the Doc. | |||||
| RETURNS (Doc): The newly constructed object. | |||||
| """ | |||||
| self.vocab = vocab | |||||
| self.spaces = spaces | |||||
| self.words = words | |||||
| if spaces is None: | |||||
| self.spaces = [True] * len(self.words) | |||||
| elif len(spaces) != len(self.words): | |||||
| raise ValueError("dismatch spaces and words") | |||||
| def get_chunker(self, vocab): | |||||
| return None | |||||
| def push_back(self, vocab): | |||||
| pass | |||||
| class Token(object): | |||||
| """An individual token – i.e. a word, punctuation symbol, whitespace, | |||||
| etc. | |||||
| """ | |||||
| def __init__(self, vocab, doc, offset): | |||||
| """Construct a `Token` object. | |||||
| vocab (Vocabulary): A storage container for lexical types. | |||||
| doc (Document): The parent document. | |||||
| offset (int): The index of the token within the document. | |||||
| """ | |||||
| self.vocab = vocab | |||||
| self.doc = doc | |||||
| self.token = doc[offset] | |||||
| self.i = offset | |||||
| @@ -1,356 +0,0 @@ | |||||
| import os | |||||
| from collections import namedtuple | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| import torch.optim as optim | |||||
| from torch.autograd import Variable | |||||
| from model.base_model import BaseModel | |||||
| USE_GPU = True | |||||
| class CharLM(BaseModel): | |||||
| """ | |||||
| Controller of the Character-level Neural Language Model | |||||
| To do: | |||||
| - where the data goes, call data savers. | |||||
| """ | |||||
| DataTuple = namedtuple("DataTuple", ["feature", "label"]) | |||||
| def __init__(self, lstm_batch_size, lstm_seq_len): | |||||
| super(CharLM, self).__init__() | |||||
| """ | |||||
| Settings: should come from config loader or pre-processing | |||||
| """ | |||||
| self.word_embed_dim = 300 | |||||
| self.char_embedding_dim = 15 | |||||
| self.cnn_batch_size = lstm_batch_size * lstm_seq_len | |||||
| self.lstm_seq_len = lstm_seq_len | |||||
| self.lstm_batch_size = lstm_batch_size | |||||
| self.num_epoch = 10 | |||||
| self.old_PPL = 100000 | |||||
| self.best_PPL = 100000 | |||||
| """ | |||||
| These parameters are set by pre-processing. | |||||
| """ | |||||
| self.max_word_len = None | |||||
| self.num_char = None | |||||
| self.vocab_size = None | |||||
| self.preprocess("./data_for_tests/charlm.txt") | |||||
| self.data = None # named tuple to store all data set | |||||
| self.data_ready = False | |||||
| self.criterion = nn.CrossEntropyLoss() | |||||
| self._loss = None | |||||
| self.use_gpu = USE_GPU | |||||
| # word_emb_dim == hidden_size / num of hidden units | |||||
| self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim)), | |||||
| to_var(torch.zeros(2, self.lstm_batch_size, self.word_embed_dim))) | |||||
| self.model = charLM(self.char_embedding_dim, | |||||
| self.word_embed_dim, | |||||
| self.vocab_size, | |||||
| self.num_char, | |||||
| use_gpu=self.use_gpu) | |||||
| for param in self.model.parameters(): | |||||
| nn.init.uniform(param.data, -0.05, 0.05) | |||||
| self.learning_rate = 0.1 | |||||
| self.optimizer = None | |||||
| def prepare_input(self, raw_text): | |||||
| """ | |||||
| :param raw_text: raw input text consisting of words | |||||
| :return: torch.Tensor, torch.Tensor | |||||
| feature matrix, label vector | |||||
| This function is only called once in Trainer.train, but may called multiple times in Tester.test | |||||
| So Tester will save test input for frequent calls. | |||||
| """ | |||||
| if os.path.exists("cache/prep.pt") is False: | |||||
| self.preprocess("./data_for_tests/charlm.txt") # To do: This is not good. Need to fix.. | |||||
| objects = torch.load("cache/prep.pt") | |||||
| word_dict = objects["word_dict"] | |||||
| char_dict = objects["char_dict"] | |||||
| max_word_len = self.max_word_len | |||||
| print("word/char dictionary built. Start making inputs.") | |||||
| words = raw_text | |||||
| input_vec = np.array(text2vec(words, char_dict, max_word_len)) | |||||
| # Labels are next-word index in word_dict with the same length as inputs | |||||
| input_label = np.array([word_dict[w] for w in words[1:]] + [word_dict[words[-1]]]) | |||||
| feature_input = torch.from_numpy(input_vec) | |||||
| label_input = torch.from_numpy(input_label) | |||||
| return feature_input, label_input | |||||
| def mode(self, test=False): | |||||
| if test: | |||||
| self.model.eval() | |||||
| else: | |||||
| self.model.train() | |||||
| def data_forward(self, x): | |||||
| """ | |||||
| :param x: Tensor of size [lstm_batch_size, lstm_seq_len, max_word_len+2] | |||||
| :return: Tensor of size [num_words, ?] | |||||
| """ | |||||
| # additional processing of inputs after batching | |||||
| num_seq = x.size()[0] // self.lstm_seq_len | |||||
| x = x[:num_seq * self.lstm_seq_len, :] | |||||
| x = x.view(-1, self.lstm_seq_len, self.max_word_len + 2) | |||||
| # detach hidden state of LSTM from last batch | |||||
| hidden = [state.detach() for state in self.hidden] | |||||
| output, self.hidden = self.model(to_var(x), hidden) | |||||
| return output | |||||
| def grad_backward(self): | |||||
| self.model.zero_grad() | |||||
| self._loss.backward() | |||||
| torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2) | |||||
| self.optimizer.step() | |||||
| def get_loss(self, predict, truth): | |||||
| self._loss = self.criterion(predict, to_var(truth)) | |||||
| return self._loss.data # No pytorch data structure exposed outsides | |||||
| def define_optimizer(self): | |||||
| # redefine optimizer for every new epoch | |||||
| self.optimizer = optim.SGD(self.model.parameters(), lr=self.learning_rate, momentum=0.85) | |||||
| def save(self): | |||||
| print("network saved") | |||||
| # torch.save(self.model, "cache/model.pkl") | |||||
| def preprocess(self, all_text_files): | |||||
| word_dict, char_dict = create_word_char_dict(all_text_files) | |||||
| num_char = len(char_dict) | |||||
| self.vocab_size = len(word_dict) | |||||
| char_dict["BOW"] = num_char + 1 | |||||
| char_dict["EOW"] = num_char + 2 | |||||
| char_dict["PAD"] = 0 | |||||
| self.num_char = num_char + 3 | |||||
| # char_dict is a dict of (int, string), int counting from 0 to 47 | |||||
| reverse_word_dict = {value: key for key, value in word_dict.items()} | |||||
| self.max_word_len = max([len(word) for word in word_dict]) | |||||
| objects = { | |||||
| "word_dict": word_dict, | |||||
| "char_dict": char_dict, | |||||
| "reverse_word_dict": reverse_word_dict, | |||||
| } | |||||
| torch.save(objects, "cache/prep.pt") | |||||
| print("Preprocess done.") | |||||
| """ | |||||
| Global Functions | |||||
| """ | |||||
| def batch_generator(x, batch_size): | |||||
| # x: [num_words, in_channel, height, width] | |||||
| # partitions x into batches | |||||
| num_step = x.size()[0] // batch_size | |||||
| for t in range(num_step): | |||||
| yield x[t * batch_size:(t + 1) * batch_size] | |||||
| def text2vec(words, char_dict, max_word_len): | |||||
| """ Return list of list of int """ | |||||
| word_vec = [] | |||||
| for word in words: | |||||
| vec = [char_dict[ch] for ch in word] | |||||
| if len(vec) < max_word_len: | |||||
| vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))] | |||||
| vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]] | |||||
| word_vec.append(vec) | |||||
| return word_vec | |||||
| def read_data(file_name): | |||||
| with open(file_name, 'r') as f: | |||||
| corpus = f.read().lower() | |||||
| import re | |||||
| corpus = re.sub(r"<unk>", "unk", corpus) | |||||
| return corpus.split() | |||||
| def get_char_dict(vocabulary): | |||||
| char_dict = dict() | |||||
| count = 1 | |||||
| for word in vocabulary: | |||||
| for ch in word: | |||||
| if ch not in char_dict: | |||||
| char_dict[ch] = count | |||||
| count += 1 | |||||
| return char_dict | |||||
| def create_word_char_dict(*file_name): | |||||
| text = [] | |||||
| for file in file_name: | |||||
| text += read_data(file) | |||||
| word_dict = {word: ix for ix, word in enumerate(set(text))} | |||||
| char_dict = get_char_dict(word_dict) | |||||
| return word_dict, char_dict | |||||
| def to_var(x): | |||||
| if torch.cuda.is_available() and USE_GPU: | |||||
| x = x.cuda() | |||||
| return Variable(x) | |||||
| """ | |||||
| Neural Network | |||||
| """ | |||||
| class Highway(nn.Module): | |||||
| """Highway network""" | |||||
| def __init__(self, input_size): | |||||
| super(Highway, self).__init__() | |||||
| self.fc1 = nn.Linear(input_size, input_size, bias=True) | |||||
| self.fc2 = nn.Linear(input_size, input_size, bias=True) | |||||
| def forward(self, x): | |||||
| t = F.sigmoid(self.fc1(x)) | |||||
| return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1 - t, x) | |||||
| class charLM(nn.Module): | |||||
| """Character-level Neural Language Model | |||||
| CNN + highway network + LSTM | |||||
| # Input: | |||||
| 4D tensor with shape [batch_size, in_channel, height, width] | |||||
| # Output: | |||||
| 2D Tensor with shape [batch_size, vocab_size] | |||||
| # Arguments: | |||||
| char_emb_dim: the size of each character's embedding | |||||
| word_emb_dim: the size of each word's embedding | |||||
| vocab_size: num of unique words | |||||
| num_char: num of characters | |||||
| use_gpu: True or False | |||||
| """ | |||||
| def __init__(self, char_emb_dim, word_emb_dim, | |||||
| vocab_size, num_char, use_gpu): | |||||
| super(charLM, self).__init__() | |||||
| self.char_emb_dim = char_emb_dim | |||||
| self.word_emb_dim = word_emb_dim | |||||
| self.vocab_size = vocab_size | |||||
| # char embedding layer | |||||
| self.char_embed = nn.Embedding(num_char, char_emb_dim) | |||||
| # convolutions of filters with different sizes | |||||
| self.convolutions = [] | |||||
| # list of tuples: (the number of filter, width) | |||||
| # self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)] | |||||
| self.filter_num_width = [(25, 1), (50, 2), (75, 3)] | |||||
| for out_channel, filter_width in self.filter_num_width: | |||||
| self.convolutions.append( | |||||
| nn.Conv2d( | |||||
| 1, # in_channel | |||||
| out_channel, # out_channel | |||||
| kernel_size=(char_emb_dim, filter_width), # (height, width) | |||||
| bias=True | |||||
| ) | |||||
| ) | |||||
| self.highway_input_dim = sum([x for x, y in self.filter_num_width]) | |||||
| self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False) | |||||
| # highway net | |||||
| self.highway1 = Highway(self.highway_input_dim) | |||||
| self.highway2 = Highway(self.highway_input_dim) | |||||
| # LSTM | |||||
| self.lstm_num_layers = 2 | |||||
| self.lstm = nn.LSTM(input_size=self.highway_input_dim, | |||||
| hidden_size=self.word_emb_dim, | |||||
| num_layers=self.lstm_num_layers, | |||||
| bias=True, | |||||
| dropout=0.5, | |||||
| batch_first=True) | |||||
| # output layer | |||||
| self.dropout = nn.Dropout(p=0.5) | |||||
| self.linear = nn.Linear(self.word_emb_dim, self.vocab_size) | |||||
| if use_gpu is True: | |||||
| for x in range(len(self.convolutions)): | |||||
| self.convolutions[x] = self.convolutions[x].cuda() | |||||
| self.highway1 = self.highway1.cuda() | |||||
| self.highway2 = self.highway2.cuda() | |||||
| self.lstm = self.lstm.cuda() | |||||
| self.dropout = self.dropout.cuda() | |||||
| self.char_embed = self.char_embed.cuda() | |||||
| self.linear = self.linear.cuda() | |||||
| self.batch_norm = self.batch_norm.cuda() | |||||
| def forward(self, x, hidden): | |||||
| # Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2] | |||||
| # Return: Variable of Tensor with shape [num_words, len(word_dict)] | |||||
| lstm_batch_size = x.size()[0] | |||||
| lstm_seq_len = x.size()[1] | |||||
| x = x.contiguous().view(-1, x.size()[2]) | |||||
| # [num_seq*seq_len, max_word_len+2] | |||||
| x = self.char_embed(x) | |||||
| # [num_seq*seq_len, max_word_len+2, char_emb_dim] | |||||
| x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3) | |||||
| # [num_seq*seq_len, 1, char_emb_dim, max_word_len+2] | |||||
| x = self.conv_layers(x) | |||||
| # [num_seq*seq_len, total_num_filters] | |||||
| x = self.batch_norm(x) | |||||
| # [num_seq*seq_len, total_num_filters] | |||||
| x = self.highway1(x) | |||||
| x = self.highway2(x) | |||||
| # [num_seq*seq_len, total_num_filters] | |||||
| x = x.contiguous().view(lstm_batch_size, lstm_seq_len, -1) | |||||
| # [num_seq, seq_len, total_num_filters] | |||||
| x, hidden = self.lstm(x, hidden) | |||||
| # [seq_len, num_seq, hidden_size] | |||||
| x = self.dropout(x) | |||||
| # [seq_len, num_seq, hidden_size] | |||||
| x = x.contiguous().view(lstm_batch_size * lstm_seq_len, -1) | |||||
| # [num_seq*seq_len, hidden_size] | |||||
| x = self.linear(x) | |||||
| # [num_seq*seq_len, vocab_size] | |||||
| return x, hidden | |||||
| def conv_layers(self, x): | |||||
| chosen_list = list() | |||||
| for conv in self.convolutions: | |||||
| feature_map = F.tanh(conv(x)) | |||||
| # (batch_size, out_channel, 1, max_word_len-width+1) | |||||
| chosen = torch.max(feature_map, 3)[0] | |||||
| # (batch_size, out_channel, 1) | |||||
| chosen = chosen.squeeze() | |||||
| # (batch_size, out_channel) | |||||
| chosen_list.append(chosen) | |||||
| # (batch_size, total_num_filers) | |||||
| return torch.cat(chosen_list, 1) | |||||
| @@ -1,135 +0,0 @@ | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.optim as optim | |||||
| from torch.autograd import Variable | |||||
| from model.base_model import BaseModel | |||||
| USE_GPU = True | |||||
| def to_var(x): | |||||
| if torch.cuda.is_available() and USE_GPU: | |||||
| x = x.cuda() | |||||
| return Variable(x) | |||||
| class WordSegModel(BaseModel): | |||||
| """ | |||||
| Model controller for WordSeg | |||||
| """ | |||||
| def __init__(self): | |||||
| super(WordSegModel, self).__init__() | |||||
| self.id2word = None | |||||
| self.word2id = None | |||||
| self.id2tag = None | |||||
| self.tag2id = None | |||||
| self.lstm_batch_size = 8 | |||||
| self.lstm_seq_len = 32 # Trainer batch_size == lstm_batch_size * lstm_seq_len | |||||
| self.hidden_dim = 100 | |||||
| self.lstm_num_layers = 2 | |||||
| self.vocab_size = 100 | |||||
| self.word_emb_dim = 100 | |||||
| self.model = WordSeg(self.hidden_dim, self.lstm_num_layers, self.vocab_size, self.word_emb_dim) | |||||
| self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)), | |||||
| to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim))) | |||||
| self.optimizer = None | |||||
| self._loss = None | |||||
| def prepare_input(self, data): | |||||
| """ | |||||
| perform word indices lookup to convert strings into indices | |||||
| :param data: list of string, each string contains word + space + [B, M, E, S] | |||||
| :return | |||||
| """ | |||||
| word_list = [] | |||||
| tag_list = [] | |||||
| for line in data: | |||||
| if len(line) > 2: | |||||
| tokens = line.split("#") | |||||
| word_list.append(tokens[0]) | |||||
| tag_list.append(tokens[2][0]) | |||||
| self.id2word = list(set(word_list)) | |||||
| self.word2id = {word: idx for idx, word in enumerate(self.id2word)} | |||||
| self.id2tag = list(set(tag_list)) | |||||
| self.tag2id = {tag: idx for idx, tag in enumerate(self.id2tag)} | |||||
| words = np.array([self.word2id[w] for w in word_list]).reshape(-1, 1) | |||||
| tags = np.array([self.tag2id[t] for t in tag_list]).reshape(-1, 1) | |||||
| return words, tags | |||||
| def mode(self, test=False): | |||||
| if test: | |||||
| self.model.eval() | |||||
| else: | |||||
| self.model.train() | |||||
| def data_forward(self, x): | |||||
| """ | |||||
| :param x: sequence of length [batch_size], word indices | |||||
| :return: | |||||
| """ | |||||
| x = x.reshape(self.lstm_batch_size, self.lstm_seq_len) | |||||
| output, self.hidden = self.model(x, self.hidden) | |||||
| return output | |||||
| def define_optimizer(self): | |||||
| self.optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85) | |||||
| def get_loss(self, pred, truth): | |||||
| self._loss = nn.CrossEntropyLoss(pred, truth) | |||||
| return self._loss | |||||
| def grad_backward(self): | |||||
| self.model.zero_grad() | |||||
| self._loss.backward() | |||||
| torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2) | |||||
| self.optimizer.step() | |||||
| class WordSeg(nn.Module): | |||||
| """ | |||||
| PyTorch Network for word segmentation | |||||
| """ | |||||
| def __init__(self, hidden_dim, lstm_num_layers, vocab_size, word_emb_dim=100): | |||||
| super(WordSeg, self).__init__() | |||||
| self.vocab_size = vocab_size | |||||
| self.word_emb_dim = word_emb_dim | |||||
| self.lstm_num_layers = lstm_num_layers | |||||
| self.hidden_dim = hidden_dim | |||||
| self.word_emb = nn.Embedding(self.vocab_size, self.word_emb_dim) | |||||
| self.lstm = nn.LSTM(input_size=self.word_emb_dim, | |||||
| hidden_size=self.word_emb_dim, | |||||
| num_layers=self.lstm_num_layers, | |||||
| bias=True, | |||||
| dropout=0.5, | |||||
| batch_first=True) | |||||
| self.linear = nn.Linear(self.word_emb_dim, self.vocab_size) | |||||
| def forward(self, x, hidden): | |||||
| """ | |||||
| :param x: tensor of shape [batch_size, seq_len], vocabulary index | |||||
| :param hidden: | |||||
| :return x: probability of vocabulary entries | |||||
| hidden: (memory cell, hidden state) from LSTM | |||||
| """ | |||||
| # [batch_size, seq_len] | |||||
| x = self.word_emb(x) | |||||
| # [batch_size, seq_len, word_emb_size] | |||||
| x, hidden = self.lstm(x, hidden) | |||||
| # [batch_size, seq_len, word_emb_size] | |||||
| x = x.contiguous().view(x.shape[0] * x.shape[1], -1) | |||||
| # [batch_size*seq_len, word_emb_size] | |||||
| x = self.linear(x) | |||||
| # [batch_size*seq_len, vocab_size] | |||||
| return x, hidden | |||||
| @@ -1,110 +0,0 @@ | |||||
| # Byte-compiled / optimized / DLL files | |||||
| __pycache__/ | |||||
| *.py[cod] | |||||
| *$py.class | |||||
| # C extensions | |||||
| *.so | |||||
| # Distribution / packaging | |||||
| .Python | |||||
| build/ | |||||
| develop-eggs/ | |||||
| dist/ | |||||
| downloads/ | |||||
| eggs/ | |||||
| .eggs/ | |||||
| lib/ | |||||
| lib64/ | |||||
| parts/ | |||||
| sdist/ | |||||
| var/ | |||||
| wheels/ | |||||
| *.egg-info/ | |||||
| .installed.cfg | |||||
| *.egg | |||||
| MANIFEST | |||||
| # PyInstaller | |||||
| # Usually these files are written by a python script from a template | |||||
| # before PyInstaller builds the exe, so as to inject date/other infos into it. | |||||
| *.manifest | |||||
| *.spec | |||||
| # Installer logs | |||||
| pip-log.txt | |||||
| pip-delete-this-directory.txt | |||||
| # Unit test / coverage reports | |||||
| htmlcov/ | |||||
| .tox/ | |||||
| .coverage | |||||
| .coverage.* | |||||
| .cache | |||||
| nosetests.xml | |||||
| coverage.xml | |||||
| *.cover | |||||
| .hypothesis/ | |||||
| .pytest_cache/ | |||||
| # Translations | |||||
| *.mo | |||||
| *.pot | |||||
| # Django stuff: | |||||
| *.log | |||||
| local_settings.py | |||||
| db.sqlite3 | |||||
| # Flask stuff: | |||||
| instance/ | |||||
| .webassets-cache | |||||
| # Scrapy stuff: | |||||
| .scrapy | |||||
| # Sphinx documentation | |||||
| docs/_build/ | |||||
| # PyBuilder | |||||
| target/ | |||||
| # Jupyter Notebook | |||||
| .ipynb_checkpoints | |||||
| # pyenv | |||||
| .python-version | |||||
| # celery beat schedule file | |||||
| celerybeat-schedule | |||||
| # SageMath parsed files | |||||
| *.sage.py | |||||
| # Environments | |||||
| .env | |||||
| .venv | |||||
| env/ | |||||
| venv/ | |||||
| ENV/ | |||||
| env.bak/ | |||||
| venv.bak/ | |||||
| # Spyder project settings | |||||
| .spyderproject | |||||
| .spyproject | |||||
| # Rope project settings | |||||
| .ropeproject | |||||
| # mkdocs documentation | |||||
| /site | |||||
| # mypy | |||||
| .mypy_cache | |||||
| #custom | |||||
| GoogleNews-vectors-negative300.bin/ | |||||
| GoogleNews-vectors-negative300.bin.gz | |||||
| models/ | |||||
| *.swp | |||||
| @@ -1,77 +0,0 @@ | |||||
| ## Introduction | |||||
| This is the implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch. | |||||
| * MRDataset, non-static-model(word2vec rained by Mikolov etal. (2013) on 100 billion words of Google News) | |||||
| * It can be run in both CPU and GPU | |||||
| * The best accuracy is 82.61%, which is better than 81.5% in the paper | |||||
| (by Jingyuan Liu @Fudan University; Email:(fdjingyuan@outlook.com) Welcome to discussion!) | |||||
| ## Requirement | |||||
| * python 3.6 | |||||
| * pytorch > 0.1 | |||||
| * numpy | |||||
| * gensim | |||||
| ## Run | |||||
| STEP 1 | |||||
| install packages like gensim (other needed pakages is the same) | |||||
| ``` | |||||
| pip install gensim | |||||
| ``` | |||||
| STEP 2 | |||||
| install MRdataset and word2vec resources | |||||
| * MRdataset: you can download the dataset in (https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz) | |||||
| * word2vec: you can download the file in (https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit) | |||||
| Since this file is more than 1.5G, I did not display in folders. If you download the file, please remember modify the path in Function def word_embeddings(path = './GoogleNews-vectors-negative300.bin/'): | |||||
| STEP 3 | |||||
| train the model | |||||
| ``` | |||||
| python train.py | |||||
| ``` | |||||
| you will get the information printed in the screen, like | |||||
| ``` | |||||
| Epoch [1/20], Iter [100/192] Loss: 0.7008 | |||||
| Test Accuracy: 71.869159 % | |||||
| Epoch [2/20], Iter [100/192] Loss: 0.5957 | |||||
| Test Accuracy: 75.700935 % | |||||
| Epoch [3/20], Iter [100/192] Loss: 0.4934 | |||||
| Test Accuracy: 78.130841 % | |||||
| ...... | |||||
| Epoch [20/20], Iter [100/192] Loss: 0.0364 | |||||
| Test Accuracy: 81.495327 % | |||||
| Best Accuracy: 82.616822 % | |||||
| Best Model: models/cnn.pkl | |||||
| ``` | |||||
| ## Hyperparameters | |||||
| According to the paper and experiment, I set: | |||||
| |Epoch|Kernel Size|dropout|learning rate|batch size| | |||||
| |---|---|---|---|---| | |||||
| |20|\(h,300,100\)|0.5|0.0001|50| | |||||
| h = [3,4,5] | |||||
| If the accuracy is not improved, the learning rate will \*0.8. | |||||
| ## Result | |||||
| I just tried one dataset : MR. (Other 6 dataset in paper SST-1, SST-2, TREC, CR, MPQA) | |||||
| There are four models in paper: CNN-rand, CNN-static, CNN-non-static, CNN-multichannel. | |||||
| I have tried CNN-non-static:A model with pre-trained vectors from word2vec. | |||||
| All words—including the unknown ones that are randomly initialized and the pretrained vectors are fine-tuned for each task | |||||
| (which has almost the best performance and the most difficut to implement among the four models) | |||||
| |Dataset|Class Size|Best Result|Kim's Paper Result| | |||||
| |---|---|---|---| | |||||
| |MR|2|82.617%(CNN-non-static)|81.5%(CNN-nonstatic)| | |||||
| ## Reference | |||||
| * [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) | |||||
| * https://github.com/Shawn1993/cnn-text-classification-pytorch | |||||
| * https://github.com/junwang4/CNN-sentence-classification-pytorch-2017/blob/master/utils.py | |||||
| @@ -1,142 +0,0 @@ | |||||
| import codecs | |||||
| import random | |||||
| import re | |||||
| import gensim | |||||
| import numpy as np | |||||
| from gensim import corpora | |||||
| from torch.utils.data import Dataset | |||||
| def clean_str(string): | |||||
| """ | |||||
| Tokenization/string cleaning for all datasets except for SST. | |||||
| Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py | |||||
| """ | |||||
| string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) | |||||
| string = re.sub(r"\'s", " \'s", string) | |||||
| string = re.sub(r"\'ve", " \'ve", string) | |||||
| string = re.sub(r"n\'t", " n\'t", string) | |||||
| string = re.sub(r"\'re", " \'re", string) | |||||
| string = re.sub(r"\'d", " \'d", string) | |||||
| string = re.sub(r"\'ll", " \'ll", string) | |||||
| string = re.sub(r",", " , ", string) | |||||
| string = re.sub(r"!", " ! ", string) | |||||
| string = re.sub(r"\(", " \( ", string) | |||||
| string = re.sub(r"\)", " \) ", string) | |||||
| string = re.sub(r"\?", " \? ", string) | |||||
| string = re.sub(r"\s{2,}", " ", string) | |||||
| return string.strip() | |||||
| def pad_sentences(sentence, padding_word=" <PAD/>"): | |||||
| sequence_length = 64 | |||||
| sent = sentence.split() | |||||
| padded_sentence = sentence + padding_word * (sequence_length - len(sent)) | |||||
| return padded_sentence | |||||
| #data loader | |||||
| class MRDataset(Dataset): | |||||
| def __init__(self): | |||||
| #load positive and negative sentenses from files | |||||
| with codecs.open("./rt-polaritydata/rt-polarity.pos",encoding ='ISO-8859-1') as f: | |||||
| positive_examples = list(f.readlines()) | |||||
| with codecs.open("./rt-polaritydata/rt-polarity.neg",encoding ='ISO-8859-1') as f: | |||||
| negative_examples = list(f.readlines()) | |||||
| #s.strip: clear "\n"; clear_str; pad | |||||
| positive_examples = [pad_sentences(clean_str(s.strip())) for s in positive_examples] | |||||
| negative_examples = [pad_sentences(clean_str(s.strip())) for s in negative_examples] | |||||
| self.examples = positive_examples + negative_examples | |||||
| self.sentences_texts = [sample.split() for sample in self.examples] | |||||
| #word dictionary | |||||
| dictionary = corpora.Dictionary(self.sentences_texts) | |||||
| self.word2id_dict = dictionary.token2id # transform to dict, like {"human":0, "a":1,...} | |||||
| #set lables: postive is 1; negative is 0 | |||||
| positive_labels = [1 for _ in positive_examples] | |||||
| negative_labels = [0 for _ in negative_examples] | |||||
| self.lables = positive_labels + negative_labels | |||||
| examples_lables = list(zip(self.examples,self.lables)) | |||||
| random.shuffle(examples_lables) | |||||
| self.MRDataset_frame = examples_lables | |||||
| #transform word to id | |||||
| self.MRDataset_wordid = \ | |||||
| [( | |||||
| np.array([self.word2id_dict[word] for word in sent[0].split()], dtype=np.int64), | |||||
| sent[1] | |||||
| ) for sent in self.MRDataset_frame] | |||||
| def word_embeddings(self, path="./GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin"): | |||||
| # establish from google | |||||
| model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True) | |||||
| print('Please wait ... (it could take a while to load the file : {})'.format(path)) | |||||
| word_dict = self.word2id_dict | |||||
| embedding_weights = np.random.uniform(-0.25, 0.25, (len(self.word2id_dict), 300)) | |||||
| for word in word_dict: | |||||
| word_id = word_dict[word] | |||||
| if word in model.wv.vocab: | |||||
| embedding_weights[word_id, :] = model[word] | |||||
| return embedding_weights | |||||
| def __len__(self): | |||||
| return len(self.MRDataset_frame) | |||||
| def __getitem__(self,idx): | |||||
| sample = self.MRDataset_wordid[idx] | |||||
| return sample | |||||
| def getsent(self, idx): | |||||
| sample = self.MRDataset_wordid[idx][0] | |||||
| return sample | |||||
| def getlabel(self, idx): | |||||
| label = self.MRDataset_wordid[idx][1] | |||||
| return label | |||||
| def word2id(self): | |||||
| return self.word2id_dict | |||||
| def id2word(self): | |||||
| id2word_dict = dict([val,key] for key,val in self.word2id_dict.items()) | |||||
| return id2word_dict | |||||
| class train_set(Dataset): | |||||
| def __init__(self, samples): | |||||
| self.train_frame = samples | |||||
| def __len__(self): | |||||
| return len(self.train_frame) | |||||
| def __getitem__(self, idx): | |||||
| return self.train_frame[idx] | |||||
| class test_set(Dataset): | |||||
| def __init__(self, samples): | |||||
| self.test_frame = samples | |||||
| def __len__(self): | |||||
| return len(self.test_frame) | |||||
| def __getitem__(self, idx): | |||||
| return self.test_frame[idx] | |||||
| @@ -1,43 +0,0 @@ | |||||
| import os | |||||
| import sys | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| from torch.autograd import Variable | |||||
| import dataset | |||||
| class CNN_text(nn.Module): | |||||
| def __init__(self, kernel_h=[3,4,5], kernel_num=100, embed_num=1000, embed_dim=300, dropout=0.5, L2_constrain=3, batchsize=50, pretrained_embeddings=None): | |||||
| super(CNN_text, self).__init__() | |||||
| self.embedding = nn.Embedding(embed_num,embed_dim) | |||||
| self.dropout = nn.Dropout(dropout) | |||||
| if pretrained_embeddings is not None: | |||||
| self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings)) | |||||
| #the network structure | |||||
| #Conv2d: input- N,C,H,W output- (50,100,62,1) | |||||
| self.conv1 = nn.ModuleList([nn.Conv2d(1, 100, (K, 300)) for K in kernel_h]) | |||||
| self.fc1 = nn.Linear(300,2) | |||||
| def max_pooling(self, x): | |||||
| x = F.relu(conv(x)).squeeze(3) #N,C,L - (50,100,62) | |||||
| x = F.max_pool1d(x, x.size(2)).squeeze(2) | |||||
| #x.size(2)=62 squeeze: (50,100,1) -> (50,100) | |||||
| return x | |||||
| def forward(self, x): | |||||
| x = self.embedding(x) #output: (N,H,W) = (50,64,300) | |||||
| x = x.unsqueeze(1) #(N,C,H,W) | |||||
| x = [F.relu(conv(x)).squeeze(3) for conv in self.conv1] #[N, C, H(50,100,62),(50,100,61),(50,100,60)] | |||||
| x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[N,C(50,100),(50,100),(50,100)] | |||||
| x = torch.cat(x,1) | |||||
| x = self.dropout(x) | |||||
| x = self.fc1(x) | |||||
| return x | |||||
| @@ -1,97 +0,0 @@ | |||||
| import os | |||||
| import | |||||
| import | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| .dataset as dst | |||||
| from .model import CNN_text | |||||
| from torch.autograd import Variable | |||||
| # Hyper Parameters | |||||
| batch_size = 50 | |||||
| learning_rate = 0.0001 | |||||
| num_epochs = 20 | |||||
| cuda = True | |||||
| #split Dataset | |||||
| dataset = dst.MRDataset() | |||||
| length = len(dataset) | |||||
| train_dataset = dataset[:int(0.9*length)] | |||||
| test_dataset = dataset[int(0.9*length):] | |||||
| train_dataset = dst.train_set(train_dataset) | |||||
| test_dataset = dst.test_set(test_dataset) | |||||
| # Data Loader | |||||
| train_loader = torch.utils.data.DataLoader(dataset=train_dataset, | |||||
| batch_size=batch_size, | |||||
| shuffle=True) | |||||
| test_loader = torch.utils.data.DataLoader(dataset=test_dataset, | |||||
| batch_size=batch_size, | |||||
| shuffle=False) | |||||
| #cnn | |||||
| cnn = CNN_text(embed_num=len(dataset.word2id()), pretrained_embeddings=dataset.word_embeddings()) | |||||
| if cuda: | |||||
| cnn.cuda() | |||||
| # Loss and Optimizer | |||||
| criterion = nn.CrossEntropyLoss() | |||||
| optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate) | |||||
| # train and tests | |||||
| best_acc = None | |||||
| for epoch in range(num_epochs): | |||||
| # Train the Model | |||||
| cnn.train() | |||||
| for i, (sents,labels) in enumerate(train_loader): | |||||
| sents = Variable(sents) | |||||
| labels = Variable(labels) | |||||
| if cuda: | |||||
| sents = sents.cuda() | |||||
| labels = labels.cuda() | |||||
| optimizer.zero_grad() | |||||
| outputs = cnn(sents) | |||||
| loss = criterion(outputs, labels) | |||||
| loss.backward() | |||||
| optimizer.step() | |||||
| if (i+1) % 100 == 0: | |||||
| print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f' | |||||
| %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0])) | |||||
| # Test the Model | |||||
| cnn.eval() | |||||
| correct = 0 | |||||
| total = 0 | |||||
| for sents, labels in test_loader: | |||||
| sents = Variable(sents) | |||||
| if cuda: | |||||
| sents = sents.cuda() | |||||
| labels = labels.cuda() | |||||
| outputs = cnn(sents) | |||||
| _, predicted = torch.max(outputs.data, 1) | |||||
| total += labels.size(0) | |||||
| correct += (predicted == labels).sum() | |||||
| acc = 100. * correct / total | |||||
| print('Test Accuracy: %f %%' % (acc)) | |||||
| if best_acc is None or acc > best_acc: | |||||
| best_acc = acc | |||||
| if os.path.exists("models") is False: | |||||
| os.makedirs("models") | |||||
| torch.save(cnn.state_dict(), 'models/cnn.pkl') | |||||
| else: | |||||
| learning_rate = learning_rate * 0.8 | |||||
| print("Best Accuracy: %f %%" % best_acc) | |||||
| print("Best Model: models/cnn.pkl") | |||||
| @@ -1,21 +0,0 @@ | |||||
| MIT License | |||||
| Copyright (c) 2017 | |||||
| Permission is hereby granted, free of charge, to any person obtaining a copy | |||||
| of this software and associated documentation files (the "Software"), to deal | |||||
| in the Software without restriction, including without limitation the rights | |||||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||||
| copies of the Software, and to permit persons to whom the Software is | |||||
| furnished to do so, subject to the following conditions: | |||||
| The above copyright notice and this permission notice shall be included in all | |||||
| copies or substantial portions of the Software. | |||||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||||
| SOFTWARE. | |||||
| @@ -1,40 +0,0 @@ | |||||
| # PyTorch-Character-Aware-Neural-Language-Model | |||||
| This is the PyTorch implementation of character-aware neural language model proposed in this [paper](https://arxiv.org/abs/1508.06615) by Yoon Kim. | |||||
| ## Requiredments | |||||
| The code is run and tested with **Python 3.5.2** and **PyTorch 0.3.1**. | |||||
| ## HyperParameters | |||||
| | HyperParam | value | | |||||
| | ------ | :-------| | |||||
| | LSTM batch size | 20 | | |||||
| | LSTM sequence length | 35 | | |||||
| | LSTM hidden units | 300 | | |||||
| | epochs | 35 | | |||||
| | initial learning rate | 1.0 | | |||||
| | character embedding dimension | 15 | | |||||
| ## Demo | |||||
| Train the model with split train/valid/test data. | |||||
| `python train.py` | |||||
| The trained model will saved in `cache/net.pkl`. | |||||
| Test the model. | |||||
| `python test.py` | |||||
| Best result on test set: | |||||
| PPl=127.2163 | |||||
| cross entropy loss=4.8459 | |||||
| ## Acknowledgement | |||||
| This implementation borrowed ideas from | |||||
| https://github.com/jarfo/kchar | |||||
| https://github.com/cronos123/Character-Aware-Neural-Language-Models | |||||
| @@ -1,148 +0,0 @@ | |||||
| import torch | |||||
| from torch.autograd import Variable | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| class Highway(nn.Module): | |||||
| """Highway network""" | |||||
| def __init__(self, input_size): | |||||
| super(Highway, self).__init__() | |||||
| self.fc1 = nn.Linear(input_size, input_size, bias=True) | |||||
| self.fc2 = nn.Linear(input_size, input_size, bias=True) | |||||
| def forward(self, x): | |||||
| t = F.sigmoid(self.fc1(x)) | |||||
| return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1-t, x) | |||||
| class charLM(nn.Module): | |||||
| """CNN + highway network + LSTM | |||||
| # Input: | |||||
| 4D tensor with shape [batch_size, in_channel, height, width] | |||||
| # Output: | |||||
| 2D Tensor with shape [batch_size, vocab_size] | |||||
| # Arguments: | |||||
| char_emb_dim: the size of each character's embedding | |||||
| word_emb_dim: the size of each word's embedding | |||||
| vocab_size: num of unique words | |||||
| num_char: num of characters | |||||
| use_gpu: True or False | |||||
| """ | |||||
| def __init__(self, char_emb_dim, word_emb_dim, | |||||
| vocab_size, num_char, use_gpu): | |||||
| super(charLM, self).__init__() | |||||
| self.char_emb_dim = char_emb_dim | |||||
| self.word_emb_dim = word_emb_dim | |||||
| self.vocab_size = vocab_size | |||||
| # char embedding layer | |||||
| self.char_embed = nn.Embedding(num_char, char_emb_dim) | |||||
| # convolutions of filters with different sizes | |||||
| self.convolutions = [] | |||||
| # list of tuples: (the number of filter, width) | |||||
| self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)] | |||||
| for out_channel, filter_width in self.filter_num_width: | |||||
| self.convolutions.append( | |||||
| nn.Conv2d( | |||||
| 1, # in_channel | |||||
| out_channel, # out_channel | |||||
| kernel_size=(char_emb_dim, filter_width), # (height, width) | |||||
| bias=True | |||||
| ) | |||||
| ) | |||||
| self.highway_input_dim = sum([x for x, y in self.filter_num_width]) | |||||
| self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False) | |||||
| # highway net | |||||
| self.highway1 = Highway(self.highway_input_dim) | |||||
| self.highway2 = Highway(self.highway_input_dim) | |||||
| # LSTM | |||||
| self.lstm_num_layers = 2 | |||||
| self.lstm = nn.LSTM(input_size=self.highway_input_dim, | |||||
| hidden_size=self.word_emb_dim, | |||||
| num_layers=self.lstm_num_layers, | |||||
| bias=True, | |||||
| dropout=0.5, | |||||
| batch_first=True) | |||||
| # output layer | |||||
| self.dropout = nn.Dropout(p=0.5) | |||||
| self.linear = nn.Linear(self.word_emb_dim, self.vocab_size) | |||||
| if use_gpu is True: | |||||
| for x in range(len(self.convolutions)): | |||||
| self.convolutions[x] = self.convolutions[x].cuda() | |||||
| self.highway1 = self.highway1.cuda() | |||||
| self.highway2 = self.highway2.cuda() | |||||
| self.lstm = self.lstm.cuda() | |||||
| self.dropout = self.dropout.cuda() | |||||
| self.char_embed = self.char_embed.cuda() | |||||
| self.linear = self.linear.cuda() | |||||
| self.batch_norm = self.batch_norm.cuda() | |||||
| def forward(self, x, hidden): | |||||
| # Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2] | |||||
| # Return: Variable of Tensor with shape [num_words, len(word_dict)] | |||||
| lstm_batch_size = x.size()[0] | |||||
| lstm_seq_len = x.size()[1] | |||||
| x = x.contiguous().view(-1, x.size()[2]) | |||||
| # [num_seq*seq_len, max_word_len+2] | |||||
| x = self.char_embed(x) | |||||
| # [num_seq*seq_len, max_word_len+2, char_emb_dim] | |||||
| x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3) | |||||
| # [num_seq*seq_len, 1, max_word_len+2, char_emb_dim] | |||||
| x = self.conv_layers(x) | |||||
| # [num_seq*seq_len, total_num_filters] | |||||
| x = self.batch_norm(x) | |||||
| # [num_seq*seq_len, total_num_filters] | |||||
| x = self.highway1(x) | |||||
| x = self.highway2(x) | |||||
| # [num_seq*seq_len, total_num_filters] | |||||
| x = x.contiguous().view(lstm_batch_size,lstm_seq_len, -1) | |||||
| # [num_seq, seq_len, total_num_filters] | |||||
| x, hidden = self.lstm(x, hidden) | |||||
| # [seq_len, num_seq, hidden_size] | |||||
| x = self.dropout(x) | |||||
| # [seq_len, num_seq, hidden_size] | |||||
| x = x.contiguous().view(lstm_batch_size*lstm_seq_len, -1) | |||||
| # [num_seq*seq_len, hidden_size] | |||||
| x = self.linear(x) | |||||
| # [num_seq*seq_len, vocab_size] | |||||
| return x, hidden | |||||
| def conv_layers(self, x): | |||||
| chosen_list = list() | |||||
| for conv in self.convolutions: | |||||
| feature_map = F.tanh(conv(x)) | |||||
| # (batch_size, out_channel, 1, max_word_len-width+1) | |||||
| chosen = torch.max(feature_map, 3)[0] | |||||
| # (batch_size, out_channel, 1) | |||||
| chosen = chosen.squeeze() | |||||
| # (batch_size, out_channel) | |||||
| chosen_list.append(chosen) | |||||
| # (batch_size, total_num_filers) | |||||
| return torch.cat(chosen_list, 1) | |||||
| @@ -1,123 +0,0 @@ | |||||
| import os | |||||
| from collections import namedtuple | |||||
| import numpy as np | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| from torch.autograd import Variable | |||||
| from utilities import * | |||||
| def to_var(x): | |||||
| if torch.cuda.is_available(): | |||||
| x = x.cuda() | |||||
| return Variable(x) | |||||
| def test(net, data, opt): | |||||
| net.eval() | |||||
| test_input = torch.from_numpy(data.test_input) | |||||
| test_label = torch.from_numpy(data.test_label) | |||||
| num_seq = test_input.size()[0] // opt.lstm_seq_len | |||||
| test_input = test_input[:num_seq*opt.lstm_seq_len, :] | |||||
| # [num_seq, seq_len, max_word_len+2] | |||||
| test_input = test_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2) | |||||
| criterion = nn.CrossEntropyLoss() | |||||
| loss_list = [] | |||||
| num_hits = 0 | |||||
| total = 0 | |||||
| iterations = test_input.size()[0] // opt.lstm_batch_size | |||||
| test_generator = batch_generator(test_input, opt.lstm_batch_size) | |||||
| label_generator = batch_generator(test_label, opt.lstm_batch_size*opt.lstm_seq_len) | |||||
| hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)), | |||||
| to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim))) | |||||
| add_loss = 0.0 | |||||
| for t in range(iterations): | |||||
| batch_input = test_generator.__next__ () | |||||
| batch_label = label_generator.__next__() | |||||
| net.zero_grad() | |||||
| hidden = [state.detach() for state in hidden] | |||||
| test_output, hidden = net(to_var(batch_input), hidden) | |||||
| test_loss = criterion(test_output, to_var(batch_label)).data | |||||
| loss_list.append(test_loss) | |||||
| add_loss += test_loss | |||||
| print("Test Loss={0:.4f}".format(float(add_loss) / iterations)) | |||||
| print("Test PPL={0:.4f}".format(float(np.exp(add_loss / iterations)))) | |||||
| ############################################################# | |||||
| if __name__ == "__main__": | |||||
| word_embed_dim = 300 | |||||
| char_embedding_dim = 15 | |||||
| if os.path.exists("cache/prep.pt") is False: | |||||
| print("Cannot find prep.pt") | |||||
| objetcs = torch.load("cache/prep.pt") | |||||
| word_dict = objetcs["word_dict"] | |||||
| char_dict = objetcs["char_dict"] | |||||
| reverse_word_dict = objetcs["reverse_word_dict"] | |||||
| max_word_len = objetcs["max_word_len"] | |||||
| num_words = len(word_dict) | |||||
| print("word/char dictionary built. Start making inputs.") | |||||
| if os.path.exists("cache/data_sets.pt") is False: | |||||
| test_text = read_data("./tests.txt") | |||||
| test_set = np.array(text2vec(test_text, char_dict, max_word_len)) | |||||
| # Labels are next-word index in word_dict with the same length as inputs | |||||
| test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]]) | |||||
| category = {"tests": test_set, "tlabel": test_label} | |||||
| torch.save(category, "cache/data_sets.pt") | |||||
| else: | |||||
| data_sets = torch.load("cache/data_sets.pt") | |||||
| test_set = data_sets["tests"] | |||||
| test_label = data_sets["tlabel"] | |||||
| train_set = data_sets["tdata"] | |||||
| train_label = data_sets["trlabel"] | |||||
| DataTuple = namedtuple("DataTuple", "test_input test_label train_input train_label ") | |||||
| data = DataTuple( test_input=test_set, | |||||
| test_label=test_label, train_label=train_label, train_input=train_set) | |||||
| print("Loaded data sets. Start building network.") | |||||
| USE_GPU = True | |||||
| cnn_batch_size = 700 | |||||
| lstm_seq_len = 35 | |||||
| lstm_batch_size = 20 | |||||
| net = torch.load("cache/net.pkl") | |||||
| Options = namedtuple("Options", [ "cnn_batch_size", "lstm_seq_len", | |||||
| "max_word_len", "lstm_batch_size", "word_embed_dim"]) | |||||
| opt = Options(cnn_batch_size=lstm_seq_len*lstm_batch_size, | |||||
| lstm_seq_len=lstm_seq_len, | |||||
| max_word_len=max_word_len, | |||||
| lstm_batch_size=lstm_batch_size, | |||||
| word_embed_dim=word_embed_dim) | |||||
| print("Network built. Start testing.") | |||||
| test(net, data, opt) | |||||
| @@ -1,275 +0,0 @@ | |||||
| import os | |||||
| from collections import namedtuple | |||||
| import numpy as np | |||||
| import torch.optim as optim | |||||
| from .model import charLM | |||||
| from .test import test | |||||
| from .utilities import * | |||||
| def preprocess(): | |||||
| word_dict, char_dict = create_word_char_dict("charlm.txt", "train.txt", "tests.txt") | |||||
| num_words = len(word_dict) | |||||
| num_char = len(char_dict) | |||||
| char_dict["BOW"] = num_char+1 | |||||
| char_dict["EOW"] = num_char+2 | |||||
| char_dict["PAD"] = 0 | |||||
| # dict of (int, string) | |||||
| reverse_word_dict = {value:key for key, value in word_dict.items()} | |||||
| max_word_len = max([len(word) for word in word_dict]) | |||||
| objects = { | |||||
| "word_dict": word_dict, | |||||
| "char_dict": char_dict, | |||||
| "reverse_word_dict": reverse_word_dict, | |||||
| "max_word_len": max_word_len | |||||
| } | |||||
| torch.save(objects, "cache/prep.pt") | |||||
| print("Preprocess done.") | |||||
| def to_var(x): | |||||
| if torch.cuda.is_available(): | |||||
| x = x.cuda() | |||||
| return Variable(x) | |||||
| def train(net, data, opt): | |||||
| """ | |||||
| :param net: the pytorch model | |||||
| :param data: numpy array | |||||
| :param opt: named tuple | |||||
| 1. random seed | |||||
| 2. define local input | |||||
| 3. training settting: learning rate, loss, etc | |||||
| 4. main loop epoch | |||||
| 5. batchify | |||||
| 6. validation | |||||
| 7. save model | |||||
| """ | |||||
| torch.manual_seed(1024) | |||||
| train_input = torch.from_numpy(data.train_input) | |||||
| train_label = torch.from_numpy(data.train_label) | |||||
| valid_input = torch.from_numpy(data.valid_input) | |||||
| valid_label = torch.from_numpy(data.valid_label) | |||||
| # [num_seq, seq_len, max_word_len+2] | |||||
| num_seq = train_input.size()[0] // opt.lstm_seq_len | |||||
| train_input = train_input[:num_seq*opt.lstm_seq_len, :] | |||||
| train_input = train_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2) | |||||
| num_seq = valid_input.size()[0] // opt.lstm_seq_len | |||||
| valid_input = valid_input[:num_seq*opt.lstm_seq_len, :] | |||||
| valid_input = valid_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2) | |||||
| num_epoch = opt.epochs | |||||
| num_iter_per_epoch = train_input.size()[0] // opt.lstm_batch_size | |||||
| learning_rate = opt.init_lr | |||||
| old_PPL = 100000 | |||||
| best_PPL = 100000 | |||||
| # Log-SoftMax | |||||
| criterion = nn.CrossEntropyLoss() | |||||
| # word_emb_dim == hidden_size / num of hidden units | |||||
| hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)), | |||||
| to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim))) | |||||
| for epoch in range(num_epoch): | |||||
| ################ Validation #################### | |||||
| net.eval() | |||||
| loss_batch = [] | |||||
| PPL_batch = [] | |||||
| iterations = valid_input.size()[0] // opt.lstm_batch_size | |||||
| valid_generator = batch_generator(valid_input, opt.lstm_batch_size) | |||||
| vlabel_generator = batch_generator(valid_label, opt.lstm_batch_size*opt.lstm_seq_len) | |||||
| for t in range(iterations): | |||||
| batch_input = valid_generator.__next__() | |||||
| batch_label = vlabel_generator.__next__() | |||||
| hidden = [state.detach() for state in hidden] | |||||
| valid_output, hidden = net(to_var(batch_input), hidden) | |||||
| length = valid_output.size()[0] | |||||
| # [num_sample-1, len(word_dict)] vs [num_sample-1] | |||||
| valid_loss = criterion(valid_output, to_var(batch_label)) | |||||
| PPL = torch.exp(valid_loss.data) | |||||
| loss_batch.append(float(valid_loss)) | |||||
| PPL_batch.append(float(PPL)) | |||||
| PPL = np.mean(PPL_batch) | |||||
| print("[epoch {}] valid PPL={}".format(epoch, PPL)) | |||||
| print("valid loss={}".format(np.mean(loss_batch))) | |||||
| print("PPL decrease={}".format(float(old_PPL - PPL))) | |||||
| # Preserve the best model | |||||
| if best_PPL > PPL: | |||||
| best_PPL = PPL | |||||
| torch.save(net.state_dict(), "cache/model.pt") | |||||
| torch.save(net, "cache/net.pkl") | |||||
| # Adjust the learning rate | |||||
| if float(old_PPL - PPL) <= 1.0: | |||||
| learning_rate /= 2 | |||||
| print("halved lr:{}".format(learning_rate)) | |||||
| old_PPL = PPL | |||||
| ################################################## | |||||
| #################### Training #################### | |||||
| net.train() | |||||
| optimizer = optim.SGD(net.parameters(), | |||||
| lr = learning_rate, | |||||
| momentum=0.85) | |||||
| # split the first dim | |||||
| input_generator = batch_generator(train_input, opt.lstm_batch_size) | |||||
| label_generator = batch_generator(train_label, opt.lstm_batch_size*opt.lstm_seq_len) | |||||
| for t in range(num_iter_per_epoch): | |||||
| batch_input = input_generator.__next__() | |||||
| batch_label = label_generator.__next__() | |||||
| # detach hidden state of LSTM from last batch | |||||
| hidden = [state.detach() for state in hidden] | |||||
| output, hidden = net(to_var(batch_input), hidden) | |||||
| # [num_word, vocab_size] | |||||
| loss = criterion(output, to_var(batch_label)) | |||||
| net.zero_grad() | |||||
| loss.backward() | |||||
| torch.nn.utils.clip_grad_norm(net.parameters(), 5, norm_type=2) | |||||
| optimizer.step() | |||||
| if (t+1) % 100 == 0: | |||||
| print("[epoch {} step {}] train loss={}, Perplexity={}".format(epoch+1, | |||||
| t+1, float(loss.data), float(np.exp(loss.data)))) | |||||
| torch.save(net.state_dict(), "cache/model.pt") | |||||
| print("Training finished.") | |||||
| ################################################################ | |||||
| if __name__=="__main__": | |||||
| word_embed_dim = 300 | |||||
| char_embedding_dim = 15 | |||||
| if os.path.exists("cache/prep.pt") is False: | |||||
| preprocess() | |||||
| objetcs = torch.load("cache/prep.pt") | |||||
| word_dict = objetcs["word_dict"] | |||||
| char_dict = objetcs["char_dict"] | |||||
| reverse_word_dict = objetcs["reverse_word_dict"] | |||||
| max_word_len = objetcs["max_word_len"] | |||||
| num_words = len(word_dict) | |||||
| print("word/char dictionary built. Start making inputs.") | |||||
| if os.path.exists("cache/data_sets.pt") is False: | |||||
| train_text = read_data("./train.txt") | |||||
| valid_text = read_data("./charlm.txt") | |||||
| test_text = read_data("./tests.txt") | |||||
| train_set = np.array(text2vec(train_text, char_dict, max_word_len)) | |||||
| valid_set = np.array(text2vec(valid_text, char_dict, max_word_len)) | |||||
| test_set = np.array(text2vec(test_text, char_dict, max_word_len)) | |||||
| # Labels are next-word index in word_dict with the same length as inputs | |||||
| train_label = np.array([word_dict[w] for w in train_text[1:]] + [word_dict[train_text[-1]]]) | |||||
| valid_label = np.array([word_dict[w] for w in valid_text[1:]] + [word_dict[valid_text[-1]]]) | |||||
| test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]]) | |||||
| category = {"tdata": train_set, "vdata": valid_set, "tests": test_set, | |||||
| "trlabel":train_label, "vlabel":valid_label, "tlabel":test_label} | |||||
| torch.save(category, "cache/data_sets.pt") | |||||
| else: | |||||
| data_sets = torch.load("cache/data_sets.pt") | |||||
| train_set = data_sets["tdata"] | |||||
| valid_set = data_sets["vdata"] | |||||
| test_set = data_sets["tests"] | |||||
| train_label = data_sets["trlabel"] | |||||
| valid_label = data_sets["vlabel"] | |||||
| test_label = data_sets["tlabel"] | |||||
| DataTuple = namedtuple("DataTuple", | |||||
| "train_input train_label valid_input valid_label test_input test_label") | |||||
| data = DataTuple(train_input=train_set, | |||||
| train_label=train_label, | |||||
| valid_input=valid_set, | |||||
| valid_label=valid_label, | |||||
| test_input=test_set, | |||||
| test_label=test_label) | |||||
| print("Loaded data sets. Start building network.") | |||||
| USE_GPU = True | |||||
| cnn_batch_size = 700 | |||||
| lstm_seq_len = 35 | |||||
| lstm_batch_size = 20 | |||||
| # cnn_batch_size == lstm_seq_len * lstm_batch_size | |||||
| net = charLM(char_embedding_dim, | |||||
| word_embed_dim, | |||||
| num_words, | |||||
| len(char_dict), | |||||
| use_gpu=USE_GPU) | |||||
| for param in net.parameters(): | |||||
| nn.init.uniform(param.data, -0.05, 0.05) | |||||
| Options = namedtuple("Options", [ | |||||
| "cnn_batch_size", "init_lr", "lstm_seq_len", | |||||
| "max_word_len", "lstm_batch_size", "epochs", | |||||
| "word_embed_dim"]) | |||||
| opt = Options(cnn_batch_size=lstm_seq_len*lstm_batch_size, | |||||
| init_lr=1.0, | |||||
| lstm_seq_len=lstm_seq_len, | |||||
| max_word_len=max_word_len, | |||||
| lstm_batch_size=lstm_batch_size, | |||||
| epochs=35, | |||||
| word_embed_dim=word_embed_dim) | |||||
| print("Network built. Start training.") | |||||
| # You can stop training anytime by "ctrl+C" | |||||
| try: | |||||
| train(net, data, opt) | |||||
| except KeyboardInterrupt: | |||||
| print('-' * 89) | |||||
| print('Exiting from training early') | |||||
| torch.save(net, "cache/net.pkl") | |||||
| print("save net") | |||||
| test(net, data, opt) | |||||
| @@ -1,86 +0,0 @@ | |||||
| import torch | |||||
| from torch.autograd import Variable | |||||
| import torch.nn as nn | |||||
| import torch.nn.functional as F | |||||
| def batch_generator(x, batch_size): | |||||
| # x: [num_words, in_channel, height, width] | |||||
| # partitions x into batches | |||||
| num_step = x.size()[0] // batch_size | |||||
| for t in range(num_step): | |||||
| yield x[t*batch_size:(t+1)*batch_size] | |||||
| def text2vec(words, char_dict, max_word_len): | |||||
| """ Return list of list of int """ | |||||
| word_vec = [] | |||||
| for word in words: | |||||
| vec = [char_dict[ch] for ch in word] | |||||
| if len(vec) < max_word_len: | |||||
| vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))] | |||||
| vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]] | |||||
| word_vec.append(vec) | |||||
| return word_vec | |||||
| def seq2vec(input_words, char_embedding, char_embedding_dim, char_table): | |||||
| """ convert the input strings into character embeddings """ | |||||
| # input_words == list of string | |||||
| # char_embedding == torch.nn.Embedding | |||||
| # char_embedding_dim == int | |||||
| # char_table == list of unique chars | |||||
| # Returns: tensor of shape [len(input_words), char_embedding_dim, max_word_len+2] | |||||
| max_word_len = max([len(word) for word in input_words]) | |||||
| print("max_word_len={}".format(max_word_len)) | |||||
| tensor_list = [] | |||||
| start_column = torch.ones(char_embedding_dim, 1) | |||||
| end_column = torch.ones(char_embedding_dim, 1) | |||||
| for word in input_words: | |||||
| # convert string to word embedding | |||||
| word_encoding = char_embedding_lookup(word, char_embedding, char_table) | |||||
| # add start and end columns | |||||
| word_encoding = torch.cat([start_column, word_encoding, end_column], 1) | |||||
| # zero-pad right columns | |||||
| word_encoding = F.pad(word_encoding, (0, max_word_len-word_encoding.size()[1]+2)).data | |||||
| # create dimension | |||||
| word_encoding = word_encoding.unsqueeze(0) | |||||
| tensor_list.append(word_encoding) | |||||
| return torch.cat(tensor_list, 0) | |||||
| def read_data(file_name): | |||||
| # Return: list of strings | |||||
| with open(file_name, 'r') as f: | |||||
| corpus = f.read().lower() | |||||
| import re | |||||
| corpus = re.sub(r"<unk>", "unk", corpus) | |||||
| return corpus.split() | |||||
| def get_char_dict(vocabulary): | |||||
| # vocabulary == dict of (word, int) | |||||
| # Return: dict of (char, int), starting from 1 | |||||
| char_dict = dict() | |||||
| count = 1 | |||||
| for word in vocabulary: | |||||
| for ch in word: | |||||
| if ch not in char_dict: | |||||
| char_dict[ch] = count | |||||
| count += 1 | |||||
| return char_dict | |||||
| def create_word_char_dict(*file_name): | |||||
| text = [] | |||||
| for file in file_name: | |||||
| text += read_data(file) | |||||
| word_dict = {word:ix for ix, word in enumerate(set(text))} | |||||
| char_dict = get_char_dict(word_dict) | |||||
| return word_dict, char_dict | |||||
| @@ -1,36 +0,0 @@ | |||||
| ## Introduction | |||||
| This is the implementation of [Hierarchical Attention Networks for Document Classification](https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf) paper in PyTorch. | |||||
| * Dataset is 600k documents extracted from [Yelp 2018](https://www.yelp.com/dataset) customer reviews | |||||
| * Use [NLTK](http://www.nltk.org/) and [Stanford CoreNLP](https://stanfordnlp.github.io/CoreNLP/) to tokenize documents and sentences | |||||
| * Both CPU & GPU support | |||||
| * The best accuracy is 71%, reaching the same performance in the paper | |||||
| ## Requirement | |||||
| * python 3.6 | |||||
| * pytorch = 0.3.0 | |||||
| * numpy | |||||
| * gensim | |||||
| * nltk | |||||
| * coreNLP | |||||
| ## Parameters | |||||
| According to the paper and experiment, I set model parameters: | |||||
| |word embedding dimension|GRU hidden size|GRU layer|word/sentence context vector dimension| | |||||
| |---|---|---|---| | |||||
| |200|50|1|100| | |||||
| And the training parameters: | |||||
| |Epoch|learning rate|momentum|batch size| | |||||
| |---|---|---|---| | |||||
| |3|0.01|0.9|64| | |||||
| ## Run | |||||
| 1. Prepare dataset. Download the [data set](https://www.yelp.com/dataset), and unzip the custom reviews as a file. Use preprocess.py to transform file into data set foe model input. | |||||
| 2. Train the model. Word enbedding of train data in 'yelp.word2vec'. The model will trained and autosaved in 'model.dict' | |||||
| ``` | |||||
| python train | |||||
| ``` | |||||
| 3. Test the model. | |||||
| ``` | |||||
| python evaluate | |||||
| ``` | |||||
| @@ -1,44 +0,0 @@ | |||||
| from model import * | |||||
| from train import * | |||||
| def evaluate(net, dataset, bactch_size=64, use_cuda=False): | |||||
| dataloader = DataLoader(dataset, batch_size=bactch_size, collate_fn=collate, num_workers=0) | |||||
| count = 0 | |||||
| if use_cuda: | |||||
| net.cuda() | |||||
| for i, batch_samples in enumerate(dataloader): | |||||
| x, y = batch_samples | |||||
| doc_list = [] | |||||
| for sample in x: | |||||
| doc = [] | |||||
| for sent_vec in sample: | |||||
| if use_cuda: | |||||
| sent_vec = sent_vec.cuda() | |||||
| doc.append(Variable(sent_vec, volatile=True)) | |||||
| doc_list.append(pack_sequence(doc)) | |||||
| if use_cuda: | |||||
| y = y.cuda() | |||||
| predicts = net(doc_list) | |||||
| p, idx = torch.max(predicts, dim=1) | |||||
| idx = idx.data | |||||
| count += torch.sum(torch.eq(idx, y)) | |||||
| return count | |||||
| if __name__ == '__main__': | |||||
| ''' | |||||
| Evaluate the performance of model | |||||
| ''' | |||||
| from gensim.models import Word2Vec | |||||
| import gensim | |||||
| from gensim import models | |||||
| embed_model = Word2Vec.load('yelp.word2vec') | |||||
| embedding = Embedding_layer(embed_model.wv, embed_model.wv.vector_size) | |||||
| del embed_model | |||||
| net = HAN(input_size=200, output_size=5, | |||||
| word_hidden_size=50, word_num_layers=1, word_context_size=100, | |||||
| sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) | |||||
| net.load_state_dict(torch.load('model.dict')) | |||||
| test_dataset = YelpDocSet('reviews', 199, 4, embedding) | |||||
| correct = evaluate(net, test_dataset, True) | |||||
| print('accuracy {}'.format(correct/len(test_dataset))) | |||||
| @@ -1,110 +0,0 @@ | |||||
| import torch | |||||
| import torch.nn as nn | |||||
| from torch.autograd import Variable | |||||
| import torch.nn.functional as F | |||||
| def pack_sequence(tensor_seq, padding_value=0.0): | |||||
| if len(tensor_seq) <= 0: | |||||
| return | |||||
| length = [v.size(0) for v in tensor_seq] | |||||
| max_len = max(length) | |||||
| size = [len(tensor_seq), max_len] | |||||
| size.extend(list(tensor_seq[0].size()[1:])) | |||||
| ans = torch.Tensor(*size).fill_(padding_value) | |||||
| if tensor_seq[0].data.is_cuda: | |||||
| ans = ans.cuda() | |||||
| ans = Variable(ans) | |||||
| for i, v in enumerate(tensor_seq): | |||||
| ans[i, :length[i], :] = v | |||||
| return ans | |||||
| class HAN(nn.Module): | |||||
| def __init__(self, input_size, output_size, | |||||
| word_hidden_size, word_num_layers, word_context_size, | |||||
| sent_hidden_size, sent_num_layers, sent_context_size): | |||||
| super(HAN, self).__init__() | |||||
| self.word_layer = AttentionNet(input_size, | |||||
| word_hidden_size, | |||||
| word_num_layers, | |||||
| word_context_size) | |||||
| self.sent_layer = AttentionNet(2* word_hidden_size, | |||||
| sent_hidden_size, | |||||
| sent_num_layers, | |||||
| sent_context_size) | |||||
| self.output_layer = nn.Linear(2* sent_hidden_size, output_size) | |||||
| self.softmax = nn.LogSoftmax(dim=1) | |||||
| def forward(self, batch_doc): | |||||
| # input is a sequence of matrix | |||||
| doc_vec_list = [] | |||||
| for doc in batch_doc: | |||||
| sent_mat = self.word_layer(doc) # doc's dim (num_sent, seq_len, word_dim) | |||||
| doc_vec_list.append(sent_mat) # sent_mat's dim (num_sent, vec_dim) | |||||
| doc_vec = self.sent_layer(pack_sequence(doc_vec_list)) | |||||
| output = self.softmax(self.output_layer(doc_vec)) | |||||
| return output | |||||
| class AttentionNet(nn.Module): | |||||
| def __init__(self, input_size, gru_hidden_size, gru_num_layers, context_vec_size): | |||||
| super(AttentionNet, self).__init__() | |||||
| self.input_size = input_size | |||||
| self.gru_hidden_size = gru_hidden_size | |||||
| self.gru_num_layers = gru_num_layers | |||||
| self.context_vec_size = context_vec_size | |||||
| # Encoder | |||||
| self.gru = nn.GRU(input_size=input_size, | |||||
| hidden_size=gru_hidden_size, | |||||
| num_layers=gru_num_layers, | |||||
| batch_first=True, | |||||
| bidirectional=True) | |||||
| # Attention | |||||
| self.fc = nn.Linear(2* gru_hidden_size, context_vec_size) | |||||
| self.tanh = nn.Tanh() | |||||
| self.softmax = nn.Softmax(dim=1) | |||||
| # context vector | |||||
| self.context_vec = nn.Parameter(torch.Tensor(context_vec_size, 1)) | |||||
| self.context_vec.data.uniform_(-0.1, 0.1) | |||||
| def forward(self, inputs): | |||||
| # GRU part | |||||
| h_t, hidden = self.gru(inputs) # inputs's dim (batch_size, seq_len, word_dim) | |||||
| u = self.tanh(self.fc(h_t)) | |||||
| # Attention part | |||||
| alpha = self.softmax(torch.matmul(u, self.context_vec)) # u's dim (batch_size, seq_len, context_vec_size) | |||||
| output = torch.bmm(torch.transpose(h_t, 1, 2), alpha) # alpha's dim (batch_size, seq_len, 1) | |||||
| return torch.squeeze(output, dim=2) # output's dim (batch_size, 2*hidden_size, 1) | |||||
| if __name__ == '__main__': | |||||
| ''' | |||||
| Test the model correctness | |||||
| ''' | |||||
| import numpy as np | |||||
| use_cuda = True | |||||
| net = HAN(input_size=200, output_size=5, | |||||
| word_hidden_size=50, word_num_layers=1, word_context_size=100, | |||||
| sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) | |||||
| optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9) | |||||
| criterion = nn.NLLLoss() | |||||
| test_time = 10 | |||||
| batch_size = 64 | |||||
| if use_cuda: | |||||
| net.cuda() | |||||
| print('test training') | |||||
| for step in range(test_time): | |||||
| x_data = [torch.randn(np.random.randint(1,10), 200, 200) for i in range(batch_size)] | |||||
| y_data = torch.LongTensor([np.random.randint(0, 5) for i in range(batch_size)]) | |||||
| if use_cuda: | |||||
| x_data = [x_i.cuda() for x_i in x_data] | |||||
| y_data = y_data.cuda() | |||||
| x = [Variable(x_i) for x_i in x_data] | |||||
| y = Variable(y_data) | |||||
| predict = net(x) | |||||
| loss = criterion(predict, y) | |||||
| optimizer.zero_grad() | |||||
| loss.backward() | |||||
| optimizer.step() | |||||
| print(loss.data[0]) | |||||
| @@ -1,51 +0,0 @@ | |||||
| '''' | |||||
| Tokenize yelp dataset's documents using stanford core nlp | |||||
| ''' | |||||
| import pickle | |||||
| import json | |||||
| import nltk | |||||
| from nltk.tokenize import stanford | |||||
| import os | |||||
| input_filename = 'review.json' | |||||
| # config for stanford core nlp | |||||
| os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe' | |||||
| path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar' | |||||
| tokenizer = stanford.CoreNLPTokenizer() | |||||
| in_dirname = 'review' | |||||
| out_dirname = 'reviews' | |||||
| f = open(input_filename, encoding='utf-8') | |||||
| samples = [] | |||||
| j = 0 | |||||
| for i, line in enumerate(f.readlines()): | |||||
| review = json.loads(line) | |||||
| samples.append((review['stars'], review['text'])) | |||||
| if (i+1) % 5000 == 0: | |||||
| print(i) | |||||
| pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb')) | |||||
| j += 1 | |||||
| samples = [] | |||||
| pickle.dump(samples, open(in_dirname + '/samples%d.pkl'%j, 'wb')) | |||||
| # samples = pickle.load(open(out_dirname + '/samples0.pkl', 'rb')) | |||||
| # print(samples[0]) | |||||
| for fn in os.listdir(in_dirname): | |||||
| print(fn) | |||||
| precessed = [] | |||||
| for stars, text in pickle.load(open(os.path.join(in_dirname, fn), 'rb')): | |||||
| tokens = [] | |||||
| sents = nltk.tokenize.sent_tokenize(text) | |||||
| for s in sents: | |||||
| tokens.append(tokenizer.tokenize(s)) | |||||
| precessed.append((stars, tokens)) | |||||
| # print(tokens) | |||||
| if len(precessed) % 100 == 0: | |||||
| print(len(precessed)) | |||||
| pickle.dump(precessed, open(os.path.join(out_dirname, fn), 'wb')) | |||||
| @@ -1,167 +0,0 @@ | |||||
| import os | |||||
| import pickle | |||||
| import nltk | |||||
| import numpy as np | |||||
| import torch | |||||
| from model import * | |||||
| class SentIter: | |||||
| def __init__(self, dirname, count): | |||||
| self.dirname = dirname | |||||
| self.count = int(count) | |||||
| def __iter__(self): | |||||
| for f in os.listdir(self.dirname)[:self.count]: | |||||
| with open(os.path.join(self.dirname, f), 'rb') as f: | |||||
| for y, x in pickle.load(f): | |||||
| for sent in x: | |||||
| yield sent | |||||
| def train_word_vec(): | |||||
| # load data | |||||
| dirname = 'reviews' | |||||
| sents = SentIter(dirname, 238) | |||||
| # define model and train | |||||
| model = models.Word2Vec(size=200, sg=0, workers=4, min_count=5) | |||||
| model.build_vocab(sents) | |||||
| model.train(sents, total_examples=model.corpus_count, epochs=10) | |||||
| model.save('yelp.word2vec') | |||||
| print(model.wv.similarity('woman', 'man')) | |||||
| print(model.wv.similarity('nice', 'awful')) | |||||
| class Embedding_layer: | |||||
| def __init__(self, wv, vector_size): | |||||
| self.wv = wv | |||||
| self.vector_size = vector_size | |||||
| def get_vec(self, w): | |||||
| try: | |||||
| v = self.wv[w] | |||||
| except KeyError as e: | |||||
| v = np.random.randn(self.vector_size) | |||||
| return v | |||||
| from torch.utils.data import DataLoader, Dataset | |||||
| class YelpDocSet(Dataset): | |||||
| def __init__(self, dirname, start_file, num_files, embedding): | |||||
| self.dirname = dirname | |||||
| self.num_files = num_files | |||||
| self._files = os.listdir(dirname)[start_file:start_file + num_files] | |||||
| self.embedding = embedding | |||||
| self._cache = [(-1, None) for i in range(5)] | |||||
| def get_doc(self, n): | |||||
| file_id = n // 5000 | |||||
| idx = file_id % 5 | |||||
| if self._cache[idx][0] != file_id: | |||||
| with open(os.path.join(self.dirname, self._files[file_id]), 'rb') as f: | |||||
| self._cache[idx] = (file_id, pickle.load(f)) | |||||
| y, x = self._cache[idx][1][n % 5000] | |||||
| sents = [] | |||||
| for s_list in x: | |||||
| sents.append(' '.join(s_list)) | |||||
| x = '\n'.join(sents) | |||||
| return x, y-1 | |||||
| def __len__(self): | |||||
| return len(self._files)*5000 | |||||
| def __getitem__(self, n): | |||||
| file_id = n // 5000 | |||||
| idx = file_id % 5 | |||||
| if self._cache[idx][0] != file_id: | |||||
| print('load {} to {}'.format(file_id, idx)) | |||||
| with open(os.path.join(self.dirname, self._files[file_id]), 'rb') as f: | |||||
| self._cache[idx] = (file_id, pickle.load(f)) | |||||
| y, x = self._cache[idx][1][n % 5000] | |||||
| doc = [] | |||||
| for sent in x: | |||||
| if len(sent) == 0: | |||||
| continue | |||||
| sent_vec = [] | |||||
| for word in sent: | |||||
| vec = self.embedding.get_vec(word) | |||||
| sent_vec.append(vec.tolist()) | |||||
| sent_vec = torch.Tensor(sent_vec) | |||||
| doc.append(sent_vec) | |||||
| if len(doc) == 0: | |||||
| doc = [torch.zeros(1,200)] | |||||
| return doc, y-1 | |||||
| def collate(iterable): | |||||
| y_list = [] | |||||
| x_list = [] | |||||
| for x, y in iterable: | |||||
| y_list.append(y) | |||||
| x_list.append(x) | |||||
| return x_list, torch.LongTensor(y_list) | |||||
| def train(net, dataset, num_epoch, batch_size, print_size=10, use_cuda=False): | |||||
| optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9) | |||||
| criterion = nn.NLLLoss() | |||||
| dataloader = DataLoader(dataset, | |||||
| batch_size=batch_size, | |||||
| collate_fn=collate, | |||||
| num_workers=0) | |||||
| running_loss = 0.0 | |||||
| if use_cuda: | |||||
| net.cuda() | |||||
| print('start training') | |||||
| for epoch in range(num_epoch): | |||||
| for i, batch_samples in enumerate(dataloader): | |||||
| x, y = batch_samples | |||||
| doc_list = [] | |||||
| for sample in x: | |||||
| doc = [] | |||||
| for sent_vec in sample: | |||||
| if use_cuda: | |||||
| sent_vec = sent_vec.cuda() | |||||
| doc.append(Variable(sent_vec)) | |||||
| doc_list.append(pack_sequence(doc)) | |||||
| if use_cuda: | |||||
| y = y.cuda() | |||||
| y = Variable(y) | |||||
| predict = net(doc_list) | |||||
| loss = criterion(predict, y) | |||||
| optimizer.zero_grad() | |||||
| loss.backward() | |||||
| optimizer.step() | |||||
| running_loss += loss.data[0] | |||||
| if i % print_size == print_size-1: | |||||
| print('{}, {}'.format(i+1, running_loss/print_size)) | |||||
| running_loss = 0.0 | |||||
| torch.save(net.state_dict(), 'model.dict') | |||||
| torch.save(net.state_dict(), 'model.dict') | |||||
| if __name__ == '__main__': | |||||
| ''' | |||||
| Train process | |||||
| ''' | |||||
| from gensim.models import Word2Vec | |||||
| import gensim | |||||
| from gensim import models | |||||
| train_word_vec() | |||||
| embed_model = Word2Vec.load('yelp.word2vec') | |||||
| embedding = Embedding_layer(embed_model.wv, embed_model.wv.vector_size) | |||||
| del embed_model | |||||
| start_file = 0 | |||||
| dataset = YelpDocSet('reviews', start_file, 120-start_file, embedding) | |||||
| print('training data size {}'.format(len(dataset))) | |||||
| net = HAN(input_size=200, output_size=5, | |||||
| word_hidden_size=50, word_num_layers=1, word_context_size=100, | |||||
| sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) | |||||
| try: | |||||
| net.load_state_dict(torch.load('model.dict')) | |||||
| print("last time trained model has loaded") | |||||
| except Exception: | |||||
| print("cannot load model, train the inital model") | |||||
| train(net, dataset, num_epoch=5, batch_size=64, use_cuda=True) | |||||
| @@ -1,14 +0,0 @@ | |||||
| class BaseSaver(object): | |||||
| """base class for all savers""" | |||||
| def __init__(self, save_path): | |||||
| self.save_path = save_path | |||||
| def save_bytes(self): | |||||
| raise NotImplementedError | |||||
| def save_str(self): | |||||
| raise NotImplementedError | |||||
| def compress(self): | |||||
| raise NotImplementedError | |||||
| @@ -1,12 +0,0 @@ | |||||
| from saver.base_saver import BaseSaver | |||||
| class Logger(BaseSaver): | |||||
| """Logging""" | |||||
| def __init__(self, save_path): | |||||
| super(Logger, self).__init__(save_path) | |||||
| def log(self, string): | |||||
| with open(self.save_path, "a") as f: | |||||
| f.write(string) | |||||
| @@ -1,8 +0,0 @@ | |||||
| from saver.base_saver import BaseSaver | |||||
| class ModelSaver(BaseSaver): | |||||
| """Save a model""" | |||||
| def __init__(self, save_path): | |||||
| super(ModelSaver, self).__init__(save_path) | |||||
| @@ -1,8 +1,9 @@ | |||||
| from action.tester import Tester | |||||
| from action.trainer import Trainer | |||||
| from loader.base_loader import ToyLoader0 | from loader.base_loader import ToyLoader0 | ||||
| from model.char_language_model import CharLM | from model.char_language_model import CharLM | ||||
| from fastNLP.action import Tester | |||||
| from fastNLP.action.trainer import Trainer | |||||
| def test_charlm(): | def test_charlm(): | ||||
| train_config = Trainer.TrainConfig(epochs=1, validate=True, save_when_better=True, | train_config = Trainer.TrainConfig(epochs=1, validate=True, save_when_better=True, | ||||
| @@ -1,10 +1,10 @@ | |||||
| from collections import namedtuple | from collections import namedtuple | ||||
| import numpy as np | import numpy as np | ||||
| from action.trainer import Trainer | |||||
| from model.base_model import ToyModel | from model.base_model import ToyModel | ||||
| from fastNLP.action.trainer import Trainer | |||||
| def test_trainer(): | def test_trainer(): | ||||
| Config = namedtuple("config", ["epochs", "validate", "save_when_better"]) | Config = namedtuple("config", ["epochs", "validate", "save_when_better"]) | ||||
| @@ -1,8 +1,9 @@ | |||||
| from action.tester import Tester | |||||
| from action.trainer import Trainer | |||||
| from loader.base_loader import BaseLoader | from loader.base_loader import BaseLoader | ||||
| from model.word_seg_model import WordSegModel | from model.word_seg_model import WordSegModel | ||||
| from fastNLP.action import Tester | |||||
| from fastNLP.action.trainer import Trainer | |||||
| def test_charlm(): | def test_charlm(): | ||||
| train_config = Trainer.TrainConfig(epochs=5, validate=False, save_when_better=False, | train_config = Trainer.TrainConfig(epochs=5, validate=False, save_when_better=False, | ||||