| @@ -0,0 +1,16 @@ | |||
| class Action(object): | |||
| """ | |||
| base class for Trainer and Tester | |||
| """ | |||
| def __init__(self): | |||
| super(Action, self).__init__() | |||
| def load_config(self, args): | |||
| pass | |||
| def load_dataset(self, args): | |||
| pass | |||
| def log(self, args): | |||
| pass | |||
| @@ -0,0 +1,9 @@ | |||
| from action.action import Action | |||
| class Tester(Action): | |||
| """docstring for Tester""" | |||
| def __init__(self, arg): | |||
| super(Tester, self).__init__() | |||
| self.arg = arg | |||
| @@ -0,0 +1,14 @@ | |||
| from action.action import Action | |||
| class Trainer(Action): | |||
| """ | |||
| Trainer for common training logic of all models | |||
| """ | |||
| def __init__(self, arg): | |||
| super(Trainer, self).__init__() | |||
| self.arg = arg | |||
| def train(self, args): | |||
| pass | |||
| @@ -0,0 +1,15 @@ | |||
| class BaseLoader(object): | |||
| """docstring for BaseLoader""" | |||
| def __init__(self, data_name, data_path): | |||
| super(BaseLoader, self).__init__() | |||
| self.data_name = data_name | |||
| self.data_path = data_path | |||
| def load(self): | |||
| """ | |||
| :return: string | |||
| """ | |||
| with open(self.data_path, "r", encoding="utf-8") as f: | |||
| text = f.read() | |||
| return text | |||
| @@ -0,0 +1,14 @@ | |||
| from loader.base_loader import BaseLoader | |||
| class ConfigLoader(BaseLoader): | |||
| """loader for configuration files""" | |||
| def __int__(self, data_name, data_path): | |||
| super(ConfigLoader, self).__init__(data_name, data_path) | |||
| self.config = self.parse(super(ConfigLoader, self).load()) | |||
| @staticmethod | |||
| def parse(string): | |||
| # To do | |||
| return string | |||
| @@ -0,0 +1,47 @@ | |||
| from loader.base_loader import BaseLoader | |||
| class DatasetLoader(BaseLoader): | |||
| """"loader for data sets""" | |||
| def __init__(self, data_name, data_path): | |||
| super(DatasetLoader, self).__init__(data_name, data_path) | |||
| class ConllLoader(DatasetLoader): | |||
| """loader for conll format files""" | |||
| def __int__(self, data_name, data_path): | |||
| """ | |||
| :param str data_name: the name of the conll data set | |||
| :param str data_path: the path to the conll data set | |||
| """ | |||
| super(ConllLoader, self).__init__(data_name, data_path) | |||
| self.data_set = self.parse(self.load()) | |||
| def load(self): | |||
| """ | |||
| :return: list lines: all lines in a conll file | |||
| """ | |||
| with open(self.data_path, "r", encoding="utf-8") as f: | |||
| lines = f.readlines() | |||
| return lines | |||
| @staticmethod | |||
| def parse(lines): | |||
| """ | |||
| :param list lines:a list containing all lines in a conll file. | |||
| :return: a 3D list | |||
| """ | |||
| sentences = list() | |||
| tokens = list() | |||
| for line in lines: | |||
| if line[0] == "#": | |||
| # skip the comments | |||
| continue | |||
| if line == "\n": | |||
| sentences.append(tokens) | |||
| tokens = [] | |||
| continue | |||
| tokens.append(line.split()) | |||
| return sentences | |||
| @@ -0,0 +1,8 @@ | |||
| from loader.base_loader import BaseLoader | |||
| class EmbedLoader(BaseLoader): | |||
| """docstring for EmbedLoader""" | |||
| def __init__(self, data_name, data_path): | |||
| super(EmbedLoader, self).__init__(data_name, data_path) | |||
| @@ -1,21 +1,21 @@ | |||
| MIT License | |||
| Copyright (c) 2017 | |||
| Permission is hereby granted, free of charge, to any person obtaining a copy | |||
| of this software and associated documentation files (the "Software"), to deal | |||
| in the Software without restriction, including without limitation the rights | |||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
| copies of the Software, and to permit persons to whom the Software is | |||
| furnished to do so, subject to the following conditions: | |||
| The above copyright notice and this permission notice shall be included in all | |||
| copies or substantial portions of the Software. | |||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
| MIT License | |||
| Copyright (c) 2017 | |||
| Permission is hereby granted, free of charge, to any person obtaining a copy | |||
| of this software and associated documentation files (the "Software"), to deal | |||
| in the Software without restriction, including without limitation the rights | |||
| to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |||
| copies of the Software, and to permit persons to whom the Software is | |||
| furnished to do so, subject to the following conditions: | |||
| The above copyright notice and this permission notice shall be included in all | |||
| copies or substantial portions of the Software. | |||
| THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |||
| IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |||
| FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |||
| AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |||
| LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |||
| OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |||
| SOFTWARE. | |||
| @@ -1,40 +1,40 @@ | |||
| # PyTorch-Character-Aware-Neural-Language-Model | |||
| This is the PyTorch implementation of character-aware neural language model proposed in this [paper](https://arxiv.org/abs/1508.06615) by Yoon Kim. | |||
| ## Requiredments | |||
| The code is run and tested with **Python 3.5.2** and **PyTorch 0.3.1**. | |||
| ## HyperParameters | |||
| | HyperParam | value | | |||
| | ------ | :-------| | |||
| | LSTM batch size | 20 | | |||
| | LSTM sequence length | 35 | | |||
| | LSTM hidden units | 300 | | |||
| | epochs | 35 | | |||
| | initial learning rate | 1.0 | | |||
| | character embedding dimension | 15 | | |||
| ## Demo | |||
| Train the model with split train/valid/test data. | |||
| `python train.py` | |||
| The trained model will saved in `cache/net.pkl`. | |||
| Test the model. | |||
| `python test.py` | |||
| Best result on test set: | |||
| PPl=127.2163 | |||
| cross entropy loss=4.8459 | |||
| ## Acknowledgement | |||
| This implementation borrowed ideas from | |||
| https://github.com/jarfo/kchar | |||
| https://github.com/cronos123/Character-Aware-Neural-Language-Models | |||
| # PyTorch-Character-Aware-Neural-Language-Model | |||
| This is the PyTorch implementation of character-aware neural language model proposed in this [paper](https://arxiv.org/abs/1508.06615) by Yoon Kim. | |||
| ## Requiredments | |||
| The code is run and tested with **Python 3.5.2** and **PyTorch 0.3.1**. | |||
| ## HyperParameters | |||
| | HyperParam | value | | |||
| | ------ | :-------| | |||
| | LSTM batch size | 20 | | |||
| | LSTM sequence length | 35 | | |||
| | LSTM hidden units | 300 | | |||
| | epochs | 35 | | |||
| | initial learning rate | 1.0 | | |||
| | character embedding dimension | 15 | | |||
| ## Demo | |||
| Train the model with split train/valid/test data. | |||
| `python train.py` | |||
| The trained model will saved in `cache/net.pkl`. | |||
| Test the model. | |||
| `python test.py` | |||
| Best result on test set: | |||
| PPl=127.2163 | |||
| cross entropy loss=4.8459 | |||
| ## Acknowledgement | |||
| This implementation borrowed ideas from | |||
| https://github.com/jarfo/kchar | |||
| https://github.com/cronos123/Character-Aware-Neural-Language-Models | |||
| @@ -1,148 +1,148 @@ | |||
| import torch | |||
| from torch.autograd import Variable | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| class Highway(nn.Module): | |||
| """Highway network""" | |||
| def __init__(self, input_size): | |||
| super(Highway, self).__init__() | |||
| self.fc1 = nn.Linear(input_size, input_size, bias=True) | |||
| self.fc2 = nn.Linear(input_size, input_size, bias=True) | |||
| def forward(self, x): | |||
| t = F.sigmoid(self.fc1(x)) | |||
| return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1-t, x) | |||
| class charLM(nn.Module): | |||
| """CNN + highway network + LSTM | |||
| # Input: | |||
| 4D tensor with shape [batch_size, in_channel, height, width] | |||
| # Output: | |||
| 2D Tensor with shape [batch_size, vocab_size] | |||
| # Arguments: | |||
| char_emb_dim: the size of each character's embedding | |||
| word_emb_dim: the size of each word's embedding | |||
| vocab_size: num of unique words | |||
| num_char: num of characters | |||
| use_gpu: True or False | |||
| """ | |||
| def __init__(self, char_emb_dim, word_emb_dim, | |||
| vocab_size, num_char, use_gpu): | |||
| super(charLM, self).__init__() | |||
| self.char_emb_dim = char_emb_dim | |||
| self.word_emb_dim = word_emb_dim | |||
| self.vocab_size = vocab_size | |||
| # char embedding layer | |||
| self.char_embed = nn.Embedding(num_char, char_emb_dim) | |||
| # convolutions of filters with different sizes | |||
| self.convolutions = [] | |||
| # list of tuples: (the number of filter, width) | |||
| self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)] | |||
| for out_channel, filter_width in self.filter_num_width: | |||
| self.convolutions.append( | |||
| nn.Conv2d( | |||
| 1, # in_channel | |||
| out_channel, # out_channel | |||
| kernel_size=(char_emb_dim, filter_width), # (height, width) | |||
| bias=True | |||
| ) | |||
| ) | |||
| self.highway_input_dim = sum([x for x, y in self.filter_num_width]) | |||
| self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False) | |||
| # highway net | |||
| self.highway1 = Highway(self.highway_input_dim) | |||
| self.highway2 = Highway(self.highway_input_dim) | |||
| # LSTM | |||
| self.lstm_num_layers = 2 | |||
| self.lstm = nn.LSTM(input_size=self.highway_input_dim, | |||
| hidden_size=self.word_emb_dim, | |||
| num_layers=self.lstm_num_layers, | |||
| bias=True, | |||
| dropout=0.5, | |||
| batch_first=True) | |||
| # output layer | |||
| self.dropout = nn.Dropout(p=0.5) | |||
| self.linear = nn.Linear(self.word_emb_dim, self.vocab_size) | |||
| if use_gpu is True: | |||
| for x in range(len(self.convolutions)): | |||
| self.convolutions[x] = self.convolutions[x].cuda() | |||
| self.highway1 = self.highway1.cuda() | |||
| self.highway2 = self.highway2.cuda() | |||
| self.lstm = self.lstm.cuda() | |||
| self.dropout = self.dropout.cuda() | |||
| self.char_embed = self.char_embed.cuda() | |||
| self.linear = self.linear.cuda() | |||
| self.batch_norm = self.batch_norm.cuda() | |||
| def forward(self, x, hidden): | |||
| # Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2] | |||
| # Return: Variable of Tensor with shape [num_words, len(word_dict)] | |||
| lstm_batch_size = x.size()[0] | |||
| lstm_seq_len = x.size()[1] | |||
| x = x.contiguous().view(-1, x.size()[2]) | |||
| # [num_seq*seq_len, max_word_len+2] | |||
| x = self.char_embed(x) | |||
| # [num_seq*seq_len, max_word_len+2, char_emb_dim] | |||
| x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3) | |||
| # [num_seq*seq_len, 1, max_word_len+2, char_emb_dim] | |||
| x = self.conv_layers(x) | |||
| # [num_seq*seq_len, total_num_filters] | |||
| x = self.batch_norm(x) | |||
| # [num_seq*seq_len, total_num_filters] | |||
| x = self.highway1(x) | |||
| x = self.highway2(x) | |||
| # [num_seq*seq_len, total_num_filters] | |||
| x = x.contiguous().view(lstm_batch_size,lstm_seq_len, -1) | |||
| # [num_seq, seq_len, total_num_filters] | |||
| x, hidden = self.lstm(x, hidden) | |||
| # [seq_len, num_seq, hidden_size] | |||
| x = self.dropout(x) | |||
| # [seq_len, num_seq, hidden_size] | |||
| x = x.contiguous().view(lstm_batch_size*lstm_seq_len, -1) | |||
| # [num_seq*seq_len, hidden_size] | |||
| x = self.linear(x) | |||
| # [num_seq*seq_len, vocab_size] | |||
| return x, hidden | |||
| def conv_layers(self, x): | |||
| chosen_list = list() | |||
| for conv in self.convolutions: | |||
| feature_map = F.tanh(conv(x)) | |||
| # (batch_size, out_channel, 1, max_word_len-width+1) | |||
| chosen = torch.max(feature_map, 3)[0] | |||
| # (batch_size, out_channel, 1) | |||
| chosen = chosen.squeeze() | |||
| # (batch_size, out_channel) | |||
| chosen_list.append(chosen) | |||
| # (batch_size, total_num_filers) | |||
| return torch.cat(chosen_list, 1) | |||
| import torch | |||
| from torch.autograd import Variable | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| class Highway(nn.Module): | |||
| """Highway network""" | |||
| def __init__(self, input_size): | |||
| super(Highway, self).__init__() | |||
| self.fc1 = nn.Linear(input_size, input_size, bias=True) | |||
| self.fc2 = nn.Linear(input_size, input_size, bias=True) | |||
| def forward(self, x): | |||
| t = F.sigmoid(self.fc1(x)) | |||
| return torch.mul(t, F.relu(self.fc2(x))) + torch.mul(1-t, x) | |||
| class charLM(nn.Module): | |||
| """CNN + highway network + LSTM | |||
| # Input: | |||
| 4D tensor with shape [batch_size, in_channel, height, width] | |||
| # Output: | |||
| 2D Tensor with shape [batch_size, vocab_size] | |||
| # Arguments: | |||
| char_emb_dim: the size of each character's embedding | |||
| word_emb_dim: the size of each word's embedding | |||
| vocab_size: num of unique words | |||
| num_char: num of characters | |||
| use_gpu: True or False | |||
| """ | |||
| def __init__(self, char_emb_dim, word_emb_dim, | |||
| vocab_size, num_char, use_gpu): | |||
| super(charLM, self).__init__() | |||
| self.char_emb_dim = char_emb_dim | |||
| self.word_emb_dim = word_emb_dim | |||
| self.vocab_size = vocab_size | |||
| # char embedding layer | |||
| self.char_embed = nn.Embedding(num_char, char_emb_dim) | |||
| # convolutions of filters with different sizes | |||
| self.convolutions = [] | |||
| # list of tuples: (the number of filter, width) | |||
| self.filter_num_width = [(25, 1), (50, 2), (75, 3), (100, 4), (125, 5), (150, 6)] | |||
| for out_channel, filter_width in self.filter_num_width: | |||
| self.convolutions.append( | |||
| nn.Conv2d( | |||
| 1, # in_channel | |||
| out_channel, # out_channel | |||
| kernel_size=(char_emb_dim, filter_width), # (height, width) | |||
| bias=True | |||
| ) | |||
| ) | |||
| self.highway_input_dim = sum([x for x, y in self.filter_num_width]) | |||
| self.batch_norm = nn.BatchNorm1d(self.highway_input_dim, affine=False) | |||
| # highway net | |||
| self.highway1 = Highway(self.highway_input_dim) | |||
| self.highway2 = Highway(self.highway_input_dim) | |||
| # LSTM | |||
| self.lstm_num_layers = 2 | |||
| self.lstm = nn.LSTM(input_size=self.highway_input_dim, | |||
| hidden_size=self.word_emb_dim, | |||
| num_layers=self.lstm_num_layers, | |||
| bias=True, | |||
| dropout=0.5, | |||
| batch_first=True) | |||
| # output layer | |||
| self.dropout = nn.Dropout(p=0.5) | |||
| self.linear = nn.Linear(self.word_emb_dim, self.vocab_size) | |||
| if use_gpu is True: | |||
| for x in range(len(self.convolutions)): | |||
| self.convolutions[x] = self.convolutions[x].cuda() | |||
| self.highway1 = self.highway1.cuda() | |||
| self.highway2 = self.highway2.cuda() | |||
| self.lstm = self.lstm.cuda() | |||
| self.dropout = self.dropout.cuda() | |||
| self.char_embed = self.char_embed.cuda() | |||
| self.linear = self.linear.cuda() | |||
| self.batch_norm = self.batch_norm.cuda() | |||
| def forward(self, x, hidden): | |||
| # Input: Variable of Tensor with shape [num_seq, seq_len, max_word_len+2] | |||
| # Return: Variable of Tensor with shape [num_words, len(word_dict)] | |||
| lstm_batch_size = x.size()[0] | |||
| lstm_seq_len = x.size()[1] | |||
| x = x.contiguous().view(-1, x.size()[2]) | |||
| # [num_seq*seq_len, max_word_len+2] | |||
| x = self.char_embed(x) | |||
| # [num_seq*seq_len, max_word_len+2, char_emb_dim] | |||
| x = torch.transpose(x.view(x.size()[0], 1, x.size()[1], -1), 2, 3) | |||
| # [num_seq*seq_len, 1, max_word_len+2, char_emb_dim] | |||
| x = self.conv_layers(x) | |||
| # [num_seq*seq_len, total_num_filters] | |||
| x = self.batch_norm(x) | |||
| # [num_seq*seq_len, total_num_filters] | |||
| x = self.highway1(x) | |||
| x = self.highway2(x) | |||
| # [num_seq*seq_len, total_num_filters] | |||
| x = x.contiguous().view(lstm_batch_size,lstm_seq_len, -1) | |||
| # [num_seq, seq_len, total_num_filters] | |||
| x, hidden = self.lstm(x, hidden) | |||
| # [seq_len, num_seq, hidden_size] | |||
| x = self.dropout(x) | |||
| # [seq_len, num_seq, hidden_size] | |||
| x = x.contiguous().view(lstm_batch_size*lstm_seq_len, -1) | |||
| # [num_seq*seq_len, hidden_size] | |||
| x = self.linear(x) | |||
| # [num_seq*seq_len, vocab_size] | |||
| return x, hidden | |||
| def conv_layers(self, x): | |||
| chosen_list = list() | |||
| for conv in self.convolutions: | |||
| feature_map = F.tanh(conv(x)) | |||
| # (batch_size, out_channel, 1, max_word_len-width+1) | |||
| chosen = torch.max(feature_map, 3)[0] | |||
| # (batch_size, out_channel, 1) | |||
| chosen = chosen.squeeze() | |||
| # (batch_size, out_channel) | |||
| chosen_list.append(chosen) | |||
| # (batch_size, total_num_filers) | |||
| return torch.cat(chosen_list, 1) | |||
| @@ -1,123 +1,123 @@ | |||
| import os | |||
| import torch | |||
| from torch.autograd import Variable | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| import numpy as np | |||
| from model import charLM | |||
| from utilities import * | |||
| from collections import namedtuple | |||
| def to_var(x): | |||
| if torch.cuda.is_available(): | |||
| x = x.cuda() | |||
| return Variable(x) | |||
| def test(net, data, opt): | |||
| net.eval() | |||
| test_input = torch.from_numpy(data.test_input) | |||
| test_label = torch.from_numpy(data.test_label) | |||
| num_seq = test_input.size()[0] // opt.lstm_seq_len | |||
| test_input = test_input[:num_seq*opt.lstm_seq_len, :] | |||
| # [num_seq, seq_len, max_word_len+2] | |||
| test_input = test_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2) | |||
| criterion = nn.CrossEntropyLoss() | |||
| loss_list = [] | |||
| num_hits = 0 | |||
| total = 0 | |||
| iterations = test_input.size()[0] // opt.lstm_batch_size | |||
| test_generator = batch_generator(test_input, opt.lstm_batch_size) | |||
| label_generator = batch_generator(test_label, opt.lstm_batch_size*opt.lstm_seq_len) | |||
| hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)), | |||
| to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim))) | |||
| add_loss = 0.0 | |||
| for t in range(iterations): | |||
| batch_input = test_generator.__next__ () | |||
| batch_label = label_generator.__next__() | |||
| net.zero_grad() | |||
| hidden = [state.detach() for state in hidden] | |||
| test_output, hidden = net(to_var(batch_input), hidden) | |||
| test_loss = criterion(test_output, to_var(batch_label)).data | |||
| loss_list.append(test_loss) | |||
| add_loss += test_loss | |||
| print("Test Loss={0:.4f}".format(float(add_loss) / iterations)) | |||
| print("Test PPL={0:.4f}".format(float(np.exp(add_loss / iterations)))) | |||
| ############################################################# | |||
| if __name__ == "__main__": | |||
| word_embed_dim = 300 | |||
| char_embedding_dim = 15 | |||
| if os.path.exists("cache/prep.pt") is False: | |||
| print("Cannot find prep.pt") | |||
| objetcs = torch.load("cache/prep.pt") | |||
| word_dict = objetcs["word_dict"] | |||
| char_dict = objetcs["char_dict"] | |||
| reverse_word_dict = objetcs["reverse_word_dict"] | |||
| max_word_len = objetcs["max_word_len"] | |||
| num_words = len(word_dict) | |||
| print("word/char dictionary built. Start making inputs.") | |||
| if os.path.exists("cache/data_sets.pt") is False: | |||
| test_text = read_data("./test.txt") | |||
| test_set = np.array(text2vec(test_text, char_dict, max_word_len)) | |||
| # Labels are next-word index in word_dict with the same length as inputs | |||
| test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]]) | |||
| category = {"test": test_set, "tlabel":test_label} | |||
| torch.save(category, "cache/data_sets.pt") | |||
| else: | |||
| data_sets = torch.load("cache/data_sets.pt") | |||
| test_set = data_sets["test"] | |||
| test_label = data_sets["tlabel"] | |||
| train_set = data_sets["tdata"] | |||
| train_label = data_sets["trlabel"] | |||
| DataTuple = namedtuple("DataTuple", "test_input test_label train_input train_label ") | |||
| data = DataTuple( test_input=test_set, | |||
| test_label=test_label, train_label=train_label, train_input=train_set) | |||
| print("Loaded data sets. Start building network.") | |||
| USE_GPU = True | |||
| cnn_batch_size = 700 | |||
| lstm_seq_len = 35 | |||
| lstm_batch_size = 20 | |||
| net = torch.load("cache/net.pkl") | |||
| Options = namedtuple("Options", [ "cnn_batch_size", "lstm_seq_len", | |||
| "max_word_len", "lstm_batch_size", "word_embed_dim"]) | |||
| opt = Options(cnn_batch_size=lstm_seq_len*lstm_batch_size, | |||
| lstm_seq_len=lstm_seq_len, | |||
| max_word_len=max_word_len, | |||
| lstm_batch_size=lstm_batch_size, | |||
| word_embed_dim=word_embed_dim) | |||
| print("Network built. Start testing.") | |||
| test(net, data, opt) | |||
| import os | |||
| import torch | |||
| from torch.autograd import Variable | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| import numpy as np | |||
| from model import charLM | |||
| from utilities import * | |||
| from collections import namedtuple | |||
| def to_var(x): | |||
| if torch.cuda.is_available(): | |||
| x = x.cuda() | |||
| return Variable(x) | |||
| def test(net, data, opt): | |||
| net.eval() | |||
| test_input = torch.from_numpy(data.test_input) | |||
| test_label = torch.from_numpy(data.test_label) | |||
| num_seq = test_input.size()[0] // opt.lstm_seq_len | |||
| test_input = test_input[:num_seq*opt.lstm_seq_len, :] | |||
| # [num_seq, seq_len, max_word_len+2] | |||
| test_input = test_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2) | |||
| criterion = nn.CrossEntropyLoss() | |||
| loss_list = [] | |||
| num_hits = 0 | |||
| total = 0 | |||
| iterations = test_input.size()[0] // opt.lstm_batch_size | |||
| test_generator = batch_generator(test_input, opt.lstm_batch_size) | |||
| label_generator = batch_generator(test_label, opt.lstm_batch_size*opt.lstm_seq_len) | |||
| hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)), | |||
| to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim))) | |||
| add_loss = 0.0 | |||
| for t in range(iterations): | |||
| batch_input = test_generator.__next__ () | |||
| batch_label = label_generator.__next__() | |||
| net.zero_grad() | |||
| hidden = [state.detach() for state in hidden] | |||
| test_output, hidden = net(to_var(batch_input), hidden) | |||
| test_loss = criterion(test_output, to_var(batch_label)).data | |||
| loss_list.append(test_loss) | |||
| add_loss += test_loss | |||
| print("Test Loss={0:.4f}".format(float(add_loss) / iterations)) | |||
| print("Test PPL={0:.4f}".format(float(np.exp(add_loss / iterations)))) | |||
| ############################################################# | |||
| if __name__ == "__main__": | |||
| word_embed_dim = 300 | |||
| char_embedding_dim = 15 | |||
| if os.path.exists("cache/prep.pt") is False: | |||
| print("Cannot find prep.pt") | |||
| objetcs = torch.load("cache/prep.pt") | |||
| word_dict = objetcs["word_dict"] | |||
| char_dict = objetcs["char_dict"] | |||
| reverse_word_dict = objetcs["reverse_word_dict"] | |||
| max_word_len = objetcs["max_word_len"] | |||
| num_words = len(word_dict) | |||
| print("word/char dictionary built. Start making inputs.") | |||
| if os.path.exists("cache/data_sets.pt") is False: | |||
| test_text = read_data("./test.txt") | |||
| test_set = np.array(text2vec(test_text, char_dict, max_word_len)) | |||
| # Labels are next-word index in word_dict with the same length as inputs | |||
| test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]]) | |||
| category = {"test": test_set, "tlabel":test_label} | |||
| torch.save(category, "cache/data_sets.pt") | |||
| else: | |||
| data_sets = torch.load("cache/data_sets.pt") | |||
| test_set = data_sets["test"] | |||
| test_label = data_sets["tlabel"] | |||
| train_set = data_sets["tdata"] | |||
| train_label = data_sets["trlabel"] | |||
| DataTuple = namedtuple("DataTuple", "test_input test_label train_input train_label ") | |||
| data = DataTuple( test_input=test_set, | |||
| test_label=test_label, train_label=train_label, train_input=train_set) | |||
| print("Loaded data sets. Start building network.") | |||
| USE_GPU = True | |||
| cnn_batch_size = 700 | |||
| lstm_seq_len = 35 | |||
| lstm_batch_size = 20 | |||
| net = torch.load("cache/net.pkl") | |||
| Options = namedtuple("Options", [ "cnn_batch_size", "lstm_seq_len", | |||
| "max_word_len", "lstm_batch_size", "word_embed_dim"]) | |||
| opt = Options(cnn_batch_size=lstm_seq_len*lstm_batch_size, | |||
| lstm_seq_len=lstm_seq_len, | |||
| max_word_len=max_word_len, | |||
| lstm_batch_size=lstm_batch_size, | |||
| word_embed_dim=word_embed_dim) | |||
| print("Network built. Start testing.") | |||
| test(net, data, opt) | |||
| @@ -1,268 +1,268 @@ | |||
| import torch | |||
| from torch.autograd import Variable | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| import torch.optim as optim | |||
| import numpy as np | |||
| import os | |||
| from model import charLM | |||
| from utilities import * | |||
| from collections import namedtuple | |||
| from test import test | |||
| def preprocess(): | |||
| word_dict, char_dict = create_word_char_dict("valid.txt", "train.txt", "test.txt") | |||
| num_words = len(word_dict) | |||
| num_char = len(char_dict) | |||
| char_dict["BOW"] = num_char+1 | |||
| char_dict["EOW"] = num_char+2 | |||
| char_dict["PAD"] = 0 | |||
| # dict of (int, string) | |||
| reverse_word_dict = {value:key for key, value in word_dict.items()} | |||
| max_word_len = max([len(word) for word in word_dict]) | |||
| objects = { | |||
| "word_dict": word_dict, | |||
| "char_dict": char_dict, | |||
| "reverse_word_dict": reverse_word_dict, | |||
| "max_word_len": max_word_len | |||
| } | |||
| torch.save(objects, "cache/prep.pt") | |||
| print("Preprocess done.") | |||
| def to_var(x): | |||
| if torch.cuda.is_available(): | |||
| x = x.cuda() | |||
| return Variable(x) | |||
| def train(net, data, opt): | |||
| torch.manual_seed(1024) | |||
| train_input = torch.from_numpy(data.train_input) | |||
| train_label = torch.from_numpy(data.train_label) | |||
| valid_input = torch.from_numpy(data.valid_input) | |||
| valid_label = torch.from_numpy(data.valid_label) | |||
| # [num_seq, seq_len, max_word_len+2] | |||
| num_seq = train_input.size()[0] // opt.lstm_seq_len | |||
| train_input = train_input[:num_seq*opt.lstm_seq_len, :] | |||
| train_input = train_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2) | |||
| num_seq = valid_input.size()[0] // opt.lstm_seq_len | |||
| valid_input = valid_input[:num_seq*opt.lstm_seq_len, :] | |||
| valid_input = valid_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2) | |||
| num_epoch = opt.epochs | |||
| num_iter_per_epoch = train_input.size()[0] // opt.lstm_batch_size | |||
| learning_rate = opt.init_lr | |||
| old_PPL = 100000 | |||
| best_PPL = 100000 | |||
| # Log-SoftMax | |||
| criterion = nn.CrossEntropyLoss() | |||
| # word_emb_dim == hidden_size / num of hidden units | |||
| hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)), | |||
| to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim))) | |||
| for epoch in range(num_epoch): | |||
| ################ Validation #################### | |||
| net.eval() | |||
| loss_batch = [] | |||
| PPL_batch = [] | |||
| iterations = valid_input.size()[0] // opt.lstm_batch_size | |||
| valid_generator = batch_generator(valid_input, opt.lstm_batch_size) | |||
| vlabel_generator = batch_generator(valid_label, opt.lstm_batch_size*opt.lstm_seq_len) | |||
| for t in range(iterations): | |||
| batch_input = valid_generator.__next__() | |||
| batch_label = vlabel_generator.__next__() | |||
| hidden = [state.detach() for state in hidden] | |||
| valid_output, hidden = net(to_var(batch_input), hidden) | |||
| length = valid_output.size()[0] | |||
| # [num_sample-1, len(word_dict)] vs [num_sample-1] | |||
| valid_loss = criterion(valid_output, to_var(batch_label)) | |||
| PPL = torch.exp(valid_loss.data) | |||
| loss_batch.append(float(valid_loss)) | |||
| PPL_batch.append(float(PPL)) | |||
| PPL = np.mean(PPL_batch) | |||
| print("[epoch {}] valid PPL={}".format(epoch, PPL)) | |||
| print("valid loss={}".format(np.mean(loss_batch))) | |||
| print("PPL decrease={}".format(float(old_PPL - PPL))) | |||
| # Preserve the best model | |||
| if best_PPL > PPL: | |||
| best_PPL = PPL | |||
| torch.save(net.state_dict(), "cache/model.pt") | |||
| torch.save(net, "cache/net.pkl") | |||
| # Adjust the learning rate | |||
| if float(old_PPL - PPL) <= 1.0: | |||
| learning_rate /= 2 | |||
| print("halved lr:{}".format(learning_rate)) | |||
| old_PPL = PPL | |||
| ################################################## | |||
| #################### Training #################### | |||
| net.train() | |||
| optimizer = optim.SGD(net.parameters(), | |||
| lr = learning_rate, | |||
| momentum=0.85) | |||
| # split the first dim | |||
| input_generator = batch_generator(train_input, opt.lstm_batch_size) | |||
| label_generator = batch_generator(train_label, opt.lstm_batch_size*opt.lstm_seq_len) | |||
| for t in range(num_iter_per_epoch): | |||
| batch_input = input_generator.__next__() | |||
| batch_label = label_generator.__next__() | |||
| # detach hidden state of LSTM from last batch | |||
| hidden = [state.detach() for state in hidden] | |||
| output, hidden = net(to_var(batch_input), hidden) | |||
| # [num_word, vocab_size] | |||
| loss = criterion(output, to_var(batch_label)) | |||
| net.zero_grad() | |||
| loss.backward() | |||
| torch.nn.utils.clip_grad_norm(net.parameters(), 5, norm_type=2) | |||
| optimizer.step() | |||
| if (t+1) % 100 == 0: | |||
| print("[epoch {} step {}] train loss={}, Perplexity={}".format(epoch+1, | |||
| t+1, float(loss.data), float(np.exp(loss.data)))) | |||
| torch.save(net.state_dict(), "cache/model.pt") | |||
| print("Training finished.") | |||
| ################################################################ | |||
| if __name__=="__main__": | |||
| word_embed_dim = 300 | |||
| char_embedding_dim = 15 | |||
| if os.path.exists("cache/prep.pt") is False: | |||
| preprocess() | |||
| objetcs = torch.load("cache/prep.pt") | |||
| word_dict = objetcs["word_dict"] | |||
| char_dict = objetcs["char_dict"] | |||
| reverse_word_dict = objetcs["reverse_word_dict"] | |||
| max_word_len = objetcs["max_word_len"] | |||
| num_words = len(word_dict) | |||
| print("word/char dictionary built. Start making inputs.") | |||
| if os.path.exists("cache/data_sets.pt") is False: | |||
| train_text = read_data("./train.txt") | |||
| valid_text = read_data("./valid.txt") | |||
| test_text = read_data("./test.txt") | |||
| train_set = np.array(text2vec(train_text, char_dict, max_word_len)) | |||
| valid_set = np.array(text2vec(valid_text, char_dict, max_word_len)) | |||
| test_set = np.array(text2vec(test_text, char_dict, max_word_len)) | |||
| # Labels are next-word index in word_dict with the same length as inputs | |||
| train_label = np.array([word_dict[w] for w in train_text[1:]] + [word_dict[train_text[-1]]]) | |||
| valid_label = np.array([word_dict[w] for w in valid_text[1:]] + [word_dict[valid_text[-1]]]) | |||
| test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]]) | |||
| category = {"tdata":train_set, "vdata":valid_set, "test": test_set, | |||
| "trlabel":train_label, "vlabel":valid_label, "tlabel":test_label} | |||
| torch.save(category, "cache/data_sets.pt") | |||
| else: | |||
| data_sets = torch.load("cache/data_sets.pt") | |||
| train_set = data_sets["tdata"] | |||
| valid_set = data_sets["vdata"] | |||
| test_set = data_sets["test"] | |||
| train_label = data_sets["trlabel"] | |||
| valid_label = data_sets["vlabel"] | |||
| test_label = data_sets["tlabel"] | |||
| DataTuple = namedtuple("DataTuple", | |||
| "train_input train_label valid_input valid_label test_input test_label") | |||
| data = DataTuple(train_input=train_set, | |||
| train_label=train_label, | |||
| valid_input=valid_set, | |||
| valid_label=valid_label, | |||
| test_input=test_set, | |||
| test_label=test_label) | |||
| print("Loaded data sets. Start building network.") | |||
| USE_GPU = True | |||
| cnn_batch_size = 700 | |||
| lstm_seq_len = 35 | |||
| lstm_batch_size = 20 | |||
| # cnn_batch_size == lstm_seq_len * lstm_batch_size | |||
| net = charLM(char_embedding_dim, | |||
| word_embed_dim, | |||
| num_words, | |||
| len(char_dict), | |||
| use_gpu=USE_GPU) | |||
| for param in net.parameters(): | |||
| nn.init.uniform(param.data, -0.05, 0.05) | |||
| Options = namedtuple("Options", [ | |||
| "cnn_batch_size", "init_lr", "lstm_seq_len", | |||
| "max_word_len", "lstm_batch_size", "epochs", | |||
| "word_embed_dim"]) | |||
| opt = Options(cnn_batch_size=lstm_seq_len*lstm_batch_size, | |||
| init_lr=1.0, | |||
| lstm_seq_len=lstm_seq_len, | |||
| max_word_len=max_word_len, | |||
| lstm_batch_size=lstm_batch_size, | |||
| epochs=35, | |||
| word_embed_dim=word_embed_dim) | |||
| print("Network built. Start training.") | |||
| # You can stop training anytime by "ctrl+C" | |||
| try: | |||
| train(net, data, opt) | |||
| except KeyboardInterrupt: | |||
| print('-' * 89) | |||
| print('Exiting from training early') | |||
| torch.save(net, "cache/net.pkl") | |||
| print("save net") | |||
| test(net, data, opt) | |||
| import torch | |||
| from torch.autograd import Variable | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| import torch.optim as optim | |||
| import numpy as np | |||
| import os | |||
| from model import charLM | |||
| from utilities import * | |||
| from collections import namedtuple | |||
| from test import test | |||
| def preprocess(): | |||
| word_dict, char_dict = create_word_char_dict("valid.txt", "train.txt", "test.txt") | |||
| num_words = len(word_dict) | |||
| num_char = len(char_dict) | |||
| char_dict["BOW"] = num_char+1 | |||
| char_dict["EOW"] = num_char+2 | |||
| char_dict["PAD"] = 0 | |||
| # dict of (int, string) | |||
| reverse_word_dict = {value:key for key, value in word_dict.items()} | |||
| max_word_len = max([len(word) for word in word_dict]) | |||
| objects = { | |||
| "word_dict": word_dict, | |||
| "char_dict": char_dict, | |||
| "reverse_word_dict": reverse_word_dict, | |||
| "max_word_len": max_word_len | |||
| } | |||
| torch.save(objects, "cache/prep.pt") | |||
| print("Preprocess done.") | |||
| def to_var(x): | |||
| if torch.cuda.is_available(): | |||
| x = x.cuda() | |||
| return Variable(x) | |||
| def train(net, data, opt): | |||
| torch.manual_seed(1024) | |||
| train_input = torch.from_numpy(data.train_input) | |||
| train_label = torch.from_numpy(data.train_label) | |||
| valid_input = torch.from_numpy(data.valid_input) | |||
| valid_label = torch.from_numpy(data.valid_label) | |||
| # [num_seq, seq_len, max_word_len+2] | |||
| num_seq = train_input.size()[0] // opt.lstm_seq_len | |||
| train_input = train_input[:num_seq*opt.lstm_seq_len, :] | |||
| train_input = train_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2) | |||
| num_seq = valid_input.size()[0] // opt.lstm_seq_len | |||
| valid_input = valid_input[:num_seq*opt.lstm_seq_len, :] | |||
| valid_input = valid_input.view(-1, opt.lstm_seq_len, opt.max_word_len+2) | |||
| num_epoch = opt.epochs | |||
| num_iter_per_epoch = train_input.size()[0] // opt.lstm_batch_size | |||
| learning_rate = opt.init_lr | |||
| old_PPL = 100000 | |||
| best_PPL = 100000 | |||
| # Log-SoftMax | |||
| criterion = nn.CrossEntropyLoss() | |||
| # word_emb_dim == hidden_size / num of hidden units | |||
| hidden = (to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim)), | |||
| to_var(torch.zeros(2, opt.lstm_batch_size, opt.word_embed_dim))) | |||
| for epoch in range(num_epoch): | |||
| ################ Validation #################### | |||
| net.eval() | |||
| loss_batch = [] | |||
| PPL_batch = [] | |||
| iterations = valid_input.size()[0] // opt.lstm_batch_size | |||
| valid_generator = batch_generator(valid_input, opt.lstm_batch_size) | |||
| vlabel_generator = batch_generator(valid_label, opt.lstm_batch_size*opt.lstm_seq_len) | |||
| for t in range(iterations): | |||
| batch_input = valid_generator.__next__() | |||
| batch_label = vlabel_generator.__next__() | |||
| hidden = [state.detach() for state in hidden] | |||
| valid_output, hidden = net(to_var(batch_input), hidden) | |||
| length = valid_output.size()[0] | |||
| # [num_sample-1, len(word_dict)] vs [num_sample-1] | |||
| valid_loss = criterion(valid_output, to_var(batch_label)) | |||
| PPL = torch.exp(valid_loss.data) | |||
| loss_batch.append(float(valid_loss)) | |||
| PPL_batch.append(float(PPL)) | |||
| PPL = np.mean(PPL_batch) | |||
| print("[epoch {}] valid PPL={}".format(epoch, PPL)) | |||
| print("valid loss={}".format(np.mean(loss_batch))) | |||
| print("PPL decrease={}".format(float(old_PPL - PPL))) | |||
| # Preserve the best model | |||
| if best_PPL > PPL: | |||
| best_PPL = PPL | |||
| torch.save(net.state_dict(), "cache/model.pt") | |||
| torch.save(net, "cache/net.pkl") | |||
| # Adjust the learning rate | |||
| if float(old_PPL - PPL) <= 1.0: | |||
| learning_rate /= 2 | |||
| print("halved lr:{}".format(learning_rate)) | |||
| old_PPL = PPL | |||
| ################################################## | |||
| #################### Training #################### | |||
| net.train() | |||
| optimizer = optim.SGD(net.parameters(), | |||
| lr = learning_rate, | |||
| momentum=0.85) | |||
| # split the first dim | |||
| input_generator = batch_generator(train_input, opt.lstm_batch_size) | |||
| label_generator = batch_generator(train_label, opt.lstm_batch_size*opt.lstm_seq_len) | |||
| for t in range(num_iter_per_epoch): | |||
| batch_input = input_generator.__next__() | |||
| batch_label = label_generator.__next__() | |||
| # detach hidden state of LSTM from last batch | |||
| hidden = [state.detach() for state in hidden] | |||
| output, hidden = net(to_var(batch_input), hidden) | |||
| # [num_word, vocab_size] | |||
| loss = criterion(output, to_var(batch_label)) | |||
| net.zero_grad() | |||
| loss.backward() | |||
| torch.nn.utils.clip_grad_norm(net.parameters(), 5, norm_type=2) | |||
| optimizer.step() | |||
| if (t+1) % 100 == 0: | |||
| print("[epoch {} step {}] train loss={}, Perplexity={}".format(epoch+1, | |||
| t+1, float(loss.data), float(np.exp(loss.data)))) | |||
| torch.save(net.state_dict(), "cache/model.pt") | |||
| print("Training finished.") | |||
| ################################################################ | |||
| if __name__=="__main__": | |||
| word_embed_dim = 300 | |||
| char_embedding_dim = 15 | |||
| if os.path.exists("cache/prep.pt") is False: | |||
| preprocess() | |||
| objetcs = torch.load("cache/prep.pt") | |||
| word_dict = objetcs["word_dict"] | |||
| char_dict = objetcs["char_dict"] | |||
| reverse_word_dict = objetcs["reverse_word_dict"] | |||
| max_word_len = objetcs["max_word_len"] | |||
| num_words = len(word_dict) | |||
| print("word/char dictionary built. Start making inputs.") | |||
| if os.path.exists("cache/data_sets.pt") is False: | |||
| train_text = read_data("./train.txt") | |||
| valid_text = read_data("./valid.txt") | |||
| test_text = read_data("./test.txt") | |||
| train_set = np.array(text2vec(train_text, char_dict, max_word_len)) | |||
| valid_set = np.array(text2vec(valid_text, char_dict, max_word_len)) | |||
| test_set = np.array(text2vec(test_text, char_dict, max_word_len)) | |||
| # Labels are next-word index in word_dict with the same length as inputs | |||
| train_label = np.array([word_dict[w] for w in train_text[1:]] + [word_dict[train_text[-1]]]) | |||
| valid_label = np.array([word_dict[w] for w in valid_text[1:]] + [word_dict[valid_text[-1]]]) | |||
| test_label = np.array([word_dict[w] for w in test_text[1:]] + [word_dict[test_text[-1]]]) | |||
| category = {"tdata":train_set, "vdata":valid_set, "test": test_set, | |||
| "trlabel":train_label, "vlabel":valid_label, "tlabel":test_label} | |||
| torch.save(category, "cache/data_sets.pt") | |||
| else: | |||
| data_sets = torch.load("cache/data_sets.pt") | |||
| train_set = data_sets["tdata"] | |||
| valid_set = data_sets["vdata"] | |||
| test_set = data_sets["test"] | |||
| train_label = data_sets["trlabel"] | |||
| valid_label = data_sets["vlabel"] | |||
| test_label = data_sets["tlabel"] | |||
| DataTuple = namedtuple("DataTuple", | |||
| "train_input train_label valid_input valid_label test_input test_label") | |||
| data = DataTuple(train_input=train_set, | |||
| train_label=train_label, | |||
| valid_input=valid_set, | |||
| valid_label=valid_label, | |||
| test_input=test_set, | |||
| test_label=test_label) | |||
| print("Loaded data sets. Start building network.") | |||
| USE_GPU = True | |||
| cnn_batch_size = 700 | |||
| lstm_seq_len = 35 | |||
| lstm_batch_size = 20 | |||
| # cnn_batch_size == lstm_seq_len * lstm_batch_size | |||
| net = charLM(char_embedding_dim, | |||
| word_embed_dim, | |||
| num_words, | |||
| len(char_dict), | |||
| use_gpu=USE_GPU) | |||
| for param in net.parameters(): | |||
| nn.init.uniform(param.data, -0.05, 0.05) | |||
| Options = namedtuple("Options", [ | |||
| "cnn_batch_size", "init_lr", "lstm_seq_len", | |||
| "max_word_len", "lstm_batch_size", "epochs", | |||
| "word_embed_dim"]) | |||
| opt = Options(cnn_batch_size=lstm_seq_len*lstm_batch_size, | |||
| init_lr=1.0, | |||
| lstm_seq_len=lstm_seq_len, | |||
| max_word_len=max_word_len, | |||
| lstm_batch_size=lstm_batch_size, | |||
| epochs=35, | |||
| word_embed_dim=word_embed_dim) | |||
| print("Network built. Start training.") | |||
| # You can stop training anytime by "ctrl+C" | |||
| try: | |||
| train(net, data, opt) | |||
| except KeyboardInterrupt: | |||
| print('-' * 89) | |||
| print('Exiting from training early') | |||
| torch.save(net, "cache/net.pkl") | |||
| print("save net") | |||
| test(net, data, opt) | |||
| @@ -1,86 +1,86 @@ | |||
| import torch | |||
| from torch.autograd import Variable | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| def batch_generator(x, batch_size): | |||
| # x: [num_words, in_channel, height, width] | |||
| # partitions x into batches | |||
| num_step = x.size()[0] // batch_size | |||
| for t in range(num_step): | |||
| yield x[t*batch_size:(t+1)*batch_size] | |||
| def text2vec(words, char_dict, max_word_len): | |||
| """ Return list of list of int """ | |||
| word_vec = [] | |||
| for word in words: | |||
| vec = [char_dict[ch] for ch in word] | |||
| if len(vec) < max_word_len: | |||
| vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))] | |||
| vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]] | |||
| word_vec.append(vec) | |||
| return word_vec | |||
| def seq2vec(input_words, char_embedding, char_embedding_dim, char_table): | |||
| """ convert the input strings into character embeddings """ | |||
| # input_words == list of string | |||
| # char_embedding == torch.nn.Embedding | |||
| # char_embedding_dim == int | |||
| # char_table == list of unique chars | |||
| # Returns: tensor of shape [len(input_words), char_embedding_dim, max_word_len+2] | |||
| max_word_len = max([len(word) for word in input_words]) | |||
| print("max_word_len={}".format(max_word_len)) | |||
| tensor_list = [] | |||
| start_column = torch.ones(char_embedding_dim, 1) | |||
| end_column = torch.ones(char_embedding_dim, 1) | |||
| for word in input_words: | |||
| # convert string to word embedding | |||
| word_encoding = char_embedding_lookup(word, char_embedding, char_table) | |||
| # add start and end columns | |||
| word_encoding = torch.cat([start_column, word_encoding, end_column], 1) | |||
| # zero-pad right columns | |||
| word_encoding = F.pad(word_encoding, (0, max_word_len-word_encoding.size()[1]+2)).data | |||
| # create dimension | |||
| word_encoding = word_encoding.unsqueeze(0) | |||
| tensor_list.append(word_encoding) | |||
| return torch.cat(tensor_list, 0) | |||
| def read_data(file_name): | |||
| # Return: list of strings | |||
| with open(file_name, 'r') as f: | |||
| corpus = f.read().lower() | |||
| import re | |||
| corpus = re.sub(r"<unk>", "unk", corpus) | |||
| return corpus.split() | |||
| def get_char_dict(vocabulary): | |||
| # vocabulary == dict of (word, int) | |||
| # Return: dict of (char, int), starting from 1 | |||
| char_dict = dict() | |||
| count = 1 | |||
| for word in vocabulary: | |||
| for ch in word: | |||
| if ch not in char_dict: | |||
| char_dict[ch] = count | |||
| count += 1 | |||
| return char_dict | |||
| def create_word_char_dict(*file_name): | |||
| text = [] | |||
| for file in file_name: | |||
| text += read_data(file) | |||
| word_dict = {word:ix for ix, word in enumerate(set(text))} | |||
| char_dict = get_char_dict(word_dict) | |||
| return word_dict, char_dict | |||
| import torch | |||
| from torch.autograd import Variable | |||
| import torch.nn as nn | |||
| import torch.nn.functional as F | |||
| def batch_generator(x, batch_size): | |||
| # x: [num_words, in_channel, height, width] | |||
| # partitions x into batches | |||
| num_step = x.size()[0] // batch_size | |||
| for t in range(num_step): | |||
| yield x[t*batch_size:(t+1)*batch_size] | |||
| def text2vec(words, char_dict, max_word_len): | |||
| """ Return list of list of int """ | |||
| word_vec = [] | |||
| for word in words: | |||
| vec = [char_dict[ch] for ch in word] | |||
| if len(vec) < max_word_len: | |||
| vec += [char_dict["PAD"] for _ in range(max_word_len - len(vec))] | |||
| vec = [char_dict["BOW"]] + vec + [char_dict["EOW"]] | |||
| word_vec.append(vec) | |||
| return word_vec | |||
| def seq2vec(input_words, char_embedding, char_embedding_dim, char_table): | |||
| """ convert the input strings into character embeddings """ | |||
| # input_words == list of string | |||
| # char_embedding == torch.nn.Embedding | |||
| # char_embedding_dim == int | |||
| # char_table == list of unique chars | |||
| # Returns: tensor of shape [len(input_words), char_embedding_dim, max_word_len+2] | |||
| max_word_len = max([len(word) for word in input_words]) | |||
| print("max_word_len={}".format(max_word_len)) | |||
| tensor_list = [] | |||
| start_column = torch.ones(char_embedding_dim, 1) | |||
| end_column = torch.ones(char_embedding_dim, 1) | |||
| for word in input_words: | |||
| # convert string to word embedding | |||
| word_encoding = char_embedding_lookup(word, char_embedding, char_table) | |||
| # add start and end columns | |||
| word_encoding = torch.cat([start_column, word_encoding, end_column], 1) | |||
| # zero-pad right columns | |||
| word_encoding = F.pad(word_encoding, (0, max_word_len-word_encoding.size()[1]+2)).data | |||
| # create dimension | |||
| word_encoding = word_encoding.unsqueeze(0) | |||
| tensor_list.append(word_encoding) | |||
| return torch.cat(tensor_list, 0) | |||
| def read_data(file_name): | |||
| # Return: list of strings | |||
| with open(file_name, 'r') as f: | |||
| corpus = f.read().lower() | |||
| import re | |||
| corpus = re.sub(r"<unk>", "unk", corpus) | |||
| return corpus.split() | |||
| def get_char_dict(vocabulary): | |||
| # vocabulary == dict of (word, int) | |||
| # Return: dict of (char, int), starting from 1 | |||
| char_dict = dict() | |||
| count = 1 | |||
| for word in vocabulary: | |||
| for ch in word: | |||
| if ch not in char_dict: | |||
| char_dict[ch] = count | |||
| count += 1 | |||
| return char_dict | |||
| def create_word_char_dict(*file_name): | |||
| text = [] | |||
| for file in file_name: | |||
| text += read_data(file) | |||
| word_dict = {word:ix for ix, word in enumerate(set(text))} | |||
| char_dict = get_char_dict(word_dict) | |||
| return word_dict, char_dict | |||
| @@ -0,0 +1,10 @@ | |||
| import unittest | |||
| class MyTestCase(unittest.TestCase): | |||
| def test_something(self): | |||
| self.assertEqual(True, False) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||