From 58127d3c4ef652ffd066cd04a466c0a4b94dcdd2 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Wed, 30 May 2018 22:28:22 +0800 Subject: [PATCH] start building word seg (generally seq2seq) model --- action/action.py | 2 +- action/trainer.py | 8 +- loader/base_loader.py | 6 + model/base_model.py | 110 +++++++++++--- model/word_seg_model.py | 135 ++++++++++++++++++ .../CNN-sentence_classification/train.py | 9 +- tests/test_word_seg.py | 30 ++++ 7 files changed, 275 insertions(+), 25 deletions(-) create mode 100644 model/word_seg_model.py create mode 100644 tests/test_word_seg.py diff --git a/action/action.py b/action/action.py index 9c3f32f6..c85a74df 100644 --- a/action/action.py +++ b/action/action.py @@ -27,7 +27,7 @@ class Action(object): :return iteration:int, the number of step in each epoch generator:generator, to generate batch inputs """ - n_samples = X.size()[0] + n_samples = X.shape[0] num_iter = n_samples // batch_size if Y is None: generator = self._batch_generate(batch_size, num_iter, X) diff --git a/action/trainer.py b/action/trainer.py index 2584552b..6b0fc1f0 100644 --- a/action/trainer.py +++ b/action/trainer.py @@ -6,7 +6,7 @@ from .tester import Tester class Trainer(Action): """ - Trainer for common training logic of all models + Trainer is a common training pipeline shared among all models. """ TrainConfig = namedtuple("config", ["epochs", "validate", "save_when_better", "log_per_step", "log_validation", "batch_size"]) @@ -23,12 +23,12 @@ class Trainer(Action): self.log_validation = train_args.log_validation self.batch_size = train_args.batch_size - def train(self, network, train_data, dev_data): + def train(self, network, train_data, dev_data=None): """ :param network: the model controller :param train_data: raw data for training :param dev_data: raw data for validation - :return: + This method will call all the base methods of network (implemented in model.base_model). """ train_x, train_y = network.prepare_input(train_data) @@ -60,6 +60,8 @@ class Trainer(Action): #################### evaluate over dev set ################### if self.validate: + if dev_data is None: + raise RuntimeError("No validation data provided.") # give all controls to tester evaluator.test(network, dev_data) diff --git a/loader/base_loader.py b/loader/base_loader.py index 9579a1e5..d087f9f2 100644 --- a/loader/base_loader.py +++ b/loader/base_loader.py @@ -14,6 +14,11 @@ class BaseLoader(object): text = f.read() return text + def load_lines(self): + with open(self.data_path, "r", encoding="utf=8") as f: + text = f.readlines() + return text + class ToyLoader0(BaseLoader): """ @@ -29,3 +34,4 @@ class ToyLoader0(BaseLoader): import re corpus = re.sub(r"", "unk", corpus) return corpus.split() + diff --git a/model/base_model.py b/model/base_model.py index facb82d9..07f3e6b2 100644 --- a/model/base_model.py +++ b/model/base_model.py @@ -2,29 +2,64 @@ import numpy as np class BaseModel(object): - """PyTorch base model for all models""" + """The base class of all models. + This class and its subclasses are actually "wrappers" of the PyTorch models. + They act as an interface between Trainer and the deep learning networks. + This interface provides the following methods to be called by Trainer. + - prepare_input + - mode + - define_optimizer + - data_forward + - grad_backward + - get_loss + """ def __init__(self): pass def prepare_input(self, data): """ - :param data: str, raw input vector(?) + Perform data transformation from raw input to vector/matrix inputs. + :param data: raw inputs :return (X, Y): tuple, input features and labels """ raise NotImplementedError def mode(self, test=False): + """ + Tell the network to be trained or not, required by PyTorch. + :param test: bool + """ + raise NotImplementedError + + def define_optimizer(self): + """ + Define PyTorch optimizer specified by the model. + """ raise NotImplementedError def data_forward(self, *x): + """ + Forward pass of the data. + :param x: input feature matrix and label vector + :return: output by the model + """ # required by PyTorch nn raise NotImplementedError def grad_backward(self): + """ + Perform gradient descent to update the model parameters. + """ raise NotImplementedError def get_loss(self, pred, truth): + """ + Compute loss given model prediction and ground truth. Loss function specified by the model. + :param pred: prediction label vector + :param truth: ground truth label vector + :return: a scalar + """ raise NotImplementedError @@ -54,29 +89,70 @@ class ToyModel(BaseModel): self._loss = np.mean(np.square(pred - truth)) return self._loss + def define_optimizer(self): + pass + class Vocabulary(object): - """ - A collection of lookup tables. + """A look-up table that allows you to access `Lexeme` objects. The `Vocab` + instance also provides access to the `StringStore`, and owns underlying + data that is shared between `Doc` objects. """ def __init__(self): - self.word_set = None - self.word2idx = None - self.emb_matrix = None - - def lookup(self, word): - if word in self.word_set: - return self.emb_matrix[self.word2idx[word]] - return LookupError("The key " + word + " does not exist.") + """Create the vocabulary. + RETURNS (Vocab): The newly constructed object. + """ + self.data_frame = None class Document(object): + """A sequence of Token objects. Access sentences and named entities, export + annotations to numpy arrays, losslessly serialize to compressed binary + strings. The `Doc` object holds an array of `Token` objects. The + Python-level `Token` and `Span` objects are views of this array, i.e. + they don't own the data themselves. -- spacy """ - contains a sequence of tokens - each token is a character with linguistic attributes + + def __init__(self, vocab, words=None, spaces=None): + """Create a Doc object. + vocab (Vocab): A vocabulary object, which must match any models you + want to use (e.g. tokenizer, parser, entity recognizer). + words (list or None): A list of unicode strings, to add to the document + as words. If `None`, defaults to empty list. + spaces (list or None): A list of boolean values, of the same length as + words. True means that the word is followed by a space, False means + it is not. If `None`, defaults to `[True]*len(words)` + user_data (dict or None): Optional extra data to attach to the Doc. + RETURNS (Doc): The newly constructed object. + """ + self.vocab = vocab + self.spaces = spaces + self.words = words + if spaces is None: + self.spaces = [True] * len(self.words) + elif len(spaces) != len(self.words): + raise ValueError("dismatch spaces and words") + + def get_chunker(self, vocab): + return None + + def push_back(self, vocab): + pass + + +class Token(object): + """An individual token – i.e. a word, punctuation symbol, whitespace, + etc. """ - def __init__(self): - # wrap pandas.dataframe - self.dataframe = None + def __init__(self, vocab, doc, offset): + """Construct a `Token` object. + vocab (Vocabulary): A storage container for lexical types. + doc (Document): The parent document. + offset (int): The index of the token within the document. + """ + self.vocab = vocab + self.doc = doc + self.token = doc[offset] + self.i = offset diff --git a/model/word_seg_model.py b/model/word_seg_model.py new file mode 100644 index 00000000..38df7a52 --- /dev/null +++ b/model/word_seg_model.py @@ -0,0 +1,135 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.optim as optim +from torch.autograd import Variable + +from model.base_model import BaseModel + +USE_GPU = True + + +def to_var(x): + if torch.cuda.is_available() and USE_GPU: + x = x.cuda() + return Variable(x) + + +class WordSegModel(BaseModel): + """ + Model controller for WordSeg + """ + + def __init__(self): + super(WordSegModel, self).__init__() + self.id2word = None + self.word2id = None + self.id2tag = None + self.tag2id = None + + self.lstm_batch_size = 8 + self.lstm_seq_len = 32 # Trainer batch_size == lstm_batch_size * lstm_seq_len + self.hidden_dim = 100 + self.lstm_num_layers = 2 + self.vocab_size = 100 + self.word_emb_dim = 100 + + self.model = WordSeg(self.hidden_dim, self.lstm_num_layers, self.vocab_size, self.word_emb_dim) + self.hidden = (to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim)), + to_var(torch.zeros(2, self.lstm_batch_size, self.word_emb_dim))) + + self.optimizer = None + self._loss = None + + def prepare_input(self, data): + """ + perform word indices lookup to convert strings into indices + :param data: list of string, each string contains word + space + [B, M, E, S] + :return + """ + word_list = [] + tag_list = [] + for line in data: + if len(line) > 2: + tokens = line.split("#") + word_list.append(tokens[0]) + tag_list.append(tokens[2][0]) + self.id2word = list(set(word_list)) + self.word2id = {word: idx for idx, word in enumerate(self.id2word)} + self.id2tag = list(set(tag_list)) + self.tag2id = {tag: idx for idx, tag in enumerate(self.id2tag)} + words = np.array([self.word2id[w] for w in word_list]).reshape(-1, 1) + tags = np.array([self.tag2id[t] for t in tag_list]).reshape(-1, 1) + return words, tags + + def mode(self, test=False): + if test: + self.model.eval() + else: + self.model.train() + + def data_forward(self, x): + """ + :param x: sequence of length [batch_size], word indices + :return: + """ + x = x.reshape(self.lstm_batch_size, self.lstm_seq_len) + output, self.hidden = self.model(x, self.hidden) + return output + + def define_optimizer(self): + self.optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85) + + def get_loss(self, pred, truth): + + self._loss = nn.CrossEntropyLoss(pred, truth) + return self._loss + + def grad_backward(self): + self.model.zero_grad() + self._loss.backward() + torch.nn.utils.clip_grad_norm(self.model.parameters(), 5, norm_type=2) + self.optimizer.step() + + +class WordSeg(nn.Module): + """ + PyTorch Network for word segmentation + """ + + def __init__(self, hidden_dim, lstm_num_layers, vocab_size, word_emb_dim=100): + super(WordSeg, self).__init__() + + self.vocab_size = vocab_size + self.word_emb_dim = word_emb_dim + self.lstm_num_layers = lstm_num_layers + self.hidden_dim = hidden_dim + + self.word_emb = nn.Embedding(self.vocab_size, self.word_emb_dim) + + self.lstm = nn.LSTM(input_size=self.word_emb_dim, + hidden_size=self.word_emb_dim, + num_layers=self.lstm_num_layers, + bias=True, + dropout=0.5, + batch_first=True) + + self.linear = nn.Linear(self.word_emb_dim, self.vocab_size) + + def forward(self, x, hidden): + """ + :param x: tensor of shape [batch_size, seq_len], vocabulary index + :param hidden: + :return x: probability of vocabulary entries + hidden: (memory cell, hidden state) from LSTM + """ + # [batch_size, seq_len] + x = self.word_emb(x) + # [batch_size, seq_len, word_emb_size] + x, hidden = self.lstm(x, hidden) + # [batch_size, seq_len, word_emb_size] + x = x.contiguous().view(x.shape[0] * x.shape[1], -1) + # [batch_size*seq_len, word_emb_size] + x = self.linear(x) + # [batch_size*seq_len, vocab_size] + return x, hidden diff --git a/reproduction/CNN-sentence_classification/train.py b/reproduction/CNN-sentence_classification/train.py index ca4ea96e..763165d4 100644 --- a/reproduction/CNN-sentence_classification/train.py +++ b/reproduction/CNN-sentence_classification/train.py @@ -1,5 +1,6 @@ import os +import import import torch import torch.nn as nn @@ -54,10 +55,10 @@ for epoch in range(num_epochs): cnn.train() for i, (sents,labels) in enumerate(train_loader): sents = Variable(sents) - labels = Variable(labels) - if cuda: - sents = sents.cuda() - labels = labels.cuda() + labels = Variable(labels) + if cuda: + sents = sents.cuda() + labels = labels.cuda() optimizer.zero_grad() outputs = cnn(sents) loss = criterion(outputs, labels) diff --git a/tests/test_word_seg.py b/tests/test_word_seg.py new file mode 100644 index 00000000..1c94327d --- /dev/null +++ b/tests/test_word_seg.py @@ -0,0 +1,30 @@ +from action.tester import Tester +from action.trainer import Trainer +from loader.base_loader import BaseLoader +from model.word_seg_model import WordSegModel + + +def test_charlm(): + train_config = Trainer.TrainConfig(epochs=5, validate=False, save_when_better=False, + log_per_step=10, log_validation=False, batch_size=254) + trainer = Trainer(train_config) + + model = WordSegModel() + + train_data = BaseLoader("load_train", "./data_for_tests/cws_train").load_lines() + + trainer.train(model, train_data) + + trainer.save_model(model) + + test_config = Tester.TestConfig(save_output=False, validate_in_training=False, + save_dev_input=False, save_loss=False, batch_size=254) + tester = Tester(test_config) + + test_data = BaseLoader("load_test", "./data_for_tests/cws_test").load_lines() + + tester.test(model, test_data) + + +if __name__ == "__main__": + test_charlm()