From d7a8217132a7c67db01fda20ee629321839750b6 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Fri, 6 Jul 2018 23:14:58 +0800 Subject: [PATCH 1/7] finished POSTrainer --- fastNLP/action/trainer.py | 39 ++++++++++- fastNLP/loader/dataset_loader.py | 1 - fastNLP/loader/preprocess.py | 66 +++++++++++-------- fastNLP/models/base_model.py | 14 ++-- fastNLP/models/sequencce_modeling.py | 98 ++++++++++++++++++++++++++++ fastNLP/modules/prototype/example.py | 48 ++++++++++++-- fastNLP/modules/utils.py | 6 ++ test/test_POS_pipeline.py | 29 ++++++++ 8 files changed, 259 insertions(+), 42 deletions(-) create mode 100644 fastNLP/models/sequencce_modeling.py create mode 100644 test/test_POS_pipeline.py diff --git a/fastNLP/action/trainer.py b/fastNLP/action/trainer.py index 437ab7d2..ac7138e5 100644 --- a/fastNLP/action/trainer.py +++ b/fastNLP/action/trainer.py @@ -7,6 +7,7 @@ import torch from fastNLP.action.action import Action from fastNLP.action.action import RandomSampler, Batchifier from fastNLP.action.tester import Tester +from fastNLP.modules.utils import seq_mask class BaseTrainer(Action): @@ -28,6 +29,7 @@ class BaseTrainer(Action): training parameters """ super(BaseTrainer, self).__init__() + self.train_args = train_args self.n_epochs = train_args.epochs self.validate = train_args.validate self.batch_size = train_args.batch_size @@ -163,8 +165,8 @@ class BaseTrainer(Action): :param data: list. Each entry is a sample, which is also a list of features and label(s). E.g. [ - [[feature_1, feature_2, feature_3], [label_1. label_2]], # sample 1 - [[feature_1, feature_2, feature_3], [label_1. label_2]], # sample 2 + [[word_11, word_12, word_13], [label_11. label_12]], # sample 1 + [[word_21, word_22, word_23], [label_21. label_22]], # sample 2 ... ] :return batch_x: list. Each entry is a list of features of a sample. @@ -313,6 +315,39 @@ class WordSegTrainer(BaseTrainer): self.optimizer.step() +class POSTrainer(BaseTrainer): + def __init__(self, train_args): + super(POSTrainer, self).__init__(train_args) + self.vocab_size = train_args.vocab_size + self.num_classes = train_args.num_classes + self.max_len = None + self.mask = None + self.batch_x = None + + def prepare_input(self, data_path): + """ + To do: Load pkl files of train/dev/test and embedding + """ + data_train = _pickle.load(open(data_path + "data_train.pkl", "rb")) + data_dev = _pickle.load(open(data_path + "data_dev.pkl", "rb")) + return data_train, data_dev + + def data_forward(self, network, x): + seq_len = [len(seq) for seq in x] + x = torch.LongTensor(x) + self.batch_size = x.size(0) + self.max_len = x.size(1) + self.mask = seq_mask(seq_len, self.max_len) + x = network(x) + self.batch_x = x + return x + + def get_loss(self, predict, truth): + truth = torch.LongTensor(truth) + loss, prediction = self.loss_func(self.batch_x, predict, self.mask, self.batch_size, self.max_len) + return loss + + if __name__ == "__name__": train_args = BaseTrainer.TrainConfig(epochs=1, validate=False, batch_size=3, pickle_path="./") trainer = BaseTrainer(train_args) diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py index 7132eb3b..284be715 100644 --- a/fastNLP/loader/dataset_loader.py +++ b/fastNLP/loader/dataset_loader.py @@ -15,7 +15,6 @@ class POSDatasetLoader(DatasetLoader): def __init__(self, data_name, data_path): super(POSDatasetLoader, self).__init__(data_name, data_path) - #self.data_set = self.load() def load(self): assert os.path.exists(self.data_path) diff --git a/fastNLP/loader/preprocess.py b/fastNLP/loader/preprocess.py index b8d88c35..8b9c6d88 100644 --- a/fastNLP/loader/preprocess.py +++ b/fastNLP/loader/preprocess.py @@ -46,19 +46,17 @@ class BasePreprocess(object): class POSPreprocess(BasePreprocess): - """ This class are used to preprocess the pos datasets. - In these datasets, each line is divided by '\t' - The first Col is the vocabulary. - The second Col is the labels. + In these datasets, each line are divided by '\t' + while the first Col is the vocabulary and the second + Col is the label. Different sentence are divided by an empty line. e.g: Tom label1 and label2 Jerry label1 . label3 - Hello label4 world label5 ! label3 @@ -71,11 +69,13 @@ class POSPreprocess(BasePreprocess): super(POSPreprocess, self).__init__(data, pickle_path) self.word_dict = None self.label_dict = None + self.data = data + self.pickle_path = pickle_path self.build_dict() self.word2id() - self.id2word() + self.vocab_size = self.id2word() self.class2id() - self.id2class() + self.num_classes = self.id2class() self.embedding() self.data_train() self.data_dev() @@ -87,7 +87,8 @@ class POSPreprocess(BasePreprocess): DEFAULT_RESERVED_LABEL[2]: 4} self.label_dict = {} for w in self.data: - if len(w) == 0: + w = w.strip() + if len(w) <= 1: continue word = w.split('\t') @@ -95,10 +96,11 @@ class POSPreprocess(BasePreprocess): index = len(self.word_dict) self.word_dict[word[0]] = index - for label in word[1: ]: - if label not in self.label_dict: - index = len(self.label_dict) - self.label_dict[label] = index + # for label in word[1: ]: + label = word[1] + if label not in self.label_dict: + index = len(self.label_dict) + self.label_dict[label] = index def pickle_exist(self, pickle_name): """ @@ -107,7 +109,7 @@ class POSPreprocess(BasePreprocess): """ if not os.path.exists(self.pickle_path): os.makedirs(self.pickle_path) - file_name = self.pickle_path + pickle_name + file_name = os.path.join(self.pickle_path, pickle_name) if os.path.exists(file_name): return True else: @@ -118,42 +120,48 @@ class POSPreprocess(BasePreprocess): return # nothing will be done if word2id.pkl exists - file_name = self.pickle_path + "word2id.pkl" - with open(file_name, "wb", encoding='utf-8') as f: + file_name = os.path.join(self.pickle_path, "word2id.pkl") + with open(file_name, "wb") as f: _pickle.dump(self.word_dict, f) def id2word(self): if self.pickle_exist("id2word.pkl"): - return + file_name = os.path.join(self.pickle_path, "id2word.pkl") + id2word_dict = _pickle.load(open(file_name, "rb")) + return len(id2word_dict) # nothing will be done if id2word.pkl exists id2word_dict = {} for word in self.word_dict: id2word_dict[self.word_dict[word]] = word - file_name = self.pickle_path + "id2word.pkl" - with open(file_name, "wb", encoding='utf-8') as f: + file_name = os.path.join(self.pickle_path, "id2word.pkl") + with open(file_name, "wb") as f: _pickle.dump(id2word_dict, f) + return len(id2word_dict) def class2id(self): if self.pickle_exist("class2id.pkl"): return # nothing will be done if class2id.pkl exists - file_name = self.pickle_path + "class2id.pkl" - with open(file_name, "wb", encoding='utf-8') as f: + file_name = os.path.join(self.pickle_path, "class2id.pkl") + with open(file_name, "wb") as f: _pickle.dump(self.label_dict, f) def id2class(self): if self.pickle_exist("id2class.pkl"): - return + file_name = os.path.join(self.pickle_path, "id2class.pkl") + id2class_dict = _pickle.load(open(file_name, "rb")) + return len(id2class_dict) # nothing will be done if id2class.pkl exists id2class_dict = {} for label in self.label_dict: id2class_dict[self.label_dict[label]] = label - file_name = self.pickle_path + "id2class.pkl" - with open(file_name, "wb", encoding='utf-8') as f: + file_name = os.path.join(self.pickle_path, "id2class.pkl") + with open(file_name, "wb") as f: _pickle.dump(id2class_dict, f) + return len(id2class_dict) def embedding(self): if self.pickle_exist("embedding.pkl"): @@ -168,22 +176,26 @@ class POSPreprocess(BasePreprocess): data_train = [] sentence = [] for w in self.data: - if len(w) == 0: + w = w.strip() + if len(w) <= 1: wid = [] lid = [] for i in range(len(sentence)): + # if sentence[i][0]=="": + # print("") wid.append(self.word_dict[sentence[i][0]]) lid.append(self.label_dict[sentence[i][1]]) data_train.append((wid, lid)) sentence = [] + continue sentence.append(w.split('\t')) - file_name = self.pickle_path + "data_train.pkl" - with open(file_name, "wb", encoding='utf-8') as f: + file_name = os.path.join(self.pickle_path, "data_train.pkl") + with open(file_name, "wb") as f: _pickle.dump(data_train, f) def data_dev(self): pass def data_test(self): - pass + pass \ No newline at end of file diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py index 9249e2e3..54e28687 100644 --- a/fastNLP/models/base_model.py +++ b/fastNLP/models/base_model.py @@ -4,9 +4,9 @@ import torch class BaseModel(torch.nn.Module): """Base PyTorch model for all models. Three network modules presented: - - embedding module + - encoder module - aggregation module - - output module + - decoder module Subclasses must implement these three modules with "components". """ @@ -15,21 +15,20 @@ class BaseModel(torch.nn.Module): def forward(self, *inputs): x = self.encode(*inputs) - x = self.aggregation(x) - x = self.output(x) + x = self.aggregate(x) + x = self.decode(x) return x def encode(self, x): raise NotImplementedError - def aggregation(self, x): + def aggregate(self, x): raise NotImplementedError - def output(self, x): + def decode(self, x): raise NotImplementedError - class Vocabulary(object): """A look-up table that allows you to access `Lexeme` objects. The `Vocab` instance also provides access to the `StringStore`, and owns underlying @@ -93,3 +92,4 @@ class Token(object): self.doc = doc self.token = doc[offset] self.i = offset + diff --git a/fastNLP/models/sequencce_modeling.py b/fastNLP/models/sequencce_modeling.py new file mode 100644 index 00000000..af6931e4 --- /dev/null +++ b/fastNLP/models/sequencce_modeling.py @@ -0,0 +1,98 @@ +import torch +import torch.nn as nn +from torch.nn import functional as F + +from fastNLP.models.base_model import BaseModel +from fastNLP.modules.CRF import ContionalRandomField + + +class SeqLabeling(BaseModel): + """ + PyTorch Network for sequence labeling + """ + + def __init__(self, hidden_dim, + rnn_num_layerd, + num_classes, + vocab_size, + word_emb_dim=100, + init_emb=None, + rnn_mode="gru", + bi_direction=False, + dropout=0.5, + use_crf=True): + super(SeqLabeling, self).__init__() + + self.Emb = nn.Embedding(vocab_size, word_emb_dim) + if init_emb: + self.Emb.weight = nn.Parameter(init_emb) + + self.num_classes = num_classes + self.input_dim = word_emb_dim + self.layers = rnn_num_layerd + self.hidden_dim = hidden_dim + self.bi_direction = bi_direction + self.dropout = dropout + self.mode = rnn_mode + + if self.mode == "lstm": + self.rnn = nn.LSTM(self.input_dim, self.hidden_dim, self.layers, batch_first=True, + bidirectional=self.bi_direction, dropout=self.dropout) + elif self.mode == "gru": + self.rnn = nn.GRU(self.input_dim, self.hidden_dim, self.layers, batch_first=True, + bidirectional=self.bi_direction, dropout=self.dropout) + elif self.mode == "rnn": + self.rnn = nn.RNN(self.input_dim, self.hidden_dim, self.layers, batch_first=True, + bidirectional=self.bi_direction, dropout=self.dropout) + else: + raise Exception + if bi_direction: + self.linear = nn.Linear(self.hidden_dim * 2, self.num_classes) + else: + self.linear = nn.Linear(self.hidden_dim, self.num_classes) + self.use_crf = use_crf + if self.use_crf: + self.crf = ContionalRandomField(num_classes) + + def forward(self, x): + + x = self.embedding(x) + x, hidden = self.encode(x) + x = self.aggregation(x) + x = self.output(x) + return x + + def embedding(self, x): + return self.Emb(x) + + def encode(self, x): + return self.rnn(x) + + def aggregate(self, x): + return x + + def decode(self, x): + x = self.linear(x) + return x + + def loss(self, x, y, mask, batch_size, max_len): + """ + Negative log likelihood loss. + :param x: + :param y: + :param seq_len: + :return loss: + prediction: + """ + if self.use_crf: + total_loss = self.crf(x, y, mask) + tag_seq = self.crf.viterbi_decode(x, mask) + else: + # error + loss_function = nn.NLLLoss(ignore_index=0, size_average=False) + x = x.view(batch_size * max_len, -1) + score = F.log_softmax(x) + total_loss = loss_function(score, y.view(batch_size * max_len)) + _, tag_seq = torch.max(score) + tag_seq = tag_seq.view(batch_size, max_len) + return torch.mean(total_loss), tag_seq diff --git a/fastNLP/modules/prototype/example.py b/fastNLP/modules/prototype/example.py index a19898c6..d23a0ec2 100644 --- a/fastNLP/modules/prototype/example.py +++ b/fastNLP/modules/prototype/example.py @@ -1,12 +1,13 @@ -import torch -import torch.nn as nn -import encoder +import time + import aggregation +import dataloader import embedding +import encoder import predict +import torch +import torch.nn as nn import torch.optim as optim -import time -import dataloader WORD_NUM = 357361 WORD_SIZE = 100 @@ -16,6 +17,30 @@ R = 10 MLP_HIDDEN = 2000 CLASSES_NUM = 5 +from fastNLP.models.base_model import BaseModel +from fastNLP.action.trainer import BaseTrainer + + +class MyNet(BaseModel): + def __init__(self): + super(MyNet, self).__init__() + self.embedding = embedding.Lookuptable(WORD_NUM, WORD_SIZE) + self.encoder = encoder.Lstm(WORD_SIZE, HIDDEN_SIZE, 1, 0.5, True) + self.aggregation = aggregation.Selfattention(2 * HIDDEN_SIZE, D_A, R) + self.predict = predict.MLP(R * HIDDEN_SIZE * 2, MLP_HIDDEN, CLASSES_NUM) + self.penalty = None + + def encode(self, x): + return self.encode(self.embedding(x)) + + def aggregate(self, x): + x, self.penalty = self.aggregate(x) + return x + + def decode(self, x): + return [self.predict(x), self.penalty] + + class Net(nn.Module): """ A model for sentiment analysis using lstm and self-attention @@ -34,6 +59,19 @@ class Net(nn.Module): x = self.predict(x) return x, penalty + +class MyTrainer(BaseTrainer): + def __init__(self, args): + super(MyTrainer, self).__init__(args) + self.optimizer = None + + def define_optimizer(self): + self.optimizer = optim.SGD(self.model.parameters(), lr=0.01, momentum=0.9) + + def define_loss(self): + self.loss_func = nn.CrossEntropyLoss() + + def train(model_dict=None, using_cuda=True, learning_rate=0.06,\ momentum=0.3, batch_size=32, epochs=5, coef=1.0, interval=10): """ diff --git a/fastNLP/modules/utils.py b/fastNLP/modules/utils.py index 15afe883..a6b31a20 100644 --- a/fastNLP/modules/utils.py +++ b/fastNLP/modules/utils.py @@ -7,3 +7,9 @@ def mask_softmax(matrix, mask): else: raise NotImplementedError return result + + +def seq_mask(seq_len, max_len): + mask = [torch.ge(torch.LongTensor(seq_len), i + 1) for i in range(max_len)] + mask = torch.stack(mask, 1) + return mask diff --git a/test/test_POS_pipeline.py b/test/test_POS_pipeline.py new file mode 100644 index 00000000..db4232e7 --- /dev/null +++ b/test/test_POS_pipeline.py @@ -0,0 +1,29 @@ +from fastNLP.action.trainer import POSTrainer +from fastNLP.loader.dataset_loader import POSDatasetLoader +from fastNLP.loader.preprocess import POSPreprocess +from fastNLP.models.sequencce_modeling import SeqLabeling + +data_name = "people" +data_path = "data/people.txt" +pickle_path = "data" + +if __name__ == "__main__": + # Data Loader + pos = POSDatasetLoader(data_name, data_path) + train_data = pos.load_lines() + + # Preprocessor + p = POSPreprocess(train_data, pickle_path) + vocab_size = p.vocab_size + num_classes = p.num_classes + + # Trainer + train_args = POSTrainer.TrainConfig(epochs=20, batch_size=1, num_classes=num_classes, + vocab_size=vocab_size, pickle_path=pickle_path) + trainer = POSTrainer(train_args) + + # Model + model = SeqLabeling(100, 1, num_classes, vocab_size, bi_direction=True) + + # Start training. + trainer.train(model) From cca276b8c09add219bbbcaa8cbf78d786358cea3 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sat, 7 Jul 2018 16:57:57 +0800 Subject: [PATCH 2/7] - optimize package calling from test files - add people.txt in data_for_tests - To do: incorrect CRF param in POS_pipeline --- fastNLP/action/trainer.py | 35 ++++++++++++--- fastNLP/loader/dataset_loader.py | 2 +- fastNLP/models/sequencce_modeling.py | 9 +++- test/data_for_tests/people.txt | 67 ++++++++++++++++++++++++++++ test/test_POS_pipeline.py | 11 +++-- 5 files changed, 111 insertions(+), 13 deletions(-) create mode 100644 test/data_for_tests/people.txt diff --git a/fastNLP/action/trainer.py b/fastNLP/action/trainer.py index ac7138e5..94a704f9 100644 --- a/fastNLP/action/trainer.py +++ b/fastNLP/action/trainer.py @@ -31,12 +31,13 @@ class BaseTrainer(Action): super(BaseTrainer, self).__init__() self.train_args = train_args self.n_epochs = train_args.epochs - self.validate = train_args.validate + # self.validate = train_args.validate self.batch_size = train_args.batch_size self.pickle_path = train_args.pickle_path self.model = None self.iterator = None self.loss_func = None + self.optimizer = None def train(self, network): """General training loop. @@ -316,6 +317,8 @@ class WordSegTrainer(BaseTrainer): class POSTrainer(BaseTrainer): + TrainConfig = namedtuple("config", ["epochs", "batch_size", "pickle_path", "num_classes", "vocab_size"]) + def __init__(self, train_args): super(POSTrainer, self).__init__(train_args) self.vocab_size = train_args.vocab_size @@ -328,9 +331,9 @@ class POSTrainer(BaseTrainer): """ To do: Load pkl files of train/dev/test and embedding """ - data_train = _pickle.load(open(data_path + "data_train.pkl", "rb")) - data_dev = _pickle.load(open(data_path + "data_dev.pkl", "rb")) - return data_train, data_dev + data_train = _pickle.load(open(data_path + "/data_train.pkl", "rb")) + data_dev = _pickle.load(open(data_path + "/data_train.pkl", "rb")) + return data_train, data_dev, 0, 1 def data_forward(self, network, x): seq_len = [len(seq) for seq in x] @@ -342,10 +345,28 @@ class POSTrainer(BaseTrainer): self.batch_x = x return x + def mode(self, test=False): + if test: + self.model.eval() + else: + self.model.train() + + def define_optimizer(self): + self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01, momentum=0.9) + def get_loss(self, predict, truth): - truth = torch.LongTensor(truth) - loss, prediction = self.loss_func(self.batch_x, predict, self.mask, self.batch_size, self.max_len) - return loss + """ + Compute loss given prediction and ground truth. + :param predict: prediction label vector + :param truth: ground truth label vector + :return: a scalar + """ + if self.loss_func is None: + if hasattr(self.model, "loss"): + self.loss_func = self.model.loss + else: + self.define_loss() + return self.loss_func(self.batch_x, predict, self.mask, self.batch_size, self.max_len) if __name__ == "__name__": diff --git a/fastNLP/loader/dataset_loader.py b/fastNLP/loader/dataset_loader.py index 284be715..d57a48db 100644 --- a/fastNLP/loader/dataset_loader.py +++ b/fastNLP/loader/dataset_loader.py @@ -23,7 +23,7 @@ class POSDatasetLoader(DatasetLoader): return line def load_lines(self): - assert os.path.exists(self.data_path) + assert (os.path.exists(self.data_path)) with open(self.data_path, "r", encoding="utf-8") as f: lines = f.readlines() return lines diff --git a/fastNLP/models/sequencce_modeling.py b/fastNLP/models/sequencce_modeling.py index af6931e4..ba96d4b6 100644 --- a/fastNLP/models/sequencce_modeling.py +++ b/fastNLP/models/sequencce_modeling.py @@ -58,8 +58,8 @@ class SeqLabeling(BaseModel): x = self.embedding(x) x, hidden = self.encode(x) - x = self.aggregation(x) - x = self.output(x) + x = self.aggregate(x) + x = self.decode(x) return x def embedding(self, x): @@ -84,6 +84,11 @@ class SeqLabeling(BaseModel): :return loss: prediction: """ + x = x.float() + y = y.long() + mask = mask.byte() + print(x.shape, y.shape, mask.shape) + if self.use_crf: total_loss = self.crf(x, y, mask) tag_seq = self.crf.viterbi_decode(x, mask) diff --git a/test/data_for_tests/people.txt b/test/data_for_tests/people.txt new file mode 100644 index 00000000..f34c85cb --- /dev/null +++ b/test/data_for_tests/people.txt @@ -0,0 +1,67 @@ +迈 B-v +向 E-v +充 B-v +满 E-v +希 B-n +望 E-n +的 S-u +新 S-a +世 B-n +纪 E-n +— B-w +— E-w +一 B-t +九 M-t +九 M-t +八 M-t +年 E-t +新 B-t +年 E-t +讲 B-n +话 E-n +( S-w +附 S-v +图 B-n +片 E-n +1 S-m +张 S-q +) S-w + +中 B-nt +共 M-nt +中 M-nt +央 E-nt +总 B-n +书 M-n +记 E-n +、 S-w +国 B-n +家 E-n +主 B-n +席 E-n +江 B-nr +泽 M-nr +民 E-nr + +( S-w +一 B-t +九 M-t +九 M-t +七 M-t +年 E-t +十 B-t +二 M-t +月 E-t +三 B-t +十 M-t +一 M-t +日 E-t +) S-w + +1 B-t +2 M-t +月 E-t +3 B-t +1 M-t +日 E-t +, S-w \ No newline at end of file diff --git a/test/test_POS_pipeline.py b/test/test_POS_pipeline.py index db4232e7..66e418c6 100644 --- a/test/test_POS_pipeline.py +++ b/test/test_POS_pipeline.py @@ -1,11 +1,15 @@ +import sys + +sys.path.append("..") + from fastNLP.action.trainer import POSTrainer from fastNLP.loader.dataset_loader import POSDatasetLoader from fastNLP.loader.preprocess import POSPreprocess from fastNLP.models.sequencce_modeling import SeqLabeling -data_name = "people" -data_path = "data/people.txt" -pickle_path = "data" +data_name = "people.txt" +data_path = "data_for_tests/people.txt" +pickle_path = "data_for_tests" if __name__ == "__main__": # Data Loader @@ -27,3 +31,4 @@ if __name__ == "__main__": # Start training. trainer.train(model) + From 4c9c791304d29f4289c87d6fe6b67ff40e5bbdc0 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Sat, 7 Jul 2018 16:59:59 +0800 Subject: [PATCH 3/7] cancel restriction for base model --- fastNLP/models/base_model.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/fastNLP/models/base_model.py b/fastNLP/models/base_model.py index 54e28687..24dfdb85 100644 --- a/fastNLP/models/base_model.py +++ b/fastNLP/models/base_model.py @@ -3,31 +3,12 @@ import torch class BaseModel(torch.nn.Module): """Base PyTorch model for all models. - Three network modules presented: - - encoder module - - aggregation module - - decoder module - Subclasses must implement these three modules with "components". + To do: add some useful common features """ def __init__(self): super(BaseModel, self).__init__() - def forward(self, *inputs): - x = self.encode(*inputs) - x = self.aggregate(x) - x = self.decode(x) - return x - - def encode(self, x): - raise NotImplementedError - - def aggregate(self, x): - raise NotImplementedError - - def decode(self, x): - raise NotImplementedError - class Vocabulary(object): """A look-up table that allows you to access `Lexeme` objects. The `Vocab` From 83c032df5d661e0860695d80c37296480555b833 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Tue, 10 Jul 2018 18:51:42 +0800 Subject: [PATCH 4/7] fix bug in CRF comments; optimize PyTorch type conversion. --- fastNLP/action/trainer.py | 23 ++++++++------- fastNLP/models/sequencce_modeling.py | 42 ++++++++++++---------------- fastNLP/modules/CRF.py | 4 +-- 3 files changed, 33 insertions(+), 36 deletions(-) diff --git a/fastNLP/action/trainer.py b/fastNLP/action/trainer.py index 94a704f9..1f22ef28 100644 --- a/fastNLP/action/trainer.py +++ b/fastNLP/action/trainer.py @@ -170,8 +170,8 @@ class BaseTrainer(Action): [[word_21, word_22, word_23], [label_21. label_22]], # sample 2 ... ] - :return batch_x: list. Each entry is a list of features of a sample. - batch_y: list. Each entry is a list of labels of a sample. + :return batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] + batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] """ if self.iterator is None: self.iterator = iter(Batchifier(RandomSampler(data), batch_size, drop_last=True)) @@ -325,7 +325,6 @@ class POSTrainer(BaseTrainer): self.num_classes = train_args.num_classes self.max_len = None self.mask = None - self.batch_x = None def prepare_input(self, data_path): """ @@ -336,14 +335,18 @@ class POSTrainer(BaseTrainer): return data_train, data_dev, 0, 1 def data_forward(self, network, x): + """ + :param network: the PyTorch model + :param x: list of list, [batch_size, max_len] + :return y: [batch_size, num_classes] + """ seq_len = [len(seq) for seq in x] - x = torch.LongTensor(x) + x = torch.Tensor(x).long() self.batch_size = x.size(0) self.max_len = x.size(1) self.mask = seq_mask(seq_len, self.max_len) - x = network(x) - self.batch_x = x - return x + y = network(x) + return y def mode(self, test=False): if test: @@ -357,8 +360,8 @@ class POSTrainer(BaseTrainer): def get_loss(self, predict, truth): """ Compute loss given prediction and ground truth. - :param predict: prediction label vector - :param truth: ground truth label vector + :param predict: prediction label vector, [batch_size, num_classes] + :param truth: ground truth label vector, [batch_size, max_len] :return: a scalar """ if self.loss_func is None: @@ -366,7 +369,7 @@ class POSTrainer(BaseTrainer): self.loss_func = self.model.loss else: self.define_loss() - return self.loss_func(self.batch_x, predict, self.mask, self.batch_size, self.max_len) + return self.loss_func(predict, truth, self.mask, self.batch_size, self.max_len) if __name__ == "__name__": diff --git a/fastNLP/models/sequencce_modeling.py b/fastNLP/models/sequencce_modeling.py index ba96d4b6..96f09f80 100644 --- a/fastNLP/models/sequencce_modeling.py +++ b/fastNLP/models/sequencce_modeling.py @@ -12,7 +12,7 @@ class SeqLabeling(BaseModel): """ def __init__(self, hidden_dim, - rnn_num_layerd, + rnn_num_layer, num_classes, vocab_size, word_emb_dim=100, @@ -29,7 +29,7 @@ class SeqLabeling(BaseModel): self.num_classes = num_classes self.input_dim = word_emb_dim - self.layers = rnn_num_layerd + self.layers = rnn_num_layer self.hidden_dim = hidden_dim self.bi_direction = bi_direction self.dropout = dropout @@ -55,32 +55,26 @@ class SeqLabeling(BaseModel): self.crf = ContionalRandomField(num_classes) def forward(self, x): - - x = self.embedding(x) - x, hidden = self.encode(x) - x = self.aggregate(x) - x = self.decode(x) - return x - - def embedding(self, x): - return self.Emb(x) - - def encode(self, x): - return self.rnn(x) - - def aggregate(self, x): - return x - - def decode(self, x): - x = self.linear(x) - return x + """ + :param x: LongTensor, [batch_size, mex_len] + :return y: [batch_size, tag_size, tag_size] + """ + x = self.Emb(x) + # [batch_size, max_len, word_emb_dim] + x, hidden = self.rnn(x) + # [batch_size, max_len, hidden_size * direction] + y = self.linear(x) + # [batch_size, max_len, num_classes] + return y def loss(self, x, y, mask, batch_size, max_len): """ Negative log likelihood loss. - :param x: - :param y: - :param seq_len: + :param x: FloatTensor, [batch_size, tag_size, tag_size] + :param y: LongTensor, [batch_size, max_len] + :param mask: ByteTensor, [batch_size, max_len] + :param batch_size: int + :param max_len: int :return loss: prediction: """ diff --git a/fastNLP/modules/CRF.py b/fastNLP/modules/CRF.py index 6361b93d..96c84dca 100644 --- a/fastNLP/modules/CRF.py +++ b/fastNLP/modules/CRF.py @@ -82,7 +82,7 @@ class ContionalRandomField(nn.Module): def _glod_score(self, feats, tags, masks): """ Compute the score for the gold path. - :param feats: FloatTensor, batch_size x tag_size x tag_size + :param feats: FloatTensor, batch_size x max_len x tag_size :param tags: LongTensor, batch_size x max_len :param masks: ByteTensor, batch_size x max_len :return:FloatTensor, batch_size @@ -118,7 +118,7 @@ class ContionalRandomField(nn.Module): def forward(self, feats, tags, masks): """ Calculate the neg log likelihood - :param feats:FloatTensor, batch_size x tag_size x tag_size + :param feats:FloatTensor, batch_size x max_len x tag_size :param tags:LongTensor, batch_size x max_len :param masks:ByteTensor batch_size x max_len :return:FloatTensor, batch_size From c98d5924b585a7bfdc127e017d8cc2ff444d7e25 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Tue, 10 Jul 2018 20:46:35 +0800 Subject: [PATCH 5/7] sequence labeling ready to Train! --- fastNLP/action/trainer.py | 47 ++++++++++++------- ...encce_modeling.py => sequence_modeling.py} | 2 +- requirements.txt | 4 +- test/test_POS_pipeline.py | 9 ++-- 4 files changed, 39 insertions(+), 23 deletions(-) rename fastNLP/models/{sequencce_modeling.py => sequence_modeling.py} (98%) diff --git a/fastNLP/action/trainer.py b/fastNLP/action/trainer.py index 1f22ef28..6f51435a 100644 --- a/fastNLP/action/trainer.py +++ b/fastNLP/action/trainer.py @@ -1,5 +1,4 @@ import _pickle -from collections import namedtuple import numpy as np import torch @@ -22,18 +21,22 @@ class BaseTrainer(Action): - grad_backward - get_loss """ - TrainConfig = namedtuple("config", ["epochs", "validate", "batch_size", "pickle_path"]) def __init__(self, train_args): """ - training parameters + :param train_args: dict of (key, value) + + The base trainer requires the following keys: + - epochs: int, the number of epochs in training + - validate: bool, whether or not to validate on dev set + - batch_size: int + - pickle_path: str, the path to pickle files for pre-processing """ super(BaseTrainer, self).__init__() - self.train_args = train_args - self.n_epochs = train_args.epochs - # self.validate = train_args.validate - self.batch_size = train_args.batch_size - self.pickle_path = train_args.pickle_path + self.n_epochs = train_args["epochs"] + self.validate = train_args["validate"] + self.batch_size = train_args["batch_size"] + self.pickle_path = train_args["pickle_path"] self.model = None self.iterator = None self.loss_func = None @@ -66,8 +69,9 @@ class BaseTrainer(Action): for epoch in range(self.n_epochs): self.mode(test=False) - self.define_optimizer() + self.iterator = iter(Batchifier(RandomSampler(data_train), self.batch_size, drop_last=True)) + for step in range(iterations): batch_x, batch_y = self.batchify(self.batch_size, data_train) @@ -173,8 +177,6 @@ class BaseTrainer(Action): :return batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] """ - if self.iterator is None: - self.iterator = iter(Batchifier(RandomSampler(data), batch_size, drop_last=True)) indices = next(self.iterator) batch = [data[idx] for idx in indices] batch_x = [sample[0] for sample in batch] @@ -304,6 +306,7 @@ class WordSegTrainer(BaseTrainer): self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01, momentum=0.85) def get_loss(self, predict, truth): + truth = torch.Tensor(truth) self._loss = torch.nn.CrossEntropyLoss(predict, truth) return self._loss @@ -316,13 +319,16 @@ class WordSegTrainer(BaseTrainer): self.optimizer.step() + class POSTrainer(BaseTrainer): - TrainConfig = namedtuple("config", ["epochs", "batch_size", "pickle_path", "num_classes", "vocab_size"]) + """ + Trainer for Sequence Modeling + """ def __init__(self, train_args): super(POSTrainer, self).__init__(train_args) - self.vocab_size = train_args.vocab_size - self.num_classes = train_args.num_classes + self.vocab_size = train_args["vocab_size"] + self.num_classes = train_args["num_classes"] self.max_len = None self.mask = None @@ -357,6 +363,13 @@ class POSTrainer(BaseTrainer): def define_optimizer(self): self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01, momentum=0.9) + def grad_backward(self, loss): + self.model.zero_grad() + loss.backward() + + def update(self): + self.optimizer.step() + def get_loss(self, predict, truth): """ Compute loss given prediction and ground truth. @@ -364,16 +377,18 @@ class POSTrainer(BaseTrainer): :param truth: ground truth label vector, [batch_size, max_len] :return: a scalar """ + truth = torch.Tensor(truth) if self.loss_func is None: if hasattr(self.model, "loss"): self.loss_func = self.model.loss else: self.define_loss() - return self.loss_func(predict, truth, self.mask, self.batch_size, self.max_len) + loss, prediction = self.loss_func(predict, truth, self.mask, self.batch_size, self.max_len) + return loss if __name__ == "__name__": - train_args = BaseTrainer.TrainConfig(epochs=1, validate=False, batch_size=3, pickle_path="./") + train_args = {"epochs": 1, "validate": False, "batch_size": 3, "pickle_path": "./"} trainer = BaseTrainer(train_args) data_train = [[[1, 2, 3, 4], [0]] * 10] + [[[1, 3, 5, 2], [1]] * 10] trainer.batchify(batch_size=3, data=data_train) diff --git a/fastNLP/models/sequencce_modeling.py b/fastNLP/models/sequence_modeling.py similarity index 98% rename from fastNLP/models/sequencce_modeling.py rename to fastNLP/models/sequence_modeling.py index 96f09f80..80d13cf3 100644 --- a/fastNLP/models/sequencce_modeling.py +++ b/fastNLP/models/sequence_modeling.py @@ -81,7 +81,7 @@ class SeqLabeling(BaseModel): x = x.float() y = y.long() mask = mask.byte() - print(x.shape, y.shape, mask.shape) + # print(x.shape, y.shape, mask.shape) if self.use_crf: total_loss = self.crf(x, y, mask) diff --git a/requirements.txt b/requirements.txt index 0fc94538..d961dd92 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -numpy==1.14.2 +numpy>=1.14.2 torch==0.4.0 -torchvision==0.1.8 +torchvision>=0.1.8 diff --git a/test/test_POS_pipeline.py b/test/test_POS_pipeline.py index 66e418c6..c6e3fd83 100644 --- a/test/test_POS_pipeline.py +++ b/test/test_POS_pipeline.py @@ -5,7 +5,7 @@ sys.path.append("..") from fastNLP.action.trainer import POSTrainer from fastNLP.loader.dataset_loader import POSDatasetLoader from fastNLP.loader.preprocess import POSPreprocess -from fastNLP.models.sequencce_modeling import SeqLabeling +from fastNLP.models.sequence_modeling import SeqLabeling data_name = "people.txt" data_path = "data_for_tests/people.txt" @@ -22,13 +22,14 @@ if __name__ == "__main__": num_classes = p.num_classes # Trainer - train_args = POSTrainer.TrainConfig(epochs=20, batch_size=1, num_classes=num_classes, - vocab_size=vocab_size, pickle_path=pickle_path) + train_args = {"epochs": 20, "batch_size": 1, "num_classes": num_classes, + "vocab_size": vocab_size, "pickle_path": pickle_path, "validate": False} trainer = POSTrainer(train_args) # Model model = SeqLabeling(100, 1, num_classes, vocab_size, bi_direction=True) - # Start training. + # Start training trainer.train(model) + print("Training finished!") From a73087e913ea6c7faad53a104983f87b0a8b2bef Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Tue, 10 Jul 2018 22:00:24 +0800 Subject: [PATCH 6/7] refactor Tester; Tester + Trainer for seq modeling work --- fastNLP/action/tester.py | 161 +++++++++++++++++++++++++------------- fastNLP/action/trainer.py | 36 +++++---- test/test_POS_pipeline.py | 2 +- 3 files changed, 125 insertions(+), 74 deletions(-) diff --git a/fastNLP/action/tester.py b/fastNLP/action/tester.py index 7f660bb0..2a71cf4d 100644 --- a/fastNLP/action/tester.py +++ b/fastNLP/action/tester.py @@ -1,87 +1,136 @@ -from collections import namedtuple +import _pickle -import numpy as np +import torch from fastNLP.action.action import Action +from fastNLP.action.action import RandomSampler, Batchifier +from fastNLP.modules.utils import seq_mask -class Tester(Action): +class BaseTester(Action): """docstring for Tester""" - TestConfig = namedtuple("config", ["validate_in_training", "save_dev_input", "save_output", - "save_loss", "batch_size"]) - def __init__(self, test_args): """ :param test_args: named tuple """ - super(Tester, self).__init__() - self.validate_in_training = test_args.validate_in_training - self.save_dev_input = test_args.save_dev_input + super(BaseTester, self).__init__() + self.validate_in_training = test_args["validate_in_training"] self.valid_x = None self.valid_y = None - self.save_output = test_args.save_output + self.save_output = test_args["save_output"] self.output = None - self.save_loss = test_args.save_loss + self.save_loss = test_args["save_loss"] self.mean_loss = None - self.batch_size = test_args.batch_size - - def test(self, network, data): - print("testing") - network.mode(test=True) # turn on the testing mode - if self.save_dev_input: - if self.valid_x is None: - valid_x, valid_y = network.prepare_input(data) - self.valid_x = valid_x - self.valid_y = valid_y - else: - valid_x = self.valid_x - valid_y = self.valid_y - else: - valid_x, valid_y = network.prepare_input(data) + self.batch_size = test_args["batch_size"] + self.pickle_path = test_args["pickle_path"] + self.iterator = None - # split into batches by self.batch_size - iterations, test_batch_generator = self.batchify(self.batch_size, valid_x, valid_y) + def test(self, network): + # print("--------------testing----------------") + self.mode(network, test=True) - batch_output = list() - loss_history = list() - # turn on the testing mode of the network - network.mode(test=True) + dev_data = self.prepare_input(self.pickle_path) - for step in range(iterations): - batch_x, batch_y = test_batch_generator.__next__() + self.iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) - # forward pass from test input to predicted output - prediction = network.data_forward(batch_x) + batch_output = list() + eval_history = list() + num_iter = len(dev_data) // self.batch_size + + for step in range(num_iter): + batch_x, batch_y = self.batchify(dev_data) - loss = network.get_loss(prediction, batch_y) + prediction = self.data_forward(network, batch_x) + eval_results = self.evaluate(prediction, batch_y) if self.save_output: - batch_output.append(prediction.data) + batch_output.append(prediction) if self.save_loss: - loss_history.append(loss) - self.log(self.make_log(step, loss)) - - if self.save_loss: - self.mean_loss = np.mean(np.array(loss_history)) - if self.save_output: - self.output = self.make_output(batch_output) + eval_history.append(eval_results) - @property - def loss(self): - return self.mean_loss + def prepare_input(self, data_path): + data_dev = _pickle.load(open(data_path + "/data_train.pkl", "rb")) + return data_dev - @property - def result(self): - return self.output + def batchify(self, data): + """ + 1. Perform batching from data and produce a batch of training data. + 2. Add padding. + :param data: list. Each entry is a sample, which is also a list of features and label(s). + E.g. + [ + [[word_11, word_12, word_13], [label_11. label_12]], # sample 1 + [[word_21, word_22, word_23], [label_21. label_22]], # sample 2 + ... + ] + :return batch_x: list. Each entry is a list of features of a sample. [batch_size, max_len] + batch_y: list. Each entry is a list of labels of a sample. [batch_size, num_labels] + """ + indices = next(self.iterator) + batch = [data[idx] for idx in indices] + batch_x = [sample[0] for sample in batch] + batch_y = [sample[1] for sample in batch] + batch_x = self.pad(batch_x) + return batch_x, batch_y @staticmethod - def make_output(batch_outputs): - # construct full prediction with batch outputs - return np.concatenate(batch_outputs, axis=0) + def pad(batch, fill=0): + """ + Pad a batch of samples to maximum length. + :param batch: list of list + :param fill: word index to pad, default 0. + :return: a padded batch + """ + max_length = max([len(x) for x in batch]) + for idx, sample in enumerate(batch): + if len(sample) < max_length: + batch[idx] = sample + [fill * (max_length - len(sample))] + return batch - def load_config(self, args): + def data_forward(self, network, data): raise NotImplementedError - def load_dataset(self, args): + def evaluate(self, predict, truth): raise NotImplementedError + + @property + def matrices(self): + raise NotImplementedError + + def mode(self, model, test=True): + """To do: combine this function with Trainer""" + if test: + model.eval() + else: + model.train() + + +class POSTester(BaseTester): + """ + Tester for sequence labeling. + """ + + def __init__(self, test_args): + super(POSTester, self).__init__(test_args) + self.max_len = None + self.mask = None + + def data_forward(self, network, x): + """To Do: combine with Trainer + + :param network: the PyTorch model + :param x: list of list, [batch_size, max_len] + :return y: [batch_size, num_classes] + """ + seq_len = [len(seq) for seq in x] + x = torch.Tensor(x).long() + self.batch_size = x.size(0) + self.max_len = x.size(1) + self.mask = seq_mask(seq_len, self.max_len) + y = network(x) + return y + + def evaluate(self, predict, truth): + """To Do: """ + return 0 diff --git a/fastNLP/action/trainer.py b/fastNLP/action/trainer.py index 6f51435a..034b46ca 100644 --- a/fastNLP/action/trainer.py +++ b/fastNLP/action/trainer.py @@ -5,7 +5,7 @@ import torch from fastNLP.action.action import Action from fastNLP.action.action import RandomSampler, Batchifier -from fastNLP.action.tester import Tester +from fastNLP.action.tester import POSTester from fastNLP.modules.utils import seq_mask @@ -43,7 +43,7 @@ class BaseTrainer(Action): self.optimizer = None def train(self, network): - """General training loop. + """General Training Steps :param network: a model The method is framework independent. @@ -57,23 +57,27 @@ class BaseTrainer(Action): - update Subclasses must implement these methods with a specific framework. """ + # prepare model and data self.model = network data_train, data_dev, data_test, embedding = self.prepare_input(self.pickle_path) - test_args = Tester.TestConfig(save_output=True, validate_in_training=True, - save_dev_input=True, save_loss=True, batch_size=self.batch_size) - evaluator = Tester(test_args) + # define tester over dev data + valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True, + "save_loss": True, "batch_size": self.batch_size, "pickle_path": self.pickle_path} + validator = POSTester(valid_args) - best_loss = 1e10 + # main training epochs iterations = len(data_train) // self.batch_size - for epoch in range(self.n_epochs): + + # turn on network training mode; define optimizer; prepare batch iterator self.mode(test=False) self.define_optimizer() self.iterator = iter(Batchifier(RandomSampler(data_train), self.batch_size, drop_last=True)) + # training iterations in one epoch for step in range(iterations): - batch_x, batch_y = self.batchify(self.batch_size, data_train) + batch_x, batch_y = self.batchify(data_train) prediction = self.data_forward(network, batch_x) @@ -84,9 +88,7 @@ class BaseTrainer(Action): if self.validate: if data_dev is None: raise RuntimeError("No validation data provided.") - evaluator.test(network, data_dev) - if evaluator.loss < best_loss: - best_loss = evaluator.loss + validator.test(network) # finish training @@ -162,11 +164,10 @@ class BaseTrainer(Action): """ raise NotImplementedError - def batchify(self, batch_size, data): + def batchify(self, data): """ 1. Perform batching from data and produce a batch of training data. 2. Add padding. - :param batch_size: int, the size of a batch :param data: list. Each entry is a sample, which is also a list of features and label(s). E.g. [ @@ -200,7 +201,9 @@ class BaseTrainer(Action): class ToyTrainer(BaseTrainer): - """A simple trainer for a PyTorch model.""" + """ + deprecated + """ def __init__(self, train_args): super(ToyTrainer, self).__init__(train_args) @@ -235,7 +238,7 @@ class ToyTrainer(BaseTrainer): class WordSegTrainer(BaseTrainer): """ - reserve for changes + deprecated """ def __init__(self, train_args): @@ -319,7 +322,6 @@ class WordSegTrainer(BaseTrainer): self.optimizer.step() - class POSTrainer(BaseTrainer): """ Trainer for Sequence Modeling @@ -391,4 +393,4 @@ if __name__ == "__name__": train_args = {"epochs": 1, "validate": False, "batch_size": 3, "pickle_path": "./"} trainer = BaseTrainer(train_args) data_train = [[[1, 2, 3, 4], [0]] * 10] + [[[1, 3, 5, 2], [1]] * 10] - trainer.batchify(batch_size=3, data=data_train) + trainer.batchify(data=data_train) diff --git a/test/test_POS_pipeline.py b/test/test_POS_pipeline.py index c6e3fd83..af22e3b9 100644 --- a/test/test_POS_pipeline.py +++ b/test/test_POS_pipeline.py @@ -23,7 +23,7 @@ if __name__ == "__main__": # Trainer train_args = {"epochs": 20, "batch_size": 1, "num_classes": num_classes, - "vocab_size": vocab_size, "pickle_path": pickle_path, "validate": False} + "vocab_size": vocab_size, "pickle_path": pickle_path, "validate": True} trainer = POSTrainer(train_args) # Model From 7514be6f30cafe6e7e16a1477ad61019985796f0 Mon Sep 17 00:00:00 2001 From: FengZiYjun Date: Wed, 11 Jul 2018 21:51:35 +0800 Subject: [PATCH 7/7] - add validation loss into trainer.train - restructure: move reproduction outside - add evaluate in tester --- fastNLP/action/tester.py | 36 +++++++++++++----- fastNLP/action/trainer.py | 2 + fastNLP/reproduction/__init__.py | 0 .../CNN-sentence_classification/.gitignore | 0 .../CNN-sentence_classification/README.md | 0 .../CNN-sentence_classification/__init__.py | 0 .../CNN-sentence_classification/dataset.py | 0 .../CNN-sentence_classification/model.py | 0 .../rt-polaritydata/rt-polarity.neg | 0 .../rt-polaritydata/rt-polarity.pos | 0 .../CNN-sentence_classification/train.py | 0 .../Char-aware_NLM/LICENSE | 0 .../Char-aware_NLM/README.md | 0 .../Char-aware_NLM/__init__.py | 0 .../Char-aware_NLM/model.py | 0 .../Char-aware_NLM/test.py | 0 .../Char-aware_NLM/test.txt | 0 .../Char-aware_NLM/train.py | 0 .../Char-aware_NLM/train.txt | 0 .../Char-aware_NLM/utilities.py | 0 .../Char-aware_NLM/valid.txt | 0 .../HAN-document_classification/README.md | 0 .../HAN-document_classification/__init__.py | 0 .../data/test_samples.pkl | Bin .../data/train_samples.pkl | Bin .../data/yelp.word2vec | Bin .../HAN-document_classification/evaluate.py | 0 .../HAN-document_classification/model.py | 0 .../HAN-document_classification/preprocess.py | 0 .../HAN-document_classification/train.py | 0 30 files changed, 29 insertions(+), 9 deletions(-) delete mode 100644 fastNLP/reproduction/__init__.py rename {fastNLP/reproduction => reproduction}/CNN-sentence_classification/.gitignore (100%) rename {fastNLP/reproduction => reproduction}/CNN-sentence_classification/README.md (100%) rename {fastNLP/reproduction => reproduction}/CNN-sentence_classification/__init__.py (100%) rename {fastNLP/reproduction => reproduction}/CNN-sentence_classification/dataset.py (100%) rename {fastNLP/reproduction => reproduction}/CNN-sentence_classification/model.py (100%) rename {fastNLP/reproduction => reproduction}/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg (100%) rename {fastNLP/reproduction => reproduction}/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos (100%) rename {fastNLP/reproduction => reproduction}/CNN-sentence_classification/train.py (100%) rename {fastNLP/reproduction => reproduction}/Char-aware_NLM/LICENSE (100%) rename {fastNLP/reproduction => reproduction}/Char-aware_NLM/README.md (100%) rename {fastNLP/reproduction => reproduction}/Char-aware_NLM/__init__.py (100%) rename {fastNLP/reproduction => reproduction}/Char-aware_NLM/model.py (100%) rename {fastNLP/reproduction => reproduction}/Char-aware_NLM/test.py (100%) rename {fastNLP/reproduction => reproduction}/Char-aware_NLM/test.txt (100%) rename {fastNLP/reproduction => reproduction}/Char-aware_NLM/train.py (100%) rename {fastNLP/reproduction => reproduction}/Char-aware_NLM/train.txt (100%) rename {fastNLP/reproduction => reproduction}/Char-aware_NLM/utilities.py (100%) rename {fastNLP/reproduction => reproduction}/Char-aware_NLM/valid.txt (100%) rename {fastNLP/reproduction => reproduction}/HAN-document_classification/README.md (100%) rename {fastNLP/reproduction => reproduction}/HAN-document_classification/__init__.py (100%) rename {fastNLP/reproduction => reproduction}/HAN-document_classification/data/test_samples.pkl (100%) rename {fastNLP/reproduction => reproduction}/HAN-document_classification/data/train_samples.pkl (100%) rename {fastNLP/reproduction => reproduction}/HAN-document_classification/data/yelp.word2vec (100%) rename {fastNLP/reproduction => reproduction}/HAN-document_classification/evaluate.py (100%) rename {fastNLP/reproduction => reproduction}/HAN-document_classification/model.py (100%) rename {fastNLP/reproduction => reproduction}/HAN-document_classification/preprocess.py (100%) rename {fastNLP/reproduction => reproduction}/HAN-document_classification/train.py (100%) diff --git a/fastNLP/action/tester.py b/fastNLP/action/tester.py index 2a71cf4d..9d32ec40 100644 --- a/fastNLP/action/tester.py +++ b/fastNLP/action/tester.py @@ -1,5 +1,6 @@ import _pickle +import numpy as np import torch from fastNLP.action.action import Action @@ -16,8 +17,7 @@ class BaseTester(Action): """ super(BaseTester, self).__init__() self.validate_in_training = test_args["validate_in_training"] - self.valid_x = None - self.valid_y = None + self.save_dev_data = None self.save_output = test_args["save_output"] self.output = None self.save_loss = test_args["save_loss"] @@ -26,8 +26,14 @@ class BaseTester(Action): self.pickle_path = test_args["pickle_path"] self.iterator = None + self.model = None + self.eval_history = [] + def test(self, network): # print("--------------testing----------------") + self.model = network + + # turn on the testing mode; clean up the history self.mode(network, test=True) dev_data = self.prepare_input(self.pickle_path) @@ -35,7 +41,6 @@ class BaseTester(Action): self.iterator = iter(Batchifier(RandomSampler(dev_data), self.batch_size, drop_last=True)) batch_output = list() - eval_history = list() num_iter = len(dev_data) // self.batch_size for step in range(num_iter): @@ -47,11 +52,18 @@ class BaseTester(Action): if self.save_output: batch_output.append(prediction) if self.save_loss: - eval_history.append(eval_results) + self.eval_history.append(eval_results) def prepare_input(self, data_path): - data_dev = _pickle.load(open(data_path + "/data_train.pkl", "rb")) - return data_dev + """ + Save the dev data once it is loaded. Can return directly next time. + :param data_path: str, the path to the pickle data for dev + :return save_dev_data: list. Each entry is a sample, which is also a list of features and label(s). + """ + if self.save_dev_data is None: + data_dev = _pickle.load(open(data_path + "/data_train.pkl", "rb")) + self.save_dev_data = data_dev + return self.save_dev_data def batchify(self, data): """ @@ -99,11 +111,12 @@ class BaseTester(Action): raise NotImplementedError def mode(self, model, test=True): - """To do: combine this function with Trainer""" + """To do: combine this function with Trainer ?? """ if test: model.eval() else: model.train() + self.eval_history.clear() class POSTester(BaseTester): @@ -115,6 +128,7 @@ class POSTester(BaseTester): super(POSTester, self).__init__(test_args) self.max_len = None self.mask = None + self.batch_result = None def data_forward(self, network, x): """To Do: combine with Trainer @@ -132,5 +146,9 @@ class POSTester(BaseTester): return y def evaluate(self, predict, truth): - """To Do: """ - return 0 + truth = torch.Tensor(truth) + loss, prediction = self.model.loss(predict, truth, self.mask, self.batch_size, self.max_len) + return loss.data + + def matrices(self): + return np.mean(self.eval_history) diff --git a/fastNLP/action/trainer.py b/fastNLP/action/trainer.py index 034b46ca..0ab9fee7 100644 --- a/fastNLP/action/trainer.py +++ b/fastNLP/action/trainer.py @@ -89,6 +89,7 @@ class BaseTrainer(Action): if data_dev is None: raise RuntimeError("No validation data provided.") validator.test(network) + print("[epoch {}] dev loss={:.2f}".format(epoch, validator.matrices())) # finish training @@ -386,6 +387,7 @@ class POSTrainer(BaseTrainer): else: self.define_loss() loss, prediction = self.loss_func(predict, truth, self.mask, self.batch_size, self.max_len) + # print("loss={:.2f}".format(loss.data)) return loss diff --git a/fastNLP/reproduction/__init__.py b/fastNLP/reproduction/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/fastNLP/reproduction/CNN-sentence_classification/.gitignore b/reproduction/CNN-sentence_classification/.gitignore similarity index 100% rename from fastNLP/reproduction/CNN-sentence_classification/.gitignore rename to reproduction/CNN-sentence_classification/.gitignore diff --git a/fastNLP/reproduction/CNN-sentence_classification/README.md b/reproduction/CNN-sentence_classification/README.md similarity index 100% rename from fastNLP/reproduction/CNN-sentence_classification/README.md rename to reproduction/CNN-sentence_classification/README.md diff --git a/fastNLP/reproduction/CNN-sentence_classification/__init__.py b/reproduction/CNN-sentence_classification/__init__.py similarity index 100% rename from fastNLP/reproduction/CNN-sentence_classification/__init__.py rename to reproduction/CNN-sentence_classification/__init__.py diff --git a/fastNLP/reproduction/CNN-sentence_classification/dataset.py b/reproduction/CNN-sentence_classification/dataset.py similarity index 100% rename from fastNLP/reproduction/CNN-sentence_classification/dataset.py rename to reproduction/CNN-sentence_classification/dataset.py diff --git a/fastNLP/reproduction/CNN-sentence_classification/model.py b/reproduction/CNN-sentence_classification/model.py similarity index 100% rename from fastNLP/reproduction/CNN-sentence_classification/model.py rename to reproduction/CNN-sentence_classification/model.py diff --git a/fastNLP/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg b/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg similarity index 100% rename from fastNLP/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg rename to reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.neg diff --git a/fastNLP/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos b/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos similarity index 100% rename from fastNLP/reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos rename to reproduction/CNN-sentence_classification/rt-polaritydata/rt-polarity.pos diff --git a/fastNLP/reproduction/CNN-sentence_classification/train.py b/reproduction/CNN-sentence_classification/train.py similarity index 100% rename from fastNLP/reproduction/CNN-sentence_classification/train.py rename to reproduction/CNN-sentence_classification/train.py diff --git a/fastNLP/reproduction/Char-aware_NLM/LICENSE b/reproduction/Char-aware_NLM/LICENSE similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/LICENSE rename to reproduction/Char-aware_NLM/LICENSE diff --git a/fastNLP/reproduction/Char-aware_NLM/README.md b/reproduction/Char-aware_NLM/README.md similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/README.md rename to reproduction/Char-aware_NLM/README.md diff --git a/fastNLP/reproduction/Char-aware_NLM/__init__.py b/reproduction/Char-aware_NLM/__init__.py similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/__init__.py rename to reproduction/Char-aware_NLM/__init__.py diff --git a/fastNLP/reproduction/Char-aware_NLM/model.py b/reproduction/Char-aware_NLM/model.py similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/model.py rename to reproduction/Char-aware_NLM/model.py diff --git a/fastNLP/reproduction/Char-aware_NLM/test.py b/reproduction/Char-aware_NLM/test.py similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/test.py rename to reproduction/Char-aware_NLM/test.py diff --git a/fastNLP/reproduction/Char-aware_NLM/test.txt b/reproduction/Char-aware_NLM/test.txt similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/test.txt rename to reproduction/Char-aware_NLM/test.txt diff --git a/fastNLP/reproduction/Char-aware_NLM/train.py b/reproduction/Char-aware_NLM/train.py similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/train.py rename to reproduction/Char-aware_NLM/train.py diff --git a/fastNLP/reproduction/Char-aware_NLM/train.txt b/reproduction/Char-aware_NLM/train.txt similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/train.txt rename to reproduction/Char-aware_NLM/train.txt diff --git a/fastNLP/reproduction/Char-aware_NLM/utilities.py b/reproduction/Char-aware_NLM/utilities.py similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/utilities.py rename to reproduction/Char-aware_NLM/utilities.py diff --git a/fastNLP/reproduction/Char-aware_NLM/valid.txt b/reproduction/Char-aware_NLM/valid.txt similarity index 100% rename from fastNLP/reproduction/Char-aware_NLM/valid.txt rename to reproduction/Char-aware_NLM/valid.txt diff --git a/fastNLP/reproduction/HAN-document_classification/README.md b/reproduction/HAN-document_classification/README.md similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/README.md rename to reproduction/HAN-document_classification/README.md diff --git a/fastNLP/reproduction/HAN-document_classification/__init__.py b/reproduction/HAN-document_classification/__init__.py similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/__init__.py rename to reproduction/HAN-document_classification/__init__.py diff --git a/fastNLP/reproduction/HAN-document_classification/data/test_samples.pkl b/reproduction/HAN-document_classification/data/test_samples.pkl similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/data/test_samples.pkl rename to reproduction/HAN-document_classification/data/test_samples.pkl diff --git a/fastNLP/reproduction/HAN-document_classification/data/train_samples.pkl b/reproduction/HAN-document_classification/data/train_samples.pkl similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/data/train_samples.pkl rename to reproduction/HAN-document_classification/data/train_samples.pkl diff --git a/fastNLP/reproduction/HAN-document_classification/data/yelp.word2vec b/reproduction/HAN-document_classification/data/yelp.word2vec similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/data/yelp.word2vec rename to reproduction/HAN-document_classification/data/yelp.word2vec diff --git a/fastNLP/reproduction/HAN-document_classification/evaluate.py b/reproduction/HAN-document_classification/evaluate.py similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/evaluate.py rename to reproduction/HAN-document_classification/evaluate.py diff --git a/fastNLP/reproduction/HAN-document_classification/model.py b/reproduction/HAN-document_classification/model.py similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/model.py rename to reproduction/HAN-document_classification/model.py diff --git a/fastNLP/reproduction/HAN-document_classification/preprocess.py b/reproduction/HAN-document_classification/preprocess.py similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/preprocess.py rename to reproduction/HAN-document_classification/preprocess.py diff --git a/fastNLP/reproduction/HAN-document_classification/train.py b/reproduction/HAN-document_classification/train.py similarity index 100% rename from fastNLP/reproduction/HAN-document_classification/train.py rename to reproduction/HAN-document_classification/train.py