diff --git a/fastNLP/core/inference.py b/fastNLP/core/inference.py index 11a3ba48..c1085554 100644 --- a/fastNLP/core/inference.py +++ b/fastNLP/core/inference.py @@ -63,7 +63,7 @@ class Inference(object): """ Perform inference. :param network: - :param data: multi-level lists of strings + :param data: two-level lists of strings :return result: the model outputs """ # transform strings into indices @@ -97,7 +97,7 @@ class Inference(object): def prepare_input(self, data): """ - Transform three-level list of strings into that of index. + Transform two-level list of strings into that of index. :param data: [ [word_11, word_12, ...], @@ -140,7 +140,7 @@ class SeqLabelInfer(Inference): mask = mask.byte().view(batch_size, max_len) y = network(x) prediction = network.prediction(y, mask) - return torch.Tensor(prediction, required_grad=False) + return torch.Tensor(prediction) def make_batch(self, iterator, data, use_cuda): return make_batch(iterator, data, use_cuda, output_length=True) diff --git a/fastNLP/core/tester.py b/fastNLP/core/tester.py index 425f2029..3799eed1 100644 --- a/fastNLP/core/tester.py +++ b/fastNLP/core/tester.py @@ -37,10 +37,6 @@ class BaseTester(object): else: self.model = network - # no backward setting for model - for param in network.parameters(): - param.requires_grad = False - # turn on the testing mode; clean up the history self.mode(network, test=True) self.eval_history.clear() @@ -112,6 +108,7 @@ class SeqLabelTester(BaseTester): super(SeqLabelTester, self).__init__(test_args) self.max_len = None self.mask = None + self.seq_len = None self.batch_result = None def data_forward(self, network, inputs): @@ -125,7 +122,7 @@ class SeqLabelTester(BaseTester): if torch.cuda.is_available() and self.use_cuda: mask = mask.cuda() self.mask = mask - + self.seq_len = seq_len y = network(x) return y diff --git a/fastNLP/models/sequence_modeling.py b/fastNLP/models/sequence_modeling.py index b28ef604..5addc73e 100644 --- a/fastNLP/models/sequence_modeling.py +++ b/fastNLP/models/sequence_modeling.py @@ -56,3 +56,49 @@ class SeqLabeling(BaseModel): """ tag_seq = self.Crf.viterbi_decode(x, mask) return tag_seq + + +class AdvSeqLabel(SeqLabeling): + """ + Advanced Sequence Labeling Model + """ + + def __init__(self, args, emb=None): + super(AdvSeqLabel, self).__init__(args) + + vocab_size = args["vocab_size"] + word_emb_dim = args["word_emb_dim"] + hidden_dim = args["rnn_hidden_units"] + num_classes = args["num_classes"] + + self.Embedding = encoder.embedding.Embedding(vocab_size, word_emb_dim, init_emb=emb) + self.Rnn = encoder.lstm.Lstm(word_emb_dim, hidden_dim, num_layers=3, dropout=0.3, bidirectional=True) + self.Linear1 = encoder.Linear(hidden_dim * 2, hidden_dim * 2 // 3) + self.batch_norm = torch.nn.BatchNorm1d(hidden_dim * 2 // 3) + self.relu = torch.nn.ReLU() + self.drop = torch.nn.Dropout(0.3) + self.Linear2 = encoder.Linear(hidden_dim * 2 // 3, num_classes) + + self.Crf = decoder.CRF.ConditionalRandomField(num_classes) + + def forward(self, x): + """ + :param x: LongTensor, [batch_size, mex_len] + :return y: [batch_size, mex_len, tag_size] + """ + batch_size = x.size(0) + max_len = x.size(1) + x = self.Embedding(x) + # [batch_size, max_len, word_emb_dim] + x = self.Rnn(x) + # [batch_size, max_len, hidden_size * direction] + x = x.contiguous() + x = x.view(batch_size * max_len, -1) + x = self.Linear1(x) + x = self.batch_norm(x) + x = self.relu(x) + x = self.drop(x) + x = self.Linear2(x) + x = x.view(batch_size, max_len, -1) + # [batch_size, max_len, num_classes] + return x diff --git a/test/data_for_tests/people.txt b/test/data_for_tests/people.txt index e4909679..9ef0de6d 100644 --- a/test/data_for_tests/people.txt +++ b/test/data_for_tests/people.txt @@ -123,6 +123,160 @@ 张 S-q ) S-w +迈 B-v +向 E-v +充 B-v +满 E-v +希 B-n +望 E-n +的 S-u +新 S-a +世 B-n +纪 E-n +— B-w +— E-w +一 B-t +九 M-t +九 M-t +八 M-t +年 E-t +新 B-t +年 E-t +讲 B-n +话 E-n +( S-w +附 S-v +图 B-n +片 E-n +1 S-m +张 S-q +) S-w + +迈 B-v +向 E-v +充 B-v +满 E-v +希 B-n +望 E-n +的 S-u +新 S-a +世 B-n +纪 E-n +— B-w +— E-w +一 B-t +九 M-t +九 M-t +八 M-t +年 E-t +新 B-t +年 E-t +讲 B-n +话 E-n +( S-w +附 S-v +图 B-n +片 E-n +1 S-m +张 S-q +) S-w + +中 B-nt +共 M-nt +中 M-nt +央 E-nt +总 B-n +书 M-n +记 E-n +、 S-w +国 B-n +家 E-n +主 B-n +席 E-n +江 B-nr +泽 M-nr +民 E-nr + +( S-w +一 B-t +九 M-t +九 M-t +七 M-t +年 E-t +十 B-t +二 M-t +月 E-t +三 B-t +十 M-t +一 M-t +日 E-t +) S-w + +1 B-t +2 M-t +月 E-t +3 B-t +1 M-t +日 E-t +, S-w +迈 B-v +向 E-v +充 B-v +满 E-v +希 B-n +望 E-n +的 S-u +新 S-a +世 B-n +纪 E-n +— B-w +— E-w +一 B-t +九 M-t +九 M-t +八 M-t +年 E-t +新 B-t +年 E-t +讲 B-n +话 E-n +( S-w +附 S-v +图 B-n +片 E-n +1 S-m +张 S-q +) S-w + +迈 B-v +向 E-v +充 B-v +满 E-v +希 B-n +望 E-n +的 S-u +新 S-a +世 B-n +纪 E-n +— B-w +— E-w +一 B-t +九 M-t +九 M-t +八 M-t +年 E-t +新 B-t +年 E-t +讲 B-n +话 E-n +( S-w +附 S-v +图 B-n +片 E-n +1 S-m +张 S-q +) S-w + 迈 B-v 向 E-v 充 B-v diff --git a/test/ner.py b/test/ner.py new file mode 100644 index 00000000..beaac1d6 --- /dev/null +++ b/test/ner.py @@ -0,0 +1,137 @@ +import _pickle +import os + +import numpy as np +import torch + +from fastNLP.core.tester import SeqLabelTester +from fastNLP.core.trainer import SeqLabelTrainer +from fastNLP.loader.preprocess import POSPreprocess +from fastNLP.models.sequence_modeling import AdvSeqLabel + + +class MyNERTrainer(SeqLabelTrainer): + def __init__(self, train_args): + super(MyNERTrainer, self).__init__(train_args) + self.scheduler = None + + def define_optimizer(self): + """ + override + :return: + """ + self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001) + self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=3000, gamma=0.5) + + def update(self): + """ + override + :return: + """ + self.optimizer.step() + self.scheduler.step() + + def _create_validator(self, valid_args): + return MyNERTester(valid_args) + + def best_eval_result(self, validator): + accuracy = validator.metrics() + if accuracy > self.best_accuracy: + self.best_accuracy = accuracy + return True + else: + return False + + +class MyNERTester(SeqLabelTester): + def __init__(self, test_args): + super(MyNERTester, self).__init__(test_args) + + def _evaluate(self, prediction, batch_y, seq_len): + """ + :param prediction: [batch_size, seq_len, num_classes] + :param batch_y: [batch_size, seq_len] + :param seq_len: [batch_size] + :return: + """ + summ = 0 + correct = 0 + _, indices = torch.max(prediction, 2) + for p, y, l in zip(indices, batch_y, seq_len): + summ += l + correct += np.sum(p[:l].cpu().numpy() == y[:l].cpu().numpy()) + return float(correct / summ) + + def evaluate(self, predict, truth): + return self._evaluate(predict, truth, self.seq_len) + + def metrics(self): + return np.mean(self.eval_history) + + def show_matrices(self): + return "dev accuracy={:.2f}".format(float(self.metrics())) + + +def embedding_process(emb_file, word_dict, emb_dim, emb_pkl): + if os.path.exists(emb_pkl): + with open(emb_pkl, "rb") as f: + embedding_np = _pickle.load(f) + return embedding_np + with open(emb_file, "r", encoding="utf-8") as f: + embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim)) + for line in f: + line = line.strip().split() + if len(line) != emb_dim + 1: + continue + if line[0] in word_dict: + embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]] + with open(emb_pkl, "wb") as f: + _pickle.dump(embedding_np, f) + return embedding_np + + +def data_load(data_file): + with open(data_file, "r", encoding="utf-8") as f: + all_data = [] + sent = [] + label = [] + for line in f: + line = line.strip().split() + + if not len(line) <= 1: + sent.append(line[0]) + label.append(line[1]) + else: + all_data.append([sent, label]) + sent = [] + label = [] + return all_data + + +data_path = "data_for_tests/people.txt" +pick_path = "data_for_tests/" +emb_path = "data_for_tests/emb50.txt" +save_path = "data_for_tests/" +if __name__ == "__main__": + data = data_load(data_path) + p = POSPreprocess(data, pickle_path=pick_path, train_dev_split=0.3) + # emb = embedding_process(emb_path, p.word2index, 50, os.path.join(pick_path, "embedding.pkl")) + emb = None + args = {"epochs": 20, + "batch_size": 1, + "pickle_path": pick_path, + "validate": True, + "save_best_dev": True, + "model_saved_path": save_path, + "use_cuda": True, + + "vocab_size": p.vocab_size, + "num_classes": p.num_classes, + "word_emb_dim": 50, + "rnn_hidden_units": 100 + } + # emb = torch.Tensor(emb).float().cuda() + networks = AdvSeqLabel(args, emb) + trainer = MyNERTrainer(args) + trainer.train(network=networks) + print("Training finished!") diff --git a/test/ner_decode.py b/test/ner_decode.py new file mode 100644 index 00000000..a319a20e --- /dev/null +++ b/test/ner_decode.py @@ -0,0 +1,129 @@ +import _pickle +import os + +import torch + +from fastNLP.core.inference import SeqLabelInfer +from fastNLP.core.trainer import SeqLabelTrainer +from fastNLP.loader.model_loader import ModelLoader +from fastNLP.models.sequence_modeling import AdvSeqLabel + + +class Decode(SeqLabelTrainer): + def __init__(self, args): + super(Decode, self).__init__(args) + + def decoder(self, network, sents, model_path): + self.model = network + self.model.load_state_dict(torch.load(model_path)) + out_put = [] + self.mode(network, test=True) + for batch_x in sents: + prediction = self.data_forward(self.model, batch_x) + + seq_tag = self.model.prediction(prediction, batch_x[1]) + + out_put.append(list(seq_tag)[0]) + return out_put + + +def process_sent(sents, word2id): + sents_num = [] + for s in sents: + sent_num = [] + for c in s: + if c in word2id: + sent_num.append(word2id[c]) + else: + sent_num.append(word2id[""]) + sents_num.append(([sent_num], [len(sent_num)])) # batch_size is 1 + + return sents_num + + +def process_tag(sents, tags, id2class): + Tags = [] + for ttt in tags: + Tags.append([id2class[t] for t in ttt]) + + Segs = [] + PosNers = [] + for sent, tag in zip(sents, tags): + word__ = [] + lll__ = [] + for c, t in zip(sent, tag): + + t = id2class[t] + l = t.split("-") + split_ = l[0] + pn = l[1] + + if split_ == "S": + word__.append(c) + lll__.append(pn) + word_1 = "" + elif split_ == "E": + word_1 += c + word__.append(word_1) + lll__.append(pn) + word_1 = "" + elif split_ == "B": + word_1 = "" + word_1 += c + else: + word_1 += c + Segs.append(word__) + PosNers.append(lll__) + return Segs, PosNers + + +pickle_path = "data_for_tests/" +model_path = "data_for_tests/model_best_dev.pkl" +if __name__ == "__main__": + + with open(os.path.join(pickle_path, "id2word.pkl"), "rb") as f: + id2word = _pickle.load(f) + with open(os.path.join(pickle_path, "word2id.pkl"), "rb") as f: + word2id = _pickle.load(f) + with open(os.path.join(pickle_path, "id2class.pkl"), "rb") as f: + id2class = _pickle.load(f) + + sent = ["中共中央总书记、国家主席江泽民", + "逆向处理输入序列并返回逆序后的序列"] # here is input + + args = {"epochs": 1, + "batch_size": 1, + "pickle_path": "data_for_tests/", + "validate": True, + "save_best_dev": True, + "model_saved_path": "data_for_tests/", + "use_cuda": False, + + "vocab_size": len(word2id), + "num_classes": len(id2class), + "word_emb_dim": 50, + "rnn_hidden_units": 100, + } + """ + network = AdvSeqLabel(args, None) + decoder_ = Decode(args) + tags_num = decoder_.decoder(network, process_sent(sent, word2id), model_path=model_path) + output_seg, output_pn = process_tag(sent, tags_num, id2class) # here is output + print(output_seg) + print(output_pn) + """ + # Define the same model + model = AdvSeqLabel(args, None) + + # Dump trained parameters into the model + ModelLoader.load_pytorch(model, "./data_for_tests/model_best_dev.pkl") + print("model loaded!") + + # Inference interface + infer = SeqLabelInfer(pickle_path) + sent = [[ch for ch in s] for s in sent] + results = infer.predict(model, sent) + + for res in results: + print(res) + print("Inference finished!")