@@ -63,7 +63,7 @@ class Inference(object): | |||
""" | |||
Perform inference. | |||
:param network: | |||
:param data: multi-level lists of strings | |||
:param data: two-level lists of strings | |||
:return result: the model outputs | |||
""" | |||
# transform strings into indices | |||
@@ -97,7 +97,7 @@ class Inference(object): | |||
def prepare_input(self, data): | |||
""" | |||
Transform three-level list of strings into that of index. | |||
Transform two-level list of strings into that of index. | |||
:param data: | |||
[ | |||
[word_11, word_12, ...], | |||
@@ -140,7 +140,7 @@ class SeqLabelInfer(Inference): | |||
mask = mask.byte().view(batch_size, max_len) | |||
y = network(x) | |||
prediction = network.prediction(y, mask) | |||
return torch.Tensor(prediction, required_grad=False) | |||
return torch.Tensor(prediction) | |||
def make_batch(self, iterator, data, use_cuda): | |||
return make_batch(iterator, data, use_cuda, output_length=True) | |||
@@ -37,10 +37,6 @@ class BaseTester(object): | |||
else: | |||
self.model = network | |||
# no backward setting for model | |||
for param in network.parameters(): | |||
param.requires_grad = False | |||
# turn on the testing mode; clean up the history | |||
self.mode(network, test=True) | |||
self.eval_history.clear() | |||
@@ -112,6 +108,7 @@ class SeqLabelTester(BaseTester): | |||
super(SeqLabelTester, self).__init__(test_args) | |||
self.max_len = None | |||
self.mask = None | |||
self.seq_len = None | |||
self.batch_result = None | |||
def data_forward(self, network, inputs): | |||
@@ -125,7 +122,7 @@ class SeqLabelTester(BaseTester): | |||
if torch.cuda.is_available() and self.use_cuda: | |||
mask = mask.cuda() | |||
self.mask = mask | |||
self.seq_len = seq_len | |||
y = network(x) | |||
return y | |||
@@ -56,3 +56,49 @@ class SeqLabeling(BaseModel): | |||
""" | |||
tag_seq = self.Crf.viterbi_decode(x, mask) | |||
return tag_seq | |||
class AdvSeqLabel(SeqLabeling): | |||
""" | |||
Advanced Sequence Labeling Model | |||
""" | |||
def __init__(self, args, emb=None): | |||
super(AdvSeqLabel, self).__init__(args) | |||
vocab_size = args["vocab_size"] | |||
word_emb_dim = args["word_emb_dim"] | |||
hidden_dim = args["rnn_hidden_units"] | |||
num_classes = args["num_classes"] | |||
self.Embedding = encoder.embedding.Embedding(vocab_size, word_emb_dim, init_emb=emb) | |||
self.Rnn = encoder.lstm.Lstm(word_emb_dim, hidden_dim, num_layers=3, dropout=0.3, bidirectional=True) | |||
self.Linear1 = encoder.Linear(hidden_dim * 2, hidden_dim * 2 // 3) | |||
self.batch_norm = torch.nn.BatchNorm1d(hidden_dim * 2 // 3) | |||
self.relu = torch.nn.ReLU() | |||
self.drop = torch.nn.Dropout(0.3) | |||
self.Linear2 = encoder.Linear(hidden_dim * 2 // 3, num_classes) | |||
self.Crf = decoder.CRF.ConditionalRandomField(num_classes) | |||
def forward(self, x): | |||
""" | |||
:param x: LongTensor, [batch_size, mex_len] | |||
:return y: [batch_size, mex_len, tag_size] | |||
""" | |||
batch_size = x.size(0) | |||
max_len = x.size(1) | |||
x = self.Embedding(x) | |||
# [batch_size, max_len, word_emb_dim] | |||
x = self.Rnn(x) | |||
# [batch_size, max_len, hidden_size * direction] | |||
x = x.contiguous() | |||
x = x.view(batch_size * max_len, -1) | |||
x = self.Linear1(x) | |||
x = self.batch_norm(x) | |||
x = self.relu(x) | |||
x = self.drop(x) | |||
x = self.Linear2(x) | |||
x = x.view(batch_size, max_len, -1) | |||
# [batch_size, max_len, num_classes] | |||
return x |
@@ -123,6 +123,160 @@ | |||
张 S-q | |||
) S-w | |||
迈 B-v | |||
向 E-v | |||
充 B-v | |||
满 E-v | |||
希 B-n | |||
望 E-n | |||
的 S-u | |||
新 S-a | |||
世 B-n | |||
纪 E-n | |||
— B-w | |||
— E-w | |||
一 B-t | |||
九 M-t | |||
九 M-t | |||
八 M-t | |||
年 E-t | |||
新 B-t | |||
年 E-t | |||
讲 B-n | |||
话 E-n | |||
( S-w | |||
附 S-v | |||
图 B-n | |||
片 E-n | |||
1 S-m | |||
张 S-q | |||
) S-w | |||
迈 B-v | |||
向 E-v | |||
充 B-v | |||
满 E-v | |||
希 B-n | |||
望 E-n | |||
的 S-u | |||
新 S-a | |||
世 B-n | |||
纪 E-n | |||
— B-w | |||
— E-w | |||
一 B-t | |||
九 M-t | |||
九 M-t | |||
八 M-t | |||
年 E-t | |||
新 B-t | |||
年 E-t | |||
讲 B-n | |||
话 E-n | |||
( S-w | |||
附 S-v | |||
图 B-n | |||
片 E-n | |||
1 S-m | |||
张 S-q | |||
) S-w | |||
中 B-nt | |||
共 M-nt | |||
中 M-nt | |||
央 E-nt | |||
总 B-n | |||
书 M-n | |||
记 E-n | |||
、 S-w | |||
国 B-n | |||
家 E-n | |||
主 B-n | |||
席 E-n | |||
江 B-nr | |||
泽 M-nr | |||
民 E-nr | |||
( S-w | |||
一 B-t | |||
九 M-t | |||
九 M-t | |||
七 M-t | |||
年 E-t | |||
十 B-t | |||
二 M-t | |||
月 E-t | |||
三 B-t | |||
十 M-t | |||
一 M-t | |||
日 E-t | |||
) S-w | |||
1 B-t | |||
2 M-t | |||
月 E-t | |||
3 B-t | |||
1 M-t | |||
日 E-t | |||
, S-w | |||
迈 B-v | |||
向 E-v | |||
充 B-v | |||
满 E-v | |||
希 B-n | |||
望 E-n | |||
的 S-u | |||
新 S-a | |||
世 B-n | |||
纪 E-n | |||
— B-w | |||
— E-w | |||
一 B-t | |||
九 M-t | |||
九 M-t | |||
八 M-t | |||
年 E-t | |||
新 B-t | |||
年 E-t | |||
讲 B-n | |||
话 E-n | |||
( S-w | |||
附 S-v | |||
图 B-n | |||
片 E-n | |||
1 S-m | |||
张 S-q | |||
) S-w | |||
迈 B-v | |||
向 E-v | |||
充 B-v | |||
满 E-v | |||
希 B-n | |||
望 E-n | |||
的 S-u | |||
新 S-a | |||
世 B-n | |||
纪 E-n | |||
— B-w | |||
— E-w | |||
一 B-t | |||
九 M-t | |||
九 M-t | |||
八 M-t | |||
年 E-t | |||
新 B-t | |||
年 E-t | |||
讲 B-n | |||
话 E-n | |||
( S-w | |||
附 S-v | |||
图 B-n | |||
片 E-n | |||
1 S-m | |||
张 S-q | |||
) S-w | |||
迈 B-v | |||
向 E-v | |||
充 B-v | |||
@@ -0,0 +1,137 @@ | |||
import _pickle | |||
import os | |||
import numpy as np | |||
import torch | |||
from fastNLP.core.tester import SeqLabelTester | |||
from fastNLP.core.trainer import SeqLabelTrainer | |||
from fastNLP.loader.preprocess import POSPreprocess | |||
from fastNLP.models.sequence_modeling import AdvSeqLabel | |||
class MyNERTrainer(SeqLabelTrainer): | |||
def __init__(self, train_args): | |||
super(MyNERTrainer, self).__init__(train_args) | |||
self.scheduler = None | |||
def define_optimizer(self): | |||
""" | |||
override | |||
:return: | |||
""" | |||
self.optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001) | |||
self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=3000, gamma=0.5) | |||
def update(self): | |||
""" | |||
override | |||
:return: | |||
""" | |||
self.optimizer.step() | |||
self.scheduler.step() | |||
def _create_validator(self, valid_args): | |||
return MyNERTester(valid_args) | |||
def best_eval_result(self, validator): | |||
accuracy = validator.metrics() | |||
if accuracy > self.best_accuracy: | |||
self.best_accuracy = accuracy | |||
return True | |||
else: | |||
return False | |||
class MyNERTester(SeqLabelTester): | |||
def __init__(self, test_args): | |||
super(MyNERTester, self).__init__(test_args) | |||
def _evaluate(self, prediction, batch_y, seq_len): | |||
""" | |||
:param prediction: [batch_size, seq_len, num_classes] | |||
:param batch_y: [batch_size, seq_len] | |||
:param seq_len: [batch_size] | |||
:return: | |||
""" | |||
summ = 0 | |||
correct = 0 | |||
_, indices = torch.max(prediction, 2) | |||
for p, y, l in zip(indices, batch_y, seq_len): | |||
summ += l | |||
correct += np.sum(p[:l].cpu().numpy() == y[:l].cpu().numpy()) | |||
return float(correct / summ) | |||
def evaluate(self, predict, truth): | |||
return self._evaluate(predict, truth, self.seq_len) | |||
def metrics(self): | |||
return np.mean(self.eval_history) | |||
def show_matrices(self): | |||
return "dev accuracy={:.2f}".format(float(self.metrics())) | |||
def embedding_process(emb_file, word_dict, emb_dim, emb_pkl): | |||
if os.path.exists(emb_pkl): | |||
with open(emb_pkl, "rb") as f: | |||
embedding_np = _pickle.load(f) | |||
return embedding_np | |||
with open(emb_file, "r", encoding="utf-8") as f: | |||
embedding_np = np.random.uniform(-1, 1, size=(len(word_dict), emb_dim)) | |||
for line in f: | |||
line = line.strip().split() | |||
if len(line) != emb_dim + 1: | |||
continue | |||
if line[0] in word_dict: | |||
embedding_np[word_dict[line[0]]] = [float(i) for i in line[1:]] | |||
with open(emb_pkl, "wb") as f: | |||
_pickle.dump(embedding_np, f) | |||
return embedding_np | |||
def data_load(data_file): | |||
with open(data_file, "r", encoding="utf-8") as f: | |||
all_data = [] | |||
sent = [] | |||
label = [] | |||
for line in f: | |||
line = line.strip().split() | |||
if not len(line) <= 1: | |||
sent.append(line[0]) | |||
label.append(line[1]) | |||
else: | |||
all_data.append([sent, label]) | |||
sent = [] | |||
label = [] | |||
return all_data | |||
data_path = "data_for_tests/people.txt" | |||
pick_path = "data_for_tests/" | |||
emb_path = "data_for_tests/emb50.txt" | |||
save_path = "data_for_tests/" | |||
if __name__ == "__main__": | |||
data = data_load(data_path) | |||
p = POSPreprocess(data, pickle_path=pick_path, train_dev_split=0.3) | |||
# emb = embedding_process(emb_path, p.word2index, 50, os.path.join(pick_path, "embedding.pkl")) | |||
emb = None | |||
args = {"epochs": 20, | |||
"batch_size": 1, | |||
"pickle_path": pick_path, | |||
"validate": True, | |||
"save_best_dev": True, | |||
"model_saved_path": save_path, | |||
"use_cuda": True, | |||
"vocab_size": p.vocab_size, | |||
"num_classes": p.num_classes, | |||
"word_emb_dim": 50, | |||
"rnn_hidden_units": 100 | |||
} | |||
# emb = torch.Tensor(emb).float().cuda() | |||
networks = AdvSeqLabel(args, emb) | |||
trainer = MyNERTrainer(args) | |||
trainer.train(network=networks) | |||
print("Training finished!") |
@@ -0,0 +1,129 @@ | |||
import _pickle | |||
import os | |||
import torch | |||
from fastNLP.core.inference import SeqLabelInfer | |||
from fastNLP.core.trainer import SeqLabelTrainer | |||
from fastNLP.loader.model_loader import ModelLoader | |||
from fastNLP.models.sequence_modeling import AdvSeqLabel | |||
class Decode(SeqLabelTrainer): | |||
def __init__(self, args): | |||
super(Decode, self).__init__(args) | |||
def decoder(self, network, sents, model_path): | |||
self.model = network | |||
self.model.load_state_dict(torch.load(model_path)) | |||
out_put = [] | |||
self.mode(network, test=True) | |||
for batch_x in sents: | |||
prediction = self.data_forward(self.model, batch_x) | |||
seq_tag = self.model.prediction(prediction, batch_x[1]) | |||
out_put.append(list(seq_tag)[0]) | |||
return out_put | |||
def process_sent(sents, word2id): | |||
sents_num = [] | |||
for s in sents: | |||
sent_num = [] | |||
for c in s: | |||
if c in word2id: | |||
sent_num.append(word2id[c]) | |||
else: | |||
sent_num.append(word2id["<unk>"]) | |||
sents_num.append(([sent_num], [len(sent_num)])) # batch_size is 1 | |||
return sents_num | |||
def process_tag(sents, tags, id2class): | |||
Tags = [] | |||
for ttt in tags: | |||
Tags.append([id2class[t] for t in ttt]) | |||
Segs = [] | |||
PosNers = [] | |||
for sent, tag in zip(sents, tags): | |||
word__ = [] | |||
lll__ = [] | |||
for c, t in zip(sent, tag): | |||
t = id2class[t] | |||
l = t.split("-") | |||
split_ = l[0] | |||
pn = l[1] | |||
if split_ == "S": | |||
word__.append(c) | |||
lll__.append(pn) | |||
word_1 = "" | |||
elif split_ == "E": | |||
word_1 += c | |||
word__.append(word_1) | |||
lll__.append(pn) | |||
word_1 = "" | |||
elif split_ == "B": | |||
word_1 = "" | |||
word_1 += c | |||
else: | |||
word_1 += c | |||
Segs.append(word__) | |||
PosNers.append(lll__) | |||
return Segs, PosNers | |||
pickle_path = "data_for_tests/" | |||
model_path = "data_for_tests/model_best_dev.pkl" | |||
if __name__ == "__main__": | |||
with open(os.path.join(pickle_path, "id2word.pkl"), "rb") as f: | |||
id2word = _pickle.load(f) | |||
with open(os.path.join(pickle_path, "word2id.pkl"), "rb") as f: | |||
word2id = _pickle.load(f) | |||
with open(os.path.join(pickle_path, "id2class.pkl"), "rb") as f: | |||
id2class = _pickle.load(f) | |||
sent = ["中共中央总书记、国家主席江泽民", | |||
"逆向处理输入序列并返回逆序后的序列"] # here is input | |||
args = {"epochs": 1, | |||
"batch_size": 1, | |||
"pickle_path": "data_for_tests/", | |||
"validate": True, | |||
"save_best_dev": True, | |||
"model_saved_path": "data_for_tests/", | |||
"use_cuda": False, | |||
"vocab_size": len(word2id), | |||
"num_classes": len(id2class), | |||
"word_emb_dim": 50, | |||
"rnn_hidden_units": 100, | |||
} | |||
""" | |||
network = AdvSeqLabel(args, None) | |||
decoder_ = Decode(args) | |||
tags_num = decoder_.decoder(network, process_sent(sent, word2id), model_path=model_path) | |||
output_seg, output_pn = process_tag(sent, tags_num, id2class) # here is output | |||
print(output_seg) | |||
print(output_pn) | |||
""" | |||
# Define the same model | |||
model = AdvSeqLabel(args, None) | |||
# Dump trained parameters into the model | |||
ModelLoader.load_pytorch(model, "./data_for_tests/model_best_dev.pkl") | |||
print("model loaded!") | |||
# Inference interface | |||
infer = SeqLabelInfer(pickle_path) | |||
sent = [[ch for ch in s] for s in sent] | |||
results = infer.predict(model, sent) | |||
for res in results: | |||
print(res) | |||
print("Inference finished!") |