@@ -1,11 +1,7 @@ | |||
import torch | |||
from fastNLP.core.dataset import DataSet | |||
from fastNLP.core.instance import Instance | |||
from fastNLP.core.predictor import Predictor | |||
from fastNLP.api.model_zoo import load_url | |||
model_urls = { | |||
'cws': "", | |||
@@ -48,23 +44,13 @@ class POS_tagger(API): | |||
for example in query: | |||
data.append(Instance(words=example)) | |||
data = self.pipeline(data) | |||
predictor = Predictor() | |||
outputs = predictor.predict(self.model, data) | |||
out = self.pipeline(data) | |||
answers = [] | |||
for out in outputs: | |||
out = out.numpy() | |||
for sent in out: | |||
answers.append([self.tag_vocab.to_word(tag) for tag in sent]) | |||
return answers | |||
return [x["outputs"] for x in out] | |||
def load(self, name): | |||
_dict = torch.load(name) | |||
self.pipeline = _dict['pipeline'] | |||
self.model = _dict['model'] | |||
self.tag_vocab = _dict["tag_vocab"] | |||
@@ -38,18 +38,18 @@ class SeqLabelEvaluator(Evaluator): | |||
def __call__(self, predict, truth): | |||
""" | |||
:param predict: list of List, the network outputs from all batches. | |||
:param predict: list of dict, the network outputs from all batches. | |||
:param truth: list of dict, the ground truths from all batch_y. | |||
:return accuracy: | |||
""" | |||
truth = [item["truth"] for item in truth] | |||
predict = [item["predict"] for item in predict] | |||
total_correct, total_count = 0., 0. | |||
for x, y in zip(predict, truth): | |||
x = torch.tensor(x) | |||
# x = torch.tensor(x) | |||
y = y.to(x) # make sure they are in the same device | |||
mask = x.ge(1).long() | |||
correct = torch.sum(x * mask == y * mask) | |||
correct -= torch.sum(x.le(0)) | |||
correct = torch.sum(x * mask == y * mask) - torch.sum(x.le(0)) | |||
total_correct += float(correct) | |||
total_count += float(torch.sum(mask)) | |||
accuracy = total_correct / total_count | |||
@@ -9,7 +9,7 @@ from fastNLP.core.batch import Batch | |||
from fastNLP.core.loss import Loss | |||
from fastNLP.core.metrics import Evaluator | |||
from fastNLP.core.optimizer import Optimizer | |||
from fastNLP.core.sampler import RandomSampler | |||
from fastNLP.core.sampler import BucketSampler | |||
from fastNLP.core.tester import SeqLabelTester, ClassificationTester, SNLITester | |||
from fastNLP.core.tester import Tester | |||
from fastNLP.saver.logger import create_logger | |||
@@ -144,7 +144,8 @@ class Trainer(object): | |||
logger.info("training epoch {}".format(epoch)) | |||
# prepare mini-batch iterator | |||
data_iterator = Batch(train_data, batch_size=self.batch_size, sampler=RandomSampler(), | |||
data_iterator = Batch(train_data, batch_size=self.batch_size, | |||
sampler=BucketSampler(10, self.batch_size, "word_seq_origin_len"), | |||
use_cuda=self.use_cuda) | |||
logger.info("prepared data iterator") | |||
@@ -170,15 +171,19 @@ class Trainer(object): | |||
for batch_x, batch_y in data_iterator: | |||
prediction = self.data_forward(network, batch_x) | |||
loss = self.get_loss(prediction, batch_y) | |||
# TODO: refactor self.get_loss | |||
loss = prediction["loss"] if "loss" in prediction else self.get_loss(prediction, batch_y) | |||
# acc = self._evaluator([{"predict": prediction["predict"]}], [{"truth": batch_x["truth"]}]) | |||
self.grad_backward(loss) | |||
self.update() | |||
self._summary_writer.add_scalar("loss", loss.item(), global_step=step) | |||
for name, param in self._model.named_parameters(): | |||
if param.requires_grad: | |||
self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=step) | |||
self._summary_writer.add_scalar(name + "_std", param.std(), global_step=step) | |||
self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=step) | |||
# self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=step) | |||
# self._summary_writer.add_scalar(name + "_std", param.std(), global_step=step) | |||
# self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=step) | |||
pass | |||
if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0: | |||
end = time.time() | |||
@@ -361,10 +361,11 @@ class PeopleDailyCorpusLoader(DataSetLoader): | |||
pos_tag_examples = [] | |||
ner_examples = [] | |||
for sent in sents: | |||
if len(sent) <= 2: | |||
continue | |||
inside_ne = False | |||
sent_pos_tag = [] | |||
sent_words = [] | |||
sent_word = [] | |||
sent_ner = [] | |||
words = sent.strip().split()[1:] | |||
for word in words: | |||
@@ -389,23 +390,10 @@ class PeopleDailyCorpusLoader(DataSetLoader): | |||
ner_tag = "O" | |||
tmp = word.split("/") | |||
token, pos = tmp[0], tmp[1] | |||
pos_tag = [] | |||
for single_token in token: | |||
if len(token) == 1: | |||
single_pos = "S-" + pos | |||
else: | |||
single_pos = "M-" + pos | |||
pos_tag.append(single_pos) | |||
sent_word.append(single_token) | |||
if len(token) > 1: | |||
pos_tag[0] = "B-" + pos | |||
pos_tag[-1] = "E-" + pos | |||
sent_pos_tag += pos_tag | |||
sent_ner.append(ner_tag) | |||
sent_pos_tag.append(pos) | |||
sent_words.append(token) | |||
pos_tag_examples.append([sent_word, sent_pos_tag]) | |||
pos_tag_examples.append([sent_words, sent_pos_tag]) | |||
ner_examples.append([sent_words, sent_ner]) | |||
# List[List[List[str], List[str]]] | |||
return pos_tag_examples, ner_examples | |||
@@ -14,5 +14,5 @@ class BaseModel(torch.nn.Module): | |||
trainer = Trainer(**train_args) | |||
trainer.train(self, train_data, dev_data) | |||
def predict(self): | |||
pass | |||
def predict(self, *args, **kwargs): | |||
raise NotImplementedError |
@@ -1,3 +1,4 @@ | |||
import numpy as np | |||
import torch | |||
from fastNLP.models.base_model import BaseModel | |||
@@ -55,10 +56,8 @@ class SeqLabeling(BaseModel): | |||
# [batch_size, max_len, hidden_size * direction] | |||
x = self.Linear(x) | |||
# [batch_size, max_len, num_classes] | |||
if truth is not None: | |||
return self._internal_loss(x, truth) | |||
else: | |||
return self.decode(x) | |||
return {"loss": self._internal_loss(x, truth) if truth is not None else None, | |||
"predict": self.decode(x)} | |||
def loss(self, x, y): | |||
""" Since the loss has been computed in forward(), this function simply returns x.""" | |||
@@ -116,7 +115,7 @@ class AdvSeqLabel(SeqLabeling): | |||
num_classes = args["num_classes"] | |||
self.Embedding = encoder.embedding.Embedding(vocab_size, word_emb_dim, init_emb=emb) | |||
self.Rnn = encoder.lstm.LSTM(word_emb_dim, hidden_dim, num_layers=3, dropout=0.5, bidirectional=True) | |||
self.Rnn = encoder.lstm.LSTM(word_emb_dim, hidden_dim, num_layers=1, dropout=0.5, bidirectional=True) | |||
self.Linear1 = encoder.Linear(hidden_dim * 2, hidden_dim * 2 // 3) | |||
self.batch_norm = torch.nn.BatchNorm1d(hidden_dim * 2 // 3) | |||
self.relu = torch.nn.ReLU() | |||
@@ -128,32 +127,56 @@ class AdvSeqLabel(SeqLabeling): | |||
def forward(self, word_seq, word_seq_origin_len, truth=None): | |||
""" | |||
:param word_seq: LongTensor, [batch_size, mex_len] | |||
:param word_seq_origin_len: list of int. | |||
:param word_seq_origin_len: LongTensor, [batch_size, ] | |||
:param truth: LongTensor, [batch_size, max_len] | |||
:return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting. | |||
If truth is not None, return loss, a scalar. Used in training. | |||
""" | |||
word_seq = word_seq.long() | |||
word_seq_origin_len = word_seq_origin_len.long() | |||
truth = truth.long() if truth is not None else None | |||
self.mask = self.make_mask(word_seq, word_seq_origin_len) | |||
word_seq_origin_len = word_seq_origin_len.cpu().numpy() | |||
sent_len, idx_sort = np.sort(word_seq_origin_len)[::-1], np.argsort(-word_seq_origin_len) | |||
idx_unsort = np.argsort(idx_sort) | |||
idx_sort = torch.from_numpy(idx_sort) | |||
idx_unsort = torch.from_numpy(idx_unsort) | |||
# word_seq_origin_len = word_seq_origin_len.long() | |||
truth = truth.long() if truth is not None else None | |||
batch_size = word_seq.size(0) | |||
max_len = word_seq.size(1) | |||
if next(self.parameters()).is_cuda: | |||
word_seq = word_seq.cuda() | |||
idx_sort = idx_sort.cuda() | |||
idx_unsort = idx_unsort.cuda() | |||
self.mask = self.mask.cuda() | |||
truth = truth.cuda() if truth is not None else None | |||
x = self.Embedding(word_seq) | |||
# [batch_size, max_len, word_emb_dim] | |||
x = self.Rnn(x) | |||
sent_variable = x.index_select(0, idx_sort) | |||
sent_packed = torch.nn.utils.rnn.pack_padded_sequence(sent_variable, sent_len, batch_first=True) | |||
x = self.Rnn(sent_packed) | |||
# [batch_size, max_len, hidden_size * direction] | |||
sent_output = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)[0] | |||
x = sent_output.index_select(0, idx_unsort) | |||
x = x.contiguous() | |||
x = x.view(batch_size * max_len, -1) | |||
x = self.Linear1(x) | |||
x = self.batch_norm(x) | |||
# x = self.batch_norm(x) | |||
x = self.relu(x) | |||
x = self.drop(x) | |||
x = self.Linear2(x) | |||
x = x.view(batch_size, max_len, -1) | |||
# [batch_size, max_len, num_classes] | |||
if truth is not None: | |||
return self._internal_loss(x, truth) | |||
else: | |||
return self.decode(x) | |||
return {"loss": self._internal_loss(x, truth) if truth is not None else None, | |||
"predict": self.decode(x)} | |||
def predict(self, **x): | |||
out = self.forward(**x) | |||
return {"predict": out["predict"]} |
@@ -1,6 +1,6 @@ | |||
[train] | |||
epochs = 5 | |||
batch_size = 64 | |||
epochs = 300 | |||
batch_size = 32 | |||
pickle_path = "./save/" | |||
validate = false | |||
save_best_dev = true | |||
@@ -1,11 +1,14 @@ | |||
import copy | |||
import os | |||
import sys | |||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))) | |||
print(sys.path) | |||
import torch | |||
from fastNLP.api.pipeline import Pipeline | |||
from fastNLP.api.processor import VocabProcessor, IndexerProcessor, SeqLenProcessor | |||
from fastNLP.core.dataset import DataSet | |||
from fastNLP.api.pipeline import Pipeline | |||
from fastNLP.api.processor import VocabProcessor, IndexerProcessor, SeqLenProcessor, ModelProcessor, Index2WordProcessor | |||
from fastNLP.core.instance import Instance | |||
from fastNLP.core.metrics import SeqLabelEvaluator | |||
from fastNLP.core.optimizer import Optimizer | |||
@@ -14,11 +17,12 @@ from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader | |||
from fastNLP.models.sequence_modeling import AdvSeqLabel | |||
cfgfile = './pos_tag.cfg' | |||
# datadir = "/home/zyfeng/data/" | |||
# data_name = "POS_PD_1998.txt" | |||
datadir = "/home/zyfeng/fastnlp_0.2.0/test/data_for_tests/" | |||
data_name = "people_daily_raw.txt" | |||
datadir = "/home/zyfeng/data/" | |||
data_name = "CWS_POS_TAG_NER_people_daily.txt" | |||
# datadir = "/home/zyfeng/env/fastnlp_v_2/test/data_for_tests" | |||
# data_name = "people_daily_raw.txt" | |||
pos_tag_data_path = os.path.join(datadir, data_name) | |||
@@ -58,6 +62,7 @@ def train(): | |||
tag_indexer(dataset) | |||
seq_len_proc = SeqLenProcessor("word_seq", "word_seq_origin_len") | |||
seq_len_proc(dataset) | |||
#torch.save(dataset, "data_set.pkl") | |||
dev_set = copy.deepcopy(dataset) | |||
dev_set.set_is_target(truth=True) | |||
@@ -75,14 +80,21 @@ def train(): | |||
trainer = Trainer(epochs=train_param["epochs"], | |||
batch_size=train_param["batch_size"], | |||
validate=True, | |||
optimizer=Optimizer("SGD", lr=0.01, momentum=0.9), | |||
evaluator=SeqLabelEvaluator() | |||
optimizer=Optimizer("Adam", lr=0.01, weight_decay=0.9), | |||
evaluator=SeqLabelEvaluator(), | |||
use_cuda=True | |||
) | |||
trainer.train(model, dataset, dev_set) | |||
model_proc = ModelProcessor(model, "word_seq_origin_len") | |||
dataset.set_is_target(truth=True) | |||
res = model_proc.process(dataset) | |||
decoder = Index2WordProcessor(tag_vocab_proc.get_vocab(), "predict", "outputs") | |||
# save model & pipeline | |||
pp = Pipeline([word_indexer, seq_len_proc]) | |||
save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_vocab_proc.get_vocab()} | |||
pp = Pipeline([word_indexer, seq_len_proc, model_proc, decoder]) | |||
save_dict = {"pipeline": pp} | |||
torch.save(save_dict, "model_pp.pkl") | |||
@@ -1,22 +1,22 @@ | |||
import os | |||
from fastNLP.core.vocabulary import Vocabulary | |||
from fastNLP.loader.dataset_loader import TokenizeDataSetLoader | |||
from fastNLP.core.metrics import SeqLabelEvaluator | |||
from fastNLP.core.optimizer import Optimizer | |||
from fastNLP.core.preprocess import save_pickle | |||
from fastNLP.core.tester import SeqLabelTester | |||
from fastNLP.core.trainer import SeqLabelTrainer | |||
from fastNLP.core.vocabulary import Vocabulary | |||
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection | |||
from fastNLP.loader.dataset_loader import TokenizeDataSetLoader | |||
from fastNLP.loader.model_loader import ModelLoader | |||
from fastNLP.models.sequence_modeling import SeqLabeling | |||
from fastNLP.saver.model_saver import ModelSaver | |||
pickle_path = "./seq_label/" | |||
model_name = "seq_label_model.pkl" | |||
config_dir = "test/data_for_tests/config" | |||
data_path = "test/data_for_tests/people.txt" | |||
data_infer_path = "test/data_for_tests/people_infer.txt" | |||
config_dir = "../data_for_tests/config" | |||
data_path = "../data_for_tests/people.txt" | |||
data_infer_path = "../data_for_tests/people_infer.txt" | |||
def test_training(): | |||
@@ -84,3 +84,7 @@ def test_training(): | |||
# Start testing with validation data | |||
data_dev.set_target(truth=True) | |||
tester.test(model, data_dev) | |||
if __name__ == "__main__": | |||
test_training() |