Browse Source

保存pos tag 脚本

tags/v0.2.0
FengZiYjun yunfan 5 years ago
parent
commit
4be15a5b43
9 changed files with 93 additions and 75 deletions
  1. +2
    -16
      fastNLP/api/api.py
  2. +4
    -4
      fastNLP/core/metrics.py
  3. +11
    -6
      fastNLP/core/trainer.py
  4. +4
    -16
      fastNLP/loader/dataset_loader.py
  5. +2
    -2
      fastNLP/models/base_model.py
  6. +37
    -14
      fastNLP/models/sequence_modeling.py
  7. +2
    -2
      reproduction/pos_tag_model/pos_tag.cfg
  8. +22
    -10
      reproduction/pos_tag_model/train_pos_tag.py
  9. +9
    -5
      test/model/test_seq_label.py

+ 2
- 16
fastNLP/api/api.py View File

@@ -1,11 +1,7 @@

import torch import torch


from fastNLP.core.dataset import DataSet from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance from fastNLP.core.instance import Instance
from fastNLP.core.predictor import Predictor

from fastNLP.api.model_zoo import load_url


model_urls = { model_urls = {
'cws': "", 'cws': "",
@@ -48,23 +44,13 @@ class POS_tagger(API):
for example in query: for example in query:
data.append(Instance(words=example)) data.append(Instance(words=example))


data = self.pipeline(data)

predictor = Predictor()
outputs = predictor.predict(self.model, data)
out = self.pipeline(data)


answers = []
for out in outputs:
out = out.numpy()
for sent in out:
answers.append([self.tag_vocab.to_word(tag) for tag in sent])
return answers
return [x["outputs"] for x in out]


def load(self, name): def load(self, name):
_dict = torch.load(name) _dict = torch.load(name)
self.pipeline = _dict['pipeline'] self.pipeline = _dict['pipeline']
self.model = _dict['model']
self.tag_vocab = _dict["tag_vocab"]








+ 4
- 4
fastNLP/core/metrics.py View File

@@ -38,18 +38,18 @@ class SeqLabelEvaluator(Evaluator):
def __call__(self, predict, truth): def __call__(self, predict, truth):
""" """


:param predict: list of List, the network outputs from all batches.
:param predict: list of dict, the network outputs from all batches.
:param truth: list of dict, the ground truths from all batch_y. :param truth: list of dict, the ground truths from all batch_y.
:return accuracy: :return accuracy:
""" """
truth = [item["truth"] for item in truth] truth = [item["truth"] for item in truth]
predict = [item["predict"] for item in predict]
total_correct, total_count = 0., 0. total_correct, total_count = 0., 0.
for x, y in zip(predict, truth): for x, y in zip(predict, truth):
x = torch.tensor(x)
# x = torch.tensor(x)
y = y.to(x) # make sure they are in the same device y = y.to(x) # make sure they are in the same device
mask = x.ge(1).long() mask = x.ge(1).long()
correct = torch.sum(x * mask == y * mask)
correct -= torch.sum(x.le(0))
correct = torch.sum(x * mask == y * mask) - torch.sum(x.le(0))
total_correct += float(correct) total_correct += float(correct)
total_count += float(torch.sum(mask)) total_count += float(torch.sum(mask))
accuracy = total_correct / total_count accuracy = total_correct / total_count


+ 11
- 6
fastNLP/core/trainer.py View File

@@ -9,7 +9,7 @@ from fastNLP.core.batch import Batch
from fastNLP.core.loss import Loss from fastNLP.core.loss import Loss
from fastNLP.core.metrics import Evaluator from fastNLP.core.metrics import Evaluator
from fastNLP.core.optimizer import Optimizer from fastNLP.core.optimizer import Optimizer
from fastNLP.core.sampler import RandomSampler
from fastNLP.core.sampler import BucketSampler
from fastNLP.core.tester import SeqLabelTester, ClassificationTester, SNLITester from fastNLP.core.tester import SeqLabelTester, ClassificationTester, SNLITester
from fastNLP.core.tester import Tester from fastNLP.core.tester import Tester
from fastNLP.saver.logger import create_logger from fastNLP.saver.logger import create_logger
@@ -144,7 +144,8 @@ class Trainer(object):
logger.info("training epoch {}".format(epoch)) logger.info("training epoch {}".format(epoch))


# prepare mini-batch iterator # prepare mini-batch iterator
data_iterator = Batch(train_data, batch_size=self.batch_size, sampler=RandomSampler(),
data_iterator = Batch(train_data, batch_size=self.batch_size,
sampler=BucketSampler(10, self.batch_size, "word_seq_origin_len"),
use_cuda=self.use_cuda) use_cuda=self.use_cuda)
logger.info("prepared data iterator") logger.info("prepared data iterator")


@@ -170,15 +171,19 @@ class Trainer(object):
for batch_x, batch_y in data_iterator: for batch_x, batch_y in data_iterator:
prediction = self.data_forward(network, batch_x) prediction = self.data_forward(network, batch_x)


loss = self.get_loss(prediction, batch_y)
# TODO: refactor self.get_loss
loss = prediction["loss"] if "loss" in prediction else self.get_loss(prediction, batch_y)
# acc = self._evaluator([{"predict": prediction["predict"]}], [{"truth": batch_x["truth"]}])

self.grad_backward(loss) self.grad_backward(loss)
self.update() self.update()
self._summary_writer.add_scalar("loss", loss.item(), global_step=step) self._summary_writer.add_scalar("loss", loss.item(), global_step=step)
for name, param in self._model.named_parameters(): for name, param in self._model.named_parameters():
if param.requires_grad: if param.requires_grad:
self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=step)
self._summary_writer.add_scalar(name + "_std", param.std(), global_step=step)
self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=step)
# self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=step)
# self._summary_writer.add_scalar(name + "_std", param.std(), global_step=step)
# self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=step)
pass


if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0: if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0:
end = time.time() end = time.time()


+ 4
- 16
fastNLP/loader/dataset_loader.py View File

@@ -361,10 +361,11 @@ class PeopleDailyCorpusLoader(DataSetLoader):
pos_tag_examples = [] pos_tag_examples = []
ner_examples = [] ner_examples = []
for sent in sents: for sent in sents:
if len(sent) <= 2:
continue
inside_ne = False inside_ne = False
sent_pos_tag = [] sent_pos_tag = []
sent_words = [] sent_words = []
sent_word = []
sent_ner = [] sent_ner = []
words = sent.strip().split()[1:] words = sent.strip().split()[1:]
for word in words: for word in words:
@@ -389,23 +390,10 @@ class PeopleDailyCorpusLoader(DataSetLoader):
ner_tag = "O" ner_tag = "O"
tmp = word.split("/") tmp = word.split("/")
token, pos = tmp[0], tmp[1] token, pos = tmp[0], tmp[1]

pos_tag = []
for single_token in token:
if len(token) == 1:
single_pos = "S-" + pos
else:
single_pos = "M-" + pos
pos_tag.append(single_pos)
sent_word.append(single_token)
if len(token) > 1:
pos_tag[0] = "B-" + pos
pos_tag[-1] = "E-" + pos
sent_pos_tag += pos_tag

sent_ner.append(ner_tag) sent_ner.append(ner_tag)
sent_pos_tag.append(pos)
sent_words.append(token) sent_words.append(token)
pos_tag_examples.append([sent_word, sent_pos_tag])
pos_tag_examples.append([sent_words, sent_pos_tag])
ner_examples.append([sent_words, sent_ner]) ner_examples.append([sent_words, sent_ner])
# List[List[List[str], List[str]]] # List[List[List[str], List[str]]]
return pos_tag_examples, ner_examples return pos_tag_examples, ner_examples


+ 2
- 2
fastNLP/models/base_model.py View File

@@ -14,5 +14,5 @@ class BaseModel(torch.nn.Module):
trainer = Trainer(**train_args) trainer = Trainer(**train_args)
trainer.train(self, train_data, dev_data) trainer.train(self, train_data, dev_data)


def predict(self):
pass
def predict(self, *args, **kwargs):
raise NotImplementedError

+ 37
- 14
fastNLP/models/sequence_modeling.py View File

@@ -1,3 +1,4 @@
import numpy as np
import torch import torch


from fastNLP.models.base_model import BaseModel from fastNLP.models.base_model import BaseModel
@@ -55,10 +56,8 @@ class SeqLabeling(BaseModel):
# [batch_size, max_len, hidden_size * direction] # [batch_size, max_len, hidden_size * direction]
x = self.Linear(x) x = self.Linear(x)
# [batch_size, max_len, num_classes] # [batch_size, max_len, num_classes]
if truth is not None:
return self._internal_loss(x, truth)
else:
return self.decode(x)
return {"loss": self._internal_loss(x, truth) if truth is not None else None,
"predict": self.decode(x)}


def loss(self, x, y): def loss(self, x, y):
""" Since the loss has been computed in forward(), this function simply returns x.""" """ Since the loss has been computed in forward(), this function simply returns x."""
@@ -116,7 +115,7 @@ class AdvSeqLabel(SeqLabeling):
num_classes = args["num_classes"] num_classes = args["num_classes"]


self.Embedding = encoder.embedding.Embedding(vocab_size, word_emb_dim, init_emb=emb) self.Embedding = encoder.embedding.Embedding(vocab_size, word_emb_dim, init_emb=emb)
self.Rnn = encoder.lstm.LSTM(word_emb_dim, hidden_dim, num_layers=3, dropout=0.5, bidirectional=True)
self.Rnn = encoder.lstm.LSTM(word_emb_dim, hidden_dim, num_layers=1, dropout=0.5, bidirectional=True)
self.Linear1 = encoder.Linear(hidden_dim * 2, hidden_dim * 2 // 3) self.Linear1 = encoder.Linear(hidden_dim * 2, hidden_dim * 2 // 3)
self.batch_norm = torch.nn.BatchNorm1d(hidden_dim * 2 // 3) self.batch_norm = torch.nn.BatchNorm1d(hidden_dim * 2 // 3)
self.relu = torch.nn.ReLU() self.relu = torch.nn.ReLU()
@@ -128,32 +127,56 @@ class AdvSeqLabel(SeqLabeling):
def forward(self, word_seq, word_seq_origin_len, truth=None): def forward(self, word_seq, word_seq_origin_len, truth=None):
""" """
:param word_seq: LongTensor, [batch_size, mex_len] :param word_seq: LongTensor, [batch_size, mex_len]
:param word_seq_origin_len: list of int.
:param word_seq_origin_len: LongTensor, [batch_size, ]
:param truth: LongTensor, [batch_size, max_len] :param truth: LongTensor, [batch_size, max_len]
:return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting. :return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting.
If truth is not None, return loss, a scalar. Used in training. If truth is not None, return loss, a scalar. Used in training.
""" """

word_seq = word_seq.long() word_seq = word_seq.long()
word_seq_origin_len = word_seq_origin_len.long()
truth = truth.long() if truth is not None else None
self.mask = self.make_mask(word_seq, word_seq_origin_len) self.mask = self.make_mask(word_seq, word_seq_origin_len)
word_seq_origin_len = word_seq_origin_len.cpu().numpy()
sent_len, idx_sort = np.sort(word_seq_origin_len)[::-1], np.argsort(-word_seq_origin_len)
idx_unsort = np.argsort(idx_sort)
idx_sort = torch.from_numpy(idx_sort)
idx_unsort = torch.from_numpy(idx_unsort)

# word_seq_origin_len = word_seq_origin_len.long()
truth = truth.long() if truth is not None else None


batch_size = word_seq.size(0) batch_size = word_seq.size(0)
max_len = word_seq.size(1) max_len = word_seq.size(1)
if next(self.parameters()).is_cuda:
word_seq = word_seq.cuda()
idx_sort = idx_sort.cuda()
idx_unsort = idx_unsort.cuda()
self.mask = self.mask.cuda()
truth = truth.cuda() if truth is not None else None

x = self.Embedding(word_seq) x = self.Embedding(word_seq)
# [batch_size, max_len, word_emb_dim] # [batch_size, max_len, word_emb_dim]
x = self.Rnn(x)

sent_variable = x.index_select(0, idx_sort)
sent_packed = torch.nn.utils.rnn.pack_padded_sequence(sent_variable, sent_len, batch_first=True)

x = self.Rnn(sent_packed)
# [batch_size, max_len, hidden_size * direction] # [batch_size, max_len, hidden_size * direction]

sent_output = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)[0]
x = sent_output.index_select(0, idx_unsort)

x = x.contiguous() x = x.contiguous()
x = x.view(batch_size * max_len, -1) x = x.view(batch_size * max_len, -1)
x = self.Linear1(x) x = self.Linear1(x)
x = self.batch_norm(x)
# x = self.batch_norm(x)
x = self.relu(x) x = self.relu(x)
x = self.drop(x) x = self.drop(x)
x = self.Linear2(x) x = self.Linear2(x)
x = x.view(batch_size, max_len, -1) x = x.view(batch_size, max_len, -1)
# [batch_size, max_len, num_classes] # [batch_size, max_len, num_classes]
if truth is not None:
return self._internal_loss(x, truth)
else:
return self.decode(x)
return {"loss": self._internal_loss(x, truth) if truth is not None else None,
"predict": self.decode(x)}

def predict(self, **x):
out = self.forward(**x)
return {"predict": out["predict"]}

+ 2
- 2
reproduction/pos_tag_model/pos_tag.cfg View File

@@ -1,6 +1,6 @@
[train] [train]
epochs = 5
batch_size = 64
epochs = 300
batch_size = 32
pickle_path = "./save/" pickle_path = "./save/"
validate = false validate = false
save_best_dev = true save_best_dev = true


+ 22
- 10
reproduction/pos_tag_model/train_pos_tag.py View File

@@ -1,11 +1,14 @@
import copy import copy
import os import os
import sys


sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
print(sys.path)
import torch import torch


from fastNLP.api.pipeline import Pipeline
from fastNLP.api.processor import VocabProcessor, IndexerProcessor, SeqLenProcessor
from fastNLP.core.dataset import DataSet from fastNLP.core.dataset import DataSet
from fastNLP.api.pipeline import Pipeline
from fastNLP.api.processor import VocabProcessor, IndexerProcessor, SeqLenProcessor, ModelProcessor, Index2WordProcessor
from fastNLP.core.instance import Instance from fastNLP.core.instance import Instance
from fastNLP.core.metrics import SeqLabelEvaluator from fastNLP.core.metrics import SeqLabelEvaluator
from fastNLP.core.optimizer import Optimizer from fastNLP.core.optimizer import Optimizer
@@ -14,11 +17,12 @@ from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader
from fastNLP.models.sequence_modeling import AdvSeqLabel from fastNLP.models.sequence_modeling import AdvSeqLabel



cfgfile = './pos_tag.cfg' cfgfile = './pos_tag.cfg'
# datadir = "/home/zyfeng/data/"
# data_name = "POS_PD_1998.txt"
datadir = "/home/zyfeng/fastnlp_0.2.0/test/data_for_tests/"
data_name = "people_daily_raw.txt"
datadir = "/home/zyfeng/data/"
data_name = "CWS_POS_TAG_NER_people_daily.txt"
# datadir = "/home/zyfeng/env/fastnlp_v_2/test/data_for_tests"
# data_name = "people_daily_raw.txt"




pos_tag_data_path = os.path.join(datadir, data_name) pos_tag_data_path = os.path.join(datadir, data_name)
@@ -58,6 +62,7 @@ def train():
tag_indexer(dataset) tag_indexer(dataset)
seq_len_proc = SeqLenProcessor("word_seq", "word_seq_origin_len") seq_len_proc = SeqLenProcessor("word_seq", "word_seq_origin_len")
seq_len_proc(dataset) seq_len_proc(dataset)
#torch.save(dataset, "data_set.pkl")


dev_set = copy.deepcopy(dataset) dev_set = copy.deepcopy(dataset)
dev_set.set_is_target(truth=True) dev_set.set_is_target(truth=True)
@@ -75,14 +80,21 @@ def train():
trainer = Trainer(epochs=train_param["epochs"], trainer = Trainer(epochs=train_param["epochs"],
batch_size=train_param["batch_size"], batch_size=train_param["batch_size"],
validate=True, validate=True,
optimizer=Optimizer("SGD", lr=0.01, momentum=0.9),
evaluator=SeqLabelEvaluator()
optimizer=Optimizer("Adam", lr=0.01, weight_decay=0.9),
evaluator=SeqLabelEvaluator(),
use_cuda=True
) )
trainer.train(model, dataset, dev_set) trainer.train(model, dataset, dev_set)


model_proc = ModelProcessor(model, "word_seq_origin_len")
dataset.set_is_target(truth=True)
res = model_proc.process(dataset)

decoder = Index2WordProcessor(tag_vocab_proc.get_vocab(), "predict", "outputs")

# save model & pipeline # save model & pipeline
pp = Pipeline([word_indexer, seq_len_proc])
save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_vocab_proc.get_vocab()}
pp = Pipeline([word_indexer, seq_len_proc, model_proc, decoder])
save_dict = {"pipeline": pp}
torch.save(save_dict, "model_pp.pkl") torch.save(save_dict, "model_pp.pkl")






+ 9
- 5
test/model/test_seq_label.py View File

@@ -1,22 +1,22 @@
import os import os


from fastNLP.core.vocabulary import Vocabulary
from fastNLP.loader.dataset_loader import TokenizeDataSetLoader
from fastNLP.core.metrics import SeqLabelEvaluator from fastNLP.core.metrics import SeqLabelEvaluator
from fastNLP.core.optimizer import Optimizer from fastNLP.core.optimizer import Optimizer
from fastNLP.core.preprocess import save_pickle from fastNLP.core.preprocess import save_pickle
from fastNLP.core.tester import SeqLabelTester from fastNLP.core.tester import SeqLabelTester
from fastNLP.core.trainer import SeqLabelTrainer from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.dataset_loader import TokenizeDataSetLoader
from fastNLP.loader.model_loader import ModelLoader from fastNLP.loader.model_loader import ModelLoader
from fastNLP.models.sequence_modeling import SeqLabeling from fastNLP.models.sequence_modeling import SeqLabeling
from fastNLP.saver.model_saver import ModelSaver from fastNLP.saver.model_saver import ModelSaver


pickle_path = "./seq_label/" pickle_path = "./seq_label/"
model_name = "seq_label_model.pkl" model_name = "seq_label_model.pkl"
config_dir = "test/data_for_tests/config"
data_path = "test/data_for_tests/people.txt"
data_infer_path = "test/data_for_tests/people_infer.txt"
config_dir = "../data_for_tests/config"
data_path = "../data_for_tests/people.txt"
data_infer_path = "../data_for_tests/people_infer.txt"




def test_training(): def test_training():
@@ -84,3 +84,7 @@ def test_training():
# Start testing with validation data # Start testing with validation data
data_dev.set_target(truth=True) data_dev.set_target(truth=True)
tester.test(model, data_dev) tester.test(model, data_dev)


if __name__ == "__main__":
test_training()

Loading…
Cancel
Save