Browse Source

保存pos tag 脚本

tags/v0.2.0
FengZiYjun yunfan 5 years ago
parent
commit
4be15a5b43
9 changed files with 93 additions and 75 deletions
  1. +2
    -16
      fastNLP/api/api.py
  2. +4
    -4
      fastNLP/core/metrics.py
  3. +11
    -6
      fastNLP/core/trainer.py
  4. +4
    -16
      fastNLP/loader/dataset_loader.py
  5. +2
    -2
      fastNLP/models/base_model.py
  6. +37
    -14
      fastNLP/models/sequence_modeling.py
  7. +2
    -2
      reproduction/pos_tag_model/pos_tag.cfg
  8. +22
    -10
      reproduction/pos_tag_model/train_pos_tag.py
  9. +9
    -5
      test/model/test_seq_label.py

+ 2
- 16
fastNLP/api/api.py View File

@@ -1,11 +1,7 @@

import torch

from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance
from fastNLP.core.predictor import Predictor

from fastNLP.api.model_zoo import load_url

model_urls = {
'cws': "",
@@ -48,23 +44,13 @@ class POS_tagger(API):
for example in query:
data.append(Instance(words=example))

data = self.pipeline(data)

predictor = Predictor()
outputs = predictor.predict(self.model, data)
out = self.pipeline(data)

answers = []
for out in outputs:
out = out.numpy()
for sent in out:
answers.append([self.tag_vocab.to_word(tag) for tag in sent])
return answers
return [x["outputs"] for x in out]

def load(self, name):
_dict = torch.load(name)
self.pipeline = _dict['pipeline']
self.model = _dict['model']
self.tag_vocab = _dict["tag_vocab"]





+ 4
- 4
fastNLP/core/metrics.py View File

@@ -38,18 +38,18 @@ class SeqLabelEvaluator(Evaluator):
def __call__(self, predict, truth):
"""

:param predict: list of List, the network outputs from all batches.
:param predict: list of dict, the network outputs from all batches.
:param truth: list of dict, the ground truths from all batch_y.
:return accuracy:
"""
truth = [item["truth"] for item in truth]
predict = [item["predict"] for item in predict]
total_correct, total_count = 0., 0.
for x, y in zip(predict, truth):
x = torch.tensor(x)
# x = torch.tensor(x)
y = y.to(x) # make sure they are in the same device
mask = x.ge(1).long()
correct = torch.sum(x * mask == y * mask)
correct -= torch.sum(x.le(0))
correct = torch.sum(x * mask == y * mask) - torch.sum(x.le(0))
total_correct += float(correct)
total_count += float(torch.sum(mask))
accuracy = total_correct / total_count


+ 11
- 6
fastNLP/core/trainer.py View File

@@ -9,7 +9,7 @@ from fastNLP.core.batch import Batch
from fastNLP.core.loss import Loss
from fastNLP.core.metrics import Evaluator
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.sampler import RandomSampler
from fastNLP.core.sampler import BucketSampler
from fastNLP.core.tester import SeqLabelTester, ClassificationTester, SNLITester
from fastNLP.core.tester import Tester
from fastNLP.saver.logger import create_logger
@@ -144,7 +144,8 @@ class Trainer(object):
logger.info("training epoch {}".format(epoch))

# prepare mini-batch iterator
data_iterator = Batch(train_data, batch_size=self.batch_size, sampler=RandomSampler(),
data_iterator = Batch(train_data, batch_size=self.batch_size,
sampler=BucketSampler(10, self.batch_size, "word_seq_origin_len"),
use_cuda=self.use_cuda)
logger.info("prepared data iterator")

@@ -170,15 +171,19 @@ class Trainer(object):
for batch_x, batch_y in data_iterator:
prediction = self.data_forward(network, batch_x)

loss = self.get_loss(prediction, batch_y)
# TODO: refactor self.get_loss
loss = prediction["loss"] if "loss" in prediction else self.get_loss(prediction, batch_y)
# acc = self._evaluator([{"predict": prediction["predict"]}], [{"truth": batch_x["truth"]}])

self.grad_backward(loss)
self.update()
self._summary_writer.add_scalar("loss", loss.item(), global_step=step)
for name, param in self._model.named_parameters():
if param.requires_grad:
self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=step)
self._summary_writer.add_scalar(name + "_std", param.std(), global_step=step)
self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=step)
# self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=step)
# self._summary_writer.add_scalar(name + "_std", param.std(), global_step=step)
# self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=step)
pass

if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0:
end = time.time()


+ 4
- 16
fastNLP/loader/dataset_loader.py View File

@@ -361,10 +361,11 @@ class PeopleDailyCorpusLoader(DataSetLoader):
pos_tag_examples = []
ner_examples = []
for sent in sents:
if len(sent) <= 2:
continue
inside_ne = False
sent_pos_tag = []
sent_words = []
sent_word = []
sent_ner = []
words = sent.strip().split()[1:]
for word in words:
@@ -389,23 +390,10 @@ class PeopleDailyCorpusLoader(DataSetLoader):
ner_tag = "O"
tmp = word.split("/")
token, pos = tmp[0], tmp[1]

pos_tag = []
for single_token in token:
if len(token) == 1:
single_pos = "S-" + pos
else:
single_pos = "M-" + pos
pos_tag.append(single_pos)
sent_word.append(single_token)
if len(token) > 1:
pos_tag[0] = "B-" + pos
pos_tag[-1] = "E-" + pos
sent_pos_tag += pos_tag

sent_ner.append(ner_tag)
sent_pos_tag.append(pos)
sent_words.append(token)
pos_tag_examples.append([sent_word, sent_pos_tag])
pos_tag_examples.append([sent_words, sent_pos_tag])
ner_examples.append([sent_words, sent_ner])
# List[List[List[str], List[str]]]
return pos_tag_examples, ner_examples


+ 2
- 2
fastNLP/models/base_model.py View File

@@ -14,5 +14,5 @@ class BaseModel(torch.nn.Module):
trainer = Trainer(**train_args)
trainer.train(self, train_data, dev_data)

def predict(self):
pass
def predict(self, *args, **kwargs):
raise NotImplementedError

+ 37
- 14
fastNLP/models/sequence_modeling.py View File

@@ -1,3 +1,4 @@
import numpy as np
import torch

from fastNLP.models.base_model import BaseModel
@@ -55,10 +56,8 @@ class SeqLabeling(BaseModel):
# [batch_size, max_len, hidden_size * direction]
x = self.Linear(x)
# [batch_size, max_len, num_classes]
if truth is not None:
return self._internal_loss(x, truth)
else:
return self.decode(x)
return {"loss": self._internal_loss(x, truth) if truth is not None else None,
"predict": self.decode(x)}

def loss(self, x, y):
""" Since the loss has been computed in forward(), this function simply returns x."""
@@ -116,7 +115,7 @@ class AdvSeqLabel(SeqLabeling):
num_classes = args["num_classes"]

self.Embedding = encoder.embedding.Embedding(vocab_size, word_emb_dim, init_emb=emb)
self.Rnn = encoder.lstm.LSTM(word_emb_dim, hidden_dim, num_layers=3, dropout=0.5, bidirectional=True)
self.Rnn = encoder.lstm.LSTM(word_emb_dim, hidden_dim, num_layers=1, dropout=0.5, bidirectional=True)
self.Linear1 = encoder.Linear(hidden_dim * 2, hidden_dim * 2 // 3)
self.batch_norm = torch.nn.BatchNorm1d(hidden_dim * 2 // 3)
self.relu = torch.nn.ReLU()
@@ -128,32 +127,56 @@ class AdvSeqLabel(SeqLabeling):
def forward(self, word_seq, word_seq_origin_len, truth=None):
"""
:param word_seq: LongTensor, [batch_size, mex_len]
:param word_seq_origin_len: list of int.
:param word_seq_origin_len: LongTensor, [batch_size, ]
:param truth: LongTensor, [batch_size, max_len]
:return y: If truth is None, return list of [decode path(list)]. Used in testing and predicting.
If truth is not None, return loss, a scalar. Used in training.
"""

word_seq = word_seq.long()
word_seq_origin_len = word_seq_origin_len.long()
truth = truth.long() if truth is not None else None
self.mask = self.make_mask(word_seq, word_seq_origin_len)
word_seq_origin_len = word_seq_origin_len.cpu().numpy()
sent_len, idx_sort = np.sort(word_seq_origin_len)[::-1], np.argsort(-word_seq_origin_len)
idx_unsort = np.argsort(idx_sort)
idx_sort = torch.from_numpy(idx_sort)
idx_unsort = torch.from_numpy(idx_unsort)

# word_seq_origin_len = word_seq_origin_len.long()
truth = truth.long() if truth is not None else None

batch_size = word_seq.size(0)
max_len = word_seq.size(1)
if next(self.parameters()).is_cuda:
word_seq = word_seq.cuda()
idx_sort = idx_sort.cuda()
idx_unsort = idx_unsort.cuda()
self.mask = self.mask.cuda()
truth = truth.cuda() if truth is not None else None

x = self.Embedding(word_seq)
# [batch_size, max_len, word_emb_dim]
x = self.Rnn(x)

sent_variable = x.index_select(0, idx_sort)
sent_packed = torch.nn.utils.rnn.pack_padded_sequence(sent_variable, sent_len, batch_first=True)

x = self.Rnn(sent_packed)
# [batch_size, max_len, hidden_size * direction]

sent_output = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)[0]
x = sent_output.index_select(0, idx_unsort)

x = x.contiguous()
x = x.view(batch_size * max_len, -1)
x = self.Linear1(x)
x = self.batch_norm(x)
# x = self.batch_norm(x)
x = self.relu(x)
x = self.drop(x)
x = self.Linear2(x)
x = x.view(batch_size, max_len, -1)
# [batch_size, max_len, num_classes]
if truth is not None:
return self._internal_loss(x, truth)
else:
return self.decode(x)
return {"loss": self._internal_loss(x, truth) if truth is not None else None,
"predict": self.decode(x)}

def predict(self, **x):
out = self.forward(**x)
return {"predict": out["predict"]}

+ 2
- 2
reproduction/pos_tag_model/pos_tag.cfg View File

@@ -1,6 +1,6 @@
[train]
epochs = 5
batch_size = 64
epochs = 300
batch_size = 32
pickle_path = "./save/"
validate = false
save_best_dev = true


+ 22
- 10
reproduction/pos_tag_model/train_pos_tag.py View File

@@ -1,11 +1,14 @@
import copy
import os
import sys

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
print(sys.path)
import torch

from fastNLP.api.pipeline import Pipeline
from fastNLP.api.processor import VocabProcessor, IndexerProcessor, SeqLenProcessor
from fastNLP.core.dataset import DataSet
from fastNLP.api.pipeline import Pipeline
from fastNLP.api.processor import VocabProcessor, IndexerProcessor, SeqLenProcessor, ModelProcessor, Index2WordProcessor
from fastNLP.core.instance import Instance
from fastNLP.core.metrics import SeqLabelEvaluator
from fastNLP.core.optimizer import Optimizer
@@ -14,11 +17,12 @@ from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader
from fastNLP.models.sequence_modeling import AdvSeqLabel


cfgfile = './pos_tag.cfg'
# datadir = "/home/zyfeng/data/"
# data_name = "POS_PD_1998.txt"
datadir = "/home/zyfeng/fastnlp_0.2.0/test/data_for_tests/"
data_name = "people_daily_raw.txt"
datadir = "/home/zyfeng/data/"
data_name = "CWS_POS_TAG_NER_people_daily.txt"
# datadir = "/home/zyfeng/env/fastnlp_v_2/test/data_for_tests"
# data_name = "people_daily_raw.txt"


pos_tag_data_path = os.path.join(datadir, data_name)
@@ -58,6 +62,7 @@ def train():
tag_indexer(dataset)
seq_len_proc = SeqLenProcessor("word_seq", "word_seq_origin_len")
seq_len_proc(dataset)
#torch.save(dataset, "data_set.pkl")

dev_set = copy.deepcopy(dataset)
dev_set.set_is_target(truth=True)
@@ -75,14 +80,21 @@ def train():
trainer = Trainer(epochs=train_param["epochs"],
batch_size=train_param["batch_size"],
validate=True,
optimizer=Optimizer("SGD", lr=0.01, momentum=0.9),
evaluator=SeqLabelEvaluator()
optimizer=Optimizer("Adam", lr=0.01, weight_decay=0.9),
evaluator=SeqLabelEvaluator(),
use_cuda=True
)
trainer.train(model, dataset, dev_set)

model_proc = ModelProcessor(model, "word_seq_origin_len")
dataset.set_is_target(truth=True)
res = model_proc.process(dataset)

decoder = Index2WordProcessor(tag_vocab_proc.get_vocab(), "predict", "outputs")

# save model & pipeline
pp = Pipeline([word_indexer, seq_len_proc])
save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_vocab_proc.get_vocab()}
pp = Pipeline([word_indexer, seq_len_proc, model_proc, decoder])
save_dict = {"pipeline": pp}
torch.save(save_dict, "model_pp.pkl")




+ 9
- 5
test/model/test_seq_label.py View File

@@ -1,22 +1,22 @@
import os

from fastNLP.core.vocabulary import Vocabulary
from fastNLP.loader.dataset_loader import TokenizeDataSetLoader
from fastNLP.core.metrics import SeqLabelEvaluator
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.preprocess import save_pickle
from fastNLP.core.tester import SeqLabelTester
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.dataset_loader import TokenizeDataSetLoader
from fastNLP.loader.model_loader import ModelLoader
from fastNLP.models.sequence_modeling import SeqLabeling
from fastNLP.saver.model_saver import ModelSaver

pickle_path = "./seq_label/"
model_name = "seq_label_model.pkl"
config_dir = "test/data_for_tests/config"
data_path = "test/data_for_tests/people.txt"
data_infer_path = "test/data_for_tests/people_infer.txt"
config_dir = "../data_for_tests/config"
data_path = "../data_for_tests/people.txt"
data_infer_path = "../data_for_tests/people_infer.txt"


def test_training():
@@ -84,3 +84,7 @@ def test_training():
# Start testing with validation data
data_dev.set_target(truth=True)
tester.test(model, data_dev)


if __name__ == "__main__":
test_training()

Loading…
Cancel
Save