Browse Source

- 添加pos_tagger API, pipeline跑通

- 修复processor的bug
- 更新core/的若干组件, 去除batch的冗余参数
- CRF有个打字错误?已修复
- 更新pos tag 训练脚本
tags/v0.2.0
FengZiYjun 5 years ago
parent
commit
5dd0f74d6d
11 changed files with 80 additions and 26 deletions
  1. +50
    -2
      fastNLP/api/api.py
  2. +2
    -2
      fastNLP/api/pipeline.py
  3. +1
    -3
      fastNLP/core/batch.py
  4. +1
    -1
      fastNLP/core/dataset.py
  5. +3
    -4
      fastNLP/core/metrics.py
  6. +1
    -1
      fastNLP/core/tester.py
  7. +3
    -3
      fastNLP/core/trainer.py
  8. +3
    -3
      fastNLP/models/sequence_modeling.py
  9. +1
    -1
      fastNLP/modules/decoder/CRF.py
  10. +2
    -2
      reproduction/pos_tag_model/pos_tag.cfg
  11. +13
    -4
      reproduction/pos_tag_model/train_pos_tag.py

+ 50
- 2
fastNLP/api/api.py View File

@@ -1,14 +1,18 @@

import torch

from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance
from fastNLP.core.predictor import Predictor


class API:
def __init__(self):
self.pipeline = None
self.model = None

def predict(self):
pass
def predict(self, *args, **kwargs):
raise NotImplementedError

def load(self, name):
_dict = torch.load(name)
@@ -19,3 +23,47 @@ class API:
_dict = {'pipeline': self.pipeline,
'model': self.model}
torch.save(_dict, path)


class POS_tagger(API):
"""FastNLP API for Part-Of-Speech tagging.

"""

def __init__(self):
super(POS_tagger, self).__init__()

def predict(self, query):
"""

:param query: list of list of str. Each string is a token(word).
:return answer: list of list of str. Each string is a tag.
"""
self.load("/home/zyfeng/fastnlp_0.2.0/reproduction/pos_tag_model/model_pp.pkl")

data = DataSet()
for example in query:
data.append(Instance(words=example))

data = self.pipeline(data)

predictor = Predictor()
outputs = predictor.predict(self.model, data)

answers = []
for out in outputs:
out = out.numpy()
for sent in out:
answers.append([self.tag_vocab.to_word(tag) for tag in sent])
return answers

def load(self, name):
_dict = torch.load(name)
self.pipeline = _dict['pipeline']
self.model = _dict['model']
self.tag_vocab = _dict["tag_vocab"]


if __name__ == "__main__":
tagger = POS_tagger()
print(tagger.predict([["我", "是", "学生", "。"], ["我", "是", "学生", "。"]]))

+ 2
- 2
fastNLP/api/pipeline.py View File

@@ -11,7 +11,7 @@ class Pipeline:
self.pipeline = []
if isinstance(processors, list):
for proc in processors:
assert isinstance(proc, Processor), "Must be a Processor, not {}.".format(type(processor))
assert isinstance(proc, Processor), "Must be a Processor, not {}.".format(type(proc))
self.pipeline = processors

def add_processor(self, processor):
@@ -21,7 +21,7 @@ class Pipeline:
def process(self, dataset):
assert len(self.pipeline) != 0, "You need to add some processor first."

for proc_name, proc in self.pipeline:
for proc in self.pipeline:
dataset = proc(dataset)

return dataset


+ 1
- 3
fastNLP/core/batch.py View File

@@ -9,7 +9,7 @@ class Batch(object):

"""

def __init__(self, dataset, batch_size, sampler, use_cuda, sort_in_batch=False, sort_key=None):
def __init__(self, dataset, batch_size, sampler, use_cuda):
"""

:param dataset: a DataSet object
@@ -22,8 +22,6 @@ class Batch(object):
self.batch_size = batch_size
self.sampler = sampler
self.use_cuda = use_cuda
self.sort_in_batch = sort_in_batch
self.sort_key = sort_key if sort_key is not None else 'word_seq'
self.idx_list = None
self.curidx = 0



+ 1
- 1
fastNLP/core/dataset.py View File

@@ -119,7 +119,7 @@ class DataSet(object):
assert isinstance(val, bool)
self.field_arrays[name].is_target = val
else:
raise KeyError
raise KeyError("{} is not a valid field name.".format(name))
return self

def set_need_tensor(self, **kwargs):


+ 3
- 4
fastNLP/core/metrics.py View File

@@ -43,12 +43,11 @@ class SeqLabelEvaluator(Evaluator):
:return accuracy:
"""
truth = [item["truth"] for item in truth]
total_correct, total_count= 0., 0.
total_correct, total_count = 0., 0.
for x, y in zip(predict, truth):
x = torch.Tensor(x)
x = torch.tensor(x)
y = y.to(x) # make sure they are in the same device
mask = x.ge(1).float()
# correct = torch.sum(x * mask.float() == (y * mask.long()).float())
mask = x.ge(1).long()
correct = torch.sum(x * mask == y * mask)
correct -= torch.sum(x.le(0))
total_correct += float(correct)


+ 1
- 1
fastNLP/core/tester.py View File

@@ -74,7 +74,7 @@ class Tester(object):
output_list = []
truth_list = []

data_iterator = Batch(dev_data, self.batch_size, sampler=RandomSampler(), use_cuda=self.use_cuda, sort_in_batch=True, sort_key='word_seq')
data_iterator = Batch(dev_data, self.batch_size, sampler=RandomSampler(), use_cuda=self.use_cuda)

with torch.no_grad():
for batch_x, batch_y in data_iterator:


+ 3
- 3
fastNLP/core/trainer.py View File

@@ -11,6 +11,7 @@ from fastNLP.core.metrics import Evaluator
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.sampler import RandomSampler
from fastNLP.core.tester import SeqLabelTester, ClassificationTester, SNLITester
from fastNLP.core.tester import Tester
from fastNLP.saver.logger import create_logger
from fastNLP.saver.model_saver import ModelSaver

@@ -144,7 +145,7 @@ class Trainer(object):

# prepare mini-batch iterator
data_iterator = Batch(train_data, batch_size=self.batch_size, sampler=RandomSampler(),
use_cuda=self.use_cuda, sort_in_batch=True, sort_key='word_seq')
use_cuda=self.use_cuda)
logger.info("prepared data iterator")

# one forward and backward pass
@@ -230,7 +231,6 @@ class Trainer(object):
def update(self):
"""Perform weight update on a model.

For PyTorch, just call optimizer to update.
"""
self._optimizer.step()

@@ -319,7 +319,7 @@ class Trainer(object):
ModelSaver(os.path.join(self.pickle_path, model_name)).save_pytorch(network)

def _create_validator(self, valid_args):
raise NotImplementedError
return Tester(**valid_args)

def set_validator(self, validor):
self.validator = validor


+ 3
- 3
fastNLP/models/sequence_modeling.py View File

@@ -116,11 +116,11 @@ class AdvSeqLabel(SeqLabeling):
num_classes = args["num_classes"]

self.Embedding = encoder.embedding.Embedding(vocab_size, word_emb_dim, init_emb=emb)
self.Rnn = encoder.lstm.LSTM(word_emb_dim, hidden_dim, num_layers=3, dropout=0.3, bidirectional=True)
self.Rnn = encoder.lstm.LSTM(word_emb_dim, hidden_dim, num_layers=3, dropout=0.5, bidirectional=True)
self.Linear1 = encoder.Linear(hidden_dim * 2, hidden_dim * 2 // 3)
self.batch_norm = torch.nn.BatchNorm1d(hidden_dim * 2 // 3)
self.relu = torch.nn.ReLU()
self.drop = torch.nn.Dropout(0.3)
self.drop = torch.nn.Dropout(0.5)
self.Linear2 = encoder.Linear(hidden_dim * 2 // 3, num_classes)

self.Crf = decoder.CRF.ConditionalRandomField(num_classes)
@@ -135,7 +135,7 @@ class AdvSeqLabel(SeqLabeling):
"""
word_seq = word_seq.long()
word_seq_origin_len = word_seq_origin_len.long()
truth = truth.long()
truth = truth.long() if truth is not None else None
self.mask = self.make_mask(word_seq, word_seq_origin_len)

batch_size = word_seq.size(0)


+ 1
- 1
fastNLP/modules/decoder/CRF.py View File

@@ -128,7 +128,7 @@ class ConditionalRandomField(nn.Module):
vpath = data.new_zeros((seq_len, batch_size, n_tags), dtype=torch.long)
vscore = data[0]
if self.include_start_end_trans:
vscore += self.start_scores.view(1. -1)
vscore += self.start_scores.view(1, -1)
for i in range(1, seq_len):
prev_score = vscore.view(batch_size, n_tags, 1)
cur_score = data[i].view(batch_size, 1, n_tags)


+ 2
- 2
reproduction/pos_tag_model/pos_tag.cfg View File

@@ -1,6 +1,6 @@
[train]
epochs = 20
batch_size = 32
epochs = 5
batch_size = 64
pickle_path = "./save/"
validate = false
save_best_dev = true


+ 13
- 4
reproduction/pos_tag_model/train_pos_tag.py View File

@@ -1,3 +1,4 @@
import copy
import os

import torch
@@ -6,6 +7,7 @@ from fastNLP.api.pipeline import Pipeline
from fastNLP.api.processor import VocabProcessor, IndexerProcessor, SeqLenProcessor
from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance
from fastNLP.core.metrics import SeqLabelEvaluator
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.trainer import Trainer
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
@@ -13,9 +15,12 @@ from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader
from fastNLP.models.sequence_modeling import AdvSeqLabel

cfgfile = './pos_tag.cfg'
# datadir = "/home/zyfeng/data/"
# data_name = "POS_PD_1998.txt"
datadir = "/home/zyfeng/fastnlp_0.2.0/test/data_for_tests/"
data_name = "people_daily_raw.txt"


pos_tag_data_path = os.path.join(datadir, data_name)
pickle_path = "save"
data_infer_path = os.path.join(datadir, "infer.utf8")
@@ -54,6 +59,9 @@ def train():
seq_len_proc = SeqLenProcessor("word_seq", "word_seq_origin_len")
seq_len_proc(dataset)

dev_set = copy.deepcopy(dataset)
dev_set.set_is_target(truth=True)

print("processors defined")
# dataset.set_is_target(tag_ids=True)
model_param["vocab_size"] = len(word_vocab_proc.get_vocab())
@@ -66,14 +74,15 @@ def train():
# call trainer to train
trainer = Trainer(epochs=train_param["epochs"],
batch_size=train_param["batch_size"],
validate=False,
validate=True,
optimizer=Optimizer("SGD", lr=0.01, momentum=0.9),
evaluator=SeqLabelEvaluator()
)
trainer.train(model, dataset)
trainer.train(model, dataset, dev_set)

# save model & pipeline
pp = Pipeline([word_vocab_proc, word_indexer, seq_len_proc])
save_dict = {"pipeline": pp, "model": model}
pp = Pipeline([word_indexer, seq_len_proc])
save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_vocab_proc.get_vocab()}
torch.save(save_dict, "model_pp.pkl")




Loading…
Cancel
Save