Browse Source

Merge branch 'dataset' of github.com:yhcc/fastNLP into dataset

tags/v0.2.0
yh_cc 5 years ago
parent
commit
7df33b23ea
12 changed files with 101 additions and 40 deletions
  1. +50
    -2
      fastNLP/api/api.py
  2. +1
    -1
      fastNLP/api/pipeline.py
  3. +1
    -3
      fastNLP/core/batch.py
  4. +1
    -1
      fastNLP/core/dataset.py
  5. +3
    -4
      fastNLP/core/metrics.py
  6. +1
    -1
      fastNLP/core/tester.py
  7. +18
    -16
      fastNLP/core/trainer.py
  8. +3
    -3
      fastNLP/models/sequence_modeling.py
  9. +2
    -2
      fastNLP/modules/decoder/CRF.py
  10. +2
    -2
      reproduction/chinese_word_segment/cws.cfg
  11. +1
    -1
      reproduction/pos_tag_model/pos_tag.cfg
  12. +18
    -4
      reproduction/pos_tag_model/train_pos_tag.py

+ 50
- 2
fastNLP/api/api.py View File

@@ -1,14 +1,18 @@


import torch import torch


from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance
from fastNLP.core.predictor import Predictor



class API: class API:
def __init__(self): def __init__(self):
self.pipeline = None self.pipeline = None
self.model = None self.model = None


def predict(self):
pass
def predict(self, *args, **kwargs):
raise NotImplementedError


def load(self, name): def load(self, name):
_dict = torch.load(name) _dict = torch.load(name)
@@ -19,3 +23,47 @@ class API:
_dict = {'pipeline': self.pipeline, _dict = {'pipeline': self.pipeline,
'model': self.model} 'model': self.model}
torch.save(_dict, path) torch.save(_dict, path)


class POS_tagger(API):
"""FastNLP API for Part-Of-Speech tagging.

"""

def __init__(self):
super(POS_tagger, self).__init__()

def predict(self, query):
"""

:param query: list of list of str. Each string is a token(word).
:return answer: list of list of str. Each string is a tag.
"""
self.load("/home/zyfeng/fastnlp_0.2.0/reproduction/pos_tag_model/model_pp.pkl")

data = DataSet()
for example in query:
data.append(Instance(words=example))

data = self.pipeline(data)

predictor = Predictor()
outputs = predictor.predict(self.model, data)

answers = []
for out in outputs:
out = out.numpy()
for sent in out:
answers.append([self.tag_vocab.to_word(tag) for tag in sent])
return answers

def load(self, name):
_dict = torch.load(name)
self.pipeline = _dict['pipeline']
self.model = _dict['model']
self.tag_vocab = _dict["tag_vocab"]


if __name__ == "__main__":
tagger = POS_tagger()
print(tagger.predict([["我", "是", "学生", "。"], ["我", "是", "学生", "。"]]))

+ 1
- 1
fastNLP/api/pipeline.py View File

@@ -11,7 +11,7 @@ class Pipeline:
self.pipeline = [] self.pipeline = []
if isinstance(processors, list): if isinstance(processors, list):
for proc in processors: for proc in processors:
assert isinstance(proc, Processor), "Must be a Processor, not {}.".format(type(processor))
assert isinstance(proc, Processor), "Must be a Processor, not {}.".format(type(proc))
self.pipeline = processors self.pipeline = processors


def add_processor(self, processor): def add_processor(self, processor):


+ 1
- 3
fastNLP/core/batch.py View File

@@ -9,7 +9,7 @@ class Batch(object):


""" """


def __init__(self, dataset, batch_size, sampler, use_cuda, sort_in_batch=False, sort_key=None):
def __init__(self, dataset, batch_size, sampler, use_cuda):
""" """


:param dataset: a DataSet object :param dataset: a DataSet object
@@ -22,8 +22,6 @@ class Batch(object):
self.batch_size = batch_size self.batch_size = batch_size
self.sampler = sampler self.sampler = sampler
self.use_cuda = use_cuda self.use_cuda = use_cuda
self.sort_in_batch = sort_in_batch
self.sort_key = sort_key if sort_key is not None else 'word_seq'
self.idx_list = None self.idx_list = None
self.curidx = 0 self.curidx = 0




+ 1
- 1
fastNLP/core/dataset.py View File

@@ -119,7 +119,7 @@ class DataSet(object):
assert isinstance(val, bool) assert isinstance(val, bool)
self.field_arrays[name].is_target = val self.field_arrays[name].is_target = val
else: else:
raise KeyError
raise KeyError("{} is not a valid field name.".format(name))
return self return self


def set_need_tensor(self, **kwargs): def set_need_tensor(self, **kwargs):


+ 3
- 4
fastNLP/core/metrics.py View File

@@ -43,12 +43,11 @@ class SeqLabelEvaluator(Evaluator):
:return accuracy: :return accuracy:
""" """
truth = [item["truth"] for item in truth] truth = [item["truth"] for item in truth]
total_correct, total_count= 0., 0.
total_correct, total_count = 0., 0.
for x, y in zip(predict, truth): for x, y in zip(predict, truth):
x = torch.Tensor(x)
x = torch.tensor(x)
y = y.to(x) # make sure they are in the same device y = y.to(x) # make sure they are in the same device
mask = x.ge(1).float()
# correct = torch.sum(x * mask.float() == (y * mask.long()).float())
mask = x.ge(1).long()
correct = torch.sum(x * mask == y * mask) correct = torch.sum(x * mask == y * mask)
correct -= torch.sum(x.le(0)) correct -= torch.sum(x.le(0))
total_correct += float(correct) total_correct += float(correct)


+ 1
- 1
fastNLP/core/tester.py View File

@@ -74,7 +74,7 @@ class Tester(object):
output_list = [] output_list = []
truth_list = [] truth_list = []


data_iterator = Batch(dev_data, self.batch_size, sampler=RandomSampler(), use_cuda=self.use_cuda, sort_in_batch=True, sort_key='word_seq')
data_iterator = Batch(dev_data, self.batch_size, sampler=RandomSampler(), use_cuda=self.use_cuda)


with torch.no_grad(): with torch.no_grad():
for batch_x, batch_y in data_iterator: for batch_x, batch_y in data_iterator:


+ 18
- 16
fastNLP/core/trainer.py View File

@@ -11,12 +11,14 @@ from fastNLP.core.metrics import Evaluator
from fastNLP.core.optimizer import Optimizer from fastNLP.core.optimizer import Optimizer
from fastNLP.core.sampler import RandomSampler from fastNLP.core.sampler import RandomSampler
from fastNLP.core.tester import SeqLabelTester, ClassificationTester, SNLITester from fastNLP.core.tester import SeqLabelTester, ClassificationTester, SNLITester
from fastNLP.core.tester import Tester
from fastNLP.saver.logger import create_logger from fastNLP.saver.logger import create_logger
from fastNLP.saver.model_saver import ModelSaver from fastNLP.saver.model_saver import ModelSaver


logger = create_logger(__name__, "./train_test.log") logger = create_logger(__name__, "./train_test.log")
logger.disabled = True logger.disabled = True



class Trainer(object): class Trainer(object):
"""Operations of training a model, including data loading, gradient descent, and validation. """Operations of training a model, including data loading, gradient descent, and validation.


@@ -138,23 +140,22 @@ class Trainer(object):
print("training epochs started " + self.start_time) print("training epochs started " + self.start_time)
logger.info("training epochs started " + self.start_time) logger.info("training epochs started " + self.start_time)
epoch, iters = 1, 0 epoch, iters = 1, 0
while(1):
if self.n_epochs != -1 and epoch > self.n_epochs:
break
while epoch <= self.n_epochs:
logger.info("training epoch {}".format(epoch)) logger.info("training epoch {}".format(epoch))


# prepare mini-batch iterator # prepare mini-batch iterator
data_iterator = Batch(train_data, batch_size=self.batch_size, sampler=RandomSampler(), data_iterator = Batch(train_data, batch_size=self.batch_size, sampler=RandomSampler(),
use_cuda=self.use_cuda, sort_in_batch=True, sort_key='word_seq')
use_cuda=self.use_cuda)
logger.info("prepared data iterator") logger.info("prepared data iterator")


# one forward and backward pass # one forward and backward pass
iters = self._train_step(data_iterator, network, start=start, n_print=self.print_every_step, epoch=epoch, step=iters, dev_data=dev_data)
iters = self._train_step(data_iterator, network, start=start, n_print=self.print_every_step, epoch=epoch,
step=iters, dev_data=dev_data)


# validation # validation
if self.validate: if self.validate:
self.valid_model() self.valid_model()
self.save_model(self._model, 'training_model_'+self.start_time)
self.save_model(self._model, 'training_model_' + self.start_time)
epoch += 1 epoch += 1


def _train_step(self, data_iterator, network, **kwargs): def _train_step(self, data_iterator, network, **kwargs):
@@ -171,13 +172,13 @@ class Trainer(object):


loss = self.get_loss(prediction, batch_y) loss = self.get_loss(prediction, batch_y)
self.grad_backward(loss) self.grad_backward(loss)
# if torch.rand(1).item() < 0.001:
# print('[grads at epoch: {:>3} step: {:>4}]'.format(kwargs['epoch'], step))
# for name, p in self._model.named_parameters():
# if p.requires_grad:
# print('\t{} {} {}'.format(name, tuple(p.size()), torch.sum(p.grad).item()))
self.update() self.update()
self._summary_writer.add_scalar("loss", loss.item(), global_step=step) self._summary_writer.add_scalar("loss", loss.item(), global_step=step)
for name, param in self._model.named_parameters():
if param.requires_grad:
self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=step)
self._summary_writer.add_scalar(name + "_std", param.std(), global_step=step)
self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=step)


if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0: if kwargs["n_print"] > 0 and step % kwargs["n_print"] == 0:
end = time.time() end = time.time()
@@ -193,14 +194,14 @@ class Trainer(object):


def valid_model(self): def valid_model(self):
if self.dev_data is None: if self.dev_data is None:
raise RuntimeError(
"self.validate is True in trainer, but dev_data is None. Please provide the validation data.")
raise RuntimeError(
"self.validate is True in trainer, but dev_data is None. Please provide the validation data.")
logger.info("validation started") logger.info("validation started")
res = self.validator.test(self._model, self.dev_data) res = self.validator.test(self._model, self.dev_data)
if self.save_best_dev and self.best_eval_result(res): if self.save_best_dev and self.best_eval_result(res):
logger.info('save best result! {}'.format(res)) logger.info('save best result! {}'.format(res))
print('save best result! {}'.format(res)) print('save best result! {}'.format(res))
self.save_model(self._model, 'best_model_'+self.start_time)
self.save_model(self._model, 'best_model_' + self.start_time)
return res return res


def mode(self, model, is_test=False): def mode(self, model, is_test=False):
@@ -230,7 +231,6 @@ class Trainer(object):
def update(self): def update(self):
"""Perform weight update on a model. """Perform weight update on a model.


For PyTorch, just call optimizer to update.
""" """
self._optimizer.step() self._optimizer.step()


@@ -319,15 +319,17 @@ class Trainer(object):
ModelSaver(os.path.join(self.pickle_path, model_name)).save_pytorch(network) ModelSaver(os.path.join(self.pickle_path, model_name)).save_pytorch(network)


def _create_validator(self, valid_args): def _create_validator(self, valid_args):
raise NotImplementedError
return Tester(**valid_args)


def set_validator(self, validor): def set_validator(self, validor):
self.validator = validor self.validator = validor



class SeqLabelTrainer(Trainer): class SeqLabelTrainer(Trainer):
"""Trainer for Sequence Labeling """Trainer for Sequence Labeling


""" """

def __init__(self, **kwargs): def __init__(self, **kwargs):
print( print(
"[FastNLP Warning] SeqLabelTrainer will be deprecated. Please use Trainer directly.") "[FastNLP Warning] SeqLabelTrainer will be deprecated. Please use Trainer directly.")


+ 3
- 3
fastNLP/models/sequence_modeling.py View File

@@ -116,11 +116,11 @@ class AdvSeqLabel(SeqLabeling):
num_classes = args["num_classes"] num_classes = args["num_classes"]


self.Embedding = encoder.embedding.Embedding(vocab_size, word_emb_dim, init_emb=emb) self.Embedding = encoder.embedding.Embedding(vocab_size, word_emb_dim, init_emb=emb)
self.Rnn = encoder.lstm.LSTM(word_emb_dim, hidden_dim, num_layers=3, dropout=0.3, bidirectional=True)
self.Rnn = encoder.lstm.LSTM(word_emb_dim, hidden_dim, num_layers=3, dropout=0.5, bidirectional=True)
self.Linear1 = encoder.Linear(hidden_dim * 2, hidden_dim * 2 // 3) self.Linear1 = encoder.Linear(hidden_dim * 2, hidden_dim * 2 // 3)
self.batch_norm = torch.nn.BatchNorm1d(hidden_dim * 2 // 3) self.batch_norm = torch.nn.BatchNorm1d(hidden_dim * 2 // 3)
self.relu = torch.nn.ReLU() self.relu = torch.nn.ReLU()
self.drop = torch.nn.Dropout(0.3)
self.drop = torch.nn.Dropout(0.5)
self.Linear2 = encoder.Linear(hidden_dim * 2 // 3, num_classes) self.Linear2 = encoder.Linear(hidden_dim * 2 // 3, num_classes)


self.Crf = decoder.CRF.ConditionalRandomField(num_classes) self.Crf = decoder.CRF.ConditionalRandomField(num_classes)
@@ -135,7 +135,7 @@ class AdvSeqLabel(SeqLabeling):
""" """
word_seq = word_seq.long() word_seq = word_seq.long()
word_seq_origin_len = word_seq_origin_len.long() word_seq_origin_len = word_seq_origin_len.long()
truth = truth.long()
truth = truth.long() if truth is not None else None
self.mask = self.make_mask(word_seq, word_seq_origin_len) self.mask = self.make_mask(word_seq, word_seq_origin_len)


batch_size = word_seq.size(0) batch_size = word_seq.size(0)


+ 2
- 2
fastNLP/modules/decoder/CRF.py View File

@@ -3,6 +3,7 @@ from torch import nn


from fastNLP.modules.utils import initial_parameter from fastNLP.modules.utils import initial_parameter



def log_sum_exp(x, dim=-1): def log_sum_exp(x, dim=-1):
max_value, _ = x.max(dim=dim, keepdim=True) max_value, _ = x.max(dim=dim, keepdim=True)
res = torch.log(torch.sum(torch.exp(x - max_value), dim=dim, keepdim=True)) + max_value res = torch.log(torch.sum(torch.exp(x - max_value), dim=dim, keepdim=True)) + max_value
@@ -91,7 +92,6 @@ class ConditionalRandomField(nn.Module):
st_scores = self.start_scores.view(1, -1).repeat(batch_size, 1)[batch_idx, tags[0]] st_scores = self.start_scores.view(1, -1).repeat(batch_size, 1)[batch_idx, tags[0]]
last_idx = mask.long().sum(0) - 1 last_idx = mask.long().sum(0) - 1
ed_scores = self.end_scores.view(1, -1).repeat(batch_size, 1)[batch_idx, tags[last_idx, batch_idx]] ed_scores = self.end_scores.view(1, -1).repeat(batch_size, 1)[batch_idx, tags[last_idx, batch_idx]]
print(score.size(), st_scores.size(), ed_scores.size())
score += st_scores + ed_scores score += st_scores + ed_scores
# return [B,] # return [B,]
return score return score
@@ -128,7 +128,7 @@ class ConditionalRandomField(nn.Module):
vpath = data.new_zeros((seq_len, batch_size, n_tags), dtype=torch.long) vpath = data.new_zeros((seq_len, batch_size, n_tags), dtype=torch.long)
vscore = data[0] vscore = data[0]
if self.include_start_end_trans: if self.include_start_end_trans:
vscore += self.start_scores.view(1. -1)
vscore += self.start_scores.view(1, -1)
for i in range(1, seq_len): for i in range(1, seq_len):
prev_score = vscore.view(batch_size, n_tags, 1) prev_score = vscore.view(batch_size, n_tags, 1)
cur_score = data[i].view(batch_size, 1, n_tags) cur_score = data[i].view(batch_size, 1, n_tags)


+ 2
- 2
reproduction/chinese_word_segment/cws.cfg View File

@@ -1,6 +1,6 @@
[train] [train]
epochs = 30
batch_size = 64
epochs = 40
batch_size = 8
pickle_path = "./save/" pickle_path = "./save/"
validate = true validate = true
save_best_dev = true save_best_dev = true


+ 1
- 1
reproduction/pos_tag_model/pos_tag.cfg View File

@@ -1,6 +1,6 @@
[train] [train]
epochs = 5 epochs = 5
batch_size = 2
batch_size = 64
pickle_path = "./save/" pickle_path = "./save/"
validate = false validate = false
save_best_dev = true save_best_dev = true


+ 18
- 4
reproduction/pos_tag_model/train_pos_tag.py View File

@@ -1,3 +1,4 @@
import copy
import os import os


import torch import torch
@@ -6,15 +7,20 @@ from fastNLP.api.pipeline import Pipeline
from fastNLP.api.processor import VocabProcessor, IndexerProcessor, SeqLenProcessor from fastNLP.api.processor import VocabProcessor, IndexerProcessor, SeqLenProcessor
from fastNLP.core.dataset import DataSet from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance from fastNLP.core.instance import Instance
from fastNLP.core.metrics import SeqLabelEvaluator
from fastNLP.core.optimizer import Optimizer
from fastNLP.core.trainer import Trainer from fastNLP.core.trainer import Trainer
from fastNLP.loader.config_loader import ConfigLoader, ConfigSection from fastNLP.loader.config_loader import ConfigLoader, ConfigSection
from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader from fastNLP.loader.dataset_loader import PeopleDailyCorpusLoader
from fastNLP.models.sequence_modeling import AdvSeqLabel from fastNLP.models.sequence_modeling import AdvSeqLabel


cfgfile = './pos_tag.cfg' cfgfile = './pos_tag.cfg'
# datadir = "/home/zyfeng/data/"
# data_name = "POS_PD_1998.txt"
datadir = "/home/zyfeng/fastnlp_0.2.0/test/data_for_tests/" datadir = "/home/zyfeng/fastnlp_0.2.0/test/data_for_tests/"
data_name = "people_daily_raw.txt" data_name = "people_daily_raw.txt"



pos_tag_data_path = os.path.join(datadir, data_name) pos_tag_data_path = os.path.join(datadir, data_name)
pickle_path = "save" pickle_path = "save"
data_infer_path = os.path.join(datadir, "infer.utf8") data_infer_path = os.path.join(datadir, "infer.utf8")
@@ -53,6 +59,9 @@ def train():
seq_len_proc = SeqLenProcessor("word_seq", "word_seq_origin_len") seq_len_proc = SeqLenProcessor("word_seq", "word_seq_origin_len")
seq_len_proc(dataset) seq_len_proc(dataset)


dev_set = copy.deepcopy(dataset)
dev_set.set_is_target(truth=True)

print("processors defined") print("processors defined")
# dataset.set_is_target(tag_ids=True) # dataset.set_is_target(tag_ids=True)
model_param["vocab_size"] = len(word_vocab_proc.get_vocab()) model_param["vocab_size"] = len(word_vocab_proc.get_vocab())
@@ -63,12 +72,17 @@ def train():
model = AdvSeqLabel(model_param) model = AdvSeqLabel(model_param)


# call trainer to train # call trainer to train
trainer = Trainer(**train_param.data)
trainer.train(model, dataset)
trainer = Trainer(epochs=train_param["epochs"],
batch_size=train_param["batch_size"],
validate=True,
optimizer=Optimizer("SGD", lr=0.01, momentum=0.9),
evaluator=SeqLabelEvaluator()
)
trainer.train(model, dataset, dev_set)


# save model & pipeline # save model & pipeline
pp = Pipeline([word_vocab_proc, word_indexer, seq_len_proc])
save_dict = {"pipeline": pp, "model": model}
pp = Pipeline([word_indexer, seq_len_proc])
save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_vocab_proc.get_vocab()}
torch.save(save_dict, "model_pp.pkl") torch.save(save_dict, "model_pp.pkl")






Loading…
Cancel
Save