From 8dae71ff08476c573e3df26c00e188a7745ace78 Mon Sep 17 00:00:00 2001 From: xxliu Date: Tue, 3 Sep 2019 14:19:29 +0800 Subject: [PATCH] pipeline --- fastNLP/io/loader/coreference.py | 19 +++++++- fastNLP/io/pipe/coreference.py | 45 ++++++++++++------- .../coreference_resolution/model/model_re.py | 11 ++++- .../model/softmax_loss.py | 8 ++-- .../coreference_resolution/test/__init__.py | 0 .../test/test_dataloader.py | 14 ------ reproduction/coreference_resolution/train.py | 2 +- 7 files changed, 63 insertions(+), 36 deletions(-) delete mode 100644 reproduction/coreference_resolution/test/__init__.py delete mode 100644 reproduction/coreference_resolution/test/test_dataloader.py diff --git a/fastNLP/io/loader/coreference.py b/fastNLP/io/loader/coreference.py index c8d9bbf5..2e4d72de 100644 --- a/fastNLP/io/loader/coreference.py +++ b/fastNLP/io/loader/coreference.py @@ -1,17 +1,34 @@ from ...core.dataset import DataSet from ..file_reader import _read_json from ...core.instance import Instance +from ...core.const import Const from .json import JsonLoader class CRLoader(JsonLoader): + """ + 原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息,speakers包含每句话的说话者信息,cluster是指向现实中同一个事物的聚集,sentences是文本信息内容。 + + Example:: + + {"doc_key":"bc/cctv/00/cctv_001", + "speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]", + "clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]", + "sentences":[["I","have","an","apple"],["It","is","good"]] + } + + 读取预处理好的Conll2012数据。 + + """ def __init__(self, fields=None, dropna=False): super().__init__(fields, dropna) + self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1),"clusters":Const.TARGET,"sentences":Const.INPUTS(2)} def _load(self, path): """ 加载数据 - :param path: + :param path: 数据文件路径,文件为json + :return: """ dataset = DataSet() diff --git a/fastNLP/io/pipe/coreference.py b/fastNLP/io/pipe/coreference.py index bdf6a132..711e5919 100644 --- a/fastNLP/io/pipe/coreference.py +++ b/fastNLP/io/pipe/coreference.py @@ -6,12 +6,16 @@ __all__ = [ from .pipe import Pipe from ..data_bundle import DataBundle from ..loader.coreference import CRLoader +from ...core.const import Const from fastNLP.core.vocabulary import Vocabulary import numpy as np import collections class CoreferencePipe(Pipe): + """ + 对Coreference resolution问题进行处理,得到文章种类/说话者/字符级信息/序列长度。 + """ def __init__(self,config): super().__init__() @@ -19,28 +23,39 @@ class CoreferencePipe(Pipe): def process(self, data_bundle: DataBundle): genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])} - vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name='sentences') + vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name=Const.INPUTS(2)) vocab.build_vocab() word2id = vocab.word2idx + data_bundle.vocabs = {"vocab":vocab} char_dict = get_char_dict(self.config.char_path) + for name, ds in data_bundle.datasets.items(): - ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter), + # genre + ds.apply(lambda x: genres[x[Const.INPUTS(0)][:2]], new_field_name=Const.INPUTS(0)) + + # speaker_ids_np + ds.apply(lambda x: speaker2numpy(x[Const.INPUTS(1)], self.config.max_sentences, is_train=name == 'train'), + new_field_name=Const.INPUTS(1)) + + # doc_np + ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter), self.config.max_sentences, is_train=name == 'train')[0], - new_field_name='doc_np') - ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter), + new_field_name=Const.INPUTS(3)) + # char_index + ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter), self.config.max_sentences, is_train=name == 'train')[1], - new_field_name='char_index') - ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter), + new_field_name=Const.CHAR_INPUT) + # seq len + ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter), self.config.max_sentences, is_train=name == 'train')[2], - new_field_name='seq_len') - ds.apply(lambda x: speaker2numpy(x["speakers"], self.config.max_sentences, is_train=name == 'train'), - new_field_name='speaker_ids_np') - ds.apply(lambda x: genres[x["doc_key"][:2]], new_field_name='genre') - - ds.set_ignore_type('clusters') - ds.set_padder('clusters', None) - ds.set_input("sentences", "doc_np", "speaker_ids_np", "genre", "char_index", "seq_len") - ds.set_target("clusters") + new_field_name=Const.INPUT_LEN) + + + ds.set_ignore_type(Const.TARGET) + ds.set_padder(Const.TARGET, None) + ds.set_input(Const.INPUTS(0), Const.INPUTS(1), Const.INPUTS(2), Const.INPUTS(3), Const.CHAR_INPUT, Const.INPUT_LEN) + ds.set_target(Const.TARGET) + return data_bundle def process_from_file(self, paths): diff --git a/reproduction/coreference_resolution/model/model_re.py b/reproduction/coreference_resolution/model/model_re.py index 9dd90ec4..eaa2941b 100644 --- a/reproduction/coreference_resolution/model/model_re.py +++ b/reproduction/coreference_resolution/model/model_re.py @@ -8,6 +8,7 @@ from fastNLP.models.base_model import BaseModel from fastNLP.modules.encoder.variational_rnn import VarLSTM from reproduction.coreference_resolution.model import preprocess from fastNLP.io.embed_loader import EmbedLoader +from fastNLP.core.const import Const import random # 设置seed @@ -415,7 +416,7 @@ class Model(BaseModel): return predicted_clusters - def forward(self, sentences, doc_np, speaker_ids_np, genre, char_index, seq_len): + def forward(self, words1 , words2, words3, words4, chars, seq_len): """ 实际输入都是tensor :param sentences: 句子,被fastNLP转化成了numpy, @@ -426,6 +427,14 @@ class Model(BaseModel): :param seq_len: 被fastNLP转化成了Tensor :return: """ + + sentences = words3 + doc_np = words4 + speaker_ids_np = words2 + genre = words1 + char_index = chars + + # change for fastNLP sentences = sentences[0].tolist() doc_tensor = doc_np[0] diff --git a/reproduction/coreference_resolution/model/softmax_loss.py b/reproduction/coreference_resolution/model/softmax_loss.py index c75a31d6..1c1fcc69 100644 --- a/reproduction/coreference_resolution/model/softmax_loss.py +++ b/reproduction/coreference_resolution/model/softmax_loss.py @@ -11,18 +11,18 @@ class SoftmaxLoss(LossBase): 允许多标签分类 """ - def __init__(self, antecedent_scores=None, clusters=None, mention_start_tensor=None, mention_end_tensor=None): + def __init__(self, antecedent_scores=None, target=None, mention_start_tensor=None, mention_end_tensor=None): """ :param pred: :param target: """ super().__init__() - self._init_param_map(antecedent_scores=antecedent_scores, clusters=clusters, + self._init_param_map(antecedent_scores=antecedent_scores, target=target, mention_start_tensor=mention_start_tensor, mention_end_tensor=mention_end_tensor) - def get_loss(self, antecedent_scores, clusters, mention_start_tensor, mention_end_tensor): - antecedent_labels = get_labels(clusters[0], mention_start_tensor, mention_end_tensor, + def get_loss(self, antecedent_scores, target, mention_start_tensor, mention_end_tensor): + antecedent_labels = get_labels(target[0], mention_start_tensor, mention_end_tensor, Config().max_antecedents) antecedent_labels = torch.from_numpy(antecedent_labels*1).to(torch.device("cuda:" + Config().cuda)) diff --git a/reproduction/coreference_resolution/test/__init__.py b/reproduction/coreference_resolution/test/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/reproduction/coreference_resolution/test/test_dataloader.py b/reproduction/coreference_resolution/test/test_dataloader.py deleted file mode 100644 index 6a3be520..00000000 --- a/reproduction/coreference_resolution/test/test_dataloader.py +++ /dev/null @@ -1,14 +0,0 @@ - - -import unittest -from fastNLP.io.pipe.coreference import CoreferencePipe -from reproduction.coreference_resolution.model.config import Config - -class Test_CRLoader(unittest.TestCase): - def test_cr_loader(self): - config = Config() - bundle = CoreferencePipe(config).process_from_file({'train': config.train_path, 'dev': config.dev_path,'test': config.test_path}) - - print(bundle.datasets['train'][0]) - print(bundle.datasets['dev'][0]) - print(bundle.datasets['test'][0]) diff --git a/reproduction/coreference_resolution/train.py b/reproduction/coreference_resolution/train.py index 6c26cf4c..790c7659 100644 --- a/reproduction/coreference_resolution/train.py +++ b/reproduction/coreference_resolution/train.py @@ -45,7 +45,7 @@ if __name__ == "__main__": print("数据集划分:\ntrain:", str(len(data_info.datasets["train"])), "\ndev:" + str(len(data_info.datasets["dev"])) + "\ntest:" + str(len(data_info.datasets["test"]))) # print(data_info) - model = Model(data_info.vocabs, config) + model = Model(data_info.vocabs['vocab'], config) print(model) loss = SoftmaxLoss()