@@ -1,17 +1,34 @@ | |||||
from ...core.dataset import DataSet | from ...core.dataset import DataSet | ||||
from ..file_reader import _read_json | from ..file_reader import _read_json | ||||
from ...core.instance import Instance | from ...core.instance import Instance | ||||
from ...core.const import Const | |||||
from .json import JsonLoader | from .json import JsonLoader | ||||
class CRLoader(JsonLoader): | class CRLoader(JsonLoader): | ||||
""" | |||||
原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息,speakers包含每句话的说话者信息,cluster是指向现实中同一个事物的聚集,sentences是文本信息内容。 | |||||
Example:: | |||||
{"doc_key":"bc/cctv/00/cctv_001", | |||||
"speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]", | |||||
"clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]", | |||||
"sentences":[["I","have","an","apple"],["It","is","good"]] | |||||
} | |||||
读取预处理好的Conll2012数据。 | |||||
""" | |||||
def __init__(self, fields=None, dropna=False): | def __init__(self, fields=None, dropna=False): | ||||
super().__init__(fields, dropna) | super().__init__(fields, dropna) | ||||
self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1),"clusters":Const.TARGET,"sentences":Const.INPUTS(2)} | |||||
def _load(self, path): | def _load(self, path): | ||||
""" | """ | ||||
加载数据 | 加载数据 | ||||
:param path: | |||||
:param path: 数据文件路径,文件为json | |||||
:return: | :return: | ||||
""" | """ | ||||
dataset = DataSet() | dataset = DataSet() | ||||
@@ -6,12 +6,16 @@ __all__ = [ | |||||
from .pipe import Pipe | from .pipe import Pipe | ||||
from ..data_bundle import DataBundle | from ..data_bundle import DataBundle | ||||
from ..loader.coreference import CRLoader | from ..loader.coreference import CRLoader | ||||
from ...core.const import Const | |||||
from fastNLP.core.vocabulary import Vocabulary | from fastNLP.core.vocabulary import Vocabulary | ||||
import numpy as np | import numpy as np | ||||
import collections | import collections | ||||
class CoreferencePipe(Pipe): | class CoreferencePipe(Pipe): | ||||
""" | |||||
对Coreference resolution问题进行处理,得到文章种类/说话者/字符级信息/序列长度。 | |||||
""" | |||||
def __init__(self,config): | def __init__(self,config): | ||||
super().__init__() | super().__init__() | ||||
@@ -19,28 +23,39 @@ class CoreferencePipe(Pipe): | |||||
def process(self, data_bundle: DataBundle): | def process(self, data_bundle: DataBundle): | ||||
genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])} | genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])} | ||||
vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name='sentences') | |||||
vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name=Const.INPUTS(2)) | |||||
vocab.build_vocab() | vocab.build_vocab() | ||||
word2id = vocab.word2idx | word2id = vocab.word2idx | ||||
data_bundle.vocabs = {"vocab":vocab} | |||||
char_dict = get_char_dict(self.config.char_path) | char_dict = get_char_dict(self.config.char_path) | ||||
for name, ds in data_bundle.datasets.items(): | for name, ds in data_bundle.datasets.items(): | ||||
ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter), | |||||
# genre | |||||
ds.apply(lambda x: genres[x[Const.INPUTS(0)][:2]], new_field_name=Const.INPUTS(0)) | |||||
# speaker_ids_np | |||||
ds.apply(lambda x: speaker2numpy(x[Const.INPUTS(1)], self.config.max_sentences, is_train=name == 'train'), | |||||
new_field_name=Const.INPUTS(1)) | |||||
# doc_np | |||||
ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter), | |||||
self.config.max_sentences, is_train=name == 'train')[0], | self.config.max_sentences, is_train=name == 'train')[0], | ||||
new_field_name='doc_np') | |||||
ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter), | |||||
new_field_name=Const.INPUTS(3)) | |||||
# char_index | |||||
ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter), | |||||
self.config.max_sentences, is_train=name == 'train')[1], | self.config.max_sentences, is_train=name == 'train')[1], | ||||
new_field_name='char_index') | |||||
ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter), | |||||
new_field_name=Const.CHAR_INPUT) | |||||
# seq len | |||||
ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter), | |||||
self.config.max_sentences, is_train=name == 'train')[2], | self.config.max_sentences, is_train=name == 'train')[2], | ||||
new_field_name='seq_len') | |||||
ds.apply(lambda x: speaker2numpy(x["speakers"], self.config.max_sentences, is_train=name == 'train'), | |||||
new_field_name='speaker_ids_np') | |||||
ds.apply(lambda x: genres[x["doc_key"][:2]], new_field_name='genre') | |||||
ds.set_ignore_type('clusters') | |||||
ds.set_padder('clusters', None) | |||||
ds.set_input("sentences", "doc_np", "speaker_ids_np", "genre", "char_index", "seq_len") | |||||
ds.set_target("clusters") | |||||
new_field_name=Const.INPUT_LEN) | |||||
ds.set_ignore_type(Const.TARGET) | |||||
ds.set_padder(Const.TARGET, None) | |||||
ds.set_input(Const.INPUTS(0), Const.INPUTS(1), Const.INPUTS(2), Const.INPUTS(3), Const.CHAR_INPUT, Const.INPUT_LEN) | |||||
ds.set_target(Const.TARGET) | |||||
return data_bundle | return data_bundle | ||||
def process_from_file(self, paths): | def process_from_file(self, paths): | ||||
@@ -8,6 +8,7 @@ from fastNLP.models.base_model import BaseModel | |||||
from fastNLP.modules.encoder.variational_rnn import VarLSTM | from fastNLP.modules.encoder.variational_rnn import VarLSTM | ||||
from reproduction.coreference_resolution.model import preprocess | from reproduction.coreference_resolution.model import preprocess | ||||
from fastNLP.io.embed_loader import EmbedLoader | from fastNLP.io.embed_loader import EmbedLoader | ||||
from fastNLP.core.const import Const | |||||
import random | import random | ||||
# 设置seed | # 设置seed | ||||
@@ -415,7 +416,7 @@ class Model(BaseModel): | |||||
return predicted_clusters | return predicted_clusters | ||||
def forward(self, sentences, doc_np, speaker_ids_np, genre, char_index, seq_len): | |||||
def forward(self, words1 , words2, words3, words4, chars, seq_len): | |||||
""" | """ | ||||
实际输入都是tensor | 实际输入都是tensor | ||||
:param sentences: 句子,被fastNLP转化成了numpy, | :param sentences: 句子,被fastNLP转化成了numpy, | ||||
@@ -426,6 +427,14 @@ class Model(BaseModel): | |||||
:param seq_len: 被fastNLP转化成了Tensor | :param seq_len: 被fastNLP转化成了Tensor | ||||
:return: | :return: | ||||
""" | """ | ||||
sentences = words3 | |||||
doc_np = words4 | |||||
speaker_ids_np = words2 | |||||
genre = words1 | |||||
char_index = chars | |||||
# change for fastNLP | # change for fastNLP | ||||
sentences = sentences[0].tolist() | sentences = sentences[0].tolist() | ||||
doc_tensor = doc_np[0] | doc_tensor = doc_np[0] | ||||
@@ -11,18 +11,18 @@ class SoftmaxLoss(LossBase): | |||||
允许多标签分类 | 允许多标签分类 | ||||
""" | """ | ||||
def __init__(self, antecedent_scores=None, clusters=None, mention_start_tensor=None, mention_end_tensor=None): | |||||
def __init__(self, antecedent_scores=None, target=None, mention_start_tensor=None, mention_end_tensor=None): | |||||
""" | """ | ||||
:param pred: | :param pred: | ||||
:param target: | :param target: | ||||
""" | """ | ||||
super().__init__() | super().__init__() | ||||
self._init_param_map(antecedent_scores=antecedent_scores, clusters=clusters, | |||||
self._init_param_map(antecedent_scores=antecedent_scores, target=target, | |||||
mention_start_tensor=mention_start_tensor, mention_end_tensor=mention_end_tensor) | mention_start_tensor=mention_start_tensor, mention_end_tensor=mention_end_tensor) | ||||
def get_loss(self, antecedent_scores, clusters, mention_start_tensor, mention_end_tensor): | |||||
antecedent_labels = get_labels(clusters[0], mention_start_tensor, mention_end_tensor, | |||||
def get_loss(self, antecedent_scores, target, mention_start_tensor, mention_end_tensor): | |||||
antecedent_labels = get_labels(target[0], mention_start_tensor, mention_end_tensor, | |||||
Config().max_antecedents) | Config().max_antecedents) | ||||
antecedent_labels = torch.from_numpy(antecedent_labels*1).to(torch.device("cuda:" + Config().cuda)) | antecedent_labels = torch.from_numpy(antecedent_labels*1).to(torch.device("cuda:" + Config().cuda)) | ||||
@@ -1,14 +0,0 @@ | |||||
import unittest | |||||
from fastNLP.io.pipe.coreference import CoreferencePipe | |||||
from reproduction.coreference_resolution.model.config import Config | |||||
class Test_CRLoader(unittest.TestCase): | |||||
def test_cr_loader(self): | |||||
config = Config() | |||||
bundle = CoreferencePipe(config).process_from_file({'train': config.train_path, 'dev': config.dev_path,'test': config.test_path}) | |||||
print(bundle.datasets['train'][0]) | |||||
print(bundle.datasets['dev'][0]) | |||||
print(bundle.datasets['test'][0]) |
@@ -45,7 +45,7 @@ if __name__ == "__main__": | |||||
print("数据集划分:\ntrain:", str(len(data_info.datasets["train"])), | print("数据集划分:\ntrain:", str(len(data_info.datasets["train"])), | ||||
"\ndev:" + str(len(data_info.datasets["dev"])) + "\ntest:" + str(len(data_info.datasets["test"]))) | "\ndev:" + str(len(data_info.datasets["dev"])) + "\ntest:" + str(len(data_info.datasets["test"]))) | ||||
# print(data_info) | # print(data_info) | ||||
model = Model(data_info.vocabs, config) | |||||
model = Model(data_info.vocabs['vocab'], config) | |||||
print(model) | print(model) | ||||
loss = SoftmaxLoss() | loss = SoftmaxLoss() | ||||