Browse Source

pipeline

tags/v0.4.10
xxliu 5 years ago
parent
commit
8dae71ff08
7 changed files with 63 additions and 36 deletions
  1. +18
    -1
      fastNLP/io/loader/coreference.py
  2. +30
    -15
      fastNLP/io/pipe/coreference.py
  3. +10
    -1
      reproduction/coreference_resolution/model/model_re.py
  4. +4
    -4
      reproduction/coreference_resolution/model/softmax_loss.py
  5. +0
    -0
      reproduction/coreference_resolution/test/__init__.py
  6. +0
    -14
      reproduction/coreference_resolution/test/test_dataloader.py
  7. +1
    -1
      reproduction/coreference_resolution/train.py

+ 18
- 1
fastNLP/io/loader/coreference.py View File

@@ -1,17 +1,34 @@
from ...core.dataset import DataSet from ...core.dataset import DataSet
from ..file_reader import _read_json from ..file_reader import _read_json
from ...core.instance import Instance from ...core.instance import Instance
from ...core.const import Const
from .json import JsonLoader from .json import JsonLoader




class CRLoader(JsonLoader): class CRLoader(JsonLoader):
"""
原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息,speakers包含每句话的说话者信息,cluster是指向现实中同一个事物的聚集,sentences是文本信息内容。

Example::

{"doc_key":"bc/cctv/00/cctv_001",
"speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]",
"clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]",
"sentences":[["I","have","an","apple"],["It","is","good"]]
}

读取预处理好的Conll2012数据。

"""
def __init__(self, fields=None, dropna=False): def __init__(self, fields=None, dropna=False):
super().__init__(fields, dropna) super().__init__(fields, dropna)
self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1),"clusters":Const.TARGET,"sentences":Const.INPUTS(2)}


def _load(self, path): def _load(self, path):
""" """
加载数据 加载数据
:param path:
:param path: 数据文件路径,文件为json

:return: :return:
""" """
dataset = DataSet() dataset = DataSet()


+ 30
- 15
fastNLP/io/pipe/coreference.py View File

@@ -6,12 +6,16 @@ __all__ = [
from .pipe import Pipe from .pipe import Pipe
from ..data_bundle import DataBundle from ..data_bundle import DataBundle
from ..loader.coreference import CRLoader from ..loader.coreference import CRLoader
from ...core.const import Const
from fastNLP.core.vocabulary import Vocabulary from fastNLP.core.vocabulary import Vocabulary
import numpy as np import numpy as np
import collections import collections




class CoreferencePipe(Pipe): class CoreferencePipe(Pipe):
"""
对Coreference resolution问题进行处理,得到文章种类/说话者/字符级信息/序列长度。
"""


def __init__(self,config): def __init__(self,config):
super().__init__() super().__init__()
@@ -19,28 +23,39 @@ class CoreferencePipe(Pipe):


def process(self, data_bundle: DataBundle): def process(self, data_bundle: DataBundle):
genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])} genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])}
vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name='sentences')
vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name=Const.INPUTS(2))
vocab.build_vocab() vocab.build_vocab()
word2id = vocab.word2idx word2id = vocab.word2idx
data_bundle.vocabs = {"vocab":vocab}
char_dict = get_char_dict(self.config.char_path) char_dict = get_char_dict(self.config.char_path)

for name, ds in data_bundle.datasets.items(): for name, ds in data_bundle.datasets.items():
ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter),
# genre
ds.apply(lambda x: genres[x[Const.INPUTS(0)][:2]], new_field_name=Const.INPUTS(0))

# speaker_ids_np
ds.apply(lambda x: speaker2numpy(x[Const.INPUTS(1)], self.config.max_sentences, is_train=name == 'train'),
new_field_name=Const.INPUTS(1))

# doc_np
ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
self.config.max_sentences, is_train=name == 'train')[0], self.config.max_sentences, is_train=name == 'train')[0],
new_field_name='doc_np')
ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter),
new_field_name=Const.INPUTS(3))
# char_index
ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
self.config.max_sentences, is_train=name == 'train')[1], self.config.max_sentences, is_train=name == 'train')[1],
new_field_name='char_index')
ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter),
new_field_name=Const.CHAR_INPUT)
# seq len
ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
self.config.max_sentences, is_train=name == 'train')[2], self.config.max_sentences, is_train=name == 'train')[2],
new_field_name='seq_len')
ds.apply(lambda x: speaker2numpy(x["speakers"], self.config.max_sentences, is_train=name == 'train'),
new_field_name='speaker_ids_np')
ds.apply(lambda x: genres[x["doc_key"][:2]], new_field_name='genre')

ds.set_ignore_type('clusters')
ds.set_padder('clusters', None)
ds.set_input("sentences", "doc_np", "speaker_ids_np", "genre", "char_index", "seq_len")
ds.set_target("clusters")
new_field_name=Const.INPUT_LEN)


ds.set_ignore_type(Const.TARGET)
ds.set_padder(Const.TARGET, None)
ds.set_input(Const.INPUTS(0), Const.INPUTS(1), Const.INPUTS(2), Const.INPUTS(3), Const.CHAR_INPUT, Const.INPUT_LEN)
ds.set_target(Const.TARGET)

return data_bundle return data_bundle


def process_from_file(self, paths): def process_from_file(self, paths):


+ 10
- 1
reproduction/coreference_resolution/model/model_re.py View File

@@ -8,6 +8,7 @@ from fastNLP.models.base_model import BaseModel
from fastNLP.modules.encoder.variational_rnn import VarLSTM from fastNLP.modules.encoder.variational_rnn import VarLSTM
from reproduction.coreference_resolution.model import preprocess from reproduction.coreference_resolution.model import preprocess
from fastNLP.io.embed_loader import EmbedLoader from fastNLP.io.embed_loader import EmbedLoader
from fastNLP.core.const import Const
import random import random


# 设置seed # 设置seed
@@ -415,7 +416,7 @@ class Model(BaseModel):
return predicted_clusters return predicted_clusters




def forward(self, sentences, doc_np, speaker_ids_np, genre, char_index, seq_len):
def forward(self, words1 , words2, words3, words4, chars, seq_len):
""" """
实际输入都是tensor 实际输入都是tensor
:param sentences: 句子,被fastNLP转化成了numpy, :param sentences: 句子,被fastNLP转化成了numpy,
@@ -426,6 +427,14 @@ class Model(BaseModel):
:param seq_len: 被fastNLP转化成了Tensor :param seq_len: 被fastNLP转化成了Tensor
:return: :return:
""" """

sentences = words3
doc_np = words4
speaker_ids_np = words2
genre = words1
char_index = chars


# change for fastNLP # change for fastNLP
sentences = sentences[0].tolist() sentences = sentences[0].tolist()
doc_tensor = doc_np[0] doc_tensor = doc_np[0]


+ 4
- 4
reproduction/coreference_resolution/model/softmax_loss.py View File

@@ -11,18 +11,18 @@ class SoftmaxLoss(LossBase):
允许多标签分类 允许多标签分类
""" """


def __init__(self, antecedent_scores=None, clusters=None, mention_start_tensor=None, mention_end_tensor=None):
def __init__(self, antecedent_scores=None, target=None, mention_start_tensor=None, mention_end_tensor=None):
""" """


:param pred: :param pred:
:param target: :param target:
""" """
super().__init__() super().__init__()
self._init_param_map(antecedent_scores=antecedent_scores, clusters=clusters,
self._init_param_map(antecedent_scores=antecedent_scores, target=target,
mention_start_tensor=mention_start_tensor, mention_end_tensor=mention_end_tensor) mention_start_tensor=mention_start_tensor, mention_end_tensor=mention_end_tensor)


def get_loss(self, antecedent_scores, clusters, mention_start_tensor, mention_end_tensor):
antecedent_labels = get_labels(clusters[0], mention_start_tensor, mention_end_tensor,
def get_loss(self, antecedent_scores, target, mention_start_tensor, mention_end_tensor):
antecedent_labels = get_labels(target[0], mention_start_tensor, mention_end_tensor,
Config().max_antecedents) Config().max_antecedents)


antecedent_labels = torch.from_numpy(antecedent_labels*1).to(torch.device("cuda:" + Config().cuda)) antecedent_labels = torch.from_numpy(antecedent_labels*1).to(torch.device("cuda:" + Config().cuda))


+ 0
- 0
reproduction/coreference_resolution/test/__init__.py View File


+ 0
- 14
reproduction/coreference_resolution/test/test_dataloader.py View File

@@ -1,14 +0,0 @@


import unittest
from fastNLP.io.pipe.coreference import CoreferencePipe
from reproduction.coreference_resolution.model.config import Config

class Test_CRLoader(unittest.TestCase):
def test_cr_loader(self):
config = Config()
bundle = CoreferencePipe(config).process_from_file({'train': config.train_path, 'dev': config.dev_path,'test': config.test_path})

print(bundle.datasets['train'][0])
print(bundle.datasets['dev'][0])
print(bundle.datasets['test'][0])

+ 1
- 1
reproduction/coreference_resolution/train.py View File

@@ -45,7 +45,7 @@ if __name__ == "__main__":
print("数据集划分:\ntrain:", str(len(data_info.datasets["train"])), print("数据集划分:\ntrain:", str(len(data_info.datasets["train"])),
"\ndev:" + str(len(data_info.datasets["dev"])) + "\ntest:" + str(len(data_info.datasets["test"]))) "\ndev:" + str(len(data_info.datasets["dev"])) + "\ntest:" + str(len(data_info.datasets["test"])))
# print(data_info) # print(data_info)
model = Model(data_info.vocabs, config)
model = Model(data_info.vocabs['vocab'], config)
print(model) print(model)


loss = SoftmaxLoss() loss = SoftmaxLoss()


Loading…
Cancel
Save