Browse Source

pipeline

tags/v0.4.10
xxliu 5 years ago
parent
commit
8dae71ff08
7 changed files with 63 additions and 36 deletions
  1. +18
    -1
      fastNLP/io/loader/coreference.py
  2. +30
    -15
      fastNLP/io/pipe/coreference.py
  3. +10
    -1
      reproduction/coreference_resolution/model/model_re.py
  4. +4
    -4
      reproduction/coreference_resolution/model/softmax_loss.py
  5. +0
    -0
      reproduction/coreference_resolution/test/__init__.py
  6. +0
    -14
      reproduction/coreference_resolution/test/test_dataloader.py
  7. +1
    -1
      reproduction/coreference_resolution/train.py

+ 18
- 1
fastNLP/io/loader/coreference.py View File

@@ -1,17 +1,34 @@
from ...core.dataset import DataSet
from ..file_reader import _read_json
from ...core.instance import Instance
from ...core.const import Const
from .json import JsonLoader


class CRLoader(JsonLoader):
"""
原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息,speakers包含每句话的说话者信息,cluster是指向现实中同一个事物的聚集,sentences是文本信息内容。

Example::

{"doc_key":"bc/cctv/00/cctv_001",
"speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]",
"clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]",
"sentences":[["I","have","an","apple"],["It","is","good"]]
}

读取预处理好的Conll2012数据。

"""
def __init__(self, fields=None, dropna=False):
super().__init__(fields, dropna)
self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1),"clusters":Const.TARGET,"sentences":Const.INPUTS(2)}

def _load(self, path):
"""
加载数据
:param path:
:param path: 数据文件路径,文件为json

:return:
"""
dataset = DataSet()


+ 30
- 15
fastNLP/io/pipe/coreference.py View File

@@ -6,12 +6,16 @@ __all__ = [
from .pipe import Pipe
from ..data_bundle import DataBundle
from ..loader.coreference import CRLoader
from ...core.const import Const
from fastNLP.core.vocabulary import Vocabulary
import numpy as np
import collections


class CoreferencePipe(Pipe):
"""
对Coreference resolution问题进行处理,得到文章种类/说话者/字符级信息/序列长度。
"""

def __init__(self,config):
super().__init__()
@@ -19,28 +23,39 @@ class CoreferencePipe(Pipe):

def process(self, data_bundle: DataBundle):
genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])}
vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name='sentences')
vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name=Const.INPUTS(2))
vocab.build_vocab()
word2id = vocab.word2idx
data_bundle.vocabs = {"vocab":vocab}
char_dict = get_char_dict(self.config.char_path)

for name, ds in data_bundle.datasets.items():
ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter),
# genre
ds.apply(lambda x: genres[x[Const.INPUTS(0)][:2]], new_field_name=Const.INPUTS(0))

# speaker_ids_np
ds.apply(lambda x: speaker2numpy(x[Const.INPUTS(1)], self.config.max_sentences, is_train=name == 'train'),
new_field_name=Const.INPUTS(1))

# doc_np
ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
self.config.max_sentences, is_train=name == 'train')[0],
new_field_name='doc_np')
ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter),
new_field_name=Const.INPUTS(3))
# char_index
ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
self.config.max_sentences, is_train=name == 'train')[1],
new_field_name='char_index')
ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter),
new_field_name=Const.CHAR_INPUT)
# seq len
ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
self.config.max_sentences, is_train=name == 'train')[2],
new_field_name='seq_len')
ds.apply(lambda x: speaker2numpy(x["speakers"], self.config.max_sentences, is_train=name == 'train'),
new_field_name='speaker_ids_np')
ds.apply(lambda x: genres[x["doc_key"][:2]], new_field_name='genre')

ds.set_ignore_type('clusters')
ds.set_padder('clusters', None)
ds.set_input("sentences", "doc_np", "speaker_ids_np", "genre", "char_index", "seq_len")
ds.set_target("clusters")
new_field_name=Const.INPUT_LEN)


ds.set_ignore_type(Const.TARGET)
ds.set_padder(Const.TARGET, None)
ds.set_input(Const.INPUTS(0), Const.INPUTS(1), Const.INPUTS(2), Const.INPUTS(3), Const.CHAR_INPUT, Const.INPUT_LEN)
ds.set_target(Const.TARGET)

return data_bundle

def process_from_file(self, paths):


+ 10
- 1
reproduction/coreference_resolution/model/model_re.py View File

@@ -8,6 +8,7 @@ from fastNLP.models.base_model import BaseModel
from fastNLP.modules.encoder.variational_rnn import VarLSTM
from reproduction.coreference_resolution.model import preprocess
from fastNLP.io.embed_loader import EmbedLoader
from fastNLP.core.const import Const
import random

# 设置seed
@@ -415,7 +416,7 @@ class Model(BaseModel):
return predicted_clusters


def forward(self, sentences, doc_np, speaker_ids_np, genre, char_index, seq_len):
def forward(self, words1 , words2, words3, words4, chars, seq_len):
"""
实际输入都是tensor
:param sentences: 句子,被fastNLP转化成了numpy,
@@ -426,6 +427,14 @@ class Model(BaseModel):
:param seq_len: 被fastNLP转化成了Tensor
:return:
"""

sentences = words3
doc_np = words4
speaker_ids_np = words2
genre = words1
char_index = chars


# change for fastNLP
sentences = sentences[0].tolist()
doc_tensor = doc_np[0]


+ 4
- 4
reproduction/coreference_resolution/model/softmax_loss.py View File

@@ -11,18 +11,18 @@ class SoftmaxLoss(LossBase):
允许多标签分类
"""

def __init__(self, antecedent_scores=None, clusters=None, mention_start_tensor=None, mention_end_tensor=None):
def __init__(self, antecedent_scores=None, target=None, mention_start_tensor=None, mention_end_tensor=None):
"""

:param pred:
:param target:
"""
super().__init__()
self._init_param_map(antecedent_scores=antecedent_scores, clusters=clusters,
self._init_param_map(antecedent_scores=antecedent_scores, target=target,
mention_start_tensor=mention_start_tensor, mention_end_tensor=mention_end_tensor)

def get_loss(self, antecedent_scores, clusters, mention_start_tensor, mention_end_tensor):
antecedent_labels = get_labels(clusters[0], mention_start_tensor, mention_end_tensor,
def get_loss(self, antecedent_scores, target, mention_start_tensor, mention_end_tensor):
antecedent_labels = get_labels(target[0], mention_start_tensor, mention_end_tensor,
Config().max_antecedents)

antecedent_labels = torch.from_numpy(antecedent_labels*1).to(torch.device("cuda:" + Config().cuda))


+ 0
- 0
reproduction/coreference_resolution/test/__init__.py View File


+ 0
- 14
reproduction/coreference_resolution/test/test_dataloader.py View File

@@ -1,14 +0,0 @@


import unittest
from fastNLP.io.pipe.coreference import CoreferencePipe
from reproduction.coreference_resolution.model.config import Config

class Test_CRLoader(unittest.TestCase):
def test_cr_loader(self):
config = Config()
bundle = CoreferencePipe(config).process_from_file({'train': config.train_path, 'dev': config.dev_path,'test': config.test_path})

print(bundle.datasets['train'][0])
print(bundle.datasets['dev'][0])
print(bundle.datasets['test'][0])

+ 1
- 1
reproduction/coreference_resolution/train.py View File

@@ -45,7 +45,7 @@ if __name__ == "__main__":
print("数据集划分:\ntrain:", str(len(data_info.datasets["train"])),
"\ndev:" + str(len(data_info.datasets["dev"])) + "\ntest:" + str(len(data_info.datasets["test"])))
# print(data_info)
model = Model(data_info.vocabs, config)
model = Model(data_info.vocabs['vocab'], config)
print(model)

loss = SoftmaxLoss()


Loading…
Cancel
Save