From 8dae71ff08476c573e3df26c00e188a7745ace78 Mon Sep 17 00:00:00 2001
From: xxliu <yexu_i@qq.com>
Date: Tue, 3 Sep 2019 14:19:29 +0800
Subject: [PATCH] pipeline

---
 fastNLP/io/loader/coreference.py              | 19 +++++++-
 fastNLP/io/pipe/coreference.py                | 45 ++++++++++++-------
 .../coreference_resolution/model/model_re.py  | 11 ++++-
 .../model/softmax_loss.py                     |  8 ++--
 .../coreference_resolution/test/__init__.py   |  0
 .../test/test_dataloader.py                   | 14 ------
 reproduction/coreference_resolution/train.py  |  2 +-
 7 files changed, 63 insertions(+), 36 deletions(-)
 delete mode 100644 reproduction/coreference_resolution/test/__init__.py
 delete mode 100644 reproduction/coreference_resolution/test/test_dataloader.py

diff --git a/fastNLP/io/loader/coreference.py b/fastNLP/io/loader/coreference.py
index c8d9bbf5..2e4d72de 100644
--- a/fastNLP/io/loader/coreference.py
+++ b/fastNLP/io/loader/coreference.py
@@ -1,17 +1,34 @@
 from ...core.dataset import DataSet
 from ..file_reader import _read_json
 from ...core.instance import Instance
+from ...core.const import Const
 from .json import JsonLoader
 
 
 class CRLoader(JsonLoader):
+    """
+        原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息，speakers包含每句话的说话者信息，cluster是指向现实中同一个事物的聚集，sentences是文本信息内容。
+
+        Example::
+
+           {"doc_key":"bc/cctv/00/cctv_001",
+           "speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]",
+           "clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]",
+           "sentences":[["I","have","an","apple"],["It","is","good"]]
+           }
+
+        读取预处理好的Conll2012数据。
+
+        """
     def __init__(self, fields=None, dropna=False):
         super().__init__(fields, dropna)
+        self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1),"clusters":Const.TARGET,"sentences":Const.INPUTS(2)}
 
     def _load(self, path):
         """
         加载数据
-        :param path:
+        :param path: 数据文件路径，文件为json
+
         :return:
         """
         dataset = DataSet()
diff --git a/fastNLP/io/pipe/coreference.py b/fastNLP/io/pipe/coreference.py
index bdf6a132..711e5919 100644
--- a/fastNLP/io/pipe/coreference.py
+++ b/fastNLP/io/pipe/coreference.py
@@ -6,12 +6,16 @@ __all__ = [
 from .pipe import Pipe
 from ..data_bundle import DataBundle
 from ..loader.coreference import CRLoader
+from ...core.const import Const
 from fastNLP.core.vocabulary import Vocabulary
 import numpy as np
 import collections
 
 
 class CoreferencePipe(Pipe):
+    """
+    对Coreference resolution问题进行处理，得到文章种类/说话者/字符级信息/序列长度。
+    """
 
     def __init__(self,config):
         super().__init__()
@@ -19,28 +23,39 @@ class CoreferencePipe(Pipe):
 
     def process(self, data_bundle: DataBundle):
         genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])}
-        vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name='sentences')
+        vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name=Const.INPUTS(2))
         vocab.build_vocab()
         word2id = vocab.word2idx
+        data_bundle.vocabs = {"vocab":vocab}
         char_dict = get_char_dict(self.config.char_path)
+
         for name, ds in data_bundle.datasets.items():
-            ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter),
+            # genre
+            ds.apply(lambda x: genres[x[Const.INPUTS(0)][:2]], new_field_name=Const.INPUTS(0))
+
+            # speaker_ids_np
+            ds.apply(lambda x: speaker2numpy(x[Const.INPUTS(1)], self.config.max_sentences, is_train=name == 'train'),
+                     new_field_name=Const.INPUTS(1))
+
+            # doc_np
+            ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
                                                     self.config.max_sentences, is_train=name == 'train')[0],
-                     new_field_name='doc_np')
-            ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter),
+                     new_field_name=Const.INPUTS(3))
+            # char_index
+            ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
                                                     self.config.max_sentences, is_train=name == 'train')[1],
-                     new_field_name='char_index')
-            ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter),
+                     new_field_name=Const.CHAR_INPUT)
+            # seq len
+            ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
                                                     self.config.max_sentences, is_train=name == 'train')[2],
-                     new_field_name='seq_len')
-            ds.apply(lambda x: speaker2numpy(x["speakers"], self.config.max_sentences, is_train=name == 'train'),
-                     new_field_name='speaker_ids_np')
-            ds.apply(lambda x: genres[x["doc_key"][:2]], new_field_name='genre')
-
-            ds.set_ignore_type('clusters')
-            ds.set_padder('clusters', None)
-            ds.set_input("sentences", "doc_np", "speaker_ids_np", "genre", "char_index", "seq_len")
-            ds.set_target("clusters")
+                     new_field_name=Const.INPUT_LEN)
+
+
+            ds.set_ignore_type(Const.TARGET)
+            ds.set_padder(Const.TARGET, None)
+            ds.set_input(Const.INPUTS(0), Const.INPUTS(1), Const.INPUTS(2), Const.INPUTS(3), Const.CHAR_INPUT, Const.INPUT_LEN)
+            ds.set_target(Const.TARGET)
+
         return data_bundle
 
     def process_from_file(self, paths):
diff --git a/reproduction/coreference_resolution/model/model_re.py b/reproduction/coreference_resolution/model/model_re.py
index 9dd90ec4..eaa2941b 100644
--- a/reproduction/coreference_resolution/model/model_re.py
+++ b/reproduction/coreference_resolution/model/model_re.py
@@ -8,6 +8,7 @@ from fastNLP.models.base_model import BaseModel
 from fastNLP.modules.encoder.variational_rnn import VarLSTM
 from reproduction.coreference_resolution.model import preprocess
 from fastNLP.io.embed_loader import EmbedLoader
+from fastNLP.core.const import Const
 import random
 
 # 设置seed
@@ -415,7 +416,7 @@ class Model(BaseModel):
         return predicted_clusters
 
 
-    def forward(self, sentences, doc_np, speaker_ids_np, genre, char_index, seq_len):
+    def forward(self, words1 , words2, words3, words4, chars, seq_len):
         """
         实际输入都是tensor
         :param sentences: 句子，被fastNLP转化成了numpy，
@@ -426,6 +427,14 @@ class Model(BaseModel):
         :param seq_len: 被fastNLP转化成了Tensor
         :return:
         """
+
+        sentences = words3
+        doc_np = words4
+        speaker_ids_np = words2
+        genre = words1
+        char_index = chars
+
+
         # change for fastNLP
         sentences = sentences[0].tolist()
         doc_tensor = doc_np[0]
diff --git a/reproduction/coreference_resolution/model/softmax_loss.py b/reproduction/coreference_resolution/model/softmax_loss.py
index c75a31d6..1c1fcc69 100644
--- a/reproduction/coreference_resolution/model/softmax_loss.py
+++ b/reproduction/coreference_resolution/model/softmax_loss.py
@@ -11,18 +11,18 @@ class SoftmaxLoss(LossBase):
     允许多标签分类
     """
 
-    def __init__(self, antecedent_scores=None, clusters=None, mention_start_tensor=None, mention_end_tensor=None):
+    def __init__(self, antecedent_scores=None, target=None, mention_start_tensor=None, mention_end_tensor=None):
         """
 
         :param pred:
         :param target:
         """
         super().__init__()
-        self._init_param_map(antecedent_scores=antecedent_scores, clusters=clusters,
+        self._init_param_map(antecedent_scores=antecedent_scores, target=target,
                              mention_start_tensor=mention_start_tensor, mention_end_tensor=mention_end_tensor)
 
-    def get_loss(self, antecedent_scores, clusters, mention_start_tensor, mention_end_tensor):
-        antecedent_labels = get_labels(clusters[0], mention_start_tensor, mention_end_tensor,
+    def get_loss(self, antecedent_scores, target, mention_start_tensor, mention_end_tensor):
+        antecedent_labels = get_labels(target[0], mention_start_tensor, mention_end_tensor,
                                        Config().max_antecedents)
 
         antecedent_labels = torch.from_numpy(antecedent_labels*1).to(torch.device("cuda:" + Config().cuda))
diff --git a/reproduction/coreference_resolution/test/__init__.py b/reproduction/coreference_resolution/test/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/reproduction/coreference_resolution/test/test_dataloader.py b/reproduction/coreference_resolution/test/test_dataloader.py
deleted file mode 100644
index 6a3be520..00000000
--- a/reproduction/coreference_resolution/test/test_dataloader.py
+++ /dev/null
@@ -1,14 +0,0 @@
-
-
-import unittest
-from fastNLP.io.pipe.coreference import CoreferencePipe
-from reproduction.coreference_resolution.model.config import Config
-
-class Test_CRLoader(unittest.TestCase):
-    def test_cr_loader(self):
-        config = Config()
-        bundle = CoreferencePipe(config).process_from_file({'train': config.train_path, 'dev': config.dev_path,'test': config.test_path})
-
-        print(bundle.datasets['train'][0])
-        print(bundle.datasets['dev'][0])
-        print(bundle.datasets['test'][0])
diff --git a/reproduction/coreference_resolution/train.py b/reproduction/coreference_resolution/train.py
index 6c26cf4c..790c7659 100644
--- a/reproduction/coreference_resolution/train.py
+++ b/reproduction/coreference_resolution/train.py
@@ -45,7 +45,7 @@ if __name__ == "__main__":
     print("数据集划分：\ntrain:", str(len(data_info.datasets["train"])),
           "\ndev:" + str(len(data_info.datasets["dev"])) + "\ntest:" + str(len(data_info.datasets["test"])))
     # print(data_info)
-    model = Model(data_info.vocabs, config)
+    model = Model(data_info.vocabs['vocab'], config)
     print(model)
 
     loss = SoftmaxLoss()