From b4e542095d34e3831a7f98b3d4e9e0a41e6e3f77 Mon Sep 17 00:00:00 2001
From: xxliu <yexu_i@qq.com>
Date: Mon, 26 Aug 2019 19:21:35 +0800
Subject: [PATCH 1/7] pipe

---
 fastNLP/io/loader/__init__.py                 |   5 +-
 fastNLP/io/loader/coreference.py              |  24 ++++
 fastNLP/io/pipe/__init__.py                   |   3 +
 fastNLP/io/pipe/coreference.py                | 115 ++++++++++++++++++
 reproduction/coreference_resolution/README.md |   2 +-
 .../data_load/__init__.py                     |   0
 .../data_load/cr_loader.py                    |  68 -----------
 .../test/test_dataloader.py                   |  20 +--
 reproduction/coreference_resolution/train.py  |  10 +-
 reproduction/coreference_resolution/valid.py  |  10 +-
 10 files changed, 166 insertions(+), 91 deletions(-)
 create mode 100644 fastNLP/io/loader/coreference.py
 create mode 100644 fastNLP/io/pipe/coreference.py
 delete mode 100644 reproduction/coreference_resolution/data_load/__init__.py
 delete mode 100644 reproduction/coreference_resolution/data_load/cr_loader.py

diff --git a/fastNLP/io/loader/__init__.py b/fastNLP/io/loader/__init__.py
index 6c23f213..aae3171a 100644
--- a/fastNLP/io/loader/__init__.py
+++ b/fastNLP/io/loader/__init__.py
@@ -71,7 +71,9 @@ __all__ = [
     "QuoraLoader",
     "SNLILoader",
     "QNLILoader",
-    "RTELoader"
+    "RTELoader",
+
+    "CRLoader"
 ]
 from .classification import YelpLoader, YelpFullLoader, YelpPolarityLoader, IMDBLoader, SSTLoader, SST2Loader
 from .conll import ConllLoader, Conll2003Loader, Conll2003NERLoader, OntoNotesNERLoader, CTBLoader
@@ -81,3 +83,4 @@ from .json import JsonLoader
 from .loader import Loader
 from .matching import MNLILoader, QuoraLoader, SNLILoader, QNLILoader, RTELoader
 from .conll import MsraNERLoader, PeopleDailyNERLoader, WeiboNERLoader
+from .coreference import CRLoader
\ No newline at end of file
diff --git a/fastNLP/io/loader/coreference.py b/fastNLP/io/loader/coreference.py
new file mode 100644
index 00000000..c8d9bbf5
--- /dev/null
+++ b/fastNLP/io/loader/coreference.py
@@ -0,0 +1,24 @@
+from ...core.dataset import DataSet
+from ..file_reader import _read_json
+from ...core.instance import Instance
+from .json import JsonLoader
+
+
+class CRLoader(JsonLoader):
+    def __init__(self, fields=None, dropna=False):
+        super().__init__(fields, dropna)
+
+    def _load(self, path):
+        """
+        加载数据
+        :param path:
+        :return:
+        """
+        dataset = DataSet()
+        for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna):
+            if self.fields:
+                ins = {self.fields[k]: v for k, v in d.items()}
+            else:
+                ins = d
+            dataset.append(Instance(**ins))
+        return dataset
\ No newline at end of file
diff --git a/fastNLP/io/pipe/__init__.py b/fastNLP/io/pipe/__init__.py
index 048e4cfe..d99b68c4 100644
--- a/fastNLP/io/pipe/__init__.py
+++ b/fastNLP/io/pipe/__init__.py
@@ -37,6 +37,8 @@ __all__ = [
     "QuoraPipe",
     "QNLIPipe",
     "MNLIPipe",
+
+    "CoreferencePipe"
 ]
 
 from .classification import YelpFullPipe, YelpPolarityPipe, SSTPipe, SST2Pipe, IMDBPipe
@@ -46,3 +48,4 @@ from .matching import MatchingBertPipe, RTEBertPipe, SNLIBertPipe, QuoraBertPipe
 from .pipe import Pipe
 from .conll import Conll2003Pipe
 from .cws import CWSPipe
+from .coreference import CoreferencePipe
diff --git a/fastNLP/io/pipe/coreference.py b/fastNLP/io/pipe/coreference.py
new file mode 100644
index 00000000..bdf6a132
--- /dev/null
+++ b/fastNLP/io/pipe/coreference.py
@@ -0,0 +1,115 @@
+__all__ = [
+    "CoreferencePipe"
+
+]
+
+from .pipe import Pipe
+from ..data_bundle import DataBundle
+from ..loader.coreference import CRLoader
+from fastNLP.core.vocabulary import Vocabulary
+import numpy as np
+import collections
+
+
+class CoreferencePipe(Pipe):
+
+    def __init__(self,config):
+        super().__init__()
+        self.config = config
+
+    def process(self, data_bundle: DataBundle):
+        genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])}
+        vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name='sentences')
+        vocab.build_vocab()
+        word2id = vocab.word2idx
+        char_dict = get_char_dict(self.config.char_path)
+        for name, ds in data_bundle.datasets.items():
+            ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter),
+                                                    self.config.max_sentences, is_train=name == 'train')[0],
+                     new_field_name='doc_np')
+            ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter),
+                                                    self.config.max_sentences, is_train=name == 'train')[1],
+                     new_field_name='char_index')
+            ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter),
+                                                    self.config.max_sentences, is_train=name == 'train')[2],
+                     new_field_name='seq_len')
+            ds.apply(lambda x: speaker2numpy(x["speakers"], self.config.max_sentences, is_train=name == 'train'),
+                     new_field_name='speaker_ids_np')
+            ds.apply(lambda x: genres[x["doc_key"][:2]], new_field_name='genre')
+
+            ds.set_ignore_type('clusters')
+            ds.set_padder('clusters', None)
+            ds.set_input("sentences", "doc_np", "speaker_ids_np", "genre", "char_index", "seq_len")
+            ds.set_target("clusters")
+        return data_bundle
+
+    def process_from_file(self, paths):
+        bundle = CRLoader().load(paths)
+        return self.process(bundle)
+
+
+# helper
+
+def doc2numpy(doc, word2id, chardict, max_filter, max_sentences, is_train):
+    docvec, char_index, length, max_len = _doc2vec(doc, word2id, chardict, max_filter, max_sentences, is_train)
+    assert max(length) == max_len
+    assert char_index.shape[0] == len(length)
+    assert char_index.shape[1] == max_len
+    doc_np = np.zeros((len(docvec), max_len), int)
+    for i in range(len(docvec)):
+        for j in range(len(docvec[i])):
+            doc_np[i][j] = docvec[i][j]
+    return doc_np, char_index, length
+
+def _doc2vec(doc,word2id,char_dict,max_filter,max_sentences,is_train):
+    max_len = 0
+    max_word_length = 0
+    docvex = []
+    length = []
+    if is_train:
+        sent_num = min(max_sentences,len(doc))
+    else:
+        sent_num = len(doc)
+
+    for i in range(sent_num):
+        sent = doc[i]
+        length.append(len(sent))
+        if (len(sent) > max_len):
+            max_len = len(sent)
+        sent_vec =[]
+        for j,word in enumerate(sent):
+            if len(word)>max_word_length:
+                max_word_length = len(word)
+            if word in word2id:
+                sent_vec.append(word2id[word])
+            else:
+                sent_vec.append(word2id["UNK"])
+        docvex.append(sent_vec)
+
+    char_index = np.zeros((sent_num, max_len, max_word_length),dtype=int)
+    for i in range(sent_num):
+        sent = doc[i]
+        for j,word in enumerate(sent):
+            char_index[i, j, :len(word)] = [char_dict[c] for c in word]
+
+    return docvex,char_index,length,max_len
+
+def speaker2numpy(speakers_raw,max_sentences,is_train):
+    if is_train and len(speakers_raw)> max_sentences:
+        speakers_raw = speakers_raw[0:max_sentences]
+    speakers = flatten(speakers_raw)
+    speaker_dict = {s: i for i, s in enumerate(set(speakers))}
+    speaker_ids = np.array([speaker_dict[s] for s in speakers])
+    return speaker_ids
+
+# 展平
+def flatten(l):
+    return [item for sublist in l for item in sublist]
+
+def get_char_dict(path):
+    vocab = ["<UNK>"]
+    with open(path) as f:
+        vocab.extend(c.strip() for c in f.readlines())
+    char_dict = collections.defaultdict(int)
+    char_dict.update({c: i for i, c in enumerate(vocab)})
+    return char_dict
\ No newline at end of file
diff --git a/reproduction/coreference_resolution/README.md b/reproduction/coreference_resolution/README.md
index 7cbcd052..c1a286e5 100644
--- a/reproduction/coreference_resolution/README.md
+++ b/reproduction/coreference_resolution/README.md
@@ -1,4 +1,4 @@
-# 共指消解复现
+# 指代消解复现
 ## 介绍
 Coreference resolution是查找文本中指向同一现实实体的所有表达式的任务。
 对于涉及自然语言理解的许多更高级别的NLP任务来说，
diff --git a/reproduction/coreference_resolution/data_load/__init__.py b/reproduction/coreference_resolution/data_load/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/reproduction/coreference_resolution/data_load/cr_loader.py b/reproduction/coreference_resolution/data_load/cr_loader.py
deleted file mode 100644
index 5ed73473..00000000
--- a/reproduction/coreference_resolution/data_load/cr_loader.py
+++ /dev/null
@@ -1,68 +0,0 @@
-from fastNLP.io.dataset_loader import JsonLoader,DataSet,Instance
-from fastNLP.io.file_reader import _read_json
-from fastNLP.core.vocabulary import Vocabulary
-from fastNLP.io.data_bundle import DataBundle
-from reproduction.coreference_resolution.model.config import Config
-import reproduction.coreference_resolution.model.preprocess as preprocess
-
-
-class CRLoader(JsonLoader):
-    def __init__(self, fields=None, dropna=False):
-        super().__init__(fields, dropna)
-
-    def _load(self, path):
-        """
-        加载数据
-        :param path:
-        :return:
-        """
-        dataset = DataSet()
-        for idx, d in _read_json(path, fields=self.fields_list, dropna=self.dropna):
-            if self.fields:
-                ins = {self.fields[k]: v for k, v in d.items()}
-            else:
-                ins = d
-            dataset.append(Instance(**ins))
-        return dataset
-
-    def process(self, paths, **kwargs):
-        data_info = DataBundle()
-        for name in ['train', 'test', 'dev']:
-            data_info.datasets[name] = self.load(paths[name])
-
-        config = Config()
-        vocab = Vocabulary().from_dataset(*data_info.datasets.values(), field_name='sentences')
-        vocab.build_vocab()
-        word2id = vocab.word2idx
-
-        char_dict = preprocess.get_char_dict(config.char_path)
-        data_info.vocabs = vocab
-
-        genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])}
-
-        for name, ds in data_info.datasets.items():
-            ds.apply(lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter),
-                                                        config.max_sentences, is_train=name=='train')[0],
-                         new_field_name='doc_np')
-            ds.apply(lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter),
-                                                        config.max_sentences, is_train=name=='train')[1],
-                         new_field_name='char_index')
-            ds.apply(lambda x: preprocess.doc2numpy(x['sentences'], word2id, char_dict, max(config.filter),
-                                                        config.max_sentences, is_train=name=='train')[2],
-                         new_field_name='seq_len')
-            ds.apply(lambda x: preprocess.speaker2numpy(x["speakers"], config.max_sentences, is_train=name=='train'),
-                         new_field_name='speaker_ids_np')
-            ds.apply(lambda x: genres[x["doc_key"][:2]], new_field_name='genre')
-
-            ds.set_ignore_type('clusters')
-            ds.set_padder('clusters', None)
-            ds.set_input("sentences", "doc_np", "speaker_ids_np", "genre", "char_index", "seq_len")
-            ds.set_target("clusters")
-
-        # train_dev, test = self.ds.split(348 / (2802 + 343 + 348), shuffle=False)
-        # train, dev = train_dev.split(343 / (2802 + 343), shuffle=False)
-
-        return data_info
-
-
-
diff --git a/reproduction/coreference_resolution/test/test_dataloader.py b/reproduction/coreference_resolution/test/test_dataloader.py
index 0d9dae52..6a3be520 100644
--- a/reproduction/coreference_resolution/test/test_dataloader.py
+++ b/reproduction/coreference_resolution/test/test_dataloader.py
@@ -1,14 +1,14 @@
+
+
 import unittest
-from ..data_load.cr_loader import CRLoader
+from fastNLP.io.pipe.coreference import CoreferencePipe
+from reproduction.coreference_resolution.model.config import Config
 
 class Test_CRLoader(unittest.TestCase):
     def test_cr_loader(self):
-        train_path = 'data/train.english.jsonlines.mini'
-        dev_path = 'data/dev.english.jsonlines.minid'
-        test_path = 'data/test.english.jsonlines'
-        cr = CRLoader()
-        data_info = cr.process({'train':train_path,'dev':dev_path,'test':test_path})
-
-        print(data_info.datasets['train'][0])
-        print(data_info.datasets['dev'][0])
-        print(data_info.datasets['test'][0])
+        config = Config()
+        bundle = CoreferencePipe(config).process_from_file({'train': config.train_path, 'dev': config.dev_path,'test': config.test_path})
+
+        print(bundle.datasets['train'][0])
+        print(bundle.datasets['dev'][0])
+        print(bundle.datasets['test'][0])
diff --git a/reproduction/coreference_resolution/train.py b/reproduction/coreference_resolution/train.py
index a231a575..6c26cf4c 100644
--- a/reproduction/coreference_resolution/train.py
+++ b/reproduction/coreference_resolution/train.py
@@ -7,7 +7,8 @@ from torch.optim import Adam
 from fastNLP.core.callback import Callback, GradientClipCallback
 from fastNLP.core.trainer import Trainer
 
-from reproduction.coreference_resolution.data_load.cr_loader import CRLoader
+from fastNLP.io.pipe.coreference import CoreferencePipe
+
 from reproduction.coreference_resolution.model.config import Config
 from reproduction.coreference_resolution.model.model_re import Model
 from reproduction.coreference_resolution.model.softmax_loss import SoftmaxLoss
@@ -38,11 +39,8 @@ if __name__ == "__main__":
 
     @cache_results('cache.pkl')
     def cache():
-        cr_train_dev_test = CRLoader()
-
-        data_info = cr_train_dev_test.process({'train': config.train_path, 'dev': config.dev_path,
-                                               'test': config.test_path})
-        return data_info
+        bundle = CoreferencePipe(Config()).process_from_file({'train': config.train_path, 'dev': config.dev_path,'test': config.test_path})
+        return bundle
     data_info = cache()
     print("数据集划分：\ntrain:", str(len(data_info.datasets["train"])),
           "\ndev:" + str(len(data_info.datasets["dev"])) + "\ntest:" + str(len(data_info.datasets["test"])))
diff --git a/reproduction/coreference_resolution/valid.py b/reproduction/coreference_resolution/valid.py
index 826332c6..454629e1 100644
--- a/reproduction/coreference_resolution/valid.py
+++ b/reproduction/coreference_resolution/valid.py
@@ -1,7 +1,8 @@
 import torch
 from reproduction.coreference_resolution.model.config import Config
 from reproduction.coreference_resolution.model.metric import CRMetric
-from reproduction.coreference_resolution.data_load.cr_loader import CRLoader
+from fastNLP.io.pipe.coreference import CoreferencePipe
+
 from fastNLP import Tester
 import argparse
 
@@ -11,13 +12,12 @@ if __name__=='__main__':
     parser.add_argument('--path')
     args = parser.parse_args()
     
-    cr_loader = CRLoader()
     config = Config()
-    data_info = cr_loader.process({'train': config.train_path, 'dev': config.dev_path,
-                                               'test': config.test_path})
+    bundle = CoreferencePipe(Config()).process_from_file(
+        {'train': config.train_path, 'dev': config.dev_path, 'test': config.test_path})
     metirc = CRMetric()
     model = torch.load(args.path)
-    tester = Tester(data_info.datasets['test'],model,metirc,batch_size=1,device="cuda:0")
+    tester = Tester(bundle.datasets['test'],model,metirc,batch_size=1,device="cuda:0")
     tester.test()
     print('test over')
 

From 8dae71ff08476c573e3df26c00e188a7745ace78 Mon Sep 17 00:00:00 2001
From: xxliu <yexu_i@qq.com>
Date: Tue, 3 Sep 2019 14:19:29 +0800
Subject: [PATCH 2/7] pipeline

---
 fastNLP/io/loader/coreference.py              | 19 +++++++-
 fastNLP/io/pipe/coreference.py                | 45 ++++++++++++-------
 .../coreference_resolution/model/model_re.py  | 11 ++++-
 .../model/softmax_loss.py                     |  8 ++--
 .../coreference_resolution/test/__init__.py   |  0
 .../test/test_dataloader.py                   | 14 ------
 reproduction/coreference_resolution/train.py  |  2 +-
 7 files changed, 63 insertions(+), 36 deletions(-)
 delete mode 100644 reproduction/coreference_resolution/test/__init__.py
 delete mode 100644 reproduction/coreference_resolution/test/test_dataloader.py

diff --git a/fastNLP/io/loader/coreference.py b/fastNLP/io/loader/coreference.py
index c8d9bbf5..2e4d72de 100644
--- a/fastNLP/io/loader/coreference.py
+++ b/fastNLP/io/loader/coreference.py
@@ -1,17 +1,34 @@
 from ...core.dataset import DataSet
 from ..file_reader import _read_json
 from ...core.instance import Instance
+from ...core.const import Const
 from .json import JsonLoader
 
 
 class CRLoader(JsonLoader):
+    """
+        原始数据中内容应该为, 每一行为一个json对象,其中doc_key包含文章的种类信息，speakers包含每句话的说话者信息，cluster是指向现实中同一个事物的聚集，sentences是文本信息内容。
+
+        Example::
+
+           {"doc_key":"bc/cctv/00/cctv_001",
+           "speakers":"[["Speaker1","Speaker1","Speaker1"],["Speaker1","Speaker1","Speaker1"]]",
+           "clusters":"[[[2,3],[4,5]],[7,8],[18,20]]]",
+           "sentences":[["I","have","an","apple"],["It","is","good"]]
+           }
+
+        读取预处理好的Conll2012数据。
+
+        """
     def __init__(self, fields=None, dropna=False):
         super().__init__(fields, dropna)
+        self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1),"clusters":Const.TARGET,"sentences":Const.INPUTS(2)}
 
     def _load(self, path):
         """
         加载数据
-        :param path:
+        :param path: 数据文件路径，文件为json
+
         :return:
         """
         dataset = DataSet()
diff --git a/fastNLP/io/pipe/coreference.py b/fastNLP/io/pipe/coreference.py
index bdf6a132..711e5919 100644
--- a/fastNLP/io/pipe/coreference.py
+++ b/fastNLP/io/pipe/coreference.py
@@ -6,12 +6,16 @@ __all__ = [
 from .pipe import Pipe
 from ..data_bundle import DataBundle
 from ..loader.coreference import CRLoader
+from ...core.const import Const
 from fastNLP.core.vocabulary import Vocabulary
 import numpy as np
 import collections
 
 
 class CoreferencePipe(Pipe):
+    """
+    对Coreference resolution问题进行处理，得到文章种类/说话者/字符级信息/序列长度。
+    """
 
     def __init__(self,config):
         super().__init__()
@@ -19,28 +23,39 @@ class CoreferencePipe(Pipe):
 
     def process(self, data_bundle: DataBundle):
         genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])}
-        vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name='sentences')
+        vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name=Const.INPUTS(2))
         vocab.build_vocab()
         word2id = vocab.word2idx
+        data_bundle.vocabs = {"vocab":vocab}
         char_dict = get_char_dict(self.config.char_path)
+
         for name, ds in data_bundle.datasets.items():
-            ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter),
+            # genre
+            ds.apply(lambda x: genres[x[Const.INPUTS(0)][:2]], new_field_name=Const.INPUTS(0))
+
+            # speaker_ids_np
+            ds.apply(lambda x: speaker2numpy(x[Const.INPUTS(1)], self.config.max_sentences, is_train=name == 'train'),
+                     new_field_name=Const.INPUTS(1))
+
+            # doc_np
+            ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
                                                     self.config.max_sentences, is_train=name == 'train')[0],
-                     new_field_name='doc_np')
-            ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter),
+                     new_field_name=Const.INPUTS(3))
+            # char_index
+            ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
                                                     self.config.max_sentences, is_train=name == 'train')[1],
-                     new_field_name='char_index')
-            ds.apply(lambda x: doc2numpy(x['sentences'], word2id, char_dict, max(self.config.filter),
+                     new_field_name=Const.CHAR_INPUT)
+            # seq len
+            ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
                                                     self.config.max_sentences, is_train=name == 'train')[2],
-                     new_field_name='seq_len')
-            ds.apply(lambda x: speaker2numpy(x["speakers"], self.config.max_sentences, is_train=name == 'train'),
-                     new_field_name='speaker_ids_np')
-            ds.apply(lambda x: genres[x["doc_key"][:2]], new_field_name='genre')
-
-            ds.set_ignore_type('clusters')
-            ds.set_padder('clusters', None)
-            ds.set_input("sentences", "doc_np", "speaker_ids_np", "genre", "char_index", "seq_len")
-            ds.set_target("clusters")
+                     new_field_name=Const.INPUT_LEN)
+
+
+            ds.set_ignore_type(Const.TARGET)
+            ds.set_padder(Const.TARGET, None)
+            ds.set_input(Const.INPUTS(0), Const.INPUTS(1), Const.INPUTS(2), Const.INPUTS(3), Const.CHAR_INPUT, Const.INPUT_LEN)
+            ds.set_target(Const.TARGET)
+
         return data_bundle
 
     def process_from_file(self, paths):
diff --git a/reproduction/coreference_resolution/model/model_re.py b/reproduction/coreference_resolution/model/model_re.py
index 9dd90ec4..eaa2941b 100644
--- a/reproduction/coreference_resolution/model/model_re.py
+++ b/reproduction/coreference_resolution/model/model_re.py
@@ -8,6 +8,7 @@ from fastNLP.models.base_model import BaseModel
 from fastNLP.modules.encoder.variational_rnn import VarLSTM
 from reproduction.coreference_resolution.model import preprocess
 from fastNLP.io.embed_loader import EmbedLoader
+from fastNLP.core.const import Const
 import random
 
 # 设置seed
@@ -415,7 +416,7 @@ class Model(BaseModel):
         return predicted_clusters
 
 
-    def forward(self, sentences, doc_np, speaker_ids_np, genre, char_index, seq_len):
+    def forward(self, words1 , words2, words3, words4, chars, seq_len):
         """
         实际输入都是tensor
         :param sentences: 句子，被fastNLP转化成了numpy，
@@ -426,6 +427,14 @@ class Model(BaseModel):
         :param seq_len: 被fastNLP转化成了Tensor
         :return:
         """
+
+        sentences = words3
+        doc_np = words4
+        speaker_ids_np = words2
+        genre = words1
+        char_index = chars
+
+
         # change for fastNLP
         sentences = sentences[0].tolist()
         doc_tensor = doc_np[0]
diff --git a/reproduction/coreference_resolution/model/softmax_loss.py b/reproduction/coreference_resolution/model/softmax_loss.py
index c75a31d6..1c1fcc69 100644
--- a/reproduction/coreference_resolution/model/softmax_loss.py
+++ b/reproduction/coreference_resolution/model/softmax_loss.py
@@ -11,18 +11,18 @@ class SoftmaxLoss(LossBase):
     允许多标签分类
     """
 
-    def __init__(self, antecedent_scores=None, clusters=None, mention_start_tensor=None, mention_end_tensor=None):
+    def __init__(self, antecedent_scores=None, target=None, mention_start_tensor=None, mention_end_tensor=None):
         """
 
         :param pred:
         :param target:
         """
         super().__init__()
-        self._init_param_map(antecedent_scores=antecedent_scores, clusters=clusters,
+        self._init_param_map(antecedent_scores=antecedent_scores, target=target,
                              mention_start_tensor=mention_start_tensor, mention_end_tensor=mention_end_tensor)
 
-    def get_loss(self, antecedent_scores, clusters, mention_start_tensor, mention_end_tensor):
-        antecedent_labels = get_labels(clusters[0], mention_start_tensor, mention_end_tensor,
+    def get_loss(self, antecedent_scores, target, mention_start_tensor, mention_end_tensor):
+        antecedent_labels = get_labels(target[0], mention_start_tensor, mention_end_tensor,
                                        Config().max_antecedents)
 
         antecedent_labels = torch.from_numpy(antecedent_labels*1).to(torch.device("cuda:" + Config().cuda))
diff --git a/reproduction/coreference_resolution/test/__init__.py b/reproduction/coreference_resolution/test/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/reproduction/coreference_resolution/test/test_dataloader.py b/reproduction/coreference_resolution/test/test_dataloader.py
deleted file mode 100644
index 6a3be520..00000000
--- a/reproduction/coreference_resolution/test/test_dataloader.py
+++ /dev/null
@@ -1,14 +0,0 @@
-
-
-import unittest
-from fastNLP.io.pipe.coreference import CoreferencePipe
-from reproduction.coreference_resolution.model.config import Config
-
-class Test_CRLoader(unittest.TestCase):
-    def test_cr_loader(self):
-        config = Config()
-        bundle = CoreferencePipe(config).process_from_file({'train': config.train_path, 'dev': config.dev_path,'test': config.test_path})
-
-        print(bundle.datasets['train'][0])
-        print(bundle.datasets['dev'][0])
-        print(bundle.datasets['test'][0])
diff --git a/reproduction/coreference_resolution/train.py b/reproduction/coreference_resolution/train.py
index 6c26cf4c..790c7659 100644
--- a/reproduction/coreference_resolution/train.py
+++ b/reproduction/coreference_resolution/train.py
@@ -45,7 +45,7 @@ if __name__ == "__main__":
     print("数据集划分：\ntrain:", str(len(data_info.datasets["train"])),
           "\ndev:" + str(len(data_info.datasets["dev"])) + "\ntest:" + str(len(data_info.datasets["test"])))
     # print(data_info)
-    model = Model(data_info.vocabs, config)
+    model = Model(data_info.vocabs['vocab'], config)
     print(model)
 
     loss = SoftmaxLoss()

From ea5fbc8881dc763a1ac13c0422da07fb199d6fc1 Mon Sep 17 00:00:00 2001
From: xxliu <yexu_i@qq.com>
Date: Thu, 5 Sep 2019 05:07:52 +0800
Subject: [PATCH 3/7] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=B3=A8=E9=87=8A=20?=
 =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E6=B5=8B=E8=AF=95=E6=96=87=E4=BB=B6=E5=8F=8A?=
 =?UTF-8?q?=E6=B5=8B=E8=AF=95=E6=A0=B7=E4=BE=8B=20=E4=BF=AE=E6=94=B9?=
 =?UTF-8?q?=E9=83=A8=E5=88=86=E5=8F=98=E9=87=8F=E5=91=BD=E5=90=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/io/loader/coreference.py              |  5 +-
 fastNLP/io/pipe/coreference.py                | 48 +++++++++++++++++--
 reproduction/coreference_resolution/train.py  | 16 +++----
 .../coreference/coreference_dev.json          |  2 +
 .../coreference/coreference_test.json         |  2 +
 .../coreference/coreference_train.json        |  2 +
 test/io/loader/test_coreference_loader.py     | 16 +++++++
 test/io/pipe/test_coreference.py              | 24 ++++++++++
 8 files changed, 101 insertions(+), 14 deletions(-)
 create mode 100644 test/data_for_tests/coreference/coreference_dev.json
 create mode 100644 test/data_for_tests/coreference/coreference_test.json
 create mode 100644 test/data_for_tests/coreference/coreference_train.json
 create mode 100644 test/io/loader/test_coreference_loader.py
 create mode 100644 test/io/pipe/test_coreference.py

diff --git a/fastNLP/io/loader/coreference.py b/fastNLP/io/loader/coreference.py
index 2e4d72de..b4493571 100644
--- a/fastNLP/io/loader/coreference.py
+++ b/fastNLP/io/loader/coreference.py
@@ -22,7 +22,10 @@ class CRLoader(JsonLoader):
         """
     def __init__(self, fields=None, dropna=False):
         super().__init__(fields, dropna)
-        self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1),"clusters":Const.TARGET,"sentences":Const.INPUTS(2)}
+        # self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1),"clusters":Const.TARGET,"sentences":Const.INPUTS(2)}
+        # TODO check 1
+        self.fields = {"doc_key": "raw_key", "speakers": "raw_speakers", "clusters": "raw_clusters",
+                       "sentences": "raw_words"}
 
     def _load(self, path):
         """
diff --git a/fastNLP/io/pipe/coreference.py b/fastNLP/io/pipe/coreference.py
index 711e5919..baa616f1 100644
--- a/fastNLP/io/pipe/coreference.py
+++ b/fastNLP/io/pipe/coreference.py
@@ -22,21 +22,56 @@ class CoreferencePipe(Pipe):
         self.config = config
 
     def process(self, data_bundle: DataBundle):
+        """
+        对load进来的数据进一步处理
+        原始数据包含：raw_key,raw_speaker,raw_words,raw_clusters
+        .. csv-table::
+           :header: "raw_key", "raw_speaker","raw_words","raw_clusters"
+
+           "bc/cctv/00/cctv_0000_0", "[["Speaker#1", "Speaker#1"],[]]","[["I","am"],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]"
+           "bc/cctv/00/cctv_0000_1"", "[["Speaker#1", "Speaker#1"],[]]","[["He","is"],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]"
+           "[...]", "[...]","[...]","[...]"
+
+        处理完成后数据包含文章类别、speaker信息、句子信息、句子对应的index、char、句子长度、target：
+        .. csv-table::
+           :header: "words1", "words2","words3","words4","chars","seq_len","target"
+
+           "bc", "[[0,0],[1,1]]","[["I","am"],[]]",[[1,2],[]],[[[1],[2,3]],[]],[2,3],"[[[2,3],[6,7]],[[10,12],[20,22]]]"
+           "[...]", "[...]","[...]","[...]","[...]","[...]","[...]"
+
+
+        :param data_bundle:
+        :return:
+        """
         genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])}
-        vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name=Const.INPUTS(2))
+        vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name="raw_words")
         vocab.build_vocab()
         word2id = vocab.word2idx
-        data_bundle.vocabs = {"vocab":vocab}
-        char_dict = get_char_dict(self.config.char_path)
+        data_bundle.set_vocab(vocab,"vocab")
+        if self.config.char_path:
+            char_dict = get_char_dict(self.config.char_path)
+        else:
+            char_set = set()
+            for i,w in enumerate(word2id):
+                if i < 2:
+                    continue
+                for c in w:
+                    char_set.add(c)
+
+            char_dict = collections.defaultdict(int)
+            char_dict.update({c: i for i, c in enumerate(char_set)})
 
         for name, ds in data_bundle.datasets.items():
             # genre
-            ds.apply(lambda x: genres[x[Const.INPUTS(0)][:2]], new_field_name=Const.INPUTS(0))
+            ds.apply(lambda x: genres[x["raw_key"][:2]], new_field_name=Const.INPUTS(0))
 
             # speaker_ids_np
-            ds.apply(lambda x: speaker2numpy(x[Const.INPUTS(1)], self.config.max_sentences, is_train=name == 'train'),
+            ds.apply(lambda x: speaker2numpy(x["raw_speakers"], self.config.max_sentences, is_train=name == 'train'),
                      new_field_name=Const.INPUTS(1))
 
+            # sentences
+            ds.rename_field("raw_words",Const.INPUTS(2))
+
             # doc_np
             ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
                                                     self.config.max_sentences, is_train=name == 'train')[0],
@@ -50,6 +85,9 @@ class CoreferencePipe(Pipe):
                                                     self.config.max_sentences, is_train=name == 'train')[2],
                      new_field_name=Const.INPUT_LEN)
 
+            # clusters
+            ds.rename_field("raw_clusters", Const.TARGET)
+
 
             ds.set_ignore_type(Const.TARGET)
             ds.set_padder(Const.TARGET, None)
diff --git a/reproduction/coreference_resolution/train.py b/reproduction/coreference_resolution/train.py
index 790c7659..c91f7109 100644
--- a/reproduction/coreference_resolution/train.py
+++ b/reproduction/coreference_resolution/train.py
@@ -37,15 +37,15 @@ if __name__ == "__main__":
 
     print(config)
 
-    @cache_results('cache.pkl')
+    # @cache_results('cache.pkl')
     def cache():
-        bundle = CoreferencePipe(Config()).process_from_file({'train': config.train_path, 'dev': config.dev_path,'test': config.test_path})
+        bundle = CoreferencePipe(config).process_from_file({'train': config.train_path, 'dev': config.dev_path,'test': config.test_path})
         return bundle
-    data_info = cache()
-    print("数据集划分：\ntrain:", str(len(data_info.datasets["train"])),
-          "\ndev:" + str(len(data_info.datasets["dev"])) + "\ntest:" + str(len(data_info.datasets["test"])))
+    data_bundle = cache()
+    print("数据集划分：\ntrain:", str(len(data_bundle.get_dataset("train"))),
+          "\ndev:" + str(len(data_bundle.get_dataset("dev"))) + "\ntest:" + str(len(data_bundle.get_dataset('test'))))
     # print(data_info)
-    model = Model(data_info.vocabs['vocab'], config)
+    model = Model(data_bundle.vocabs['vocab'], config)
     print(model)
 
     loss = SoftmaxLoss()
@@ -56,8 +56,8 @@ if __name__ == "__main__":
 
     lr_decay_callback = LRCallback(optim.param_groups, config.lr_decay)
 
-    trainer = Trainer(model=model, train_data=data_info.datasets["train"], dev_data=data_info.datasets["dev"],
-                      loss=loss, metrics=metric, check_code_level=-1,sampler=None,
+    trainer = Trainer(model=model, train_data=data_bundle.datasets["train"], dev_data=data_bundle.datasets["dev"],
+                      loss=loss, metrics=metric, check_code_level=-1, sampler=None,
                       batch_size=1, device=torch.device("cuda:" + config.cuda), metric_key='f', n_epochs=config.epoch,
                       optimizer=optim,
                       save_path='/remote-home/xxliu/pycharm/fastNLP/fastNLP/reproduction/coreference_resolution/save',
diff --git a/test/data_for_tests/coreference/coreference_dev.json b/test/data_for_tests/coreference/coreference_dev.json
new file mode 100644
index 00000000..9322ed30
--- /dev/null
+++ b/test/data_for_tests/coreference/coreference_dev.json
@@ -0,0 +1,2 @@
+{"doc_key": "bc/cctv/00/cctv_0000_0", "speakers": [["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi"], ["Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi"], ["Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2"], ["Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2"], ["Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2"], ["Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2"], ["Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"]], "clusters": [[[70, 70], [485, 486], [500, 500], [73, 73], [55, 55], [153, 154], [366, 366]], [[307, 312], [255, 256]], [[198, 199], [163, 164]], [[289, 290], [318, 318], [494, 497], [129, 131], [261, 261], [86, 86], [387, 387], [278, 278], [122, 124], [51, 56], [221, 225], [353, 355], [292, 292], [299, 299], [322, 322], [348, 348], [311, 312], [251, 253]], [[143, 144], [138, 138]], [[155, 176], [213, 214], [183, 184], [195, 195]], [[398, 398], [403, 403], [335, 335], [390, 390]], [[28, 28], [32, 37]], [[337, 338], [372, 373]], [[129, 130], [488, 489], [122, 123], [108, 109], [147, 148], [191, 192], [41, 42], [23, 24], [251, 252]], [[208, 208], [201, 204]], [[377, 379], [411, 413]]], "sentences": [["In", "the", "summer", "of", "2005", ",", "a", "picture", "that", "people", "have", "long", "been", "looking", "forward", "to", "started", "emerging", "with", "frequency", "in", "various", "major", "Hong", "Kong", "media", "."], ["With", "their", "unique", "charm", ",", "these", "well", "-", "known", "cartoon", "images", "once", "again", "caused", "Hong", "Kong", "to", "be", "a", "focus", "of", "worldwide", "attention", "."], ["The", "world", "'s", "fifth", "Disney", "park", "will", "soon", "open", "to", "the", "public", "here", "."], ["The", "most", "important", "thing", "about", "Disney", "is", "that", "it", "is", "a", "global", "brand", "."], ["Well", ",", "for", "several", "years", ",", "although", "it", "was", "still", "under", "construction", "and", ",", "er", ",", "not", "yet", "open", ",", "it", "can", "be", "said", "that", "many", "people", "have", "viewed", "Hong", "Kong", "with", "new", "respect", "."], ["Then", "welcome", "to", "the", "official", "writing", "ceremony", "of", "Hong", "Kong", "Disneyland", "."], ["The", "construction", "of", "Hong", "Kong", "Disneyland", "began", "two", "years", "ago", ",", "in", "2003", "."], ["In", "January", "of", "that", "year", ",", "the", "Hong", "Kong", "government", "turned", "over", "to", "Disney", "Corporation", "200", "hectares", "of", "land", "at", "the", "foot", "of", "Lantau", "Island", "that", "was", "obtained", "following", "the", "largest", "land", "reclamation", "project", "in", "recent", "years", "."], ["One", "."], ["Since", "then", ",", "this", "area", "has", "become", "a", "prohibited", "zone", "in", "Hong", "Kong", "."], ["As", "its", "neighbor", "on", "Lantau", "Island", ",", "Hong", "Kong", "International", "Airport", "had", "to", "change", "its", "flight", "routes", "to", "make", "this", "area", "a", "no", "-", "fly", "zone", "."], ["Mickey", "Mouse", "'s", "new", "home", ",", "settling", "on", "Chinese", "land", "for", "the", "first", "time", ",", "has", "captured", "worldwide", "attention", "."], ["There", "'s", "only", "one", "month", "left", "before", "the", "opening", "of", "Hong", "Kong", "Disneyland", "on", "September", "12", "."], ["The", "subway", "to", "Disney", "has", "already", "been", "constructed", "."], ["At", "subway", "stations", ",", "passengers", "will", "frequently", "press", "the", "station", "for", "Disney", "on", "ticket", "machines", ",", "trying", "to", "purchase", "tickets", "to", "enjoy", "the", "park", "when", "it", "first", "opens", "."], ["Meanwhile", ",", "the", "Disney", "subway", "station", "is", "scheduled", "to", "open", "on", "the", "same", "day", "as", "the", "park", "."], ["For", "two", "years", ",", "Disney", "has", "constantly", "maintained", "its", "mystery", "."], ["No", "media", "have", "been", "allowed", "to", "enter", "for", "photos", "."], ["We", "took", "a", "taxi", "along", "the", "path", "of", "the", "highway", "that", "heads", "toward", "Disney", ",", "trying", "to", "experience", "this", "mysterious", "park", "from", "close", "by", "."], ["However", ",", "before", "any", "of", "the", "Disney", "symbols", "were", "in", "sight", ",", "the", "car", "was", "stopped", "by", "a", "security", "guard", "at", "the", "intersection", "of", "the", "road", "towards", "Disney", "."], ["On", "our", "way", "back", ",", "the", "taxi", "driver", "gave", "us", "an", "explanation", "after", "understanding", "our", "intentions", "."], ["Er", ",", "according", "to", "what", "the", "security", "guard", "said", ",", "for", "the", "time", "before", "everything", "is", "officially", ",", "opened", ",", ",", "no", "cars", "can", "enter", "unless", "they", "have", "special", "permission", "."], ["No", "one", "can", "enter", "otherwise", "."], ["Video", "recording", "is", "especially", "forbidden", "."], ["Ah", ",", "everything", "is", "top", "secret", "."], ["If", "pictures", "are", "taken", "without", "permission", ",", "%pw", "that", "is", "to", "say", ",", "it", "will", "at", "all", "times", "be", "pursued", "by", "legal", "action", ",", "a", "big", "hassle", "."], ["Although", "Disney", "Corporation", "chose", "Hong", "Kong", "as", "the", "venue", "for", "the", "Chinese", "Disney", "park", ",", "what", "they", "are", "actually", "most", "excited", "about", "is", "the", "mainland", "China", "tourist", "market", "."]]}
+{"doc_key": "bc/cctv/00/cctv_0000_1", "speakers": [["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi"], ["Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi"], ["Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi"], ["Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"]], "clusters": [[[24, 25], [121, 122], [44, 45], [83, 84], [9, 10], [233, 235], [199, 200]]], "sentences": [["Since", "the", "implementation", "of", "the", "Individual", "Visit", "Scheme", "between", "Hong", "Kong", "and", "the", "mainland", ",", "more", "and", "more", "mainland", "tourists", "are", "coming", "to", "visit", "Hong", "Kong", "."], ["From", "the", "beginning", "up", "till", "now", ",", "more", "than", "seven", "million", "individual", "tourists", ",", "have", "come", "to", "Hong", "Kong", "."], ["Well", ",", "we", "now", ",", "er", ",", "believe", "more", "will", "be", "coming", "."], ["At", "this", "point", ",", "it", "has", "been", "about", "two", "years", "."], ["Also", ",", "the", "current", "number", "of", "34", "cities", "will", "be", "increased", "."], ["Hong", "Kong", "was", "developed", "from", "a", "fishing", "harbor", "one", "hundred", "years", "ago", "to", "become", "today", "'s", "international", "metropolis", "."], ["Here", ",", "eastern", "and", "western", "cultures", "have", "gathered", ",", "and", "the", "new", "and", "the", "old", "coexist", "."], ["When", "in", "Hong", "Kong", ",", "you", "can", "wander", "among", "skyscrapers", ",", "heartily", "enjoy", "shopping", "sprees", "in", "well", "-", "known", "stores", "and", "malls", "for", "goods", "from", "various", "countries", ",", "and", "taste", "delicious", "snacks", "from", "all", "over", "the", "world", "at", "tea", "shops", "or", "at", "street", "stands", "in", "Mong", "Kok", "."], ["You", "can", "go", "to", "burn", "incense", "and", "make", "a", "vow", "at", "the", "Repulse", "Bay", ",", "where", "all", "deities", "gather", "."], ["You", "can", "enjoy", "the", "most", "charming", "sun", "-", "filled", "sandy", "beaches", "in", "Hong", "Kong", "."], ["You", "can", "ascend", "Victoria", "Peak", "to", "get", "a", "panoramic", "view", "of", "Victoria", "Harbor", "'s", "beautiful", "scenery", "."], ["Or", "hop", "onto", "a", "trolley", "with", "over", "a", "century", "of", "history", ",", "and", "feel", "the", "city", "'s", "blend", "of", "the", "old", "and", "the", "modern", "in", "slow", "motion", "."]]}
diff --git a/test/data_for_tests/coreference/coreference_test.json b/test/data_for_tests/coreference/coreference_test.json
new file mode 100644
index 00000000..399b8cc5
--- /dev/null
+++ b/test/data_for_tests/coreference/coreference_test.json
@@ -0,0 +1,2 @@
+{"doc_key": "bc/cctv/00/cctv_0005_0", "speakers": [["speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1"], ["speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1"], ["speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1"], ["Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li"], ["Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li"], ["Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li"], ["Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li"], ["Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li"]], "clusters": [[[57, 59], [25, 27], [42, 44]], [[19, 23], [16, 16]], [[83, 83], [82, 82]]], "sentences": [["--", "basically", ",", "it", "was", "unanimously", "agreed", "upon", "by", "the", "various", "relevant", "parties", "."], ["To", "express", "its", "determination", ",", "the", "Chinese", "securities", "regulatory", "department", "compares", "this", "stock", "reform", "to", "a", "die", "that", "has", "been", "cast", "."], ["It", "takes", "time", "to", "prove", "whether", "the", "stock", "reform", "can", "really", "meet", "expectations", ",", "and", "whether", "any", "deviations", "that", "arise", "during", "the", "stock", "reform", "can", "be", "promptly", "corrected", "."], ["Dear", "viewers", ",", "the", "China", "News", "program", "will", "end", "here", "."], ["This", "is", "Xu", "Li", "."], ["Thank", "you", "everyone", "for", "watching", "."], ["Coming", "up", "is", "the", "Focus", "Today", "program", "hosted", "by", "Wang", "Shilin", "."], ["Good-bye", ",", "dear", "viewers", "."]]}
+{"doc_key": "bc/cctv/00/cctv_0005_1", "speakers": [["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua"], ["Wang_shilin", "Wang_shilin"], ["Zhou_hanhua", "Zhou_hanhua"], ["Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua"], ["Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua"], ["Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Yang_yang", "Yang_yang"], ["Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang"], ["Yang_yang", "Yang_yang"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Yang_yang", "Yang_yang"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"]], "clusters": [[[233, 234], [7, 8]], [[253, 254], [438, 439]], [[411, 412], [64, 67], [18, 30], [259, 260], [516, 516]], [[432, 433], [190, 204], [272, 272], [325, 325], [314, 314], [292, 292], [281, 281], [334, 334]], [[310, 311], [299, 300], [321, 321]], [[172, 172], [10, 10]], [[372, 373], [392, 393], [216, 219], [418, 419]], [[29, 30], [108, 109], [112, 113]], [[72, 73], [59, 60], [27, 27]], [[305, 305], [377, 377]], [[502, 503], [444, 447], [459, 460]], [[352, 353], [387, 387], [362, 362], [408, 408], [210, 219], [375, 375], [360, 360], [350, 350]], [[182, 185], [166, 168], [247, 250], [224, 226]], [[383, 384], [51, 60]], [[367, 368], [268, 268], [35, 36], [256, 260]], [[523, 523], [500, 500], [493, 493], [435, 435], [238, 238]], [[228, 229], [187, 188], [170, 171]]], "sentences": [["Hello", ",", "dear", "viewers", "."], ["Welcome", "to", "Focus", "Today", "."], ["Today", ",", "let", "'s", "turn", "our", "attention", "to", "a", "road", "cave", "-", "in", "accident", "that", "happened", "in", "Beijing", "over", "the", "holiday", "."], ["Before", "dawn", "on", "January", "3", ",", "a", "sewage", "pipe", "leakage", "accident", "occurred", "at", "the", "main", "and", "side", "roads", "of", "Jingguang", "Bridge", ",", "East", "Third", "Ring", "Road", ",", "Beijing", "Municipality", ",", "resulting", "in", "the", "road", "caving", "in", "."], ["Relevant", "departments", "from", "Beijing", "Municipality", "promptly", "activated", "emergency", "contingency", "plans", "."], ["The", "traffic", "administration", "department", "carried", "out", "traffic", "supervision", "near", "the", "accident", "scene", "."], ["Well", ",", "how", "did", "the", "emergency", "response", "mechanisms", "activated", "by", "governmental", "departments", "operate", "effectively", "during", "the", "holiday", "?"], ["After", "the", "holiday", ",", "what", "will", "be", "done", "to", "handle", "citizens", "'", "peak", "commute", "?"], ["In", "addition", ",", "what", "measures", "did", "relevant", "departments", "take", "to", "resolve", "issues", "such", "as", "waste", "discharge", ",", "heating", ",", "and", "communication", ",", "in", "order", "to", "ensure", "that", "the", "lives", "of", "citizens", "were", "not", "affected", "?"], ["Well", ",", "we", "have", "invited", "two", "honorable", "guests", "to", "the", "studio", "today", "to", "follow", "this", "topic", "with", "us", "."], ["One", "of", "the", "two", "honorable", "guests", "in", "the", "studio", "is", "Professor", "Zhou", "Hanhua", "from", "the", "Institute", "of", "Law", "of", "the", "Chinese", "Academy", "of", "Social", "Sciences", "."], ["Hello", "."], ["Next", "is", "Yang", "Yang", ",", "a", "host", "of", "Beijing", "Traffic", "Radio", "Station", "."], ["Hello", "."], ["Welcome", "both", "of", "you", "to", "the", "studio", "to", "participate", "in", "our", "program", "."], ["Well", ",", "I", "especially", "want", "to", "know", ",", "ha", ",", "how", "the", "two", "of", "you", "found", "out", "the", "news", "on", "the", "day", "of", "the", "accident", "?"], ["Ah", ",", ",", "about", "11:00", "m.", "yesterday", ",", "ah", ",", "I", "happened", "to", "find", "out", "through", "an", "SMS", "when", "I", "was", "outside", "."], ["Uh-huh", "."], ["Uh-huh", "."], ["It", "happened", "that", "I", "was", "going", "to", "have", "lunch", "with", "a", "friend", ",", "um", ",", "at", "noon", "."], ["And", "then", ",", "the", "friend", "first", "sent", "me", "an", "SMS", ",", "Uh-huh", ".", "saying", "he", "would", "come", "pick", "me", "up", "to", "go", "together", "."], ["After", "that", ",", "I", "received", "an", "SMS", "from", "1860", "."], ["Uh-huh", ",", "it", "was", "through", "an", "SMS", "."], ["And", "you", ",", "Yang", "Yang", "?"], ["A", "friend", "happened", "to", "call", "me", "."], ["You", "were", "not", "at", "work", "that", "day", "?"], ["No", "."], ["The", "station", "called", "me", "at", "noon", "and", "said", "something", "happened", "at", "Jingguang", "Bridge", "and", "that", "I", "had", "to", "go", "to", "the", "station", "immediately", "to", "research", "the", "upcoming", "program", "."], ["Uh-huh", ",", "that", "means", ",", "er", ",", "you", "found", "out", "the", "accident", "through", "an", "information", "source", "at", "the", "station", "."], ["Right", ",", "right", ",", "right", "."], ["Uh-huh", "."], ["Well", ",", "like", "Professor", "Zhou", ",", "I", "also", "received", "this", "news", ",", "ha", ",", "through", "a", "mobile", "phone", "SMS", "."], ["At", "that", "time", ",", ",", "it", "can", "be", "said", "that", "this", "SMS", "was", "among", "the", "many", ",", "ha", ",", "SMS", "containing", "New", "Year", "wishes", ",", "like", "Happy", "New", "Year", ",", "received", "after", "the", "start", "of", "the", "New", "Year", "."], ["Uh-huh", "."], ["Ah", ",", "actually", "I", "felt", "a", "lot", "of", "warmth", "when", "I", "received", "that", "SMS", "."], ["Although", "we", "live", "in", "the", "west", "instead", "of", "the", "east", "and", "it", "did", "not", "affect", "us", "much", ",", "I", "think", "it", "is", "very", "useful", ",", "ah", ",", "to", "inform", "people", "of", "this", "kind", "of", "news", "."], ["Yes", ",", "exceptionally", "."], ["Yes", ",", "exceptionally", "."]]}
diff --git a/test/data_for_tests/coreference/coreference_train.json b/test/data_for_tests/coreference/coreference_train.json
new file mode 100644
index 00000000..6932bbb7
--- /dev/null
+++ b/test/data_for_tests/coreference/coreference_train.json
@@ -0,0 +1,2 @@
+{"doc_key": "bc/cctv/00/cctv_0001_0", "speakers": [["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"]], "clusters": [[[113, 114], [42, 45], [88, 91]], [[288, 288], [293, 293]], [[185, 189], [162, 165], [101, 104]], [[232, 233], [209, 209], [253, 253]], [[36, 37], [31, 32]], [[55, 56], [79, 81]], [[283, 283], [269, 275]], [[39, 45], [47, 47]], [[285, 285], [298, 298], [235, 237], [258, 260], [117, 120], [267, 267]], [[75, 77], [51, 53]], [[310, 310], [289, 289], [295, 295]], [[135, 136], [273, 273], [26, 26]], [[200, 201], [182, 183], [179, 180]]], "sentences": [["What", "kind", "of", "memory", "?"], ["We", "respectfully", "invite", "you", "to", "watch", "a", "special", "edition", "of", "Across", "China", "."], ["WW", "II", "Landmarks", "on", "the", "Great", "Earth", "of", "China", ":", "Eternal", "Memories", "of", "Taihang", "Mountain"], ["Standing", "tall", "on", "Taihang", "Mountain", "is", "the", "Monument", "to", "the", "Hundred", "Regiments", "Offensive", "."], ["It", "is", "composed", "of", "a", "primary", "stele", ",", "secondary", "steles", ",", "a", "huge", "round", "sculpture", "and", "beacon", "tower", ",", "and", "the", "Great", "Wall", ",", "among", "other", "things", "."], ["A", "primary", "stele", ",", "three", "secondary", "steles", ",", "and", "two", "inscribed", "steles", "."], ["The", "Hundred", "Regiments", "Offensive", "was", "the", "campaign", "of", "the", "largest", "scale", "launched", "by", "the", "Eighth", "Route", "Army", "during", "the", "War", "of", "Resistance", "against", "Japan", "."], ["This", "campaign", "broke", "through", "the", "Japanese", "army", "'s", "blockade", "to", "reach", "base", "areas", "behind", "enemy", "lines", ",", "stirring", "up", "anti-Japanese", "spirit", "throughout", "the", "nation", "and", "influencing", "the", "situation", "of", "the", "anti-fascist", "war", "of", "the", "people", "worldwide", "."], ["This", "is", "Zhuanbi", "Village", ",", "Wuxiang", "County", "of", "Shanxi", "Province", ",", "where", "the", "Eighth", "Route", "Army", "was", "headquartered", "back", "then", "."], ["On", "a", "wall", "outside", "the", "headquarters", "we", "found", "a", "map", "."], ["This", "map", "was", "the", "Eighth", "Route", "Army", "'s", "depiction", "of", "the", "Mediterranean", "Sea", "situation", "at", "that", "time", "."], ["This", "map", "reflected", "the", "European", "battlefield", "situation", "."], ["In", "1940", ",", "the", "German", "army", "invaded", "and", "occupied", "Czechoslovakia", ",", "Poland", ",", "the", "Netherlands", ",", "Belgium", ",", "and", "France", "."], ["It", "was", "during", "this", "year", "that", "the", "Japanese", "army", "developed", "a", "strategy", "to", "rapidly", "force", "the", "Chinese", "people", "into", "submission", "by", "the", "end", "of", "1940", "."], ["In", "May", ",", "the", "Japanese", "army", "launched", "--"], ["From", "one", "side", ",", "it", "seized", "an", "important", "city", "in", "China", "called", "Yichang", "."], ["Um", ",", ",", "uh", ",", "through", "Yichang", ",", "it", "could", "directly", "reach", "Chongqing", "."], ["Ah", ",", "that", "threatened", "Chongqing", "."], ["Then", "they", "would", ",", "ah", ",", "bomb", "these", "large", "rear", "areas", "such", "as", "Chongqing", "."], ["So", ",", "along", "with", "the", "coordinated", ",", "er", ",", "economic", "blockade", ",", "military", "offensives", ",", "and", "strategic", "bombings", ",", "er", ",", "a", "simultaneous", "attack", "was", "launched", "in", "Hong", "Kong", "to", "lure", "the", "KMT", "government", "into", "surrender", "."], ["The", "progress", "of", "this", "coordinated", "offensive", "was", "already", "very", "entrenched", "by", "then", "."]]}
+{"doc_key": "bc/cctv/00/cctv_0001_1", "speakers": [["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1"]], "clusters": [[[129, 131], [167, 169]], [[495, 496], [446, 447], [183, 186]], [[433, 438], [314, 316], [318, 318]], [[154, 157], [531, 534], [436, 438], [139, 142], [43, 45]], [[560, 561], [547, 554], [279, 288]], [[309, 309], [374, 374], [21, 23], [9, 9], [312, 312], [385, 385]], [[212, 213], [193, 197]], [[577, 578], [581, 582]], [[262, 267], [591, 592], [523, 524], [565, 568], [424, 431]], [[255, 256], [28, 32]], [[492, 493], [175, 181], [443, 444]], [[124, 127], [449, 451], [250, 253], [29, 31], [188, 191], [407, 416], [71, 74], [510, 513], [129, 129]], [[63, 67], [139, 146], [76, 78]], [[443, 452], [175, 191]], [[485, 487], [596, 598]], [[517, 524], [556, 556], [526, 526]], [[81, 98], [133, 134]], [[47, 48], [109, 112]], [[348, 353], [365, 365], [388, 390]], [[1, 1], [477, 477], [267, 267]], [[550, 551], [288, 288], [3, 4], [18, 18]]], "sentences": [["By", "1940", ",", "China", "'s", "War", "of", "Resistance", "against", "Japan", "had", "entered", "a", "stalemate", "."], ["The", "situation", "on", "our", "side", "and", "the", "enemy", "'s", "side", "was", "intertwined", "."], ["The", "Eighth", "Route", "Army", "guerrillas", "were", "extraordinarily", "active", ",", "creating", "more", "and", "more", "trouble", "for", "the", "Japanese", "army", "in", "North", "China", "."], ["Hayao", "Tada", ",", "commander", "of", "the", "Japanese", "North", "China", "Area", "Army", ",", "adopted", "a", "strategy", "of", "siege", "warfare", "to", "deal", "with", "the", "Eighth", "Route", "Army", "."], ["The", "specific", "method", "was", "building", "a", "closely", "connected", "transport", "network", ",", "with", "a", "road", "for", "every", "village", "and", "defensive", "towers", "on", "every", "road", "."], ["Roads", "and", "railways", "were", "used", "as", "links", "to", "connect", "all", "of", "North", "China", "into", "a", "solid", ",", "widespread", "siege", ",", "in", "order", "to", "strangle", "the", "Eighth", "Route", "Army", "and", "its", "base", "areas", "in", "this", "net", "."], ["As", "part", "of", "the", "Japanese", "army", "'s", "strategy", "of", "siege", "warfare", ",", "railways", "and", "roads", "had", "actually", "become", "the", "Japanese", "army", "'s", "weapons", "of", "war", ",", "becoming", "a", "great", "threat", "to", "the", "base", "areas", "."], ["In", "December", "1939", ",", "Commander", "-", "in", "-", "chief", "Zhu", "De", "and", "Vice", "Commander", "Peng", "Dehuai", "of", "the", "Eighth", "Route", "Army", "received", "a", "top", "-", "secret", "telegram", "from", "Commander", "Lu", "Zhengcao", "of", "the", "Jizhong", "Military", "District", ",", "among", "other", "people", "."], ["The", "telegram", "said", "that", "the", "Japanese", "troops", "were", "building", "blockade", "trenches", "and", "chessboard", "-", "like", "roads", "to", "divide", "the", "Jizhong", "base", "area", "into", "small", "isolated", "blocks", "without", "the", "ability", "to", "mutually", "communicate", "and", "support", "each", "other", ",", "causing", "the", "Eighth", "Route", "Army", "and", "the", "guerrillas", "to", "lose", "maneuverability", "."], ["Before", "the", "Hundred", "Regiments", "Offensive", "in", "1940", ",", "an", "inclination", "to", "compromise", ",", "ah", ",", "surrender", ",", "was", "an", "extremely", "serious", "crisis", "in", "the", "frontline", "situation", "in", "China", "."], ["Well", ",", "on", "the", "battlefield", "behind", "enemy", "lines", ",", "in", "order", "to", "take", "over", ",", "consolidate", "the", "area", "under", "its", "occupation", ",", "Japan", "began", "a", "new", "strategy", "."], ["That", "was", "to", "use", "railways", "as", "a", "pillar", ",", "roads", "as", "a", "chain", ",", "and", "strongholds", "as", "a", "lock", ",", "to", "carry", "out", "siege", "warfare", "in", "an", "attempt", "to", "divide", "the", "base", "areas", "behind", "enemy", "lines", ",", "ah", ",", "so", "as", ",", "er", ",", "to", "cut", "off", "their", "communication", "with", "one", "another", "."], ["In", "addition", ",", "it", "relied", "on", "this", "cage", ",", "ah", ",", "to", "further", "strengthen", "its", "assaults", "against", "the", "base", "areas", "."], ["Er", "."], ["So", ",", "it", "was", "amidst", "such", "a", "grave", "international", "and", "domestic", "situation", "that", "the", "Eighth", "Route", "Army", "led", "by", "the", "Chinese", "Communist", "Party", ",", "ah", ",", "launched", ",", "ah", ",", "a", "strategic", "offensive", "called", "the", "Hundred", "Regiments", "Offensive", "."], ["This", "plot", "of", "the", "Japanese", "army", "drew", "great", "attention", "from", "Zhu", "De", "and", "Peng", "Dehuai", "of", "Eighth", "Route", "Army", "headquarters", "."], ["After", "meticulous", "studies", "and", "painstaking", "preparations", "by", "many", "parties", ",", "a", "battle", "plan", "based", "on", "surprise", "was", "formulated", "."], ["On", "July", "22", ",", "1940", ",", "a", "campaign", "preparation", "order", "to", "attack", "the", "Zhengtai", "Railway", ",", "jointly", "signed", "by", "Zhu", "De", ",", "Peng", "Dehuai", ",", "and", "Zuo", "Quan", ",", "was", "sent", "to", "Yan'an", "and", "all", "units", "of", "the", "Eighth", "Route", "Army", "."], ["What", "was", "the", ",", "purpose", "and", "goal", "of", "this", "campaign", "?"], ["It", "was", "to", "break", "through", "the", "Japanese", "army", "'s", "siege", "policy", "against", "base", "areas", "behind", "enemy", "lines", ",", "and", "to", "avert", "the", "crisis", "of", "China", "'s", "compromise", "and", "surrender", "."], ["It", "was", "to", "overcome", "this", "crisis", "."], ["Well", ",", "the", "Hundred", "Regiments", "Offensive", "was", "divided", "into", "three", "phases", "."], ["Beginning", "from", "August", "20", ",", "from", "August", "20", "to", "September", "10", ",", "the", "main", "purpose", "of", "the", "campaign", "was", "to", "sabotage", "the", "Zhengtai", "Railway", "."]]}
diff --git a/test/io/loader/test_coreference_loader.py b/test/io/loader/test_coreference_loader.py
new file mode 100644
index 00000000..48551f3e
--- /dev/null
+++ b/test/io/loader/test_coreference_loader.py
@@ -0,0 +1,16 @@
+from fastNLP.io.loader.coreference import CRLoader
+import unittest
+
+class TestCR(unittest.TestCase):
+    def test_load(self):
+
+        test_root = "../../data_for_tests/coreference/"
+        train_path = test_root+"coreference_train.json"
+        dev_path = test_root+"coreference_dev.json"
+        test_path = test_root+"coreference_test.json"
+        paths = {"train": train_path,"dev":dev_path,"test":test_path}
+
+        bundle1 = CRLoader().load(paths)
+        bundle2 = CRLoader().load(test_root)
+        print(bundle1)
+        print(bundle2)
\ No newline at end of file
diff --git a/test/io/pipe/test_coreference.py b/test/io/pipe/test_coreference.py
new file mode 100644
index 00000000..1c53f2b0
--- /dev/null
+++ b/test/io/pipe/test_coreference.py
@@ -0,0 +1,24 @@
+import unittest
+from fastNLP.io.pipe.coreference import CoreferencePipe
+
+
+class TestCR(unittest.TestCase):
+
+    def test_load(self):
+        class Config():
+            max_sentences = 50
+            filter = [3, 4, 5]
+            char_path = None
+        config = Config()
+
+        file_root_path = "../../data_for_tests/coreference/"
+        train_path = file_root_path + "coreference_train.json"
+        dev_path = file_root_path + "coreference_dev.json"
+        test_path = file_root_path + "coreference_test.json"
+
+        paths = {"train": train_path, "dev": dev_path, "test": test_path}
+
+        bundle1 = CoreferencePipe(config).process_from_file(paths)
+        bundle2 = CoreferencePipe(config).process_from_file(file_root_path)
+        print(bundle1)
+        print(bundle2)
\ No newline at end of file

From 5bbfb92a300d8d9aeba7f45ae4e2bf8dad19fcb4 Mon Sep 17 00:00:00 2001
From: xxliu <yexu_i@qq.com>
Date: Fri, 6 Sep 2019 13:08:57 +0800
Subject: [PATCH 4/7] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E8=A7=84=E8=8C=83?=
 =?UTF-8?q?=E4=BB=A5=E5=8F=8A=E4=BF=AE=E6=94=B9=E6=B5=8B=E8=AF=95=E6=96=87?=
 =?UTF-8?q?=E4=BB=B6=E8=B7=AF=E5=BE=84=E4=BB=A5=E5=8C=B9=E9=85=8Dgithub?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6=E8=B7=AF=E5=BE=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 reproduction/coreference_resolution/train.py | 4 ++--
 test/io/loader/test_coreference_loader.py    | 2 +-
 test/io/pipe/test_coreference.py             | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/reproduction/coreference_resolution/train.py b/reproduction/coreference_resolution/train.py
index c91f7109..cd4b65a5 100644
--- a/reproduction/coreference_resolution/train.py
+++ b/reproduction/coreference_resolution/train.py
@@ -45,7 +45,7 @@ if __name__ == "__main__":
     print("数据集划分：\ntrain:", str(len(data_bundle.get_dataset("train"))),
           "\ndev:" + str(len(data_bundle.get_dataset("dev"))) + "\ntest:" + str(len(data_bundle.get_dataset('test'))))
     # print(data_info)
-    model = Model(data_bundle.vocabs['vocab'], config)
+    model = Model(data_bundle.get_vocab("vocab"), config)
     print(model)
 
     loss = SoftmaxLoss()
@@ -60,7 +60,7 @@ if __name__ == "__main__":
                       loss=loss, metrics=metric, check_code_level=-1, sampler=None,
                       batch_size=1, device=torch.device("cuda:" + config.cuda), metric_key='f', n_epochs=config.epoch,
                       optimizer=optim,
-                      save_path='/remote-home/xxliu/pycharm/fastNLP/fastNLP/reproduction/coreference_resolution/save',
+                      save_path= None,
                       callbacks=[lr_decay_callback, GradientClipCallback(clip_value=5)])
     print()
 
diff --git a/test/io/loader/test_coreference_loader.py b/test/io/loader/test_coreference_loader.py
index 48551f3e..d827e947 100644
--- a/test/io/loader/test_coreference_loader.py
+++ b/test/io/loader/test_coreference_loader.py
@@ -4,7 +4,7 @@ import unittest
 class TestCR(unittest.TestCase):
     def test_load(self):
 
-        test_root = "../../data_for_tests/coreference/"
+        test_root = "test/data_for_tests/coreference/"
         train_path = test_root+"coreference_train.json"
         dev_path = test_root+"coreference_dev.json"
         test_path = test_root+"coreference_test.json"
diff --git a/test/io/pipe/test_coreference.py b/test/io/pipe/test_coreference.py
index 1c53f2b0..517be993 100644
--- a/test/io/pipe/test_coreference.py
+++ b/test/io/pipe/test_coreference.py
@@ -11,7 +11,7 @@ class TestCR(unittest.TestCase):
             char_path = None
         config = Config()
 
-        file_root_path = "../../data_for_tests/coreference/"
+        file_root_path = "test/data_for_tests/coreference/"
         train_path = file_root_path + "coreference_train.json"
         dev_path = file_root_path + "coreference_dev.json"
         test_path = file_root_path + "coreference_test.json"

From c38d815c49916c5011f74474723ff40ee3cb7422 Mon Sep 17 00:00:00 2001
From: xxliu <yexu_i@qq.com>
Date: Tue, 10 Sep 2019 13:17:45 +0800
Subject: [PATCH 5/7] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E6=B3=A8=E9=87=8A?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 fastNLP/io/pipe/coreference.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fastNLP/io/pipe/coreference.py b/fastNLP/io/pipe/coreference.py
index baa616f1..836b251d 100644
--- a/fastNLP/io/pipe/coreference.py
+++ b/fastNLP/io/pipe/coreference.py
@@ -28,15 +28,15 @@ class CoreferencePipe(Pipe):
         .. csv-table::
            :header: "raw_key", "raw_speaker","raw_words","raw_clusters"
 
-           "bc/cctv/00/cctv_0000_0", "[["Speaker#1", "Speaker#1"],[]]","[["I","am"],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]"
-           "bc/cctv/00/cctv_0000_1"", "[["Speaker#1", "Speaker#1"],[]]","[["He","is"],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]"
+           "bc/cctv/00/cctv_0000_0", "[[Speaker#1, Speaker#1],[]]","[['I','am'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]"
+           "bc/cctv/00/cctv_0000_1", "[['Speaker#1', 'peaker#1'],[]]","[['He','is'],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]"
            "[...]", "[...]","[...]","[...]"
 
         处理完成后数据包含文章类别、speaker信息、句子信息、句子对应的index、char、句子长度、target：
         .. csv-table::
            :header: "words1", "words2","words3","words4","chars","seq_len","target"
 
-           "bc", "[[0,0],[1,1]]","[["I","am"],[]]",[[1,2],[]],[[[1],[2,3]],[]],[2,3],"[[[2,3],[6,7]],[[10,12],[20,22]]]"
+           "bc", "[[0,0],[1,1]]","[['I','am'],[]]","[[1,2],[]]","[[[1],[2,3]],[]]","[2,3]","[[[2,3],[6,7]],[[10,12],[20,22]]]"
            "[...]", "[...]","[...]","[...]","[...]","[...]","[...]"
 
 

From c9883bcb30a5d0c5c01180d5ca4848ad006a6544 Mon Sep 17 00:00:00 2001
From: xxliu <yexu_i@qq.com>
Date: Tue, 10 Sep 2019 15:06:23 +0800
Subject: [PATCH 6/7] undocumented

---
 fastNLP/io/loader/coreference.py | 2 ++
 fastNLP/io/pipe/coreference.py   | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/fastNLP/io/loader/coreference.py b/fastNLP/io/loader/coreference.py
index b4493571..6e2344d2 100644
--- a/fastNLP/io/loader/coreference.py
+++ b/fastNLP/io/loader/coreference.py
@@ -1,3 +1,5 @@
+"""undocumented"""
+
 from ...core.dataset import DataSet
 from ..file_reader import _read_json
 from ...core.instance import Instance
diff --git a/fastNLP/io/pipe/coreference.py b/fastNLP/io/pipe/coreference.py
index 836b251d..bb40ca55 100644
--- a/fastNLP/io/pipe/coreference.py
+++ b/fastNLP/io/pipe/coreference.py
@@ -1,3 +1,5 @@
+"""undocumented"""
+
 __all__ = [
     "CoreferencePipe"
 

From b015cc149cb5c0688a882118410099d6b84bf78d Mon Sep 17 00:00:00 2001
From: xxliu <yexu_i@qq.com>
Date: Tue, 10 Sep 2019 16:14:18 +0800
Subject: [PATCH 7/7] undocumented

---
 fastNLP/io/loader/coreference.py                     |  4 ++--
 fastNLP/io/pipe/coreference.py                       | 12 ++++++------
 reproduction/coreference_resolution/train.py         |  3 ++-
 reproduction/coreference_resolution/valid.py         |  2 +-
 test/data_for_tests/coreference/coreference_dev.json |  3 +--
 .../data_for_tests/coreference/coreference_test.json |  3 +--
 .../coreference/coreference_train.json               |  3 +--
 7 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/fastNLP/io/loader/coreference.py b/fastNLP/io/loader/coreference.py
index 6e2344d2..714b11e5 100644
--- a/fastNLP/io/loader/coreference.py
+++ b/fastNLP/io/loader/coreference.py
@@ -26,8 +26,8 @@ class CRLoader(JsonLoader):
         super().__init__(fields, dropna)
         # self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1),"clusters":Const.TARGET,"sentences":Const.INPUTS(2)}
         # TODO check 1
-        self.fields = {"doc_key": "raw_key", "speakers": "raw_speakers", "clusters": "raw_clusters",
-                       "sentences": "raw_words"}
+        self.fields = {"doc_key": Const.RAW_WORDS(0), "speakers": Const.RAW_WORDS(1), "clusters": Const.RAW_WORDS(2),
+                       "sentences": Const.RAW_WORDS(3)}
 
     def _load(self, path):
         """
diff --git a/fastNLP/io/pipe/coreference.py b/fastNLP/io/pipe/coreference.py
index bb40ca55..b6d88998 100644
--- a/fastNLP/io/pipe/coreference.py
+++ b/fastNLP/io/pipe/coreference.py
@@ -46,10 +46,10 @@ class CoreferencePipe(Pipe):
         :return:
         """
         genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])}
-        vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name="raw_words")
+        vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name= Const.RAW_WORDS(3))
         vocab.build_vocab()
         word2id = vocab.word2idx
-        data_bundle.set_vocab(vocab,"vocab")
+        data_bundle.set_vocab(vocab,Const.INPUT)
         if self.config.char_path:
             char_dict = get_char_dict(self.config.char_path)
         else:
@@ -65,14 +65,14 @@ class CoreferencePipe(Pipe):
 
         for name, ds in data_bundle.datasets.items():
             # genre
-            ds.apply(lambda x: genres[x["raw_key"][:2]], new_field_name=Const.INPUTS(0))
+            ds.apply(lambda x: genres[x[Const.RAW_WORDS(0)][:2]], new_field_name=Const.INPUTS(0))
 
             # speaker_ids_np
-            ds.apply(lambda x: speaker2numpy(x["raw_speakers"], self.config.max_sentences, is_train=name == 'train'),
+            ds.apply(lambda x: speaker2numpy(x[Const.RAW_WORDS(1)], self.config.max_sentences, is_train=name == 'train'),
                      new_field_name=Const.INPUTS(1))
 
             # sentences
-            ds.rename_field("raw_words",Const.INPUTS(2))
+            ds.rename_field(Const.RAW_WORDS(3),Const.INPUTS(2))
 
             # doc_np
             ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
@@ -88,7 +88,7 @@ class CoreferencePipe(Pipe):
                      new_field_name=Const.INPUT_LEN)
 
             # clusters
-            ds.rename_field("raw_clusters", Const.TARGET)
+            ds.rename_field(Const.RAW_WORDS(2), Const.TARGET)
 
 
             ds.set_ignore_type(Const.TARGET)
diff --git a/reproduction/coreference_resolution/train.py b/reproduction/coreference_resolution/train.py
index cd4b65a5..23ba5d5b 100644
--- a/reproduction/coreference_resolution/train.py
+++ b/reproduction/coreference_resolution/train.py
@@ -8,6 +8,7 @@ from fastNLP.core.callback import Callback, GradientClipCallback
 from fastNLP.core.trainer import Trainer
 
 from fastNLP.io.pipe.coreference import CoreferencePipe
+from fastNLP.core.const import Const
 
 from reproduction.coreference_resolution.model.config import Config
 from reproduction.coreference_resolution.model.model_re import Model
@@ -45,7 +46,7 @@ if __name__ == "__main__":
     print("数据集划分：\ntrain:", str(len(data_bundle.get_dataset("train"))),
           "\ndev:" + str(len(data_bundle.get_dataset("dev"))) + "\ntest:" + str(len(data_bundle.get_dataset('test'))))
     # print(data_info)
-    model = Model(data_bundle.get_vocab("vocab"), config)
+    model = Model(data_bundle.get_vocab(Const.INPUT), config)
     print(model)
 
     loss = SoftmaxLoss()
diff --git a/reproduction/coreference_resolution/valid.py b/reproduction/coreference_resolution/valid.py
index 454629e1..a528ea06 100644
--- a/reproduction/coreference_resolution/valid.py
+++ b/reproduction/coreference_resolution/valid.py
@@ -17,7 +17,7 @@ if __name__=='__main__':
         {'train': config.train_path, 'dev': config.dev_path, 'test': config.test_path})
     metirc = CRMetric()
     model = torch.load(args.path)
-    tester = Tester(bundle.datasets['test'],model,metirc,batch_size=1,device="cuda:0")
+    tester = Tester(bundle.get_dataset("test"),model,metirc,batch_size=1,device="cuda:0")
     tester.test()
     print('test over')
 
diff --git a/test/data_for_tests/coreference/coreference_dev.json b/test/data_for_tests/coreference/coreference_dev.json
index 9322ed30..bb6592d3 100644
--- a/test/data_for_tests/coreference/coreference_dev.json
+++ b/test/data_for_tests/coreference/coreference_dev.json
@@ -1,2 +1 @@
-{"doc_key": "bc/cctv/00/cctv_0000_0", "speakers": [["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi"], ["Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi"], ["Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2"], ["Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2"], ["Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2"], ["Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2"], ["Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2", "Speaker#2"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"]], "clusters": [[[70, 70], [485, 486], [500, 500], [73, 73], [55, 55], [153, 154], [366, 366]], [[307, 312], [255, 256]], [[198, 199], [163, 164]], [[289, 290], [318, 318], [494, 497], [129, 131], [261, 261], [86, 86], [387, 387], [278, 278], [122, 124], [51, 56], [221, 225], [353, 355], [292, 292], [299, 299], [322, 322], [348, 348], [311, 312], [251, 253]], [[143, 144], [138, 138]], [[155, 176], [213, 214], [183, 184], [195, 195]], [[398, 398], [403, 403], [335, 335], [390, 390]], [[28, 28], [32, 37]], [[337, 338], [372, 373]], [[129, 130], [488, 489], [122, 123], [108, 109], [147, 148], [191, 192], [41, 42], [23, 24], [251, 252]], [[208, 208], [201, 204]], [[377, 379], [411, 413]]], "sentences": [["In", "the", "summer", "of", "2005", ",", "a", "picture", "that", "people", "have", "long", "been", "looking", "forward", "to", "started", "emerging", "with", "frequency", "in", "various", "major", "Hong", "Kong", "media", "."], ["With", "their", "unique", "charm", ",", "these", "well", "-", "known", "cartoon", "images", "once", "again", "caused", "Hong", "Kong", "to", "be", "a", "focus", "of", "worldwide", "attention", "."], ["The", "world", "'s", "fifth", "Disney", "park", "will", "soon", "open", "to", "the", "public", "here", "."], ["The", "most", "important", "thing", "about", "Disney", "is", "that", "it", "is", "a", "global", "brand", "."], ["Well", ",", "for", "several", "years", ",", "although", "it", "was", "still", "under", "construction", "and", ",", "er", ",", "not", "yet", "open", ",", "it", "can", "be", "said", "that", "many", "people", "have", "viewed", "Hong", "Kong", "with", "new", "respect", "."], ["Then", "welcome", "to", "the", "official", "writing", "ceremony", "of", "Hong", "Kong", "Disneyland", "."], ["The", "construction", "of", "Hong", "Kong", "Disneyland", "began", "two", "years", "ago", ",", "in", "2003", "."], ["In", "January", "of", "that", "year", ",", "the", "Hong", "Kong", "government", "turned", "over", "to", "Disney", "Corporation", "200", "hectares", "of", "land", "at", "the", "foot", "of", "Lantau", "Island", "that", "was", "obtained", "following", "the", "largest", "land", "reclamation", "project", "in", "recent", "years", "."], ["One", "."], ["Since", "then", ",", "this", "area", "has", "become", "a", "prohibited", "zone", "in", "Hong", "Kong", "."], ["As", "its", "neighbor", "on", "Lantau", "Island", ",", "Hong", "Kong", "International", "Airport", "had", "to", "change", "its", "flight", "routes", "to", "make", "this", "area", "a", "no", "-", "fly", "zone", "."], ["Mickey", "Mouse", "'s", "new", "home", ",", "settling", "on", "Chinese", "land", "for", "the", "first", "time", ",", "has", "captured", "worldwide", "attention", "."], ["There", "'s", "only", "one", "month", "left", "before", "the", "opening", "of", "Hong", "Kong", "Disneyland", "on", "September", "12", "."], ["The", "subway", "to", "Disney", "has", "already", "been", "constructed", "."], ["At", "subway", "stations", ",", "passengers", "will", "frequently", "press", "the", "station", "for", "Disney", "on", "ticket", "machines", ",", "trying", "to", "purchase", "tickets", "to", "enjoy", "the", "park", "when", "it", "first", "opens", "."], ["Meanwhile", ",", "the", "Disney", "subway", "station", "is", "scheduled", "to", "open", "on", "the", "same", "day", "as", "the", "park", "."], ["For", "two", "years", ",", "Disney", "has", "constantly", "maintained", "its", "mystery", "."], ["No", "media", "have", "been", "allowed", "to", "enter", "for", "photos", "."], ["We", "took", "a", "taxi", "along", "the", "path", "of", "the", "highway", "that", "heads", "toward", "Disney", ",", "trying", "to", "experience", "this", "mysterious", "park", "from", "close", "by", "."], ["However", ",", "before", "any", "of", "the", "Disney", "symbols", "were", "in", "sight", ",", "the", "car", "was", "stopped", "by", "a", "security", "guard", "at", "the", "intersection", "of", "the", "road", "towards", "Disney", "."], ["On", "our", "way", "back", ",", "the", "taxi", "driver", "gave", "us", "an", "explanation", "after", "understanding", "our", "intentions", "."], ["Er", ",", "according", "to", "what", "the", "security", "guard", "said", ",", "for", "the", "time", "before", "everything", "is", "officially", ",", "opened", ",", ",", "no", "cars", "can", "enter", "unless", "they", "have", "special", "permission", "."], ["No", "one", "can", "enter", "otherwise", "."], ["Video", "recording", "is", "especially", "forbidden", "."], ["Ah", ",", "everything", "is", "top", "secret", "."], ["If", "pictures", "are", "taken", "without", "permission", ",", "%pw", "that", "is", "to", "say", ",", "it", "will", "at", "all", "times", "be", "pursued", "by", "legal", "action", ",", "a", "big", "hassle", "."], ["Although", "Disney", "Corporation", "chose", "Hong", "Kong", "as", "the", "venue", "for", "the", "Chinese", "Disney", "park", ",", "what", "they", "are", "actually", "most", "excited", "about", "is", "the", "mainland", "China", "tourist", "market", "."]]}
-{"doc_key": "bc/cctv/00/cctv_0000_1", "speakers": [["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi"], ["Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi"], ["Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi"], ["Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi", "Zhou_liangshuyi"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"]], "clusters": [[[24, 25], [121, 122], [44, 45], [83, 84], [9, 10], [233, 235], [199, 200]]], "sentences": [["Since", "the", "implementation", "of", "the", "Individual", "Visit", "Scheme", "between", "Hong", "Kong", "and", "the", "mainland", ",", "more", "and", "more", "mainland", "tourists", "are", "coming", "to", "visit", "Hong", "Kong", "."], ["From", "the", "beginning", "up", "till", "now", ",", "more", "than", "seven", "million", "individual", "tourists", ",", "have", "come", "to", "Hong", "Kong", "."], ["Well", ",", "we", "now", ",", "er", ",", "believe", "more", "will", "be", "coming", "."], ["At", "this", "point", ",", "it", "has", "been", "about", "two", "years", "."], ["Also", ",", "the", "current", "number", "of", "34", "cities", "will", "be", "increased", "."], ["Hong", "Kong", "was", "developed", "from", "a", "fishing", "harbor", "one", "hundred", "years", "ago", "to", "become", "today", "'s", "international", "metropolis", "."], ["Here", ",", "eastern", "and", "western", "cultures", "have", "gathered", ",", "and", "the", "new", "and", "the", "old", "coexist", "."], ["When", "in", "Hong", "Kong", ",", "you", "can", "wander", "among", "skyscrapers", ",", "heartily", "enjoy", "shopping", "sprees", "in", "well", "-", "known", "stores", "and", "malls", "for", "goods", "from", "various", "countries", ",", "and", "taste", "delicious", "snacks", "from", "all", "over", "the", "world", "at", "tea", "shops", "or", "at", "street", "stands", "in", "Mong", "Kok", "."], ["You", "can", "go", "to", "burn", "incense", "and", "make", "a", "vow", "at", "the", "Repulse", "Bay", ",", "where", "all", "deities", "gather", "."], ["You", "can", "enjoy", "the", "most", "charming", "sun", "-", "filled", "sandy", "beaches", "in", "Hong", "Kong", "."], ["You", "can", "ascend", "Victoria", "Peak", "to", "get", "a", "panoramic", "view", "of", "Victoria", "Harbor", "'s", "beautiful", "scenery", "."], ["Or", "hop", "onto", "a", "trolley", "with", "over", "a", "century", "of", "history", ",", "and", "feel", "the", "city", "'s", "blend", "of", "the", "old", "and", "the", "modern", "in", "slow", "motion", "."]]}
+{"doc_key": "bc/cctv/00/cctv_0000_0", "speakers": [["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"]], "clusters": [[[70, 70], [485, 486], [500, 500], [73, 73], [55, 55], [153, 154], [366, 366]]], "sentences": [["In", "the", "summer", "of", "2005", ",", "a", "picture", "that", "people", "have", "long", "been", "looking", "forward", "to", "started", "emerging", "with", "frequency", "in", "various", "major", "Hong", "Kong", "media", "."], ["With", "their", "unique", "charm", ",", "these", "well", "-", "known", "cartoon", "images", "once", "again", "caused", "Hong", "Kong", "to", "be", "a", "focus", "of", "worldwide", "attention", "."]]}
diff --git a/test/data_for_tests/coreference/coreference_test.json b/test/data_for_tests/coreference/coreference_test.json
index 399b8cc5..9577da0e 100644
--- a/test/data_for_tests/coreference/coreference_test.json
+++ b/test/data_for_tests/coreference/coreference_test.json
@@ -1,2 +1 @@
-{"doc_key": "bc/cctv/00/cctv_0005_0", "speakers": [["speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1"], ["speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1"], ["speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1"], ["Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li"], ["Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li"], ["Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li"], ["Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li"], ["Xu_li", "Xu_li", "Xu_li", "Xu_li", "Xu_li"]], "clusters": [[[57, 59], [25, 27], [42, 44]], [[19, 23], [16, 16]], [[83, 83], [82, 82]]], "sentences": [["--", "basically", ",", "it", "was", "unanimously", "agreed", "upon", "by", "the", "various", "relevant", "parties", "."], ["To", "express", "its", "determination", ",", "the", "Chinese", "securities", "regulatory", "department", "compares", "this", "stock", "reform", "to", "a", "die", "that", "has", "been", "cast", "."], ["It", "takes", "time", "to", "prove", "whether", "the", "stock", "reform", "can", "really", "meet", "expectations", ",", "and", "whether", "any", "deviations", "that", "arise", "during", "the", "stock", "reform", "can", "be", "promptly", "corrected", "."], ["Dear", "viewers", ",", "the", "China", "News", "program", "will", "end", "here", "."], ["This", "is", "Xu", "Li", "."], ["Thank", "you", "everyone", "for", "watching", "."], ["Coming", "up", "is", "the", "Focus", "Today", "program", "hosted", "by", "Wang", "Shilin", "."], ["Good-bye", ",", "dear", "viewers", "."]]}
-{"doc_key": "bc/cctv/00/cctv_0005_1", "speakers": [["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua"], ["Wang_shilin", "Wang_shilin"], ["Zhou_hanhua", "Zhou_hanhua"], ["Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua"], ["Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua"], ["Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua", "Zhou_hanhua"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Yang_yang", "Yang_yang"], ["Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang"], ["Yang_yang", "Yang_yang"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Yang_yang", "Yang_yang"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"], ["Yang_yang", "Yang_yang", "Yang_yang", "Yang_yang"], ["Wang_shilin", "Wang_shilin", "Wang_shilin", "Wang_shilin"]], "clusters": [[[233, 234], [7, 8]], [[253, 254], [438, 439]], [[411, 412], [64, 67], [18, 30], [259, 260], [516, 516]], [[432, 433], [190, 204], [272, 272], [325, 325], [314, 314], [292, 292], [281, 281], [334, 334]], [[310, 311], [299, 300], [321, 321]], [[172, 172], [10, 10]], [[372, 373], [392, 393], [216, 219], [418, 419]], [[29, 30], [108, 109], [112, 113]], [[72, 73], [59, 60], [27, 27]], [[305, 305], [377, 377]], [[502, 503], [444, 447], [459, 460]], [[352, 353], [387, 387], [362, 362], [408, 408], [210, 219], [375, 375], [360, 360], [350, 350]], [[182, 185], [166, 168], [247, 250], [224, 226]], [[383, 384], [51, 60]], [[367, 368], [268, 268], [35, 36], [256, 260]], [[523, 523], [500, 500], [493, 493], [435, 435], [238, 238]], [[228, 229], [187, 188], [170, 171]]], "sentences": [["Hello", ",", "dear", "viewers", "."], ["Welcome", "to", "Focus", "Today", "."], ["Today", ",", "let", "'s", "turn", "our", "attention", "to", "a", "road", "cave", "-", "in", "accident", "that", "happened", "in", "Beijing", "over", "the", "holiday", "."], ["Before", "dawn", "on", "January", "3", ",", "a", "sewage", "pipe", "leakage", "accident", "occurred", "at", "the", "main", "and", "side", "roads", "of", "Jingguang", "Bridge", ",", "East", "Third", "Ring", "Road", ",", "Beijing", "Municipality", ",", "resulting", "in", "the", "road", "caving", "in", "."], ["Relevant", "departments", "from", "Beijing", "Municipality", "promptly", "activated", "emergency", "contingency", "plans", "."], ["The", "traffic", "administration", "department", "carried", "out", "traffic", "supervision", "near", "the", "accident", "scene", "."], ["Well", ",", "how", "did", "the", "emergency", "response", "mechanisms", "activated", "by", "governmental", "departments", "operate", "effectively", "during", "the", "holiday", "?"], ["After", "the", "holiday", ",", "what", "will", "be", "done", "to", "handle", "citizens", "'", "peak", "commute", "?"], ["In", "addition", ",", "what", "measures", "did", "relevant", "departments", "take", "to", "resolve", "issues", "such", "as", "waste", "discharge", ",", "heating", ",", "and", "communication", ",", "in", "order", "to", "ensure", "that", "the", "lives", "of", "citizens", "were", "not", "affected", "?"], ["Well", ",", "we", "have", "invited", "two", "honorable", "guests", "to", "the", "studio", "today", "to", "follow", "this", "topic", "with", "us", "."], ["One", "of", "the", "two", "honorable", "guests", "in", "the", "studio", "is", "Professor", "Zhou", "Hanhua", "from", "the", "Institute", "of", "Law", "of", "the", "Chinese", "Academy", "of", "Social", "Sciences", "."], ["Hello", "."], ["Next", "is", "Yang", "Yang", ",", "a", "host", "of", "Beijing", "Traffic", "Radio", "Station", "."], ["Hello", "."], ["Welcome", "both", "of", "you", "to", "the", "studio", "to", "participate", "in", "our", "program", "."], ["Well", ",", "I", "especially", "want", "to", "know", ",", "ha", ",", "how", "the", "two", "of", "you", "found", "out", "the", "news", "on", "the", "day", "of", "the", "accident", "?"], ["Ah", ",", ",", "about", "11:00", "m.", "yesterday", ",", "ah", ",", "I", "happened", "to", "find", "out", "through", "an", "SMS", "when", "I", "was", "outside", "."], ["Uh-huh", "."], ["Uh-huh", "."], ["It", "happened", "that", "I", "was", "going", "to", "have", "lunch", "with", "a", "friend", ",", "um", ",", "at", "noon", "."], ["And", "then", ",", "the", "friend", "first", "sent", "me", "an", "SMS", ",", "Uh-huh", ".", "saying", "he", "would", "come", "pick", "me", "up", "to", "go", "together", "."], ["After", "that", ",", "I", "received", "an", "SMS", "from", "1860", "."], ["Uh-huh", ",", "it", "was", "through", "an", "SMS", "."], ["And", "you", ",", "Yang", "Yang", "?"], ["A", "friend", "happened", "to", "call", "me", "."], ["You", "were", "not", "at", "work", "that", "day", "?"], ["No", "."], ["The", "station", "called", "me", "at", "noon", "and", "said", "something", "happened", "at", "Jingguang", "Bridge", "and", "that", "I", "had", "to", "go", "to", "the", "station", "immediately", "to", "research", "the", "upcoming", "program", "."], ["Uh-huh", ",", "that", "means", ",", "er", ",", "you", "found", "out", "the", "accident", "through", "an", "information", "source", "at", "the", "station", "."], ["Right", ",", "right", ",", "right", "."], ["Uh-huh", "."], ["Well", ",", "like", "Professor", "Zhou", ",", "I", "also", "received", "this", "news", ",", "ha", ",", "through", "a", "mobile", "phone", "SMS", "."], ["At", "that", "time", ",", ",", "it", "can", "be", "said", "that", "this", "SMS", "was", "among", "the", "many", ",", "ha", ",", "SMS", "containing", "New", "Year", "wishes", ",", "like", "Happy", "New", "Year", ",", "received", "after", "the", "start", "of", "the", "New", "Year", "."], ["Uh-huh", "."], ["Ah", ",", "actually", "I", "felt", "a", "lot", "of", "warmth", "when", "I", "received", "that", "SMS", "."], ["Although", "we", "live", "in", "the", "west", "instead", "of", "the", "east", "and", "it", "did", "not", "affect", "us", "much", ",", "I", "think", "it", "is", "very", "useful", ",", "ah", ",", "to", "inform", "people", "of", "this", "kind", "of", "news", "."], ["Yes", ",", "exceptionally", "."], ["Yes", ",", "exceptionally", "."]]}
+{"doc_key": "bc/cctv/00/cctv_0005_0", "speakers": [["speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1"], ["speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1", "speaker#1"]], "clusters": [[[57, 59], [25, 27], [42, 44]]], "sentences": [["--", "basically", ",", "it", "was", "unanimously", "agreed", "upon", "by", "the", "various", "relevant", "parties", "."], ["To", "express", "its", "determination", ",", "the", "Chinese", "securities", "regulatory", "department", "compares", "this", "stock", "reform", "to", "a", "die", "that", "has", "been", "cast", "."]]}
\ No newline at end of file
diff --git a/test/data_for_tests/coreference/coreference_train.json b/test/data_for_tests/coreference/coreference_train.json
index 6932bbb7..0c2940df 100644
--- a/test/data_for_tests/coreference/coreference_train.json
+++ b/test/data_for_tests/coreference/coreference_train.json
@@ -1,2 +1 @@
-{"doc_key": "bc/cctv/00/cctv_0001_0", "speakers": [["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"]], "clusters": [[[113, 114], [42, 45], [88, 91]], [[288, 288], [293, 293]], [[185, 189], [162, 165], [101, 104]], [[232, 233], [209, 209], [253, 253]], [[36, 37], [31, 32]], [[55, 56], [79, 81]], [[283, 283], [269, 275]], [[39, 45], [47, 47]], [[285, 285], [298, 298], [235, 237], [258, 260], [117, 120], [267, 267]], [[75, 77], [51, 53]], [[310, 310], [289, 289], [295, 295]], [[135, 136], [273, 273], [26, 26]], [[200, 201], [182, 183], [179, 180]]], "sentences": [["What", "kind", "of", "memory", "?"], ["We", "respectfully", "invite", "you", "to", "watch", "a", "special", "edition", "of", "Across", "China", "."], ["WW", "II", "Landmarks", "on", "the", "Great", "Earth", "of", "China", ":", "Eternal", "Memories", "of", "Taihang", "Mountain"], ["Standing", "tall", "on", "Taihang", "Mountain", "is", "the", "Monument", "to", "the", "Hundred", "Regiments", "Offensive", "."], ["It", "is", "composed", "of", "a", "primary", "stele", ",", "secondary", "steles", ",", "a", "huge", "round", "sculpture", "and", "beacon", "tower", ",", "and", "the", "Great", "Wall", ",", "among", "other", "things", "."], ["A", "primary", "stele", ",", "three", "secondary", "steles", ",", "and", "two", "inscribed", "steles", "."], ["The", "Hundred", "Regiments", "Offensive", "was", "the", "campaign", "of", "the", "largest", "scale", "launched", "by", "the", "Eighth", "Route", "Army", "during", "the", "War", "of", "Resistance", "against", "Japan", "."], ["This", "campaign", "broke", "through", "the", "Japanese", "army", "'s", "blockade", "to", "reach", "base", "areas", "behind", "enemy", "lines", ",", "stirring", "up", "anti-Japanese", "spirit", "throughout", "the", "nation", "and", "influencing", "the", "situation", "of", "the", "anti-fascist", "war", "of", "the", "people", "worldwide", "."], ["This", "is", "Zhuanbi", "Village", ",", "Wuxiang", "County", "of", "Shanxi", "Province", ",", "where", "the", "Eighth", "Route", "Army", "was", "headquartered", "back", "then", "."], ["On", "a", "wall", "outside", "the", "headquarters", "we", "found", "a", "map", "."], ["This", "map", "was", "the", "Eighth", "Route", "Army", "'s", "depiction", "of", "the", "Mediterranean", "Sea", "situation", "at", "that", "time", "."], ["This", "map", "reflected", "the", "European", "battlefield", "situation", "."], ["In", "1940", ",", "the", "German", "army", "invaded", "and", "occupied", "Czechoslovakia", ",", "Poland", ",", "the", "Netherlands", ",", "Belgium", ",", "and", "France", "."], ["It", "was", "during", "this", "year", "that", "the", "Japanese", "army", "developed", "a", "strategy", "to", "rapidly", "force", "the", "Chinese", "people", "into", "submission", "by", "the", "end", "of", "1940", "."], ["In", "May", ",", "the", "Japanese", "army", "launched", "--"], ["From", "one", "side", ",", "it", "seized", "an", "important", "city", "in", "China", "called", "Yichang", "."], ["Um", ",", ",", "uh", ",", "through", "Yichang", ",", "it", "could", "directly", "reach", "Chongqing", "."], ["Ah", ",", "that", "threatened", "Chongqing", "."], ["Then", "they", "would", ",", "ah", ",", "bomb", "these", "large", "rear", "areas", "such", "as", "Chongqing", "."], ["So", ",", "along", "with", "the", "coordinated", ",", "er", ",", "economic", "blockade", ",", "military", "offensives", ",", "and", "strategic", "bombings", ",", "er", ",", "a", "simultaneous", "attack", "was", "launched", "in", "Hong", "Kong", "to", "lure", "the", "KMT", "government", "into", "surrender", "."], ["The", "progress", "of", "this", "coordinated", "offensive", "was", "already", "very", "entrenched", "by", "then", "."]]}
-{"doc_key": "bc/cctv/00/cctv_0001_1", "speakers": [["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang", "Luo_huanzhang"], ["Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1", "Luo_huanzhang,Speaker#1"]], "clusters": [[[129, 131], [167, 169]], [[495, 496], [446, 447], [183, 186]], [[433, 438], [314, 316], [318, 318]], [[154, 157], [531, 534], [436, 438], [139, 142], [43, 45]], [[560, 561], [547, 554], [279, 288]], [[309, 309], [374, 374], [21, 23], [9, 9], [312, 312], [385, 385]], [[212, 213], [193, 197]], [[577, 578], [581, 582]], [[262, 267], [591, 592], [523, 524], [565, 568], [424, 431]], [[255, 256], [28, 32]], [[492, 493], [175, 181], [443, 444]], [[124, 127], [449, 451], [250, 253], [29, 31], [188, 191], [407, 416], [71, 74], [510, 513], [129, 129]], [[63, 67], [139, 146], [76, 78]], [[443, 452], [175, 191]], [[485, 487], [596, 598]], [[517, 524], [556, 556], [526, 526]], [[81, 98], [133, 134]], [[47, 48], [109, 112]], [[348, 353], [365, 365], [388, 390]], [[1, 1], [477, 477], [267, 267]], [[550, 551], [288, 288], [3, 4], [18, 18]]], "sentences": [["By", "1940", ",", "China", "'s", "War", "of", "Resistance", "against", "Japan", "had", "entered", "a", "stalemate", "."], ["The", "situation", "on", "our", "side", "and", "the", "enemy", "'s", "side", "was", "intertwined", "."], ["The", "Eighth", "Route", "Army", "guerrillas", "were", "extraordinarily", "active", ",", "creating", "more", "and", "more", "trouble", "for", "the", "Japanese", "army", "in", "North", "China", "."], ["Hayao", "Tada", ",", "commander", "of", "the", "Japanese", "North", "China", "Area", "Army", ",", "adopted", "a", "strategy", "of", "siege", "warfare", "to", "deal", "with", "the", "Eighth", "Route", "Army", "."], ["The", "specific", "method", "was", "building", "a", "closely", "connected", "transport", "network", ",", "with", "a", "road", "for", "every", "village", "and", "defensive", "towers", "on", "every", "road", "."], ["Roads", "and", "railways", "were", "used", "as", "links", "to", "connect", "all", "of", "North", "China", "into", "a", "solid", ",", "widespread", "siege", ",", "in", "order", "to", "strangle", "the", "Eighth", "Route", "Army", "and", "its", "base", "areas", "in", "this", "net", "."], ["As", "part", "of", "the", "Japanese", "army", "'s", "strategy", "of", "siege", "warfare", ",", "railways", "and", "roads", "had", "actually", "become", "the", "Japanese", "army", "'s", "weapons", "of", "war", ",", "becoming", "a", "great", "threat", "to", "the", "base", "areas", "."], ["In", "December", "1939", ",", "Commander", "-", "in", "-", "chief", "Zhu", "De", "and", "Vice", "Commander", "Peng", "Dehuai", "of", "the", "Eighth", "Route", "Army", "received", "a", "top", "-", "secret", "telegram", "from", "Commander", "Lu", "Zhengcao", "of", "the", "Jizhong", "Military", "District", ",", "among", "other", "people", "."], ["The", "telegram", "said", "that", "the", "Japanese", "troops", "were", "building", "blockade", "trenches", "and", "chessboard", "-", "like", "roads", "to", "divide", "the", "Jizhong", "base", "area", "into", "small", "isolated", "blocks", "without", "the", "ability", "to", "mutually", "communicate", "and", "support", "each", "other", ",", "causing", "the", "Eighth", "Route", "Army", "and", "the", "guerrillas", "to", "lose", "maneuverability", "."], ["Before", "the", "Hundred", "Regiments", "Offensive", "in", "1940", ",", "an", "inclination", "to", "compromise", ",", "ah", ",", "surrender", ",", "was", "an", "extremely", "serious", "crisis", "in", "the", "frontline", "situation", "in", "China", "."], ["Well", ",", "on", "the", "battlefield", "behind", "enemy", "lines", ",", "in", "order", "to", "take", "over", ",", "consolidate", "the", "area", "under", "its", "occupation", ",", "Japan", "began", "a", "new", "strategy", "."], ["That", "was", "to", "use", "railways", "as", "a", "pillar", ",", "roads", "as", "a", "chain", ",", "and", "strongholds", "as", "a", "lock", ",", "to", "carry", "out", "siege", "warfare", "in", "an", "attempt", "to", "divide", "the", "base", "areas", "behind", "enemy", "lines", ",", "ah", ",", "so", "as", ",", "er", ",", "to", "cut", "off", "their", "communication", "with", "one", "another", "."], ["In", "addition", ",", "it", "relied", "on", "this", "cage", ",", "ah", ",", "to", "further", "strengthen", "its", "assaults", "against", "the", "base", "areas", "."], ["Er", "."], ["So", ",", "it", "was", "amidst", "such", "a", "grave", "international", "and", "domestic", "situation", "that", "the", "Eighth", "Route", "Army", "led", "by", "the", "Chinese", "Communist", "Party", ",", "ah", ",", "launched", ",", "ah", ",", "a", "strategic", "offensive", "called", "the", "Hundred", "Regiments", "Offensive", "."], ["This", "plot", "of", "the", "Japanese", "army", "drew", "great", "attention", "from", "Zhu", "De", "and", "Peng", "Dehuai", "of", "Eighth", "Route", "Army", "headquarters", "."], ["After", "meticulous", "studies", "and", "painstaking", "preparations", "by", "many", "parties", ",", "a", "battle", "plan", "based", "on", "surprise", "was", "formulated", "."], ["On", "July", "22", ",", "1940", ",", "a", "campaign", "preparation", "order", "to", "attack", "the", "Zhengtai", "Railway", ",", "jointly", "signed", "by", "Zhu", "De", ",", "Peng", "Dehuai", ",", "and", "Zuo", "Quan", ",", "was", "sent", "to", "Yan'an", "and", "all", "units", "of", "the", "Eighth", "Route", "Army", "."], ["What", "was", "the", ",", "purpose", "and", "goal", "of", "this", "campaign", "?"], ["It", "was", "to", "break", "through", "the", "Japanese", "army", "'s", "siege", "policy", "against", "base", "areas", "behind", "enemy", "lines", ",", "and", "to", "avert", "the", "crisis", "of", "China", "'s", "compromise", "and", "surrender", "."], ["It", "was", "to", "overcome", "this", "crisis", "."], ["Well", ",", "the", "Hundred", "Regiments", "Offensive", "was", "divided", "into", "three", "phases", "."], ["Beginning", "from", "August", "20", ",", "from", "August", "20", "to", "September", "10", ",", "the", "main", "purpose", "of", "the", "campaign", "was", "to", "sabotage", "the", "Zhengtai", "Railway", "."]]}
+{"doc_key": "bc/cctv/00/cctv_0001_0", "speakers": [["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"], ["Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1", "Speaker#1"]], "clusters": [[[113, 114], [42, 45], [88, 91]]], "sentences": [["What", "kind", "of", "memory", "?"], ["We", "respectfully", "invite", "you", "to", "watch", "a", "special", "edition", "of", "Across", "China", "."]]}