增加注释

增加测试文件及测试样例修改部分变量命名
5 years ago · ea5fbc8881
--- a/fastNLP/io/loader/coreference.py
+++ b/fastNLP/io/loader/coreference.py
@@ -22,7 +22,10 @@ class CRLoader(JsonLoader):
        """
    def __init__(self, fields=None, dropna=False):
        super().__init__(fields, dropna)
        self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1),"clusters":Const.TARGET,"sentences":Const.INPUTS(2)}
        # self.fields = {"doc_key":Const.INPUTS(0),"speakers":Const.INPUTS(1),"clusters":Const.TARGET,"sentences":Const.INPUTS(2)}
        # TODO check 1
        self.fields = {"doc_key": "raw_key", "speakers": "raw_speakers", "clusters": "raw_clusters",
                       "sentences": "raw_words"}

    def _load(self, path):
        """
--- a/fastNLP/io/pipe/coreference.py
+++ b/fastNLP/io/pipe/coreference.py
@@ -22,21 +22,56 @@ class CoreferencePipe(Pipe):
        self.config = config

    def process(self, data_bundle: DataBundle):
        """
        对load进来的数据进一步处理
        原始数据包含：raw_key,raw_speaker,raw_words,raw_clusters
        .. csv-table::
           :header: "raw_key", "raw_speaker","raw_words","raw_clusters"

           "bc/cctv/00/cctv_0000_0", "[["Speaker#1", "Speaker#1"],[]]","[["I","am"],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]"
           "bc/cctv/00/cctv_0000_1"", "[["Speaker#1", "Speaker#1"],[]]","[["He","is"],[]]","[[[2,3],[6,7]],[[10,12],[20,22]]]"
           "[...]", "[...]","[...]","[...]"

        处理完成后数据包含文章类别、speaker信息、句子信息、句子对应的index、char、句子长度、target：
        .. csv-table::
           :header: "words1", "words2","words3","words4","chars","seq_len","target"

           "bc", "[[0,0],[1,1]]","[["I","am"],[]]",[[1,2],[]],[[[1],[2,3]],[]],[2,3],"[[[2,3],[6,7]],[[10,12],[20,22]]]"
           "[...]", "[...]","[...]","[...]","[...]","[...]","[...]"


        :param data_bundle:
        :return:
        """
        genres = {g: i for i, g in enumerate(["bc", "bn", "mz", "nw", "pt", "tc", "wb"])}
        vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name=Const.INPUTS(2))
        vocab = Vocabulary().from_dataset(*data_bundle.datasets.values(), field_name="raw_words")
        vocab.build_vocab()
        word2id = vocab.word2idx
        data_bundle.vocabs = {"vocab":vocab}
        char_dict = get_char_dict(self.config.char_path)
        data_bundle.set_vocab(vocab,"vocab")
        if self.config.char_path:
            char_dict = get_char_dict(self.config.char_path)
        else:
            char_set = set()
            for i,w in enumerate(word2id):
                if i < 2:
                    continue
                for c in w:
                    char_set.add(c)

            char_dict = collections.defaultdict(int)
            char_dict.update({c: i for i, c in enumerate(char_set)})

        for name, ds in data_bundle.datasets.items():
            # genre
            ds.apply(lambda x: genres[x[Const.INPUTS(0)][:2]], new_field_name=Const.INPUTS(0))
            ds.apply(lambda x: genres[x["raw_key"][:2]], new_field_name=Const.INPUTS(0))

            # speaker_ids_np
            ds.apply(lambda x: speaker2numpy(x[Const.INPUTS(1)], self.config.max_sentences, is_train=name == 'train'),
            ds.apply(lambda x: speaker2numpy(x["raw_speakers"], self.config.max_sentences, is_train=name == 'train'),
                     new_field_name=Const.INPUTS(1))

            # sentences
            ds.rename_field("raw_words",Const.INPUTS(2))

            # doc_np
            ds.apply(lambda x: doc2numpy(x[Const.INPUTS(2)], word2id, char_dict, max(self.config.filter),
                                                    self.config.max_sentences, is_train=name == 'train')[0],
@@ -50,6 +85,9 @@ class CoreferencePipe(Pipe):
                                                    self.config.max_sentences, is_train=name == 'train')[2],
                     new_field_name=Const.INPUT_LEN)

            # clusters
            ds.rename_field("raw_clusters", Const.TARGET)


            ds.set_ignore_type(Const.TARGET)
            ds.set_padder(Const.TARGET, None)
--- a/reproduction/coreference_resolution/train.py
+++ b/reproduction/coreference_resolution/train.py
@@ -37,15 +37,15 @@ if __name__ == "__main__":

    print(config)

    @cache_results('cache.pkl')
    # @cache_results('cache.pkl')
    def cache():
        bundle = CoreferencePipe(Config()).process_from_file({'train': config.train_path, 'dev': config.dev_path,'test': config.test_path})
        bundle = CoreferencePipe(config).process_from_file({'train': config.train_path, 'dev': config.dev_path,'test': config.test_path})
        return bundle
    data_info = cache()
    print("数据集划分：\ntrain:", str(len(data_info.datasets["train"])),
          "\ndev:" + str(len(data_info.datasets["dev"])) + "\ntest:" + str(len(data_info.datasets["test"])))
    data_bundle = cache()
    print("数据集划分：\ntrain:", str(len(data_bundle.get_dataset("train"))),
          "\ndev:" + str(len(data_bundle.get_dataset("dev"))) + "\ntest:" + str(len(data_bundle.get_dataset('test'))))
    # print(data_info)
    model = Model(data_info.vocabs['vocab'], config)
    model = Model(data_bundle.vocabs['vocab'], config)
    print(model)

    loss = SoftmaxLoss()
@@ -56,8 +56,8 @@ if __name__ == "__main__":

    lr_decay_callback = LRCallback(optim.param_groups, config.lr_decay)

    trainer = Trainer(model=model, train_data=data_info.datasets["train"], dev_data=data_info.datasets["dev"],
                      loss=loss, metrics=metric, check_code_level=-1,sampler=None,
    trainer = Trainer(model=model, train_data=data_bundle.datasets["train"], dev_data=data_bundle.datasets["dev"],
                      loss=loss, metrics=metric, check_code_level=-1, sampler=None,
                      batch_size=1, device=torch.device("cuda:" + config.cuda), metric_key='f', n_epochs=config.epoch,
                      optimizer=optim,
                      save_path='/remote-home/xxliu/pycharm/fastNLP/fastNLP/reproduction/coreference_resolution/save',
--- a/test/data_for_tests/coreference/coreference_dev.json
+++ b/test/data_for_tests/coreference/coreference_dev.json
--- a/test/data_for_tests/coreference/coreference_test.json
+++ b/test/data_for_tests/coreference/coreference_test.json
--- a/test/data_for_tests/coreference/coreference_train.json
+++ b/test/data_for_tests/coreference/coreference_train.json
--- a/test/io/loader/test_coreference_loader.py
+++ b/test/io/loader/test_coreference_loader.py
@@ -0,0 +1,16 @@
 from fastNLP.io.loader.coreference import CRLoader
 import unittest

 class TestCR(unittest.TestCase):
    def test_load(self):

        test_root = "../../data_for_tests/coreference/"
        train_path = test_root+"coreference_train.json"
        dev_path = test_root+"coreference_dev.json"
        test_path = test_root+"coreference_test.json"
        paths = {"train": train_path,"dev":dev_path,"test":test_path}

        bundle1 = CRLoader().load(paths)
        bundle2 = CRLoader().load(test_root)
        print(bundle1)
        print(bundle2)
--- a/test/io/pipe/test_coreference.py
+++ b/test/io/pipe/test_coreference.py
@@ -0,0 +1,24 @@
 import unittest
 from fastNLP.io.pipe.coreference import CoreferencePipe


 class TestCR(unittest.TestCase):

    def test_load(self):
        class Config():
            max_sentences = 50
            filter = [3, 4, 5]
            char_path = None
        config = Config()

        file_root_path = "../../data_for_tests/coreference/"
        train_path = file_root_path + "coreference_train.json"
        dev_path = file_root_path + "coreference_dev.json"
        test_path = file_root_path + "coreference_test.json"

        paths = {"train": train_path, "dev": dev_path, "test": test_path}

        bundle1 = CoreferencePipe(config).process_from_file(paths)
        bundle2 = CoreferencePipe(config).process_from_file(file_root_path)
        print(bundle1)
        print(bundle2)