delete my fastnlp/io/dataset_loader.py and add test file

5 years ago · bb373fd78d
--- a/reproduction/Summarization/data/init.py
+++ b/reproduction/Summarization/data/init.py
--- a/reproduction/Summarization/test/test_dataloader.py
+++ b/reproduction/Summarization/test/test_dataloader.py
@@ -0,0 +1,36 @@
 import unittest
 import sys
 sys.path.append('..')
 from data.dataloader import SummarizationLoader
 vocab_size = 100000
 vocab_path = "testdata/vocab"
 sent_max_len = 100
 doc_max_timesteps = 50
 class TestSummarizationLoader(unittest.TestCase):
    def test_case1(self):
        sum_loader = SummarizationLoader()
        paths = {"train":"testdata/train.jsonl", "valid":"testdata/val.jsonl", "test":"testdata/test.jsonl"}
        data = sum_loader.process(paths=paths, vocab_size=vocab_size, vocab_path=vocab_path, sent_max_len=sent_max_len, doc_max_timesteps=doc_max_timesteps)
        print(data.datasets)
    def test_case2(self):
        sum_loader = SummarizationLoader()
        paths = {"train": "testdata/train.jsonl", "valid": "testdata/val.jsonl", "test": "testdata/test.jsonl"}
        data = sum_loader.process(paths=paths, vocab_size=vocab_size, vocab_path=vocab_path, sent_max_len=sent_max_len, doc_max_timesteps=doc_max_timesteps, domain=True)
        print(data.datasets, data.vocabs)
    def test_case3(self):
        sum_loader = SummarizationLoader()
        paths = {"train": "testdata/train.jsonl", "valid": "testdata/val.jsonl", "test": "testdata/test.jsonl"}
        data = sum_loader.process(paths=paths, vocab_size=vocab_size, vocab_path=vocab_path, sent_max_len=sent_max_len, doc_max_timesteps=doc_max_timesteps, tag=True)
        print(data.datasets, data.vocabs)
--- a/reproduction/Summarization/test/testdata/vocab
+++ b/reproduction/Summarization/test/testdata/vocab
--- a/reproduction/Summarization/test_data.py
+++ b/reproduction/Summarization/test_data.py
@@ -0,0 +1,56 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 # __author__="Danqing Wang"
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #         http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 import os
 import sys
 sys.path.append('/remote-home/dqwang/FastNLP/fastNLP_brxx/')
 from fastNLP.core.const import Const
 from data.dataloader import SummarizationLoader
 from tools.data import ExampleSet, Vocab
 vocab_size = 100000
 vocab_path = "test/testdata/vocab"
 sent_max_len = 100
 doc_max_timesteps = 50
 # paths = {"train": "test/testdata/train.jsonl", "valid": "test/testdata/val.jsonl"}
 paths = {"train": "/remote-home/dqwang/Datasets/CNNDM/train.label.jsonl", "valid": "/remote-home/dqwang/Datasets/CNNDM/val.label.jsonl"}
 sum_loader = SummarizationLoader()
 dataInfo = sum_loader.process(paths=paths, vocab_size=vocab_size, vocab_path=vocab_path, sent_max_len=sent_max_len, doc_max_timesteps=doc_max_timesteps, load_vocab_file=True)
 trainset = dataInfo.datasets["train"]
 vocab = Vocab(vocab_path, vocab_size)
 dataset = ExampleSet(paths["train"], vocab, doc_max_timesteps, sent_max_len)
 # print(trainset[0]["text"])
 # print(dataset.get_example(0).original_article_sents)
 # print(trainset[0]["words"])
 # print(dataset[0][0].numpy().tolist())
 b_size = len(trainset)
 for i in range(b_size):
    if i <= 7327:
        continue
    print(trainset[i][Const.INPUT])
    print(dataset[i][0].numpy().tolist())
    assert trainset[i][Const.INPUT] == dataset[i][0].numpy().tolist(), i
    assert trainset[i][Const.INPUT_LEN] == dataset[i][2].numpy().tolist(), i
    assert trainset[i][Const.TARGET] == dataset[i][1].numpy().tolist(), i