|
@@ -0,0 +1,56 @@ |
|
|
|
|
|
#!/usr/bin/python
|
|
|
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
|
|
|
|
|
|
|
# __author__="Danqing Wang"
|
|
|
|
|
|
|
|
|
|
|
|
#
|
|
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
|
|
#
|
|
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
|
|
#
|
|
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
|
|
# limitations under the License.
|
|
|
|
|
|
# ==============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
import os
|
|
|
|
|
|
import sys
|
|
|
|
|
|
|
|
|
|
|
|
sys.path.append('/remote-home/dqwang/FastNLP/fastNLP_brxx/')
|
|
|
|
|
|
|
|
|
|
|
|
from fastNLP.core.const import Const
|
|
|
|
|
|
|
|
|
|
|
|
from data.dataloader import SummarizationLoader
|
|
|
|
|
|
from tools.data import ExampleSet, Vocab
|
|
|
|
|
|
|
|
|
|
|
|
vocab_size = 100000
|
|
|
|
|
|
vocab_path = "test/testdata/vocab"
|
|
|
|
|
|
sent_max_len = 100
|
|
|
|
|
|
doc_max_timesteps = 50
|
|
|
|
|
|
|
|
|
|
|
|
# paths = {"train": "test/testdata/train.jsonl", "valid": "test/testdata/val.jsonl"}
|
|
|
|
|
|
paths = {"train": "/remote-home/dqwang/Datasets/CNNDM/train.label.jsonl", "valid": "/remote-home/dqwang/Datasets/CNNDM/val.label.jsonl"}
|
|
|
|
|
|
sum_loader = SummarizationLoader()
|
|
|
|
|
|
dataInfo = sum_loader.process(paths=paths, vocab_size=vocab_size, vocab_path=vocab_path, sent_max_len=sent_max_len, doc_max_timesteps=doc_max_timesteps, load_vocab_file=True)
|
|
|
|
|
|
trainset = dataInfo.datasets["train"]
|
|
|
|
|
|
|
|
|
|
|
|
vocab = Vocab(vocab_path, vocab_size)
|
|
|
|
|
|
dataset = ExampleSet(paths["train"], vocab, doc_max_timesteps, sent_max_len)
|
|
|
|
|
|
|
|
|
|
|
|
# print(trainset[0]["text"])
|
|
|
|
|
|
# print(dataset.get_example(0).original_article_sents)
|
|
|
|
|
|
# print(trainset[0]["words"])
|
|
|
|
|
|
# print(dataset[0][0].numpy().tolist())
|
|
|
|
|
|
b_size = len(trainset)
|
|
|
|
|
|
for i in range(b_size):
|
|
|
|
|
|
if i <= 7327:
|
|
|
|
|
|
continue
|
|
|
|
|
|
print(trainset[i][Const.INPUT])
|
|
|
|
|
|
print(dataset[i][0].numpy().tolist())
|
|
|
|
|
|
assert trainset[i][Const.INPUT] == dataset[i][0].numpy().tolist(), i
|
|
|
|
|
|
assert trainset[i][Const.INPUT_LEN] == dataset[i][2].numpy().tolist(), i
|
|
|
|
|
|
assert trainset[i][Const.TARGET] == dataset[i][1].numpy().tolist(), i |