Browse Source

删除不必要的文件

tags/v1.0.0alpha
x54-729 2 years ago
parent
commit
2797c0a961
2 changed files with 0 additions and 347 deletions
  1. +0
    -237
      tests/core/controllers/_test_trainer_jittor.py
  2. +0
    -110
      tests/core/controllers/imdb.py

+ 0
- 237
tests/core/controllers/_test_trainer_jittor.py View File

@@ -1,237 +0,0 @@
import os
import sys
import time
# os.environ["cuda_archs"] = "61"
# os.environ["FAS"]
os.environ["log_silent"] = "1"
sys.path.append("../../../")
from datasets import load_dataset
from datasets import DatasetDict
import jittor as jt
from jittor import nn, Module
from jittor.dataset import Dataset
jt.flags.use_cuda = True
from fastNLP.core.controllers.trainer import Trainer
from fastNLP.core.metrics.accuracy import Accuracy
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.core.callbacks.progress_callback import RichCallback
from fastNLP.core.callbacks.callback import Callback
from fastNLP.core.dataloaders.jittor_dataloader.fdl import JittorDataLoader
class TextClassificationDataset(Dataset):
def __init__(self, dataset):
super(TextClassificationDataset, self).__init__()
self.dataset = dataset
self.set_attrs(total_len=len(dataset))
def __getitem__(self, idx):
return {"x": self.dataset["input_ids"][idx], "y": self.dataset["label"][idx]}
class LSTM(Module):
def __init__(self, num_of_words, hidden_size, features):
self.embedding = nn.Embedding(num_of_words, features)
self.lstm = nn.LSTM(features, hidden_size, batch_first=True)
self.layer = nn.Linear(hidden_size, 2)
self.softmax = nn.Softmax(dim=1)
self.loss_fn = nn.CrossEntropyLoss()
self.hidden_size = hidden_size
self.features = features
def init_hidden(self, x):
# batch_first
batch_size = x.shape[0]
h0 = jt.randn(1, batch_size, hidden_size)
c0 = jt.randn(1, batch_size, hidden_size)
return h0, c0
def execute(self, input_ids):
output = self.embedding(input_ids)
# TODO 去除padding
output, (h, c) = self.lstm(output, self.init_hidden(output))
# len, batch, hidden_size
output = self.layer(output[-1])
return output
def train_step(self, x, y):
x = self(x)
outputs = self.loss_fn(x, y)
return {"loss": outputs}
def evaluate_step(self, x, y):
x = self(x)
return {"pred": x, "target": y.reshape((-1,))}
class PrintWhileTrainingCallBack(Callback):
"""
通过该Callback实现训练过程中loss的输出
"""
def __init__(self, print_every_epoch, print_every_batch):
self.print_every_epoch = print_every_epoch
self.print_every_batch = print_every_batch
self.loss = 0
self.start = 0
self.epoch_start = 0
def on_train_begin(self, trainer):
"""
在训练开始前输出信息
"""
print("Start training. Total {} epochs and {} batches in each epoch.".format(
trainer.n_epochs, trainer.num_batches_per_epoch
))
self.start = time.time()
def on_before_backward(self, trainer, outputs):
"""
每次反向传播前统计loss,用于计算平均值
"""
loss = trainer.extract_loss_from_outputs(outputs)
loss = trainer.driver.tensor_to_numeric(loss)
self.loss += loss
def on_train_epoch_begin(self, trainer):
self.epoch_start = time.time()
def on_train_epoch_end(self, trainer):
"""
在每经过一定epoch或最后一个epoch时输出当前epoch的平均loss和使用时间
"""
if trainer.cur_epoch_idx % self.print_every_epoch == 0 \
or trainer.cur_epoch_idx == trainer.n_epochs:
print("Epoch: {} Loss: {} Current epoch training time: {}s".format(
trainer.cur_epoch_idx, self.loss / trainer.num_batches_per_epoch, time.time() - self.epoch_start
))
# 将loss清零
self.loss = 0
def on_train_batch_end(self, trainer):
"""
在每经过一定batch或最后一个batch时输出当前epoch截止目前的平均loss
"""
if trainer.batch_idx_in_epoch % self.print_every_batch == 0 \
or trainer.batch_idx_in_epoch == trainer.num_batches_per_epoch:
print("\tBatch: {} Loss: {}".format(
trainer.batch_idx_in_epoch, self.loss / trainer.batch_idx_in_epoch
))
def on_train_end(self, trainer):
print("Total training time: {}s".format(time.time() - self.start))
def process_data(ds: DatasetDict, vocabulary: Vocabulary, max_len=256) -> DatasetDict:
# 分词
ds = ds.map(lambda x: {"input_ids": text_to_id(vocabulary, x["text"], max_len)})
ds.set_format(type="numpy", columns=ds.column_names)
return ds
def set_vocabulary(vocab, dataset):
for data in dataset:
vocab.update(data["text"].split())
return vocab
def text_to_id(vocab, text: str, max_len):
text = text.split()
# to index
ids = [vocab.to_index(word) for word in text]
# padding
ids += [vocab.padding_idx] * (max_len - len(text))
return ids[:max_len]
def get_dataset(name, max_len, train_format="", test_format=""):
# datasets
train_dataset = load_dataset(name, split="train" + train_format).shuffle(seed=123)
test_dataset = load_dataset(name, split="test" + test_format).shuffle(seed=321)
split = train_dataset.train_test_split(test_size=0.2, seed=123)
train_dataset = split["train"]
val_dataset = split["test"]
vocab = Vocabulary()
vocab = set_vocabulary(vocab, train_dataset)
vocab = set_vocabulary(vocab, val_dataset)
train_dataset = process_data(train_dataset, vocab, max_len)
val_dataset = process_data(val_dataset, vocab, max_len)
test_dataset = process_data(test_dataset, vocab, max_len)
return TextClassificationDataset(train_dataset), TextClassificationDataset(val_dataset), \
TextClassificationDataset(test_dataset), vocab
if __name__ == "__main__":
# 训练参数
max_len = 20
epochs = 40
lr = 1
batch_size = 64
features = 100
hidden_size = 128
# 获取数据集
# imdb.py SetFit/sst2
train_data, val_data, test_data, vocab = get_dataset("SetFit/sst2", max_len, "", "")
# 使用dataloader
train_dataloader = JittorDataLoader(
dataset=train_data,
batch_size=batch_size,
shuffle=True,
num_workers=4,
)
val_dataloader = JittorDataLoader(
dataset=val_data,
batch_size=batch_size,
shuffle=True,
num_workers=4,
)
test_dataloader = JittorDataLoader(
dataset=test_data,
batch_size=1,
shuffle=False,
)
# 初始化模型
model = LSTM(len(vocab), hidden_size, features)
# 优化器
# 也可以是多个优化器的list
optimizer = nn.SGD(model.parameters(), lr)
# Metrics
metrics = {"acc": Accuracy()}
# callbacks
callbacks = [
PrintWhileTrainingCallBack(print_every_epoch=1, print_every_batch=10),
# RichCallback(), # print_every参数默认为1,即每一个batch更新一次进度条
]
trainer = Trainer(
model=model,
driver="jittor",
device=[0,1,2,3,4],
optimizers=optimizer,
train_dataloader=train_dataloader,
validate_dataloaders=val_dataloader,
validate_every=-1,
input_mapping=None,
output_mapping=None,
metrics=metrics,
n_epochs=epochs,
callbacks=callbacks,
# progress_bar="raw"
)
trainer.run()

+ 0
- 110
tests/core/controllers/imdb.py View File

@@ -1,110 +0,0 @@
# coding=utf-8
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Lint as: python3
"""IMDB movie reviews dataset."""

import datasets
from datasets.tasks import TextClassification


_DESCRIPTION = """\
Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially \
more data than previous benchmark datasets. We provide a set of 25,000 highly \
polar movie reviews for training, and 25,000 for testing. There is additional \
unlabeled data for use as well.\
"""

_CITATION = """\
@InProceedings{maas-EtAl:2011:ACL-HLT2011,
author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher},
title = {Learning Word Vectors for Sentiment Analysis},
booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
month = {June},
year = {2011},
address = {Portland, Oregon, USA},
publisher = {Association for Computational Linguistics},
pages = {142--150},
url = {http://www.aclweb.org/anthology/P11-1015}
}
"""

_DOWNLOAD_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"


class IMDBReviewsConfig(datasets.BuilderConfig):
"""BuilderConfig for IMDBReviews."""

def __init__(self, **kwargs):
"""BuilderConfig for IMDBReviews.
Args:
**kwargs: keyword arguments forwarded to super.
"""
super(IMDBReviewsConfig, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs)


class Imdb(datasets.GeneratorBasedBuilder):
"""IMDB movie reviews dataset."""

BUILDER_CONFIGS = [
IMDBReviewsConfig(
name="plain_text",
description="Plain text",
)
]

def _info(self):
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features(
{"text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["neg", "pos"])}
),
supervised_keys=None,
homepage="http://ai.stanford.edu/~amaas/data/sentiment/",
citation=_CITATION,
task_templates=[TextClassification(text_column="text", label_column="label")],
)

def _split_generators(self, dl_manager):
archive = dl_manager.download(_DOWNLOAD_URL)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train"}
),
datasets.SplitGenerator(
name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "test"}
),
datasets.SplitGenerator(
name=datasets.Split("unsupervised"),
gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train", "labeled": False},
),
]

def _generate_examples(self, files, split, labeled=True):
"""Generate aclImdb examples."""
# For labeled examples, extract the label from the path.
if labeled:
label_mapping = {"pos": 1, "neg": 0}
for path, f in files:
if path.startswith(f"aclImdb/{split}"):
label = label_mapping.get(path.split("/")[2])
if label is not None:
yield path, {"text": f.read().decode("utf-8"), "label": label}
else:
for path, f in files:
if path.startswith(f"aclImdb/{split}"):
if path.split("/")[2] == "unsup":
yield path, {"text": f.read().decode("utf-8"), "label": -1}

Loading…
Cancel
Save