diff --git a/tests/core/controllers/_test_trainer_jittor.py b/tests/core/controllers/_test_trainer_jittor.py deleted file mode 100644 index d132c99c..00000000 --- a/tests/core/controllers/_test_trainer_jittor.py +++ /dev/null @@ -1,237 +0,0 @@ -import os -import sys -import time -# os.environ["cuda_archs"] = "61" -# os.environ["FAS"] -os.environ["log_silent"] = "1" -sys.path.append("../../../") - -from datasets import load_dataset -from datasets import DatasetDict -import jittor as jt -from jittor import nn, Module -from jittor.dataset import Dataset -jt.flags.use_cuda = True - -from fastNLP.core.controllers.trainer import Trainer -from fastNLP.core.metrics.accuracy import Accuracy -from fastNLP.core.vocabulary import Vocabulary -from fastNLP.core.callbacks.progress_callback import RichCallback -from fastNLP.core.callbacks.callback import Callback -from fastNLP.core.dataloaders.jittor_dataloader.fdl import JittorDataLoader - -class TextClassificationDataset(Dataset): - def __init__(self, dataset): - super(TextClassificationDataset, self).__init__() - self.dataset = dataset - self.set_attrs(total_len=len(dataset)) - - def __getitem__(self, idx): - return {"x": self.dataset["input_ids"][idx], "y": self.dataset["label"][idx]} - - -class LSTM(Module): - - def __init__(self, num_of_words, hidden_size, features): - - self.embedding = nn.Embedding(num_of_words, features) - self.lstm = nn.LSTM(features, hidden_size, batch_first=True) - self.layer = nn.Linear(hidden_size, 2) - self.softmax = nn.Softmax(dim=1) - self.loss_fn = nn.CrossEntropyLoss() - - self.hidden_size = hidden_size - self.features = features - - def init_hidden(self, x): - # batch_first - batch_size = x.shape[0] - h0 = jt.randn(1, batch_size, hidden_size) - c0 = jt.randn(1, batch_size, hidden_size) - - return h0, c0 - - def execute(self, input_ids): - - output = self.embedding(input_ids) - # TODO 去除padding - output, (h, c) = self.lstm(output, self.init_hidden(output)) - # len, batch, hidden_size - output = self.layer(output[-1]) - - return output - - def train_step(self, x, y): - x = self(x) - outputs = self.loss_fn(x, y) - return {"loss": outputs} - - def evaluate_step(self, x, y): - x = self(x) - return {"pred": x, "target": y.reshape((-1,))} - - -class PrintWhileTrainingCallBack(Callback): - """ - 通过该Callback实现训练过程中loss的输出 - """ - - def __init__(self, print_every_epoch, print_every_batch): - self.print_every_epoch = print_every_epoch - self.print_every_batch = print_every_batch - - self.loss = 0 - self.start = 0 - self.epoch_start = 0 - - def on_train_begin(self, trainer): - """ - 在训练开始前输出信息 - """ - print("Start training. Total {} epochs and {} batches in each epoch.".format( - trainer.n_epochs, trainer.num_batches_per_epoch - )) - self.start = time.time() - - def on_before_backward(self, trainer, outputs): - """ - 每次反向传播前统计loss,用于计算平均值 - """ - loss = trainer.extract_loss_from_outputs(outputs) - loss = trainer.driver.tensor_to_numeric(loss) - self.loss += loss - - def on_train_epoch_begin(self, trainer): - self.epoch_start = time.time() - - def on_train_epoch_end(self, trainer): - """ - 在每经过一定epoch或最后一个epoch时输出当前epoch的平均loss和使用时间 - """ - if trainer.cur_epoch_idx % self.print_every_epoch == 0 \ - or trainer.cur_epoch_idx == trainer.n_epochs: - print("Epoch: {} Loss: {} Current epoch training time: {}s".format( - trainer.cur_epoch_idx, self.loss / trainer.num_batches_per_epoch, time.time() - self.epoch_start - )) - # 将loss清零 - self.loss = 0 - - def on_train_batch_end(self, trainer): - """ - 在每经过一定batch或最后一个batch时输出当前epoch截止目前的平均loss - """ - if trainer.batch_idx_in_epoch % self.print_every_batch == 0 \ - or trainer.batch_idx_in_epoch == trainer.num_batches_per_epoch: - print("\tBatch: {} Loss: {}".format( - trainer.batch_idx_in_epoch, self.loss / trainer.batch_idx_in_epoch - )) - - def on_train_end(self, trainer): - print("Total training time: {}s".format(time.time() - self.start)) - - -def process_data(ds: DatasetDict, vocabulary: Vocabulary, max_len=256) -> DatasetDict: - # 分词 - ds = ds.map(lambda x: {"input_ids": text_to_id(vocabulary, x["text"], max_len)}) - ds.set_format(type="numpy", columns=ds.column_names) - return ds - -def set_vocabulary(vocab, dataset): - - for data in dataset: - vocab.update(data["text"].split()) - return vocab - -def text_to_id(vocab, text: str, max_len): - text = text.split() - # to index - ids = [vocab.to_index(word) for word in text] - # padding - ids += [vocab.padding_idx] * (max_len - len(text)) - return ids[:max_len] - -def get_dataset(name, max_len, train_format="", test_format=""): - - # datasets - train_dataset = load_dataset(name, split="train" + train_format).shuffle(seed=123) - test_dataset = load_dataset(name, split="test" + test_format).shuffle(seed=321) - split = train_dataset.train_test_split(test_size=0.2, seed=123) - train_dataset = split["train"] - val_dataset = split["test"] - - vocab = Vocabulary() - vocab = set_vocabulary(vocab, train_dataset) - vocab = set_vocabulary(vocab, val_dataset) - - train_dataset = process_data(train_dataset, vocab, max_len) - val_dataset = process_data(val_dataset, vocab, max_len) - test_dataset = process_data(test_dataset, vocab, max_len) - - return TextClassificationDataset(train_dataset), TextClassificationDataset(val_dataset), \ - TextClassificationDataset(test_dataset), vocab - -if __name__ == "__main__": - - # 训练参数 - max_len = 20 - epochs = 40 - lr = 1 - batch_size = 64 - - features = 100 - hidden_size = 128 - - # 获取数据集 - # imdb.py SetFit/sst2 - train_data, val_data, test_data, vocab = get_dataset("SetFit/sst2", max_len, "", "") - # 使用dataloader - train_dataloader = JittorDataLoader( - dataset=train_data, - batch_size=batch_size, - shuffle=True, - num_workers=4, - ) - val_dataloader = JittorDataLoader( - dataset=val_data, - batch_size=batch_size, - shuffle=True, - num_workers=4, - ) - test_dataloader = JittorDataLoader( - dataset=test_data, - batch_size=1, - shuffle=False, - ) - - # 初始化模型 - model = LSTM(len(vocab), hidden_size, features) - - # 优化器 - # 也可以是多个优化器的list - optimizer = nn.SGD(model.parameters(), lr) - - # Metrics - metrics = {"acc": Accuracy()} - - # callbacks - callbacks = [ - PrintWhileTrainingCallBack(print_every_epoch=1, print_every_batch=10), - # RichCallback(), # print_every参数默认为1,即每一个batch更新一次进度条 - ] - - trainer = Trainer( - model=model, - driver="jittor", - device=[0,1,2,3,4], - optimizers=optimizer, - train_dataloader=train_dataloader, - validate_dataloaders=val_dataloader, - validate_every=-1, - input_mapping=None, - output_mapping=None, - metrics=metrics, - n_epochs=epochs, - callbacks=callbacks, - # progress_bar="raw" - ) - trainer.run() \ No newline at end of file diff --git a/tests/core/controllers/imdb.py b/tests/core/controllers/imdb.py deleted file mode 100644 index cdf59047..00000000 --- a/tests/core/controllers/imdb.py +++ /dev/null @@ -1,110 +0,0 @@ -# coding=utf-8 -# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Lint as: python3 -"""IMDB movie reviews dataset.""" - -import datasets -from datasets.tasks import TextClassification - - -_DESCRIPTION = """\ -Large Movie Review Dataset. -This is a dataset for binary sentiment classification containing substantially \ -more data than previous benchmark datasets. We provide a set of 25,000 highly \ -polar movie reviews for training, and 25,000 for testing. There is additional \ -unlabeled data for use as well.\ -""" - -_CITATION = """\ -@InProceedings{maas-EtAl:2011:ACL-HLT2011, - author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher}, - title = {Learning Word Vectors for Sentiment Analysis}, - booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies}, - month = {June}, - year = {2011}, - address = {Portland, Oregon, USA}, - publisher = {Association for Computational Linguistics}, - pages = {142--150}, - url = {http://www.aclweb.org/anthology/P11-1015} -} -""" - -_DOWNLOAD_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" - - -class IMDBReviewsConfig(datasets.BuilderConfig): - """BuilderConfig for IMDBReviews.""" - - def __init__(self, **kwargs): - """BuilderConfig for IMDBReviews. - Args: - **kwargs: keyword arguments forwarded to super. - """ - super(IMDBReviewsConfig, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs) - - -class Imdb(datasets.GeneratorBasedBuilder): - """IMDB movie reviews dataset.""" - - BUILDER_CONFIGS = [ - IMDBReviewsConfig( - name="plain_text", - description="Plain text", - ) - ] - - def _info(self): - return datasets.DatasetInfo( - description=_DESCRIPTION, - features=datasets.Features( - {"text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["neg", "pos"])} - ), - supervised_keys=None, - homepage="http://ai.stanford.edu/~amaas/data/sentiment/", - citation=_CITATION, - task_templates=[TextClassification(text_column="text", label_column="label")], - ) - - def _split_generators(self, dl_manager): - archive = dl_manager.download(_DOWNLOAD_URL) - return [ - datasets.SplitGenerator( - name=datasets.Split.TRAIN, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train"} - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "test"} - ), - datasets.SplitGenerator( - name=datasets.Split("unsupervised"), - gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train", "labeled": False}, - ), - ] - - def _generate_examples(self, files, split, labeled=True): - """Generate aclImdb examples.""" - # For labeled examples, extract the label from the path. - if labeled: - label_mapping = {"pos": 1, "neg": 0} - for path, f in files: - if path.startswith(f"aclImdb/{split}"): - label = label_mapping.get(path.split("/")[2]) - if label is not None: - yield path, {"text": f.read().decode("utf-8"), "label": label} - else: - for path, f in files: - if path.startswith(f"aclImdb/{split}"): - if path.split("/")[2] == "unsup": - yield path, {"text": f.read().decode("utf-8"), "label": -1} \ No newline at end of file