删除不必要的文件

3 years ago · 2797c0a961
--- a/tests/core/controllers/_test_trainer_jittor.py
+++ b/tests/core/controllers/_test_trainer_jittor.py
@@ -1,237 +0,0 @@
 import os
 import sys
 import time
 # os.environ["cuda_archs"] = "61"
 # os.environ["FAS"]
 os.environ["log_silent"] = "1"
 sys.path.append("../../../")

 from datasets import load_dataset
 from datasets import DatasetDict
 import jittor as jt
 from jittor import nn, Module
 from jittor.dataset import Dataset
 jt.flags.use_cuda = True

 from fastNLP.core.controllers.trainer import Trainer
 from fastNLP.core.metrics.accuracy import Accuracy
 from fastNLP.core.vocabulary import Vocabulary
 from fastNLP.core.callbacks.progress_callback import RichCallback
 from fastNLP.core.callbacks.callback import Callback
 from fastNLP.core.dataloaders.jittor_dataloader.fdl import JittorDataLoader

 class TextClassificationDataset(Dataset):
    def __init__(self, dataset):
        super(TextClassificationDataset, self).__init__()
        self.dataset = dataset
        self.set_attrs(total_len=len(dataset))

    def __getitem__(self, idx):
        return {"x": self.dataset["input_ids"][idx], "y": self.dataset["label"][idx]}


 class LSTM(Module):
    
    def __init__(self, num_of_words, hidden_size, features):

        self.embedding = nn.Embedding(num_of_words, features)
        self.lstm = nn.LSTM(features, hidden_size, batch_first=True)
        self.layer = nn.Linear(hidden_size, 2)
        self.softmax = nn.Softmax(dim=1)
        self.loss_fn = nn.CrossEntropyLoss()
        
        self.hidden_size = hidden_size
        self.features = features

    def init_hidden(self, x):
        # batch_first
        batch_size = x.shape[0]
        h0 = jt.randn(1, batch_size, hidden_size)
        c0 = jt.randn(1, batch_size, hidden_size)

        return h0, c0

    def execute(self, input_ids):

        output = self.embedding(input_ids)
        # TODO 去除padding
        output, (h, c) = self.lstm(output, self.init_hidden(output))
        # len, batch, hidden_size
        output = self.layer(output[-1])

        return output

    def train_step(self, x, y):
        x = self(x)
        outputs = self.loss_fn(x, y)
        return {"loss": outputs}

    def evaluate_step(self, x, y):
        x = self(x)
        return {"pred": x, "target": y.reshape((-1,))}


 class PrintWhileTrainingCallBack(Callback):
    """
    通过该Callback实现训练过程中loss的输出
    """

    def __init__(self, print_every_epoch, print_every_batch):
        self.print_every_epoch = print_every_epoch
        self.print_every_batch = print_every_batch

        self.loss = 0
        self.start = 0
        self.epoch_start = 0

    def on_train_begin(self, trainer):
        """
        在训练开始前输出信息
        """
        print("Start training. Total {} epochs and {} batches in each epoch.".format(
            trainer.n_epochs, trainer.num_batches_per_epoch
        ))
        self.start = time.time()

    def on_before_backward(self, trainer, outputs):
        """
        每次反向传播前统计loss，用于计算平均值
        """
        loss = trainer.extract_loss_from_outputs(outputs)
        loss = trainer.driver.tensor_to_numeric(loss)
        self.loss += loss

    def on_train_epoch_begin(self, trainer):
        self.epoch_start = time.time()

    def on_train_epoch_end(self, trainer):
        """
        在每经过一定epoch或最后一个epoch时输出当前epoch的平均loss和使用时间
        """
        if trainer.cur_epoch_idx % self.print_every_epoch == 0 \
            or trainer.cur_epoch_idx == trainer.n_epochs:
            print("Epoch: {} Loss: {} Current epoch training time: {}s".format(
                trainer.cur_epoch_idx, self.loss / trainer.num_batches_per_epoch, time.time() - self.epoch_start
            ))
        # 将loss清零
        self.loss = 0
    
    def on_train_batch_end(self, trainer):
        """
        在每经过一定batch或最后一个batch时输出当前epoch截止目前的平均loss
        """
        if trainer.batch_idx_in_epoch % self.print_every_batch == 0 \
            or trainer.batch_idx_in_epoch == trainer.num_batches_per_epoch:
            print("\tBatch: {} Loss: {}".format(
                trainer.batch_idx_in_epoch, self.loss / trainer.batch_idx_in_epoch
            ))

    def on_train_end(self, trainer):
        print("Total training time: {}s".format(time.time() - self.start))

    
 def process_data(ds: DatasetDict, vocabulary: Vocabulary, max_len=256) -> DatasetDict:
    # 分词
    ds = ds.map(lambda x: {"input_ids": text_to_id(vocabulary, x["text"], max_len)})
    ds.set_format(type="numpy", columns=ds.column_names)
    return ds

 def set_vocabulary(vocab, dataset):

    for data in dataset:
        vocab.update(data["text"].split())
    return vocab

 def text_to_id(vocab, text: str, max_len):
    text = text.split()
    # to index
    ids = [vocab.to_index(word) for word in text]
    # padding
    ids += [vocab.padding_idx] * (max_len - len(text))
    return ids[:max_len]

 def get_dataset(name, max_len, train_format="", test_format=""):

    # datasets
    train_dataset = load_dataset(name, split="train" + train_format).shuffle(seed=123)
    test_dataset = load_dataset(name, split="test" + test_format).shuffle(seed=321)
    split = train_dataset.train_test_split(test_size=0.2, seed=123)
    train_dataset = split["train"]
    val_dataset = split["test"]

    vocab = Vocabulary()
    vocab = set_vocabulary(vocab, train_dataset)
    vocab = set_vocabulary(vocab, val_dataset)

    train_dataset = process_data(train_dataset, vocab, max_len)
    val_dataset = process_data(val_dataset, vocab, max_len)
    test_dataset = process_data(test_dataset, vocab, max_len)

    return TextClassificationDataset(train_dataset), TextClassificationDataset(val_dataset), \
            TextClassificationDataset(test_dataset), vocab

 if __name__ == "__main__":

    # 训练参数
    max_len = 20
    epochs = 40
    lr = 1
    batch_size = 64

    features = 100
    hidden_size = 128

    # 获取数据集
    # imdb.py SetFit/sst2
    train_data, val_data, test_data, vocab = get_dataset("SetFit/sst2", max_len, "", "")
    # 使用dataloader
    train_dataloader = JittorDataLoader(
        dataset=train_data,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4,
    )
    val_dataloader = JittorDataLoader(
        dataset=val_data,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4,
    )
    test_dataloader = JittorDataLoader(
        dataset=test_data,
        batch_size=1,
        shuffle=False,
    )

    # 初始化模型
    model = LSTM(len(vocab), hidden_size, features)

    # 优化器
    # 也可以是多个优化器的list
    optimizer = nn.SGD(model.parameters(), lr)

    # Metrics
    metrics = {"acc": Accuracy()}

    # callbacks
    callbacks = [
        PrintWhileTrainingCallBack(print_every_epoch=1, print_every_batch=10),
        # RichCallback(), # print_every参数默认为1，即每一个batch更新一次进度条
    ]

    trainer = Trainer(
        model=model,
        driver="jittor",
        device=[0,1,2,3,4],
        optimizers=optimizer,
        train_dataloader=train_dataloader,
        validate_dataloaders=val_dataloader,
        validate_every=-1,
        input_mapping=None,
        output_mapping=None,
        metrics=metrics,
        n_epochs=epochs,
        callbacks=callbacks,
        # progress_bar="raw"
    )
    trainer.run()
--- a/tests/core/controllers/imdb.py
+++ b/tests/core/controllers/imdb.py
@@ -1,110 +0,0 @@
 # coding=utf-8
 # Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.

 # Lint as: python3
 """IMDB movie reviews dataset."""

 import datasets
 from datasets.tasks import TextClassification


 _DESCRIPTION = """\
 Large Movie Review Dataset.
 This is a dataset for binary sentiment classification containing substantially \
 more data than previous benchmark datasets. We provide a set of 25,000 highly \
 polar movie reviews for training, and 25,000 for testing. There is additional \
 unlabeled data for use as well.\
 """

 _CITATION = """\
@InProceedings{maas-EtAl:2011:ACL-HLT2011,
  author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
  title     = {Learning Word Vectors for Sentiment Analysis},
  booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
  month     = {June},
  year      = {2011},
  address   = {Portland, Oregon, USA},
  publisher = {Association for Computational Linguistics},
  pages     = {142--150},
  url       = {http://www.aclweb.org/anthology/P11-1015}
 }
 """

 _DOWNLOAD_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"


 class IMDBReviewsConfig(datasets.BuilderConfig):
    """BuilderConfig for IMDBReviews."""

    def __init__(self, **kwargs):
        """BuilderConfig for IMDBReviews.
        Args:
          **kwargs: keyword arguments forwarded to super.
        """
        super(IMDBReviewsConfig, self).__init__(version=datasets.Version("1.0.0", ""), **kwargs)


 class Imdb(datasets.GeneratorBasedBuilder):
    """IMDB movie reviews dataset."""

    BUILDER_CONFIGS = [
        IMDBReviewsConfig(
            name="plain_text",
            description="Plain text",
        )
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=datasets.Features(
                {"text": datasets.Value("string"), "label": datasets.features.ClassLabel(names=["neg", "pos"])}
            ),
            supervised_keys=None,
            homepage="http://ai.stanford.edu/~amaas/data/sentiment/",
            citation=_CITATION,
            task_templates=[TextClassification(text_column="text", label_column="label")],
        )

    def _split_generators(self, dl_manager):
        archive = dl_manager.download(_DOWNLOAD_URL)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train"}
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "test"}
            ),
            datasets.SplitGenerator(
                name=datasets.Split("unsupervised"),
                gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train", "labeled": False},
            ),
        ]

    def _generate_examples(self, files, split, labeled=True):
        """Generate aclImdb examples."""
        # For labeled examples, extract the label from the path.
        if labeled:
            label_mapping = {"pos": 1, "neg": 0}
            for path, f in files:
                if path.startswith(f"aclImdb/{split}"):
                    label = label_mapping.get(path.split("/")[2])
                    if label is not None:
                        yield path, {"text": f.read().decode("utf-8"), "label": label}
        else:
            for path, f in files:
                if path.startswith(f"aclImdb/{split}"):
                    if path.split("/")[2] == "unsup":
                        yield path, {"text": f.read().decode("utf-8"), "label": -1}