| @@ -1,14 +1,14 @@ | |||||
| #!/bin/bash | #!/bin/bash | ||||
| module load anaconda/2020.11 | |||||
| module load cuda/10.2 | |||||
| module load cudnn/8.1.1.33_CUDA10.2 | |||||
| #cd run | |||||
| #cd machineLearningTemplate | |||||
| #conda create --name py37 python=3.7 | |||||
| source activate py37 | |||||
| module load anaconda/2020.11 | |||||
| module load cuda/11.1 | |||||
| module load cudnn/8.1.1.33_CUDA11.1 | |||||
| cd run | |||||
| cd machineLearningScaffold | |||||
| #conda create --name py39 python=3.9 | |||||
| source activate py39 | |||||
| #pip install -r requirements.txt | #pip install -r requirements.txt | ||||
| @@ -1,92 +0,0 @@ | |||||
| { | |||||
| "cells": [ | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": null, | |||||
| "id": "4c4d04db-6fe0-4511-abb3-c746ba869863", | |||||
| "metadata": {}, | |||||
| "outputs": [], | |||||
| "source": [ | |||||
| "!apt-get update\n", | |||||
| "!apt-get install p7zip-full -y" | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": null, | |||||
| "id": "7c027a62-8116-4c55-9a90-95c4d90dd087", | |||||
| "metadata": {}, | |||||
| "outputs": [], | |||||
| "source": [ | |||||
| "!7z x dataset.zip -o/hy-tmp\n", | |||||
| "!7z x machineLearningScaffold.zip -o/hy-tmp\n", | |||||
| "!7z x vit_pre-train_checkpoint.zip -o/hy-tmp\n", | |||||
| "!mv -f /hy-tmp/pre-train_checkpoint/vit_checkpoint/imagenet21k/R50+ViT-B_16.npz /hy-tmp/pre-train_checkpoint" | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": 8, | |||||
| "id": "4c6dc6a1-0208-4efb-beaf-b1264a94ba71", | |||||
| "metadata": {}, | |||||
| "outputs": [ | |||||
| { | |||||
| "name": "stdout", | |||||
| "output_type": "stream", | |||||
| "text": [ | |||||
| "\u001b[33mWARNING: You are using pip version 21.0.1; however, version 21.2.4 is available.\n", | |||||
| "You should consider upgrading via the '/usr/bin/python3.8 -m pip install --upgrade pip' command.\u001b[0m\n" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "source": [ | |||||
| "!pip install -r /hy-tmp/requirements.txt > /dev/null" | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": null, | |||||
| "id": "2c074f52-d27e-4605-8b4f-45b11a2f1eb1", | |||||
| "metadata": {}, | |||||
| "outputs": [], | |||||
| "source": [ | |||||
| "import main\n", | |||||
| "%load_ext tensorboard" | |||||
| ] | |||||
| }, | |||||
| { | |||||
| "cell_type": "code", | |||||
| "execution_count": null, | |||||
| "id": "a5a8d64e-d90e-467f-a3b4-db8b31874044", | |||||
| "metadata": {}, | |||||
| "outputs": [], | |||||
| "source": [ | |||||
| "main.main('fit', gpus=1, dataset_path='./dataset/MFNet(RGB-T)', num_workers=2, max_epochs=60, batch_size=16, precision=16, seed=1234, # tpu_cores=8,\n", | |||||
| " checkpoint_every_n_val=1, save_name='version_0', #checkpoint_path='/version_15/checkpoints/epoch=59-step=4739.ckpt',\n", | |||||
| " path_final_save='./drive/MyDrive', save_top_k=1\n", | |||||
| " )" | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "metadata": { | |||||
| "kernelspec": { | |||||
| "display_name": "Python 3", | |||||
| "language": "python", | |||||
| "name": "python3" | |||||
| }, | |||||
| "language_info": { | |||||
| "codemirror_mode": { | |||||
| "name": "ipython", | |||||
| "version": 3 | |||||
| }, | |||||
| "file_extension": ".py", | |||||
| "mimetype": "text/x-python", | |||||
| "name": "python", | |||||
| "nbconvert_exporter": "python", | |||||
| "pygments_lexer": "ipython3", | |||||
| "version": "3.8.10" | |||||
| } | |||||
| }, | |||||
| "nbformat": 4, | |||||
| "nbformat_minor": 5 | |||||
| } | |||||
| @@ -1,3 +1,5 @@ | |||||
| import torch | |||||
| from save_checkpoint import SaveCheckpoint | from save_checkpoint import SaveCheckpoint | ||||
| from data_module import DataModule | from data_module import DataModule | ||||
| from pytorch_lightning import loggers as pl_loggers | from pytorch_lightning import loggers as pl_loggers | ||||
| @@ -27,6 +29,11 @@ def main(stage, | |||||
| 框架的入口函数. 包含设置超参数, 划分数据集, 选择训练或测试等流程 | 框架的入口函数. 包含设置超参数, 划分数据集, 选择训练或测试等流程 | ||||
| 该函数的参数为训练过程中需要经常改动的参数 | 该函数的参数为训练过程中需要经常改动的参数 | ||||
| 经常改动的 参数 作为main的输入参数 | |||||
| 不常改动的 非通用参数 存放在config | |||||
| 不常改动的 通用参数 直接进行声明 | |||||
| * 通用参数指的是所有网络中共有的参数, 如time_sum等 | |||||
| :param stage: 表示处于训练阶段还是测试阶段, fit表示训练, test表示测试 | :param stage: 表示处于训练阶段还是测试阶段, fit表示训练, test表示测试 | ||||
| :param max_epochs: | :param max_epochs: | ||||
| :param batch_size: | :param batch_size: | ||||
| @@ -44,14 +51,11 @@ def main(stage, | |||||
| 非重载训练的情况下, 可以通过调整该值控制训练的次数; | 非重载训练的情况下, 可以通过调整该值控制训练的次数; | ||||
| :param k_fold: | :param k_fold: | ||||
| """ | """ | ||||
| # 经常改动的 参数 作为main的输入参数 | |||||
| # 不常改动的 非通用参数 存放在config | |||||
| # 不常改动的 通用参数 直接进行声明 | |||||
| # 通用参数指的是所有网络中共有的参数, 如time_sum等 | |||||
| # 处理输入数据 | # 处理输入数据 | ||||
| precision = 32 if (gpus is None and tpu_cores is None) else precision | precision = 32 if (gpus is None and tpu_cores is None) else precision | ||||
| # 获得通用参数 | |||||
| # 自动处理:param gpus | |||||
| gpus = 1 if torch.cuda.is_available() and gpus is None and tpu_cores is None else None | |||||
| # 定义不常改动的通用参数 | |||||
| # TODO 获得最优的batch size | # TODO 获得最优的batch size | ||||
| num_workers = cpu_count() | num_workers = cpu_count() | ||||
| # 获得非通用参数 | # 获得非通用参数 | ||||
| @@ -100,8 +104,7 @@ def main(stage, | |||||
| if __name__ == "__main__": | if __name__ == "__main__": | ||||
| main('fit', max_epochs=2, batch_size=32, precision=16, seed=1234, dataset_path='./dataset', k_fold=10, | |||||
| main('fit', max_epochs=200, batch_size=128, precision=16, seed=1234, dataset_path='./dataset', k_fold=10, | |||||
| kth_fold_start=9, | kth_fold_start=9, | ||||
| # gpus=1, | |||||
| # version_nth=8, | # version_nth=8, | ||||
| ) | ) | ||||
| @@ -17,6 +17,7 @@ class TrainModule(pl.LightningModule): | |||||
| self.net = resnet56() | self.net = resnet56() | ||||
| self.loss = nn.CrossEntropyLoss() | self.loss = nn.CrossEntropyLoss() | ||||
| # 返回值必须包含loss, loss可以作为dict中的key, 或者直接返回loss | |||||
| def training_step(self, batch, batch_idx): | def training_step(self, batch, batch_idx): | ||||
| _, input, label = batch | _, input, label = batch | ||||
| label = label.flatten() | label = label.flatten() | ||||