From ef272dfb51a64e0cd15f1855440f2fde7b4c59ee Mon Sep 17 00:00:00 2001 From: shenyan <23357320@qq.com> Date: Thu, 21 Oct 2021 12:36:52 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=81=92=E6=BA=90=E4=BA=91?= =?UTF-8?q?=E7=9A=84=E4=BD=BF=E7=94=A8=E4=BB=A3=E7=A0=81;=E4=BF=AE?= =?UTF-8?q?=E6=94=B9=E5=8C=97=E4=BA=AC=E4=BA=91=E7=9A=84=E8=AE=AD=E7=BB=83?= =?UTF-8?q?=E8=84=9A=E6=9C=AC;=20main=E4=B8=AD=E4=BF=AE=E6=94=B9=E4=BA=86?= =?UTF-8?q?=E9=83=A8=E5=88=86=E5=8F=82=E6=95=B0=E5=B9=B6=E5=9C=A8=E8=AE=BE?= =?UTF-8?q?=E7=BD=AEgpu=E7=9A=84=E6=83=85=E5=86=B5=E4=B8=8B=E8=87=AA?= =?UTF-8?q?=E5=8A=A8=E8=B0=83=E7=94=A8gpu;=20=E6=B3=A8=E6=98=8Etrain=20mod?= =?UTF-8?q?el=E4=B8=AD=E8=AE=AD=E7=BB=83=E6=AD=A5=E7=9A=84=E8=BF=94?= =?UTF-8?q?=E5=9B=9E=E5=80=BC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- free_servers/bscc-run.sh | 14 +++--- free_servers/hy-scaffold.ipynb | 92 ---------------------------------- main.py | 19 ++++--- train_model.py | 1 + 4 files changed, 19 insertions(+), 107 deletions(-) delete mode 100644 free_servers/hy-scaffold.ipynb diff --git a/free_servers/bscc-run.sh b/free_servers/bscc-run.sh index 84b07c6..40d6a4d 100644 --- a/free_servers/bscc-run.sh +++ b/free_servers/bscc-run.sh @@ -1,14 +1,14 @@ #!/bin/bash -module load anaconda/2020.11 -module load cuda/10.2 -module load cudnn/8.1.1.33_CUDA10.2 +#cd run +#cd machineLearningTemplate -#conda create --name py37 python=3.7 -source activate py37 +module load anaconda/2020.11 +module load cuda/11.1 +module load cudnn/8.1.1.33_CUDA11.1 -cd run -cd machineLearningScaffold +#conda create --name py39 python=3.9 +source activate py39 #pip install -r requirements.txt diff --git a/free_servers/hy-scaffold.ipynb b/free_servers/hy-scaffold.ipynb deleted file mode 100644 index 067788c..0000000 --- a/free_servers/hy-scaffold.ipynb +++ /dev/null @@ -1,92 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "4c4d04db-6fe0-4511-abb3-c746ba869863", - "metadata": {}, - "outputs": [], - "source": [ - "!apt-get update\n", - "!apt-get install p7zip-full -y" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c027a62-8116-4c55-9a90-95c4d90dd087", - "metadata": {}, - "outputs": [], - "source": [ - "!7z x dataset.zip -o/hy-tmp\n", - "!7z x machineLearningScaffold.zip -o/hy-tmp\n", - "!7z x vit_pre-train_checkpoint.zip -o/hy-tmp\n", - "!mv -f /hy-tmp/pre-train_checkpoint/vit_checkpoint/imagenet21k/R50+ViT-B_16.npz /hy-tmp/pre-train_checkpoint" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "4c6dc6a1-0208-4efb-beaf-b1264a94ba71", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mWARNING: You are using pip version 21.0.1; however, version 21.2.4 is available.\n", - "You should consider upgrading via the '/usr/bin/python3.8 -m pip install --upgrade pip' command.\u001b[0m\n" - ] - } - ], - "source": [ - "!pip install -r /hy-tmp/requirements.txt > /dev/null" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2c074f52-d27e-4605-8b4f-45b11a2f1eb1", - "metadata": {}, - "outputs": [], - "source": [ - "import main\n", - "%load_ext tensorboard" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a5a8d64e-d90e-467f-a3b4-db8b31874044", - "metadata": {}, - "outputs": [], - "source": [ - "main.main('fit', gpus=1, dataset_path='./dataset/MFNet(RGB-T)', num_workers=2, max_epochs=60, batch_size=16, precision=16, seed=1234, # tpu_cores=8,\n", - " checkpoint_every_n_val=1, save_name='version_0', #checkpoint_path='/version_15/checkpoints/epoch=59-step=4739.ckpt',\n", - " path_final_save='./drive/MyDrive', save_top_k=1\n", - " )" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/main.py b/main.py index b9e1e89..b7e62f4 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,5 @@ +import torch + from save_checkpoint import SaveCheckpoint from data_module import DataModule from pytorch_lightning import loggers as pl_loggers @@ -27,6 +29,11 @@ def main(stage, 框架的入口函数. 包含设置超参数, 划分数据集, 选择训练或测试等流程 该函数的参数为训练过程中需要经常改动的参数 + 经常改动的 参数 作为main的输入参数 + 不常改动的 非通用参数 存放在config + 不常改动的 通用参数 直接进行声明 + * 通用参数指的是所有网络中共有的参数, 如time_sum等 + :param stage: 表示处于训练阶段还是测试阶段, fit表示训练, test表示测试 :param max_epochs: :param batch_size: @@ -44,14 +51,11 @@ def main(stage, 非重载训练的情况下, 可以通过调整该值控制训练的次数; :param k_fold: """ - # 经常改动的 参数 作为main的输入参数 - # 不常改动的 非通用参数 存放在config - # 不常改动的 通用参数 直接进行声明 - # 通用参数指的是所有网络中共有的参数, 如time_sum等 - # 处理输入数据 precision = 32 if (gpus is None and tpu_cores is None) else precision - # 获得通用参数 + # 自动处理:param gpus + gpus = 1 if torch.cuda.is_available() and gpus is None and tpu_cores is None else None + # 定义不常改动的通用参数 # TODO 获得最优的batch size num_workers = cpu_count() # 获得非通用参数 @@ -100,8 +104,7 @@ def main(stage, if __name__ == "__main__": - main('fit', max_epochs=2, batch_size=32, precision=16, seed=1234, dataset_path='./dataset', k_fold=10, + main('fit', max_epochs=200, batch_size=128, precision=16, seed=1234, dataset_path='./dataset', k_fold=10, kth_fold_start=9, - # gpus=1, # version_nth=8, ) diff --git a/train_model.py b/train_model.py index 2e44a64..631fc84 100644 --- a/train_model.py +++ b/train_model.py @@ -17,6 +17,7 @@ class TrainModule(pl.LightningModule): self.net = resnet56() self.loss = nn.CrossEntropyLoss() + # 返回值必须包含loss, loss可以作为dict中的key, 或者直接返回loss def training_step(self, batch, batch_idx): _, input, label = batch label = label.flatten()