diff --git a/free_servers/bscc-run.sh b/free_servers/bscc-run.sh index 84b07c6..40d6a4d 100644 --- a/free_servers/bscc-run.sh +++ b/free_servers/bscc-run.sh @@ -1,14 +1,14 @@ #!/bin/bash -module load anaconda/2020.11 -module load cuda/10.2 -module load cudnn/8.1.1.33_CUDA10.2 +#cd run +#cd machineLearningTemplate -#conda create --name py37 python=3.7 -source activate py37 +module load anaconda/2020.11 +module load cuda/11.1 +module load cudnn/8.1.1.33_CUDA11.1 -cd run -cd machineLearningScaffold +#conda create --name py39 python=3.9 +source activate py39 #pip install -r requirements.txt diff --git a/free_servers/hy-scaffold.ipynb b/free_servers/hy-scaffold.ipynb deleted file mode 100644 index 067788c..0000000 --- a/free_servers/hy-scaffold.ipynb +++ /dev/null @@ -1,92 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "4c4d04db-6fe0-4511-abb3-c746ba869863", - "metadata": {}, - "outputs": [], - "source": [ - "!apt-get update\n", - "!apt-get install p7zip-full -y" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c027a62-8116-4c55-9a90-95c4d90dd087", - "metadata": {}, - "outputs": [], - "source": [ - "!7z x dataset.zip -o/hy-tmp\n", - "!7z x machineLearningScaffold.zip -o/hy-tmp\n", - "!7z x vit_pre-train_checkpoint.zip -o/hy-tmp\n", - "!mv -f /hy-tmp/pre-train_checkpoint/vit_checkpoint/imagenet21k/R50+ViT-B_16.npz /hy-tmp/pre-train_checkpoint" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "4c6dc6a1-0208-4efb-beaf-b1264a94ba71", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33mWARNING: You are using pip version 21.0.1; however, version 21.2.4 is available.\n", - "You should consider upgrading via the '/usr/bin/python3.8 -m pip install --upgrade pip' command.\u001b[0m\n" - ] - } - ], - "source": [ - "!pip install -r /hy-tmp/requirements.txt > /dev/null" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2c074f52-d27e-4605-8b4f-45b11a2f1eb1", - "metadata": {}, - "outputs": [], - "source": [ - "import main\n", - "%load_ext tensorboard" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a5a8d64e-d90e-467f-a3b4-db8b31874044", - "metadata": {}, - "outputs": [], - "source": [ - "main.main('fit', gpus=1, dataset_path='./dataset/MFNet(RGB-T)', num_workers=2, max_epochs=60, batch_size=16, precision=16, seed=1234, # tpu_cores=8,\n", - " checkpoint_every_n_val=1, save_name='version_0', #checkpoint_path='/version_15/checkpoints/epoch=59-step=4739.ckpt',\n", - " path_final_save='./drive/MyDrive', save_top_k=1\n", - " )" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/main.py b/main.py index b9e1e89..b7e62f4 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,5 @@ +import torch + from save_checkpoint import SaveCheckpoint from data_module import DataModule from pytorch_lightning import loggers as pl_loggers @@ -27,6 +29,11 @@ def main(stage, 框架的入口函数. 包含设置超参数, 划分数据集, 选择训练或测试等流程 该函数的参数为训练过程中需要经常改动的参数 + 经常改动的 参数 作为main的输入参数 + 不常改动的 非通用参数 存放在config + 不常改动的 通用参数 直接进行声明 + * 通用参数指的是所有网络中共有的参数, 如time_sum等 + :param stage: 表示处于训练阶段还是测试阶段, fit表示训练, test表示测试 :param max_epochs: :param batch_size: @@ -44,14 +51,11 @@ def main(stage, 非重载训练的情况下, 可以通过调整该值控制训练的次数; :param k_fold: """ - # 经常改动的 参数 作为main的输入参数 - # 不常改动的 非通用参数 存放在config - # 不常改动的 通用参数 直接进行声明 - # 通用参数指的是所有网络中共有的参数, 如time_sum等 - # 处理输入数据 precision = 32 if (gpus is None and tpu_cores is None) else precision - # 获得通用参数 + # 自动处理:param gpus + gpus = 1 if torch.cuda.is_available() and gpus is None and tpu_cores is None else None + # 定义不常改动的通用参数 # TODO 获得最优的batch size num_workers = cpu_count() # 获得非通用参数 @@ -100,8 +104,7 @@ def main(stage, if __name__ == "__main__": - main('fit', max_epochs=2, batch_size=32, precision=16, seed=1234, dataset_path='./dataset', k_fold=10, + main('fit', max_epochs=200, batch_size=128, precision=16, seed=1234, dataset_path='./dataset', k_fold=10, kth_fold_start=9, - # gpus=1, # version_nth=8, ) diff --git a/train_model.py b/train_model.py index 2e44a64..631fc84 100644 --- a/train_model.py +++ b/train_model.py @@ -17,6 +17,7 @@ class TrainModule(pl.LightningModule): self.net = resnet56() self.loss = nn.CrossEntropyLoss() + # 返回值必须包含loss, loss可以作为dict中的key, 或者直接返回loss def training_step(self, batch, batch_idx): _, input, label = batch label = label.flatten()