From ef272dfb51a64e0cd15f1855440f2fde7b4c59ee Mon Sep 17 00:00:00 2001
From: shenyan <23357320@qq.com>
Date: Thu, 21 Oct 2021 12:36:52 +0800
Subject: [PATCH] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=81=92=E6=BA=90=E4=BA=91?=
 =?UTF-8?q?=E7=9A=84=E4=BD=BF=E7=94=A8=E4=BB=A3=E7=A0=81;=E4=BF=AE?=
 =?UTF-8?q?=E6=94=B9=E5=8C=97=E4=BA=AC=E4=BA=91=E7=9A=84=E8=AE=AD=E7=BB=83?=
 =?UTF-8?q?=E8=84=9A=E6=9C=AC;=20main=E4=B8=AD=E4=BF=AE=E6=94=B9=E4=BA=86?=
 =?UTF-8?q?=E9=83=A8=E5=88=86=E5=8F=82=E6=95=B0=E5=B9=B6=E5=9C=A8=E8=AE=BE?=
 =?UTF-8?q?=E7=BD=AEgpu=E7=9A=84=E6=83=85=E5=86=B5=E4=B8=8B=E8=87=AA?=
 =?UTF-8?q?=E5=8A=A8=E8=B0=83=E7=94=A8gpu;=20=E6=B3=A8=E6=98=8Etrain=20mod?=
 =?UTF-8?q?el=E4=B8=AD=E8=AE=AD=E7=BB=83=E6=AD=A5=E7=9A=84=E8=BF=94?=
 =?UTF-8?q?=E5=9B=9E=E5=80=BC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 free_servers/bscc-run.sh       | 14 +++---
 free_servers/hy-scaffold.ipynb | 92 ----------------------------------
 main.py                        | 19 ++++---
 train_model.py                 |  1 +
 4 files changed, 19 insertions(+), 107 deletions(-)
 delete mode 100644 free_servers/hy-scaffold.ipynb

diff --git a/free_servers/bscc-run.sh b/free_servers/bscc-run.sh
index 84b07c6..40d6a4d 100644
--- a/free_servers/bscc-run.sh
+++ b/free_servers/bscc-run.sh
@@ -1,14 +1,14 @@
 #!/bin/bash
 
-module load anaconda/2020.11
-module load cuda/10.2
-module load cudnn/8.1.1.33_CUDA10.2
+#cd run
+#cd machineLearningTemplate
 
-#conda create --name py37 python=3.7
-source activate py37
+module load anaconda/2020.11
+module load cuda/11.1
+module load cudnn/8.1.1.33_CUDA11.1
 
-cd run
-cd machineLearningScaffold
+#conda create --name py39 python=3.9
+source activate py39
 
 #pip install -r requirements.txt
 
diff --git a/free_servers/hy-scaffold.ipynb b/free_servers/hy-scaffold.ipynb
deleted file mode 100644
index 067788c..0000000
--- a/free_servers/hy-scaffold.ipynb
+++ /dev/null
@@ -1,92 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4c4d04db-6fe0-4511-abb3-c746ba869863",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!apt-get update\n",
-    "!apt-get install p7zip-full -y"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "7c027a62-8116-4c55-9a90-95c4d90dd087",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!7z x dataset.zip -o/hy-tmp\n",
-    "!7z x machineLearningScaffold.zip -o/hy-tmp\n",
-    "!7z x vit_pre-train_checkpoint.zip -o/hy-tmp\n",
-    "!mv -f /hy-tmp/pre-train_checkpoint/vit_checkpoint/imagenet21k/R50+ViT-B_16.npz /hy-tmp/pre-train_checkpoint"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "4c6dc6a1-0208-4efb-beaf-b1264a94ba71",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\u001b[33mWARNING: You are using pip version 21.0.1; however, version 21.2.4 is available.\n",
-      "You should consider upgrading via the '/usr/bin/python3.8 -m pip install --upgrade pip' command.\u001b[0m\n"
-     ]
-    }
-   ],
-   "source": [
-    "!pip install -r /hy-tmp/requirements.txt > /dev/null"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2c074f52-d27e-4605-8b4f-45b11a2f1eb1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import main\n",
-    "%load_ext tensorboard"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a5a8d64e-d90e-467f-a3b4-db8b31874044",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "main.main('fit', gpus=1, dataset_path='./dataset/MFNet(RGB-T)', num_workers=2, max_epochs=60, batch_size=16, precision=16, seed=1234, # tpu_cores=8,\n",
-    "     checkpoint_every_n_val=1, save_name='version_0', #checkpoint_path='/version_15/checkpoints/epoch=59-step=4739.ckpt',\n",
-    "     path_final_save='./drive/MyDrive', save_top_k=1\n",
-    "     )"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.10"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/main.py b/main.py
index b9e1e89..b7e62f4 100644
--- a/main.py
+++ b/main.py
@@ -1,3 +1,5 @@
+import torch
+
 from save_checkpoint import SaveCheckpoint
 from data_module import DataModule
 from pytorch_lightning import loggers as pl_loggers
@@ -27,6 +29,11 @@ def main(stage,
     框架的入口函数. 包含设置超参数, 划分数据集, 选择训练或测试等流程
     该函数的参数为训练过程中需要经常改动的参数
 
+    经常改动的      参数    作为main的输入参数
+    不常改动的   非通用参数    存放在config
+    不常改动的    通用参数     直接进行声明
+    * 通用参数指的是所有网络中共有的参数, 如time_sum等
+
     :param stage: 表示处于训练阶段还是测试阶段, fit表示训练, test表示测试
     :param max_epochs:
     :param batch_size:
@@ -44,14 +51,11 @@ def main(stage,
                            非重载训练的情况下, 可以通过调整该值控制训练的次数;
     :param k_fold:
     """
-    # 经常改动的      参数    作为main的输入参数
-    # 不常改动的   非通用参数    存放在config
-    # 不常改动的    通用参数     直接进行声明
-    # 通用参数指的是所有网络中共有的参数, 如time_sum等
-
     # 处理输入数据
     precision = 32 if (gpus is None and tpu_cores is None) else precision
-    # 获得通用参数
+    # 自动处理:param gpus
+    gpus = 1 if torch.cuda.is_available() and gpus is None and tpu_cores is None else None
+    # 定义不常改动的通用参数
     # TODO 获得最优的batch size
     num_workers = cpu_count()
     # 获得非通用参数
@@ -100,8 +104,7 @@ def main(stage,
 
 
 if __name__ == "__main__":
-    main('fit', max_epochs=2, batch_size=32, precision=16, seed=1234, dataset_path='./dataset', k_fold=10,
+    main('fit', max_epochs=200, batch_size=128, precision=16, seed=1234, dataset_path='./dataset', k_fold=10,
          kth_fold_start=9,
-         # gpus=1,
          # version_nth=8,
          )
diff --git a/train_model.py b/train_model.py
index 2e44a64..631fc84 100644
--- a/train_model.py
+++ b/train_model.py
@@ -17,6 +17,7 @@ class TrainModule(pl.LightningModule):
         self.net = resnet56()
         self.loss = nn.CrossEntropyLoss()
 
+    # 返回值必须包含loss, loss可以作为dict中的key, 或者直接返回loss
     def training_step(self, batch, batch_idx):
         _, input, label = batch
         label = label.flatten()