Browse Source

删除恒源云的使用代码;修改北京云的训练脚本; main中修改了部分参数并在设置gpu的情况下自动调用gpu; 注明train model中训练步的返回值

master
shenyan 4 years ago
parent
commit
ef272dfb51
4 changed files with 19 additions and 107 deletions
  1. +7
    -7
      free_servers/bscc-run.sh
  2. +0
    -92
      free_servers/hy-scaffold.ipynb
  3. +11
    -8
      main.py
  4. +1
    -0
      train_model.py

+ 7
- 7
free_servers/bscc-run.sh View File

@@ -1,14 +1,14 @@
#!/bin/bash

module load anaconda/2020.11
module load cuda/10.2
module load cudnn/8.1.1.33_CUDA10.2
#cd run
#cd machineLearningTemplate

#conda create --name py37 python=3.7
source activate py37
module load anaconda/2020.11
module load cuda/11.1
module load cudnn/8.1.1.33_CUDA11.1

cd run
cd machineLearningScaffold
#conda create --name py39 python=3.9
source activate py39

#pip install -r requirements.txt



+ 0
- 92
free_servers/hy-scaffold.ipynb View File

@@ -1,92 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "4c4d04db-6fe0-4511-abb3-c746ba869863",
"metadata": {},
"outputs": [],
"source": [
"!apt-get update\n",
"!apt-get install p7zip-full -y"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c027a62-8116-4c55-9a90-95c4d90dd087",
"metadata": {},
"outputs": [],
"source": [
"!7z x dataset.zip -o/hy-tmp\n",
"!7z x machineLearningScaffold.zip -o/hy-tmp\n",
"!7z x vit_pre-train_checkpoint.zip -o/hy-tmp\n",
"!mv -f /hy-tmp/pre-train_checkpoint/vit_checkpoint/imagenet21k/R50+ViT-B_16.npz /hy-tmp/pre-train_checkpoint"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "4c6dc6a1-0208-4efb-beaf-b1264a94ba71",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33mWARNING: You are using pip version 21.0.1; however, version 21.2.4 is available.\n",
"You should consider upgrading via the '/usr/bin/python3.8 -m pip install --upgrade pip' command.\u001b[0m\n"
]
}
],
"source": [
"!pip install -r /hy-tmp/requirements.txt > /dev/null"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2c074f52-d27e-4605-8b4f-45b11a2f1eb1",
"metadata": {},
"outputs": [],
"source": [
"import main\n",
"%load_ext tensorboard"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a5a8d64e-d90e-467f-a3b4-db8b31874044",
"metadata": {},
"outputs": [],
"source": [
"main.main('fit', gpus=1, dataset_path='./dataset/MFNet(RGB-T)', num_workers=2, max_epochs=60, batch_size=16, precision=16, seed=1234, # tpu_cores=8,\n",
" checkpoint_every_n_val=1, save_name='version_0', #checkpoint_path='/version_15/checkpoints/epoch=59-step=4739.ckpt',\n",
" path_final_save='./drive/MyDrive', save_top_k=1\n",
" )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

+ 11
- 8
main.py View File

@@ -1,3 +1,5 @@
import torch

from save_checkpoint import SaveCheckpoint
from data_module import DataModule
from pytorch_lightning import loggers as pl_loggers
@@ -27,6 +29,11 @@ def main(stage,
框架的入口函数. 包含设置超参数, 划分数据集, 选择训练或测试等流程
该函数的参数为训练过程中需要经常改动的参数

经常改动的 参数 作为main的输入参数
不常改动的 非通用参数 存放在config
不常改动的 通用参数 直接进行声明
* 通用参数指的是所有网络中共有的参数, 如time_sum等

:param stage: 表示处于训练阶段还是测试阶段, fit表示训练, test表示测试
:param max_epochs:
:param batch_size:
@@ -44,14 +51,11 @@ def main(stage,
非重载训练的情况下, 可以通过调整该值控制训练的次数;
:param k_fold:
"""
# 经常改动的 参数 作为main的输入参数
# 不常改动的 非通用参数 存放在config
# 不常改动的 通用参数 直接进行声明
# 通用参数指的是所有网络中共有的参数, 如time_sum等

# 处理输入数据
precision = 32 if (gpus is None and tpu_cores is None) else precision
# 获得通用参数
# 自动处理:param gpus
gpus = 1 if torch.cuda.is_available() and gpus is None and tpu_cores is None else None
# 定义不常改动的通用参数
# TODO 获得最优的batch size
num_workers = cpu_count()
# 获得非通用参数
@@ -100,8 +104,7 @@ def main(stage,


if __name__ == "__main__":
main('fit', max_epochs=2, batch_size=32, precision=16, seed=1234, dataset_path='./dataset', k_fold=10,
main('fit', max_epochs=200, batch_size=128, precision=16, seed=1234, dataset_path='./dataset', k_fold=10,
kth_fold_start=9,
# gpus=1,
# version_nth=8,
)

+ 1
- 0
train_model.py View File

@@ -17,6 +17,7 @@ class TrainModule(pl.LightningModule):
self.net = resnet56()
self.loss = nn.CrossEntropyLoss()

# 返回值必须包含loss, loss可以作为dict中的key, 或者直接返回loss
def training_step(self, batch, batch_idx):
_, input, label = batch
label = label.flatten()


Loading…
Cancel
Save