From 3c76ea37531a524eb555576e30ce465834b9c613 Mon Sep 17 00:00:00 2001 From: zouap Date: Fri, 8 Jul 2022 10:48:42 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BB=A3=E7=A0=81=E3=80=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: zouap --- modules/setting/setting.go | 39 ++++++++++++++++++++ routers/repo/ai_model_convert.go | 61 ++++++++++++++++---------------- 2 files changed, 69 insertions(+), 31 deletions(-) diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 3ea206fbc..bd6570bb8 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -613,6 +613,24 @@ var ( OrgName string TeamName string }{} + + ModelConvert = struct { + GPU_PYTORCH_IMAGE string + GpuQueue string + GPU_TENSORFLOW_IMAGE string + NPU_MINDSPORE_16_IMAGE string + PytorchOnnxBootFile string + PytorchTrTBootFile string + MindsporeBootFile string + TensorFlowNpuBootFile string + TensorFlowGpuBootFile string + ConvertRepoPath string + GPU_Resource_Specs_ID int + NPU_FlavorCode string + NPU_PoolID string + NPU_MINDSPORE_IMAGE_ID int + NPU_TENSORFLOW_IMAGE_ID int + }{} ) // DateLang transforms standard language locale name to corresponding value in datetime plugin. @@ -1408,6 +1426,27 @@ func NewContext() { Course.TeamName = sec.Key("team_name").MustString("") GetGrampusConfig() + + getModelConvertConfig() +} + +func getModelConvertConfig() { + sec := Cfg.Section("model_convert") + ModelConvert.GPU_PYTORCH_IMAGE = sec.Key("GPU_PYTORCH_IMAGE").MustString("dockerhub.pcl.ac.cn:5000/user-images/openi:tensorRT_7_zouap") + ModelConvert.GpuQueue = sec.Key("GpuQueue").MustString("openidgx") + ModelConvert.GPU_TENSORFLOW_IMAGE = sec.Key("GPU_TENSORFLOW_IMAGE").MustString("dockerhub.pcl.ac.cn:5000/user-images/openi:tf2onnx") + ModelConvert.NPU_MINDSPORE_16_IMAGE = sec.Key("NPU_MINDSPORE_16_IMAGE").MustString("swr.cn-south-222.ai.pcl.cn/openi/mindspore1.6.1_train_v1_openi:v3_ascend") + ModelConvert.PytorchOnnxBootFile = sec.Key("PytorchOnnxBootFile").MustString("convert_pytorch.py") + ModelConvert.PytorchTrTBootFile = sec.Key("PytorchTrTBootFile").MustString("convert_pytorch_tensorrt.py") + ModelConvert.MindsporeBootFile = sec.Key("MindsporeBootFile").MustString("convert_mindspore.py") + ModelConvert.TensorFlowNpuBootFile = sec.Key("TensorFlowNpuBootFile").MustString("convert_tensorflow.py") + ModelConvert.TensorFlowGpuBootFile = sec.Key("TensorFlowGpuBootFile").MustString("convert_tensorflow_gpu.py") + ModelConvert.ConvertRepoPath = sec.Key("ConvertRepoPath").MustString("https://git.openi.org.cn/zouap/npu_test") + ModelConvert.GPU_Resource_Specs_ID = sec.Key("GPU_Resource_Specs_ID").MustInt(1) + ModelConvert.NPU_FlavorCode = sec.Key("NPU_FlavorCode").MustString("modelarts.bm.910.arm.public.1") + ModelConvert.NPU_PoolID = sec.Key("NPU_PoolID").MustString("pool7908321a") + ModelConvert.NPU_MINDSPORE_IMAGE_ID = sec.Key("NPU_MINDSPORE_IMAGE_ID").MustInt(121) + ModelConvert.NPU_TENSORFLOW_IMAGE_ID = sec.Key("NPU_TENSORFLOW_IMAGE_ID").MustInt(35) } func GetGrampusConfig() { diff --git a/routers/repo/ai_model_convert.go b/routers/repo/ai_model_convert.go index bd1c63314..5129d7eb7 100644 --- a/routers/repo/ai_model_convert.go +++ b/routers/repo/ai_model_convert.go @@ -36,19 +36,18 @@ const ( LogFile = "log.txt" DefaultBranchName = "master" SubTaskName = "task1" - GpuQueue = "openidgx" - Success = "S000" - GPU_PYTORCH_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tensorRT_7_zouap" - GPU_TENSORFLOW_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tf2onnx" - NPU_MINDSPORE_16_IMAGE = "swr.cn-south-222.ai.pcl.cn/openi/mindspore1.6.1_train_v1_openi:v3_ascend" - PytorchOnnxBootFile = "convert_pytorch.py" - PytorchTrTBootFile = "convert_pytorch_tensorrt.py" - MindsporeBootFile = "convert_mindspore.py" - TensorFlowNpuBootFile = "convert_tensorflow.py" - TensorFlowGpuBootFile = "convert_tensorflow_gpu.py" - - ConvertRepoPath = "https://git.openi.org.cn/zouap/npu_test" - REPO_ID = 33267 + //GpuQueue = "openidgx" + Success = "S000" + //GPU_PYTORCH_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tensorRT_7_zouap" + //GPU_TENSORFLOW_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tf2onnx" + //NPU_MINDSPORE_16_IMAGE = "swr.cn-south-222.ai.pcl.cn/openi/mindspore1.6.1_train_v1_openi:v3_ascend" + //PytorchOnnxBootFile = "convert_pytorch.py" + //PytorchTrTBootFile = "convert_pytorch_tensorrt.py" + //MindsporeBootFile = "convert_mindspore.py" + //TensorFlowNpuBootFile = "convert_tensorflow.py" + //TensorFlowGpuBootFile = "convert_tensorflow_gpu.py" + + //ConvertRepoPath = "https://git.openi.org.cn/zouap/npu_test" CONVERT_FORMAT_ONNX = 0 CONVERT_FORMAT_TRT = 1 @@ -59,10 +58,10 @@ const ( NPU_MINDSPORE_IMAGE_ID = 35 NPU_TENSORFLOW_IMAGE_ID = 121 - GPU_Resource_Specs_ID = 1 //cpu 1, gpu 1 + //GPU_Resource_Specs_ID = 1 //cpu 1, gpu 1 - NPU_FlavorCode = "modelarts.bm.910.arm.public.1" - NPU_PoolID = "pool7908321a" + //NPU_FlavorCode = "modelarts.bm.910.arm.public.1" + //NPU_PoolID = "pool7908321a" ) var ( @@ -142,8 +141,8 @@ func createNpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context if err == nil { os.RemoveAll(codeLocalPath) } - if err := downloadConvertCode(ConvertRepoPath, codeLocalPath, DefaultBranchName); err != nil { - log.Error("downloadCode failed, server timed out: %s (%v)", ConvertRepoPath, err) + if err := downloadConvertCode(setting.ModelConvert.ConvertRepoPath, codeLocalPath, DefaultBranchName); err != nil { + log.Error("downloadCode failed, server timed out: %s (%v)", setting.ModelConvert.ConvertRepoPath, err) return err } if err := obsMkdir(setting.CodePathPrefix + modelConvert.ID + modelarts.OutputPath + VersionOutputPath + "/"); err != nil { @@ -172,10 +171,10 @@ func createNpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context var engineId int64 engineId = int64(NPU_MINDSPORE_IMAGE_ID) - bootfile := MindsporeBootFile + bootfile := setting.ModelConvert.MindsporeBootFile if modelConvert.SrcEngine == TENSORFLOW_ENGINE { engineId = int64(NPU_TENSORFLOW_IMAGE_ID) - bootfile = TensorFlowNpuBootFile + bootfile = setting.ModelConvert.TensorFlowNpuBootFile } userCommand := "/bin/bash /home/work/run_train.sh 's3://" + codeObsPath + "' 'code/" + bootfile + "' '/tmp/log/train.log' --'data_url'='s3://" + dataPath + "' --'train_url'='s3://" + outputObsPath + "'" userCommand += " --'model'='" + modelConvert.ModelPath + "'" @@ -193,15 +192,15 @@ func createNpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context BootFileUrl: codeObsPath + bootfile, BootFile: bootfile, TrainUrl: outputObsPath, - FlavorCode: NPU_FlavorCode, + FlavorCode: setting.ModelConvert.NPU_FlavorCode, WorkServerNumber: 1, IsLatestVersion: modelarts.IsLatestVersion, EngineID: engineId, LogUrl: logObsPath, - PoolID: NPU_PoolID, + PoolID: setting.ModelConvert.NPU_PoolID, //Parameters: param, BranchName: DefaultBranchName, - UserImageUrl: NPU_MINDSPORE_16_IMAGE, + UserImageUrl: setting.ModelConvert.NPU_MINDSPORE_16_IMAGE, UserCommand: userCommand, } result, err := modelarts.GenerateModelConvertTrainJob(req) @@ -323,21 +322,21 @@ func downloadFromObsToLocal(task *models.AiModelManage, localPath string) error func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, model *models.AiModelManage) error { modelRelativePath := model.Path command := "" - IMAGE_URL := GPU_PYTORCH_IMAGE + IMAGE_URL := setting.ModelConvert.GPU_PYTORCH_IMAGE dataActualPath := setting.Attachment.Minio.RealPath + modelRelativePath if modelConvert.SrcEngine == PYTORCH_ENGINE { if modelConvert.DestFormat == CONVERT_FORMAT_ONNX { - command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, PytorchOnnxBootFile) + command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PytorchOnnxBootFile) } else if modelConvert.DestFormat == CONVERT_FORMAT_TRT { - command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, PytorchTrTBootFile) + command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PytorchTrTBootFile) } else { return errors.New("Not support the format.") } } else if modelConvert.SrcEngine == TENSORFLOW_ENGINE { - IMAGE_URL = GPU_TENSORFLOW_IMAGE + IMAGE_URL = setting.ModelConvert.GPU_TENSORFLOW_IMAGE if modelConvert.DestFormat == CONVERT_FORMAT_ONNX { - command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, TensorFlowGpuBootFile) + command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.TensorFlowGpuBootFile) } else { return errors.New("Not support the format.") } @@ -355,7 +354,7 @@ func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context log.Info("command=" + command) codePath := setting.JobPath + modelConvert.ID + CodeMountPath - downloadConvertCode(ConvertRepoPath, codePath, DefaultBranchName) + downloadConvertCode(setting.ModelConvert.ConvertRepoPath, codePath, DefaultBranchName) uploadCodeToMinio(codePath+"/", modelConvert.ID, CodeMountPath+"/") @@ -373,11 +372,11 @@ func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context if TrainResourceSpecs == nil { json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) } - resourceSpec := TrainResourceSpecs.ResourceSpec[GPU_Resource_Specs_ID] + resourceSpec := TrainResourceSpecs.ResourceSpec[setting.ModelConvert.GPU_Resource_Specs_ID] jobResult, err := cloudbrain.CreateJob(modelConvert.ID, models.CreateJobParams{ JobName: modelConvert.ID, RetryCount: 1, - GpuType: GpuQueue, + GpuType: setting.ModelConvert.GpuQueue, Image: IMAGE_URL, TaskRoles: []models.TaskRole{ {