|
|
@@ -36,19 +36,18 @@ const ( |
|
|
|
LogFile = "log.txt" |
|
|
|
DefaultBranchName = "master" |
|
|
|
SubTaskName = "task1" |
|
|
|
GpuQueue = "openidgx" |
|
|
|
Success = "S000" |
|
|
|
GPU_PYTORCH_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tensorRT_7_zouap" |
|
|
|
GPU_TENSORFLOW_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tf2onnx" |
|
|
|
NPU_MINDSPORE_16_IMAGE = "swr.cn-south-222.ai.pcl.cn/openi/mindspore1.6.1_train_v1_openi:v3_ascend" |
|
|
|
PytorchOnnxBootFile = "convert_pytorch.py" |
|
|
|
PytorchTrTBootFile = "convert_pytorch_tensorrt.py" |
|
|
|
MindsporeBootFile = "convert_mindspore.py" |
|
|
|
TensorFlowNpuBootFile = "convert_tensorflow.py" |
|
|
|
TensorFlowGpuBootFile = "convert_tensorflow_gpu.py" |
|
|
|
|
|
|
|
ConvertRepoPath = "https://git.openi.org.cn/zouap/npu_test" |
|
|
|
REPO_ID = 33267 |
|
|
|
//GpuQueue = "openidgx" |
|
|
|
Success = "S000" |
|
|
|
//GPU_PYTORCH_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tensorRT_7_zouap" |
|
|
|
//GPU_TENSORFLOW_IMAGE = "dockerhub.pcl.ac.cn:5000/user-images/openi:tf2onnx" |
|
|
|
//NPU_MINDSPORE_16_IMAGE = "swr.cn-south-222.ai.pcl.cn/openi/mindspore1.6.1_train_v1_openi:v3_ascend" |
|
|
|
//PytorchOnnxBootFile = "convert_pytorch.py" |
|
|
|
//PytorchTrTBootFile = "convert_pytorch_tensorrt.py" |
|
|
|
//MindsporeBootFile = "convert_mindspore.py" |
|
|
|
//TensorFlowNpuBootFile = "convert_tensorflow.py" |
|
|
|
//TensorFlowGpuBootFile = "convert_tensorflow_gpu.py" |
|
|
|
|
|
|
|
//ConvertRepoPath = "https://git.openi.org.cn/zouap/npu_test" |
|
|
|
|
|
|
|
CONVERT_FORMAT_ONNX = 0 |
|
|
|
CONVERT_FORMAT_TRT = 1 |
|
|
@@ -59,10 +58,10 @@ const ( |
|
|
|
NPU_MINDSPORE_IMAGE_ID = 35 |
|
|
|
NPU_TENSORFLOW_IMAGE_ID = 121 |
|
|
|
|
|
|
|
GPU_Resource_Specs_ID = 1 //cpu 1, gpu 1 |
|
|
|
//GPU_Resource_Specs_ID = 1 //cpu 1, gpu 1 |
|
|
|
|
|
|
|
NPU_FlavorCode = "modelarts.bm.910.arm.public.1" |
|
|
|
NPU_PoolID = "pool7908321a" |
|
|
|
//NPU_FlavorCode = "modelarts.bm.910.arm.public.1" |
|
|
|
//NPU_PoolID = "pool7908321a" |
|
|
|
) |
|
|
|
|
|
|
|
var ( |
|
|
@@ -142,8 +141,8 @@ func createNpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context |
|
|
|
if err == nil { |
|
|
|
os.RemoveAll(codeLocalPath) |
|
|
|
} |
|
|
|
if err := downloadConvertCode(ConvertRepoPath, codeLocalPath, DefaultBranchName); err != nil { |
|
|
|
log.Error("downloadCode failed, server timed out: %s (%v)", ConvertRepoPath, err) |
|
|
|
if err := downloadConvertCode(setting.ModelConvert.ConvertRepoPath, codeLocalPath, DefaultBranchName); err != nil { |
|
|
|
log.Error("downloadCode failed, server timed out: %s (%v)", setting.ModelConvert.ConvertRepoPath, err) |
|
|
|
return err |
|
|
|
} |
|
|
|
if err := obsMkdir(setting.CodePathPrefix + modelConvert.ID + modelarts.OutputPath + VersionOutputPath + "/"); err != nil { |
|
|
@@ -172,10 +171,10 @@ func createNpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context |
|
|
|
|
|
|
|
var engineId int64 |
|
|
|
engineId = int64(NPU_MINDSPORE_IMAGE_ID) |
|
|
|
bootfile := MindsporeBootFile |
|
|
|
bootfile := setting.ModelConvert.MindsporeBootFile |
|
|
|
if modelConvert.SrcEngine == TENSORFLOW_ENGINE { |
|
|
|
engineId = int64(NPU_TENSORFLOW_IMAGE_ID) |
|
|
|
bootfile = TensorFlowNpuBootFile |
|
|
|
bootfile = setting.ModelConvert.TensorFlowNpuBootFile |
|
|
|
} |
|
|
|
userCommand := "/bin/bash /home/work/run_train.sh 's3://" + codeObsPath + "' 'code/" + bootfile + "' '/tmp/log/train.log' --'data_url'='s3://" + dataPath + "' --'train_url'='s3://" + outputObsPath + "'" |
|
|
|
userCommand += " --'model'='" + modelConvert.ModelPath + "'" |
|
|
@@ -193,15 +192,15 @@ func createNpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context |
|
|
|
BootFileUrl: codeObsPath + bootfile, |
|
|
|
BootFile: bootfile, |
|
|
|
TrainUrl: outputObsPath, |
|
|
|
FlavorCode: NPU_FlavorCode, |
|
|
|
FlavorCode: setting.ModelConvert.NPU_FlavorCode, |
|
|
|
WorkServerNumber: 1, |
|
|
|
IsLatestVersion: modelarts.IsLatestVersion, |
|
|
|
EngineID: engineId, |
|
|
|
LogUrl: logObsPath, |
|
|
|
PoolID: NPU_PoolID, |
|
|
|
PoolID: setting.ModelConvert.NPU_PoolID, |
|
|
|
//Parameters: param, |
|
|
|
BranchName: DefaultBranchName, |
|
|
|
UserImageUrl: NPU_MINDSPORE_16_IMAGE, |
|
|
|
UserImageUrl: setting.ModelConvert.NPU_MINDSPORE_16_IMAGE, |
|
|
|
UserCommand: userCommand, |
|
|
|
} |
|
|
|
result, err := modelarts.GenerateModelConvertTrainJob(req) |
|
|
@@ -323,21 +322,21 @@ func downloadFromObsToLocal(task *models.AiModelManage, localPath string) error |
|
|
|
func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context, model *models.AiModelManage) error { |
|
|
|
modelRelativePath := model.Path |
|
|
|
command := "" |
|
|
|
IMAGE_URL := GPU_PYTORCH_IMAGE |
|
|
|
IMAGE_URL := setting.ModelConvert.GPU_PYTORCH_IMAGE |
|
|
|
dataActualPath := setting.Attachment.Minio.RealPath + modelRelativePath |
|
|
|
|
|
|
|
if modelConvert.SrcEngine == PYTORCH_ENGINE { |
|
|
|
if modelConvert.DestFormat == CONVERT_FORMAT_ONNX { |
|
|
|
command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, PytorchOnnxBootFile) |
|
|
|
command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PytorchOnnxBootFile) |
|
|
|
} else if modelConvert.DestFormat == CONVERT_FORMAT_TRT { |
|
|
|
command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, PytorchTrTBootFile) |
|
|
|
command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.PytorchTrTBootFile) |
|
|
|
} else { |
|
|
|
return errors.New("Not support the format.") |
|
|
|
} |
|
|
|
} else if modelConvert.SrcEngine == TENSORFLOW_ENGINE { |
|
|
|
IMAGE_URL = GPU_TENSORFLOW_IMAGE |
|
|
|
IMAGE_URL = setting.ModelConvert.GPU_TENSORFLOW_IMAGE |
|
|
|
if modelConvert.DestFormat == CONVERT_FORMAT_ONNX { |
|
|
|
command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, TensorFlowGpuBootFile) |
|
|
|
command = getGpuModelConvertCommand(modelConvert.ID, modelConvert.ModelPath, modelConvert, setting.ModelConvert.TensorFlowGpuBootFile) |
|
|
|
} else { |
|
|
|
return errors.New("Not support the format.") |
|
|
|
} |
|
|
@@ -355,7 +354,7 @@ func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context |
|
|
|
log.Info("command=" + command) |
|
|
|
|
|
|
|
codePath := setting.JobPath + modelConvert.ID + CodeMountPath |
|
|
|
downloadConvertCode(ConvertRepoPath, codePath, DefaultBranchName) |
|
|
|
downloadConvertCode(setting.ModelConvert.ConvertRepoPath, codePath, DefaultBranchName) |
|
|
|
|
|
|
|
uploadCodeToMinio(codePath+"/", modelConvert.ID, CodeMountPath+"/") |
|
|
|
|
|
|
@@ -373,11 +372,11 @@ func createGpuTrainJob(modelConvert *models.AiModelConvert, ctx *context.Context |
|
|
|
if TrainResourceSpecs == nil { |
|
|
|
json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) |
|
|
|
} |
|
|
|
resourceSpec := TrainResourceSpecs.ResourceSpec[GPU_Resource_Specs_ID] |
|
|
|
resourceSpec := TrainResourceSpecs.ResourceSpec[setting.ModelConvert.GPU_Resource_Specs_ID] |
|
|
|
jobResult, err := cloudbrain.CreateJob(modelConvert.ID, models.CreateJobParams{ |
|
|
|
JobName: modelConvert.ID, |
|
|
|
RetryCount: 1, |
|
|
|
GpuType: GpuQueue, |
|
|
|
GpuType: setting.ModelConvert.GpuQueue, |
|
|
|
Image: IMAGE_URL, |
|
|
|
TaskRoles: []models.TaskRole{ |
|
|
|
{ |
|
|
|