From 6a9154af31a9bf9344644901be1b2e2267b67930 Mon Sep 17 00:00:00 2001 From: lewis <747342561@qq.com> Date: Tue, 7 Jun 2022 18:09:30 +0800 Subject: [PATCH] gpu train --- modules/grampus/grampus.go | 2 +- modules/setting/setting.go | 2 ++ routers/repo/grampus.go | 33 +++++++++++++++++++++++++-------- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/modules/grampus/grampus.go b/modules/grampus/grampus.go index 1250f0d0d..1375ecd48 100755 --- a/modules/grampus/grampus.go +++ b/modules/grampus/grampus.go @@ -15,7 +15,7 @@ const ( ProcessorTypeNPU = "npu.huawei.com/NPU" ProcessorTypeGPU = "nvidia.com/gpu" - CommandPrepareScript = "pwd;cd /tmp;mkdir -p output;mkdir -p code;mkdir -p dataset;wget -q https://git.openi.org.cn/lewis/script_for_grampus/archive/master.zip;unzip -q master.zip;cd script_for_grampus;chmod 777 downloader_for_obs uploader_for_obs;" + CommandPrepareScript = "pwd;cd /tmp;mkdir -p output;mkdir -p code;mkdir -p dataset;wget -q https://git.openi.org.cn/OpenIOSSG/script_for_grampus/archive/master.zip;unzip -q master.zip;cd script_for_grampus;chmod 777 downloader_for_obs uploader_for_obs downloader_for_minio uploader_for_minio;" CodeArchiveName = "master.zip" ) diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 945a7c6f8..abee77579 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -530,6 +530,7 @@ var ( //grampus config Grampus = struct { + Env string Host string UserName string Password string @@ -1395,6 +1396,7 @@ func NewContext() { func GetGrampusConfig() { sec := Cfg.Section("grampus") + Grampus.Env = sec.Key("ENV").MustString("TEST") Grampus.Host = sec.Key("SERVER_HOST").MustString("") Grampus.UserName = sec.Key("USERNAME").MustString("") Grampus.Password = sec.Key("PASSWORD").MustString("") diff --git a/routers/repo/grampus.go b/routers/repo/grampus.go index b348e92e7..983f98d14 100755 --- a/routers/repo/grampus.go +++ b/routers/repo/grampus.go @@ -114,14 +114,16 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain bootFile := form.BootFile params := form.Params repo := ctx.Repo.Repository - codeLocalPath := setting.JobPath + jobName + modelarts.CodePath - //codeObsPath := grampus.JobPath + jobName + modelarts.CodePath - //dataObsPath := setting.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" + codeLocalPath := setting.JobPath + jobName + cloudbrain.CodeMountPath + "/" + codeMinioPath := setting.CBCodePathPrefix + jobName + cloudbrain.CodeMountPath + "/" + dataMinioPath := setting.Attachment.Minio.BasePath + path.Join(uuid[0:1], uuid[1:2]) + "/" + uuid + "/" branchName := form.BranchName flavorName := form.FlavorName engineName := form.EngineName image := strings.TrimSpace(form.Image) + jobName = displayJobName + if !jobNamePattern.MatchString(displayJobName) { ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tplGrampusTrainJobGPUNew, &form) return @@ -192,6 +194,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } //todo: upload code (send to file_server todo this work?) + //upload code if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil { log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) @@ -207,6 +210,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain return } + //init model readme if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil { log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) @@ -215,8 +219,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain } //prepare command - var codeObsPath, dataObsPath string - command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", dataObsPath+attachment.Name, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, attachment.Name) + command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", dataMinioPath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", attachment.Name) if err != nil { log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) @@ -242,6 +245,8 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain FlavorName: flavorName, EngineName: engineName, DatasetName: attachment.Name, + IsLatestVersion: modelarts.IsLatestVersion, + VersionCount: modelarts.VersionCount, } err = grampus.GenerateTrainJob(ctx, req) @@ -357,6 +362,12 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain //prepare command command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", dataObsPath+attachment.Name, bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, attachment.Name) + if err != nil { + log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) + grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeNPU) + ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobNPUNew, &form) + return + } commitID, _ := ctx.Repo.GitRepo.GetBranchCommitID(branchName) @@ -597,7 +608,8 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo commandDownload := "./downloader_for_obs " + setting.Bucket + " " + codeRemotePath + " " + grampus.CodeArchiveName + " " + dataRemotePath + " " + datasetName + ";" command += commandDownload } else if processorType == grampus.ProcessorTypeGPU { - + commandDownload := "./downloader_for_minio " + setting.Grampus.Env + " " + codeRemotePath + " " + grampus.CodeArchiveName + " " + dataRemotePath + " " + datasetName + ";" + command += commandDownload } //unzip code & dataset @@ -636,8 +648,13 @@ func generateCommand(repoName, processorType, codeRemotePath, dataRemotePath, bo command += commandGetRes //upload models - commandUpload := "cd /tmp/script_for_grampus/;./uploader_for_obs " + setting.Bucket + " " + outputRemotePath + " " + "/tmp/output/;" - command += commandUpload + if processorType == grampus.ProcessorTypeNPU { + commandUpload := "cd /tmp/script_for_grampus/;./uploader_for_obs " + setting.Bucket + " " + outputRemotePath + " " + "/tmp/output/;" + command += commandUpload + } else if processorType == grampus.ProcessorTypeGPU { + commandUpload := "cd /tmp/script_for_grampus/;./uploader_for_minio " + setting.Grampus.Env + " " + outputRemotePath + " " + "/tmp/output/;" + command += commandUpload + } //check exec result commandCheckRes := " [[ result -eq 0 ]] && echo success || ls failed;"