|
|
|
@@ -138,93 +138,6 @@ func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) err |
|
|
|
return nil |
|
|
|
} |
|
|
|
|
|
|
|
func grampusTrainJobErrorPrepare(ctx *context.Context, processType string, form auth.CreateGrampusTrainJobForm) error { |
|
|
|
ctx.Data["PageIsCloudBrain"] = true |
|
|
|
|
|
|
|
//get valid images |
|
|
|
images, err := grampus.GetImages(processType) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetImages failed:", err.Error()) |
|
|
|
} else { |
|
|
|
ctx.Data["images"] = images.Infos |
|
|
|
} |
|
|
|
|
|
|
|
grampus.InitSpecialPool() |
|
|
|
|
|
|
|
ctx.Data["GPUEnabled"] = true |
|
|
|
ctx.Data["NPUEnabled"] = true |
|
|
|
includeCenters := make(map[string]struct{}) |
|
|
|
excludeCenters := make(map[string]struct{}) |
|
|
|
if grampus.SpecialPools != nil { |
|
|
|
for _, pool := range grampus.SpecialPools.Pools { |
|
|
|
if pool.IsExclusive { |
|
|
|
if !IsUserInOrgPool(ctx.User.ID, pool) { |
|
|
|
ctx.Data[pool.Type+"Enabled"] = false |
|
|
|
} |
|
|
|
} else { |
|
|
|
if strings.Contains(strings.ToLower(processType), strings.ToLower(pool.Type)) { |
|
|
|
if IsUserInOrgPool(ctx.User.ID, pool) { |
|
|
|
for _, center := range pool.Pool { |
|
|
|
includeCenters[center.Queue] = struct{}{} |
|
|
|
} |
|
|
|
} else { |
|
|
|
for _, center := range pool.Pool { |
|
|
|
excludeCenters[center.Queue] = struct{}{} |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
//get valid resource specs |
|
|
|
specs, err := grampus.GetResourceSpecs(processType) |
|
|
|
|
|
|
|
grampusSpecs := getFilterSpecBySpecialPool(specs, includeCenters, excludeCenters) |
|
|
|
|
|
|
|
if err != nil { |
|
|
|
log.Error("GetResourceSpecs failed:", err.Error()) |
|
|
|
} else { |
|
|
|
ctx.Data["flavor_infos"] = grampusSpecs |
|
|
|
} |
|
|
|
|
|
|
|
if processType == grampus.ProcessorTypeGPU { |
|
|
|
ctx.Data["datasetType"] = models.TypeCloudBrainOne |
|
|
|
} else if processType == grampus.ProcessorTypeNPU { |
|
|
|
ctx.Data["datasetType"] = models.TypeCloudBrainTwo |
|
|
|
} |
|
|
|
|
|
|
|
var Parameters modelarts.Parameters |
|
|
|
if err := json.Unmarshal([]byte(form.Params), &Parameters); err != nil { |
|
|
|
ctx.ServerError("json.Unmarshal failed:", err) |
|
|
|
return err |
|
|
|
} |
|
|
|
ctx.Data["params"] = Parameters.Parameter |
|
|
|
ctx.Data["boot_file"] = form.BootFile |
|
|
|
ctx.Data["attachment"] = form.Attachment |
|
|
|
_, datasetNames, err := models.GetDatasetInfo(form.Attachment) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"]) |
|
|
|
return nil |
|
|
|
} |
|
|
|
ctx.Data["dataset_name"] = datasetNames |
|
|
|
ctx.Data["branch_name"] = form.BranchName |
|
|
|
ctx.Data["image_id"] = form.ImageID |
|
|
|
|
|
|
|
ctx.Data["display_job_name"] = form.DisplayJobName |
|
|
|
ctx.Data["image"] = form.Image |
|
|
|
ctx.Data["flavor"] = form.FlavorID |
|
|
|
ctx.Data["flavor_name"] = form.FlavorName |
|
|
|
ctx.Data["description"] = form.Description |
|
|
|
ctx.Data["engine_name"] = form.EngineName |
|
|
|
ctx.Data["work_server_number"] = form.WorkServerNumber |
|
|
|
|
|
|
|
return nil |
|
|
|
} |
|
|
|
|
|
|
|
func getFilterSpecBySpecialPool(specs *models.GetGrampusResourceSpecsResult, includeCenters map[string]struct{}, excludeCenters map[string]struct{}) []models.GrampusSpec { |
|
|
|
if len(includeCenters) == 0 && len(excludeCenters) == 0 { |
|
|
|
return specs.Infos |
|
|
|
@@ -295,14 +208,14 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
image := strings.TrimSpace(form.Image) |
|
|
|
|
|
|
|
if !jobNamePattern.MatchString(displayJobName) { |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
errStr := checkSpecialPool(ctx, "GPU") |
|
|
|
if errStr != "" { |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(errStr, tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -311,13 +224,13 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.GPUResource) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("system error", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} else { |
|
|
|
if count >= 1 { |
|
|
|
log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -326,7 +239,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
//check param |
|
|
|
if err := grampusParamCheckCreateTrainJob(form); err != nil { |
|
|
|
log.Error("paramCheckCreateTrainJob failed:(%v)", err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -336,14 +249,14 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
if err == nil { |
|
|
|
if len(tasks) != 0 { |
|
|
|
log.Error("the job name did already exist", ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("the job name did already exist", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
} else { |
|
|
|
if !models.IsErrJobNotExist(err) { |
|
|
|
log.Error("system error, %v", err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("system error", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -353,7 +266,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
attachment, err := models.GetAttachmentByUUID(uuid) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetAttachmentByUUID failed:", err.Error(), ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("dataset is not exist", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -366,7 +279,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
|
|
|
|
if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil { |
|
|
|
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -375,7 +288,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
//upload code |
|
|
|
if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil { |
|
|
|
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -383,7 +296,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/" |
|
|
|
if err := mkModelPath(modelPath); err != nil { |
|
|
|
log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -391,7 +304,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
//init model readme |
|
|
|
if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil { |
|
|
|
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -400,7 +313,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", dataMinioPath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", attachment.Name) |
|
|
|
if err != nil { |
|
|
|
log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -432,7 +345,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
err = grampus.GenerateTrainJob(ctx, req) |
|
|
|
if err != nil { |
|
|
|
log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -479,14 +392,14 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
engineName := form.EngineName |
|
|
|
|
|
|
|
if !jobNamePattern.MatchString(displayJobName) { |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
errStr := checkSpecialPool(ctx, "NPU") |
|
|
|
if errStr != "" { |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(errStr, tplGrampusTrainJobGPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -495,13 +408,13 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.NPUResource) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} else { |
|
|
|
if count >= 1 { |
|
|
|
log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -510,7 +423,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
//check param |
|
|
|
if err := grampusParamCheckCreateTrainJob(form); err != nil { |
|
|
|
log.Error("paramCheckCreateTrainJob failed:(%v)", err) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -520,14 +433,14 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
if err == nil { |
|
|
|
if len(tasks) != 0 { |
|
|
|
log.Error("the job name did already exist", ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("the job name did already exist", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
} else { |
|
|
|
if !models.IsErrJobNotExist(err) { |
|
|
|
log.Error("system error, %v", err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -537,7 +450,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
attachment, err := models.GetAttachmentByUUID(uuid) |
|
|
|
if err != nil { |
|
|
|
log.Error("GetAttachmentByUUID failed:", err.Error(), ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("dataset is not exist", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -550,7 +463,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
|
|
|
|
if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil { |
|
|
|
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("Create task failed, server timed out", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -558,14 +471,14 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
//todo: upload code (send to file_server todo this work?) |
|
|
|
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil { |
|
|
|
log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("Failed to obsMkdir_output", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|
|
|
|
if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil { |
|
|
|
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("Failed to uploadCodeToObs", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -574,7 +487,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", dataObsPath+"'"+attachment.Name+"'", bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, attachment.Name) |
|
|
|
if err != nil { |
|
|
|
log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"]) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
@@ -610,7 +523,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain |
|
|
|
err = grampus.GenerateTrainJob(ctx, req) |
|
|
|
if err != nil { |
|
|
|
log.Error("GenerateTrainJob failed:%v", err.Error()) |
|
|
|
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form) |
|
|
|
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU) |
|
|
|
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form) |
|
|
|
return |
|
|
|
} |
|
|
|
|