Browse Source

fix-1968

tags/v1.22.7.2^2
liuzx 3 years ago
parent
commit
0b4c4b4c22
2 changed files with 45 additions and 380 deletions
  1. +18
    -266
      routers/repo/cloudbrain.go
  2. +27
    -114
      routers/repo/grampus.go

+ 18
- 266
routers/repo/cloudbrain.go View File

@@ -2,7 +2,6 @@ package repo

import (
"bufio"
"code.gitea.io/gitea/modules/notification"
"encoding/json"
"errors"
"fmt"
@@ -16,6 +15,8 @@ import (
"time"
"unicode/utf8"

"code.gitea.io/gitea/modules/notification"

"code.gitea.io/gitea/modules/grampus"

"code.gitea.io/gitea/modules/timeutil"
@@ -219,255 +220,6 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error {

return nil
}
func cloudBrainTrainJobErrorPrepare(ctx *context.Context, form auth.CreateCloudBrainForm) error {
ctx.Data["PageIsCloudBrain"] = true

if categories == nil {
json.Unmarshal([]byte(setting.BenchmarkCategory), &categories)
}
ctx.Data["benchmark_categories"] = categories.Category

ctx.Data["benchmark_types"] = GetBenchmarkTypes(ctx).BenchmarkType
queuesDetail, _ := cloudbrain.GetQueuesDetail()
if queuesDetail != nil {
ctx.Data["QueuesDetail"] = queuesDetail
}

cloudbrain.InitSpecialPool()

if gpuInfos == nil {
json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos)
}
ctx.Data["gpu_types"] = gpuInfos.GpuInfo

if trainGpuInfos == nil {
json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos)
}
ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo

if inferenceGpuInfos == nil && setting.InferenceGpuTypes != "" {
json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos)
}
if inferenceGpuInfos != nil {
ctx.Data["inference_gpu_types"] = inferenceGpuInfos.GpuInfo
}

if benchmarkGpuInfos == nil {
json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos)
}
ctx.Data["benchmark_gpu_types"] = benchmarkGpuInfos.GpuInfo

if benchmarkResourceSpecs == nil {
json.Unmarshal([]byte(setting.BenchmarkResourceSpecs), &benchmarkResourceSpecs)
}
ctx.Data["benchmark_resource_specs"] = benchmarkResourceSpecs.ResourceSpec

if cloudbrain.ResourceSpecs == nil {
json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs)
}
ctx.Data["resource_specs"] = cloudbrain.ResourceSpecs.ResourceSpec

if cloudbrain.TrainResourceSpecs == nil {
json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs)
}
ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec

if cloudbrain.InferenceResourceSpecs == nil && setting.InferenceResourceSpecs != "" {
json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs)
}
if cloudbrain.InferenceResourceSpecs != nil {
ctx.Data["inference_resource_specs"] = cloudbrain.InferenceResourceSpecs.ResourceSpec
}

if cloudbrain.SpecialPools != nil {
var debugGpuTypes []*models.GpuInfo
var trainGpuTypes []*models.GpuInfo

for _, pool := range cloudbrain.SpecialPools.Pools {
org, _ := models.GetOrgByName(pool.Org)
if org != nil {
isOrgMember, _ := models.IsOrganizationMember(org.ID, ctx.User.ID)
if isOrgMember {
for _, jobType := range pool.JobType {
if jobType == string(models.JobTypeDebug) {
debugGpuTypes = append(debugGpuTypes, pool.Pool...)
if pool.ResourceSpec != nil {
ctx.Data["resource_specs"] = pool.ResourceSpec
}
} else if jobType == string(models.JobTypeTrain) {
trainGpuTypes = append(trainGpuTypes, pool.Pool...)
if pool.ResourceSpec != nil {
ctx.Data["train_resource_specs"] = pool.ResourceSpec
}
}
}
break
}
}

}

if len(debugGpuTypes) > 0 {
ctx.Data["gpu_types"] = debugGpuTypes
}

if len(trainGpuTypes) > 0 {
ctx.Data["train_gpu_types"] = trainGpuTypes
}

}

var Parameters modelarts.Parameters
if err := json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["params"] = Parameters.Parameter
ctx.Data["boot_file"] = form.BootFile
ctx.Data["attachment"] = form.Attachment
_, datasetNames, err := models.GetDatasetInfo(form.Attachment)
if err != nil {
log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
return nil
}
ctx.Data["dataset_name"] = datasetNames
ctx.Data["branch_name"] = form.BranchName
ctx.Data["datasetType"] = models.TypeCloudBrainOne

ctx.Data["display_job_name"] = form.DisplayJobName
ctx.Data["image"] = form.Image
ctx.Data["job_type"] = form.JobType
ctx.Data["gpu_type"] = form.GpuType
ctx.Data["resource_spec_id"] = form.ResourceSpecId
return nil
}

func cloudBrainInferenceJobErrorPrepare(ctx *context.Context, form auth.CreateCloudBrainInferencForm) error {
ctx.Data["PageIsCloudBrain"] = true

if categories == nil {
json.Unmarshal([]byte(setting.BenchmarkCategory), &categories)
}
ctx.Data["benchmark_categories"] = categories.Category

ctx.Data["benchmark_types"] = GetBenchmarkTypes(ctx).BenchmarkType
queuesDetail, _ := cloudbrain.GetQueuesDetail()
if queuesDetail != nil {
ctx.Data["QueuesDetail"] = queuesDetail
}

cloudbrain.InitSpecialPool()

if gpuInfos == nil {
json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos)
}
ctx.Data["gpu_types"] = gpuInfos.GpuInfo

if trainGpuInfos == nil {
json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos)
}
ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo

if inferenceGpuInfos == nil && setting.InferenceGpuTypes != "" {
json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos)
}
if inferenceGpuInfos != nil {
ctx.Data["inference_gpu_types"] = inferenceGpuInfos.GpuInfo
}

if benchmarkGpuInfos == nil {
json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos)
}
ctx.Data["benchmark_gpu_types"] = benchmarkGpuInfos.GpuInfo

if benchmarkResourceSpecs == nil {
json.Unmarshal([]byte(setting.BenchmarkResourceSpecs), &benchmarkResourceSpecs)
}
ctx.Data["benchmark_resource_specs"] = benchmarkResourceSpecs.ResourceSpec

if cloudbrain.ResourceSpecs == nil {
json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs)
}
ctx.Data["resource_specs"] = cloudbrain.ResourceSpecs.ResourceSpec

if cloudbrain.TrainResourceSpecs == nil {
json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs)
}
ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec

if cloudbrain.InferenceResourceSpecs == nil && setting.InferenceResourceSpecs != "" {
json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs)
}
if cloudbrain.InferenceResourceSpecs != nil {
ctx.Data["inference_resource_specs"] = cloudbrain.InferenceResourceSpecs.ResourceSpec
}

if cloudbrain.SpecialPools != nil {
var debugGpuTypes []*models.GpuInfo
var trainGpuTypes []*models.GpuInfo

for _, pool := range cloudbrain.SpecialPools.Pools {
org, _ := models.GetOrgByName(pool.Org)
if org != nil {
isOrgMember, _ := models.IsOrganizationMember(org.ID, ctx.User.ID)
if isOrgMember {
for _, jobType := range pool.JobType {
if jobType == string(models.JobTypeDebug) {
debugGpuTypes = append(debugGpuTypes, pool.Pool...)
if pool.ResourceSpec != nil {
ctx.Data["resource_specs"] = pool.ResourceSpec
}
} else if jobType == string(models.JobTypeTrain) {
trainGpuTypes = append(trainGpuTypes, pool.Pool...)
if pool.ResourceSpec != nil {
ctx.Data["train_resource_specs"] = pool.ResourceSpec
}
}
}
break
}
}

}
if len(debugGpuTypes) > 0 {
ctx.Data["gpu_types"] = debugGpuTypes
}

if len(trainGpuTypes) > 0 {
ctx.Data["train_gpu_types"] = trainGpuTypes
}

}
var Parameters modelarts.Parameters
if err := json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["params"] = Parameters.Parameter
ctx.Data["boot_file"] = form.BootFile
ctx.Data["attachment"] = form.Attachment
_, datasetNames, err := models.GetDatasetInfo(form.Attachment)
if err != nil {
log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
return nil
}
ctx.Data["dataset_name"] = datasetNames
ctx.Data["branch_name"] = form.BranchName
ctx.Data["datasetType"] = models.TypeCloudBrainOne

ctx.Data["display_job_name"] = form.DisplayJobName
ctx.Data["image"] = form.Image
ctx.Data["job_type"] = form.JobType
ctx.Data["gpu_type"] = form.GpuType
ctx.Data["resource_spec_id"] = form.ResourceSpecId
ctx.Data["label_names"] = form.LabelName
ctx.Data["train_url"] = form.TrainUrl
ctx.Data["ckpt_name"] = form.CkptName
ctx.Data["model_name"] = form.ModelName
ctx.Data["model_version"] = form.ModelVersion
ctx.Data["description"] = form.Description
return nil
}

func CloudBrainNew(ctx *context.Context) {
err := cloudBrainNewDataPrepare(ctx)
@@ -500,28 +252,28 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
if err == nil {
if len(tasks) != 0 {
log.Error("the job name did already exist", ctx.Data["MsgID"])
cloudBrainTrainJobErrorPrepare(ctx, form)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("the job name did already exist", tpl, &form)
return
}
} else {
if !models.IsErrJobNotExist(err) {
log.Error("system error, %v", err, ctx.Data["MsgID"])
cloudBrainTrainJobErrorPrepare(ctx, form)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("system error", tpl, &form)
return
}
}

if !jobNamePattern.MatchString(displayJobName) {
cloudBrainTrainJobErrorPrepare(ctx, form)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
return
}

if jobType != string(models.JobTypeBenchmark) && jobType != string(models.JobTypeDebug) && jobType != string(models.JobTypeTrain) {
log.Error("jobtype error:", jobType, ctx.Data["MsgID"])
cloudBrainTrainJobErrorPrepare(ctx, form)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("jobtype error", tpl, &form)
return
}
@@ -529,13 +281,13 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
count, err := models.GetCloudbrainCountByUserID(ctx.User.ID, jobType)
if err != nil {
log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
cloudBrainTrainJobErrorPrepare(ctx, form)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("system error", tpl, &form)
return
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
cloudBrainTrainJobErrorPrepare(ctx, form)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain.morethanonejob"), tpl, &form)
return
}
@@ -544,7 +296,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
datasetInfos, datasetNames, err := models.GetDatasetInfo(uuids)
if err != nil {
log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
cloudBrainTrainJobErrorPrepare(ctx, form)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
return
}
@@ -565,7 +317,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
errStr := checkCloudBrainSpecialPool(ctx, jobType, gpuQueue, resourceSpecId)

if errStr != "" {
cloudBrainTrainJobErrorPrepare(ctx, form)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(errStr, tpl, &form)
return
}
@@ -611,7 +363,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {

err = cloudbrain.GenerateTask(req)
if err != nil {
cloudBrainTrainJobErrorPrepare(ctx, form)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(err.Error(), tpl, &form)
return
}
@@ -651,21 +403,21 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
if err == nil {
if len(tasks) != 0 {
log.Error("the job name did already exist", ctx.Data["MsgID"])
cloudBrainInferenceJobErrorPrepare(ctx, form)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("the job name did already exist", tpl, &form)
return
}
} else {
if !models.IsErrJobNotExist(err) {
log.Error("system error, %v", err, ctx.Data["MsgID"])
cloudBrainInferenceJobErrorPrepare(ctx, form)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("system error", tpl, &form)
return
}
}

if !jobNamePattern.MatchString(displayJobName) {
cloudBrainInferenceJobErrorPrepare(ctx, form)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form)
return
}
@@ -673,13 +425,13 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
count, err := models.GetCloudbrainCountByUserID(ctx.User.ID, jobType)
if err != nil {
log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"])
cloudBrainInferenceJobErrorPrepare(ctx, form)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("system error", tpl, &form)
return
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
cloudBrainInferenceJobErrorPrepare(ctx, form)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain.morethanonejob"), tpl, &form)
return
}
@@ -699,7 +451,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
datasetInfos, datasetNames, err := models.GetDatasetInfo(uuid)
if err != nil {
log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
cloudBrainInferenceJobErrorPrepare(ctx, form)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr("cloudbrain.error.dataset_select"), tpl, &form)
return
}
@@ -736,7 +488,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra

err = cloudbrain.GenerateTask(req)
if err != nil {
cloudBrainInferenceJobErrorPrepare(ctx, form)
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(err.Error(), tpl, &form)
return
}


+ 27
- 114
routers/repo/grampus.go View File

@@ -138,93 +138,6 @@ func grampusTrainJobNewDataPrepare(ctx *context.Context, processType string) err
return nil
}

func grampusTrainJobErrorPrepare(ctx *context.Context, processType string, form auth.CreateGrampusTrainJobForm) error {
ctx.Data["PageIsCloudBrain"] = true

//get valid images
images, err := grampus.GetImages(processType)
if err != nil {
log.Error("GetImages failed:", err.Error())
} else {
ctx.Data["images"] = images.Infos
}

grampus.InitSpecialPool()

ctx.Data["GPUEnabled"] = true
ctx.Data["NPUEnabled"] = true
includeCenters := make(map[string]struct{})
excludeCenters := make(map[string]struct{})
if grampus.SpecialPools != nil {
for _, pool := range grampus.SpecialPools.Pools {
if pool.IsExclusive {
if !IsUserInOrgPool(ctx.User.ID, pool) {
ctx.Data[pool.Type+"Enabled"] = false
}
} else {
if strings.Contains(strings.ToLower(processType), strings.ToLower(pool.Type)) {
if IsUserInOrgPool(ctx.User.ID, pool) {
for _, center := range pool.Pool {
includeCenters[center.Queue] = struct{}{}
}
} else {
for _, center := range pool.Pool {
excludeCenters[center.Queue] = struct{}{}
}

}

}

}
}
}

//get valid resource specs
specs, err := grampus.GetResourceSpecs(processType)

grampusSpecs := getFilterSpecBySpecialPool(specs, includeCenters, excludeCenters)

if err != nil {
log.Error("GetResourceSpecs failed:", err.Error())
} else {
ctx.Data["flavor_infos"] = grampusSpecs
}

if processType == grampus.ProcessorTypeGPU {
ctx.Data["datasetType"] = models.TypeCloudBrainOne
} else if processType == grampus.ProcessorTypeNPU {
ctx.Data["datasetType"] = models.TypeCloudBrainTwo
}

var Parameters modelarts.Parameters
if err := json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["params"] = Parameters.Parameter
ctx.Data["boot_file"] = form.BootFile
ctx.Data["attachment"] = form.Attachment
_, datasetNames, err := models.GetDatasetInfo(form.Attachment)
if err != nil {
log.Error("GetDatasetInfo failed: %v", err, ctx.Data["MsgID"])
return nil
}
ctx.Data["dataset_name"] = datasetNames
ctx.Data["branch_name"] = form.BranchName
ctx.Data["image_id"] = form.ImageID

ctx.Data["display_job_name"] = form.DisplayJobName
ctx.Data["image"] = form.Image
ctx.Data["flavor"] = form.FlavorID
ctx.Data["flavor_name"] = form.FlavorName
ctx.Data["description"] = form.Description
ctx.Data["engine_name"] = form.EngineName
ctx.Data["work_server_number"] = form.WorkServerNumber

return nil
}

func getFilterSpecBySpecialPool(specs *models.GetGrampusResourceSpecsResult, includeCenters map[string]struct{}, excludeCenters map[string]struct{}) []models.GrampusSpec {
if len(includeCenters) == 0 && len(excludeCenters) == 0 {
return specs.Infos
@@ -295,14 +208,14 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
image := strings.TrimSpace(form.Image)

if !jobNamePattern.MatchString(displayJobName) {
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tplGrampusTrainJobGPUNew, &form)
return
}

errStr := checkSpecialPool(ctx, "GPU")
if errStr != "" {
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(errStr, tplGrampusTrainJobGPUNew, &form)
return
}
@@ -311,13 +224,13 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.GPUResource)
if err != nil {
log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("system error", tplGrampusTrainJobGPUNew, &form)
return
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplGrampusTrainJobGPUNew, &form)
return
}
@@ -326,7 +239,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
//check param
if err := grampusParamCheckCreateTrainJob(form); err != nil {
log.Error("paramCheckCreateTrainJob failed:(%v)", err, ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobGPUNew, &form)
return
}
@@ -336,14 +249,14 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err == nil {
if len(tasks) != 0 {
log.Error("the job name did already exist", ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("the job name did already exist", tplGrampusTrainJobGPUNew, &form)
return
}
} else {
if !models.IsErrJobNotExist(err) {
log.Error("system error, %v", err, ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("system error", tplGrampusTrainJobGPUNew, &form)
return
}
@@ -353,7 +266,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
attachment, err := models.GetAttachmentByUUID(uuid)
if err != nil {
log.Error("GetAttachmentByUUID failed:", err.Error(), ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("dataset is not exist", tplGrampusTrainJobGPUNew, &form)
return
}
@@ -366,7 +279,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain

if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form)
return
}
@@ -375,7 +288,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
//upload code
if err := uploadCodeToMinio(codeLocalPath+"/", jobName, cloudbrain.CodeMountPath+"/"); err != nil {
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form)
return
}
@@ -383,7 +296,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/"
if err := mkModelPath(modelPath); err != nil {
log.Error("Failed to mkModelPath: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form)
return
}
@@ -391,7 +304,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
//init model readme
if err := uploadCodeToMinio(modelPath, jobName, cloudbrain.ModelMountPath+"/"); err != nil {
log.Error("Failed to uploadCodeToMinio: %s (%v)", repo.FullName(), err, ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form)
return
}
@@ -400,7 +313,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
command, err := generateCommand(repo.Name, grampus.ProcessorTypeGPU, codeMinioPath+cloudbrain.DefaultBranchName+".zip", dataMinioPath, bootFile, params, setting.CBCodePathPrefix+jobName+cloudbrain.ModelMountPath+"/", attachment.Name)
if err != nil {
log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobGPUNew, &form)
return
}
@@ -432,7 +345,7 @@ func GrampusTrainJobGpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
err = grampus.GenerateTrainJob(ctx, req)
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error(), ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeGPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobGPUNew, &form)
return
}
@@ -479,14 +392,14 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
engineName := form.EngineName

if !jobNamePattern.MatchString(displayJobName) {
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tplGrampusTrainJobNPUNew, &form)
return
}

errStr := checkSpecialPool(ctx, "NPU")
if errStr != "" {
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(errStr, tplGrampusTrainJobGPUNew, &form)
return
}
@@ -495,13 +408,13 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
count, err := models.GetGrampusCountByUserID(ctx.User.ID, string(models.JobTypeTrain), models.NPUResource)
if err != nil {
log.Error("GetGrampusCountByUserID failed:%v", err, ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form)
return
} else {
if count >= 1 {
log.Error("the user already has running or waiting task", ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplGrampusTrainJobNPUNew, &form)
return
}
@@ -510,7 +423,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
//check param
if err := grampusParamCheckCreateTrainJob(form); err != nil {
log.Error("paramCheckCreateTrainJob failed:(%v)", err)
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form)
return
}
@@ -520,14 +433,14 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
if err == nil {
if len(tasks) != 0 {
log.Error("the job name did already exist", ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("the job name did already exist", tplGrampusTrainJobNPUNew, &form)
return
}
} else {
if !models.IsErrJobNotExist(err) {
log.Error("system error, %v", err, ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("system error", tplGrampusTrainJobNPUNew, &form)
return
}
@@ -537,7 +450,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
attachment, err := models.GetAttachmentByUUID(uuid)
if err != nil {
log.Error("GetAttachmentByUUID failed:", err.Error(), ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("dataset is not exist", tplGrampusTrainJobNPUNew, &form)
return
}
@@ -550,7 +463,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain

if err := downloadZipCode(ctx, codeLocalPath, branchName); err != nil {
log.Error("downloadZipCode failed, server timed out: %s (%v)", repo.FullName(), err)
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("Create task failed, server timed out", tplGrampusTrainJobNPUNew, &form)
return
}
@@ -558,14 +471,14 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
//todo: upload code (send to file_server todo this work?)
if err := obsMkdir(setting.CodePathPrefix + jobName + modelarts.OutputPath); err != nil {
log.Error("Failed to obsMkdir_output: %s (%v)", repo.FullName(), err)
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("Failed to obsMkdir_output", tplGrampusTrainJobNPUNew, &form)
return
}

if err := uploadCodeToObs(codeLocalPath, jobName, ""); err != nil {
log.Error("Failed to uploadCodeToObs: %s (%v)", repo.FullName(), err)
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("Failed to uploadCodeToObs", tplGrampusTrainJobNPUNew, &form)
return
}
@@ -574,7 +487,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
command, err := generateCommand(repo.Name, grampus.ProcessorTypeNPU, codeObsPath+cloudbrain.DefaultBranchName+".zip", dataObsPath+"'"+attachment.Name+"'", bootFile, params, setting.CodePathPrefix+jobName+modelarts.OutputPath, attachment.Name)
if err != nil {
log.Error("Failed to generateCommand: %s (%v)", displayJobName, err, ctx.Data["MsgID"])
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr("Create task failed, internal error", tplGrampusTrainJobNPUNew, &form)
return
}
@@ -610,7 +523,7 @@ func GrampusTrainJobNpuCreate(ctx *context.Context, form auth.CreateGrampusTrain
err = grampus.GenerateTrainJob(ctx, req)
if err != nil {
log.Error("GenerateTrainJob failed:%v", err.Error())
grampusTrainJobErrorPrepare(ctx, grampus.ProcessorTypeNPU, form)
grampusTrainJobNewDataPrepare(ctx, grampus.ProcessorTypeGPU)
ctx.RenderWithErr(err.Error(), tplGrampusTrainJobNPUNew, &form)
return
}


Loading…
Cancel
Save