diff --git a/models/cloudbrain.go b/models/cloudbrain.go index 7e986be52..3c4a3ae98 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -2313,3 +2313,9 @@ func GetCloudbrainByIDs(ids []int64) ([]*Cloudbrain, error) { In("id", ids). Find(&cloudbrains) } + +func GetCloudbrainWithDeletedByIDs(ids []int64) ([]*Cloudbrain, error) { + cloudbrains := make([]*Cloudbrain, 0) + return cloudbrains, x. + In("id", ids).Unscoped().Find(&cloudbrains) +} diff --git a/models/cloudbrain_spec.go b/models/cloudbrain_spec.go index 32f184037..c891c2fad 100644 --- a/models/cloudbrain_spec.go +++ b/models/cloudbrain_spec.go @@ -83,3 +83,26 @@ func GetCloudbrainSpecByID(cloudbrainId int64) (*CloudbrainSpec, error) { } return r, nil } + +func FindNoSpecHistoricTask(page, pageSize int) ([]*Cloudbrain, error) { + r := make([]*Cloudbrain, 0) + err := x.Unscoped(). + Where(" 1=1 and not exists (select 1 from cloudbrain_spec where cloudbrain.id = cloudbrain_spec.cloudbrain_id)"). + Limit(pageSize, (page-1)*pageSize). + OrderBy("cloudbrain.id"). + Find(&r) + if err != nil { + return nil, err + } + return r, nil +} + +func CountNoSpecHistoricTask() (int64, error) { + n, err := x.Unscoped(). + Where(" 1=1 and not exists (select 1 from cloudbrain_spec where cloudbrain.id = cloudbrain_spec.cloudbrain_id)"). + Count(&Cloudbrain{}) + if err != nil { + return 0, err + } + return n, nil +} diff --git a/models/resource_specification.go b/models/resource_specification.go index 8ef95a8da..b2eadf2a1 100644 --- a/models/resource_specification.go +++ b/models/resource_specification.go @@ -147,6 +147,21 @@ type FindSpecsOptions struct { Cluster string AiCenterCode string SpecId int64 + QueueCode string + SourceSpecId string + AccCardsNum int + UseAccCardsNum bool + AccCardType string + CpuCores int + UseCpuCores bool + MemGiB float32 + UseMemGiB bool + GPUMemGiB float32 + UseGPUMemGiB bool + ShareMemGiB float32 + UseShareMemGiB bool + //if true,find specs no matter used or not used in scene. if false,only find specs used in scene + RequestAll bool } type Specification struct { @@ -316,9 +331,10 @@ func SyncGrampusSpecs(updateList []ResourceSpecification, insertList []ResourceS return sess.Commit() } -func FindAvailableSpecs(opts FindSpecsOptions) ([]*Specification, error) { +//FindSpecs +func FindSpecs(opts FindSpecsOptions) ([]*Specification, error) { var cond = builder.NewCond() - if opts.JobType != "" { + if !opts.RequestAll && opts.JobType != "" { cond = cond.And(builder.Eq{"resource_scene.job_type": opts.JobType}) } if opts.ComputeResource != "" { @@ -333,17 +349,108 @@ func FindAvailableSpecs(opts FindSpecsOptions) ([]*Specification, error) { if opts.SpecId > 0 { cond = cond.And(builder.Eq{"resource_specification.id": opts.SpecId}) } - cond = cond.And(builder.Or(builder.Eq{"resource_scene.delete_time": 0}, builder.IsNull{"resource_scene.delete_time"})) - + if opts.QueueCode != "" { + cond = cond.And(builder.Eq{"resource_queue.queue_code": opts.QueueCode}) + } + if opts.SourceSpecId != "" { + cond = cond.And(builder.Eq{"resource_specification.source_spec_id": opts.SourceSpecId}) + } + if opts.UseAccCardsNum { + cond = cond.And(builder.Eq{"resource_specification.acc_cards_num": opts.AccCardsNum}) + } + if opts.AccCardType != "" { + cond = cond.And(builder.Eq{"resource_queue.acc_card_type": opts.AccCardType}) + } + if opts.UseCpuCores { + cond = cond.And(builder.Eq{"resource_specification.cpu_cores": opts.CpuCores}) + } + if opts.UseMemGiB { + cond = cond.And(builder.Eq{"resource_specification.mem_gi_b": opts.MemGiB}) + } + if opts.UseGPUMemGiB { + cond = cond.And(builder.Eq{"resource_specification.gpu_mem_gi_b": opts.GPUMemGiB}) + } + if opts.UseShareMemGiB { + cond = cond.And(builder.Eq{"resource_specification.share_mem_gi_b": opts.ShareMemGiB}) + } r := make([]*Specification, 0) - err := x.Where(cond). - Join("INNER", "resource_scene_spec", "resource_scene_spec.spec_id = resource_specification.id"). - Join("INNER", "resource_scene", "resource_scene_spec.scene_id = resource_scene.id"). - Join("INNER", "resource_queue", "resource_queue.id = resource_specification.queue_id"). - OrderBy("resource_queue.compute_resource asc,resource_queue.acc_card_type asc,resource_specification.acc_cards_num asc"). + s := x.Where(cond). + Join("INNER", "resource_queue", "resource_queue.id = resource_specification.queue_id") + + if !opts.RequestAll { + s = s.Join("INNER", "resource_scene_spec", "resource_scene_spec.spec_id = resource_specification.id"). + Join("INNER", "resource_scene", "resource_scene_spec.scene_id = resource_scene.id") + } + err := s.OrderBy("resource_queue.compute_resource asc,resource_queue.acc_card_type asc,resource_specification.acc_cards_num asc"). Unscoped().Find(&r) if err != nil { return nil, err } return r, nil } + +func InitQueueAndSpec(queue ResourceQueue, spec ResourceSpecification) (*Specification, error) { + sess := x.NewSession() + defer sess.Close() + + sess.Begin() + param := ResourceQueue{ + QueueCode: queue.QueueCode, + Cluster: queue.Cluster, + AiCenterCode: queue.AiCenterCode, + ComputeResource: queue.ComputeResource, + AccCardType: queue.AccCardType, + } + _, err := sess.Get(¶m) + if err != nil { + sess.Rollback() + return nil, err + } + if param.ID == 0 { + _, err = sess.InsertOne(&queue) + if err != nil { + sess.Rollback() + return nil, err + } + } else { + queue = param + } + + spec.QueueId = queue.ID + _, err = sess.InsertOne(&spec) + if err != nil { + sess.Rollback() + return nil, err + } + sess.Commit() + return &Specification{ + ID: spec.ID, + SourceSpecId: spec.SourceSpecId, + AccCardsNum: spec.AccCardsNum, + AccCardType: queue.AccCardType, + CpuCores: spec.CpuCores, + MemGiB: spec.MemGiB, + GPUMemGiB: spec.GPUMemGiB, + ShareMemGiB: spec.ShareMemGiB, + ComputeResource: queue.ComputeResource, + UnitPrice: spec.UnitPrice, + QueueId: queue.ID, + QueueCode: queue.QueueCode, + Cluster: queue.Cluster, + AiCenterCode: queue.AiCenterCode, + AiCenterName: queue.AiCenterName, + }, nil +} + +func GetCloudbrainOneAccCardType(queueCode string) string { + switch queueCode { + case "a100": + return "A100" + case "openidebug": + return "T4" + case "openidgx": + return "V100" + + } + return "" +} diff --git a/modules/auth/modelarts.go b/modules/auth/modelarts.go index ce41f5d1e..23e1f325a 100755 --- a/modules/auth/modelarts.go +++ b/modules/auth/modelarts.go @@ -22,6 +22,7 @@ type CreateModelArtsNotebookForm struct { Description string `form:"description"` Flavor string `form:"flavor" binding:"Required"` ImageId string `form:"image_id" binding:"Required"` + SpecId int64 `form:"spec_id" binding:"Required"` } func (f *CreateModelArtsNotebookForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { @@ -46,6 +47,7 @@ type CreateModelArtsTrainJobForm struct { VersionName string `form:"version_name" binding:"Required"` FlavorName string `form:"flaver_names" binding:"Required"` EngineName string `form:"engine_names" binding:"Required"` + SpecId int64 `form:"spec_id" binding:"Required"` } type CreateModelArtsInferenceJobForm struct { @@ -71,6 +73,7 @@ type CreateModelArtsInferenceJobForm struct { ModelName string `form:"model_name" binding:"Required"` ModelVersion string `form:"model_version" binding:"Required"` CkptName string `form:"ckpt_name" binding:"Required"` + SpecId int64 `form:"spec_id" binding:"Required"` } func (f *CreateModelArtsTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index 8dcf1b1a9..637f7a3b5 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -84,7 +84,6 @@ type GenerateTrainJobReq struct { BootFileUrl string DataUrl string TrainUrl string - FlavorCode string LogUrl string PoolID string WorkServerNumber int @@ -96,6 +95,7 @@ type GenerateTrainJobReq struct { BranchName string PreVersionId int64 PreVersionName string + FlavorCode string FlavorName string VersionCount int EngineName string @@ -103,6 +103,7 @@ type GenerateTrainJobReq struct { UserImageUrl string UserCommand string DatasetName string + Spec *models.Specification } type GenerateInferenceJobReq struct { @@ -115,7 +116,6 @@ type GenerateInferenceJobReq struct { BootFileUrl string DataUrl string TrainUrl string - FlavorCode string LogUrl string PoolID string WorkServerNumber int @@ -134,6 +134,7 @@ type GenerateInferenceJobReq struct { ModelVersion string CkptName string ResultUrl string + Spec *models.Specification } type VersionInfo struct { @@ -256,7 +257,7 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor strin return nil } -func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, flavor, imageId string) error { +func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification) error { if poolInfos == nil { json.Unmarshal([]byte(setting.PoolInfos), &poolInfos) } @@ -270,7 +271,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc jobResult, err := createNotebook2(models.CreateNotebook2Params{ JobName: jobName, Description: description, - Flavor: flavor, + Flavor: spec.SourceSpecId, Duration: autoStopDurationMs, ImageID: imageId, PoolID: poolInfos.PoolInfo[0].PoolId, @@ -292,7 +293,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc RepoID: ctx.Repo.Repository.ID, JobID: jobResult.ID, JobName: jobName, - FlavorCode: flavor, + FlavorCode: spec.SourceSpecId, DisplayJobName: displayJobName, JobType: string(models.JobTypeDebug), Type: models.TypeCloudBrainTwo, @@ -302,6 +303,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc Description: description, CreatedUnix: createTime, UpdatedUnix: createTime, + Spec: spec, }) if err != nil { @@ -335,7 +337,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error PoolID: req.PoolID, CreateVersion: true, Flavor: models.Flavor{ - Code: req.FlavorCode, + Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, UserImageUrl: req.UserImageUrl, @@ -357,7 +359,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error PoolID: req.PoolID, CreateVersion: true, Flavor: models.Flavor{ - Code: req.FlavorCode, + Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, }, @@ -391,7 +393,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error BootFile: req.BootFile, DataUrl: req.DataUrl, LogUrl: req.LogUrl, - FlavorCode: req.FlavorCode, + FlavorCode: req.Spec.SourceSpecId, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, @@ -400,6 +402,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error TotalVersionCount: req.TotalVersionCount, CreatedUnix: createTime, UpdatedUnix: createTime, + Spec: req.Spec, }) if createErr != nil { @@ -451,7 +454,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job LogUrl: req.LogUrl, PoolID: req.PoolID, Flavor: models.Flavor{ - Code: req.FlavorCode, + Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, PreVersionId: req.PreVersionId, @@ -472,7 +475,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job LogUrl: req.LogUrl, PoolID: req.PoolID, Flavor: models.Flavor{ - Code: req.FlavorCode, + Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, PreVersionId: req.PreVersionId, @@ -524,7 +527,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job DataUrl: req.DataUrl, LogUrl: req.LogUrl, PreVersionId: req.PreVersionId, - FlavorCode: req.FlavorCode, + FlavorCode: req.Spec.SourceSpecId, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, @@ -533,6 +536,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job VersionCount: VersionListCount + 1, CreatedUnix: createTime, UpdatedUnix: createTime, + Spec: req.Spec, }) if createErr != nil { log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error()) @@ -716,7 +720,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e PoolID: req.PoolID, CreateVersion: true, Flavor: models.Flavor{ - Code: req.FlavorCode, + Code: req.Spec.SourceSpecId, }, Parameter: req.Parameters, }, @@ -753,7 +757,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e BootFile: req.BootFile, DataUrl: req.DataUrl, LogUrl: req.LogUrl, - FlavorCode: req.FlavorCode, + FlavorCode: req.Spec.SourceSpecId, Description: req.Description, WorkServerNumber: req.WorkServerNumber, FlavorName: req.FlavorName, @@ -769,6 +773,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e ResultUrl: req.ResultUrl, CreatedUnix: createTime, UpdatedUnix: createTime, + Spec: req.Spec, }) if err != nil { diff --git a/routers/admin/resources.go b/routers/admin/resources.go index 7d267c19c..808c44f3f 100644 --- a/routers/admin/resources.go +++ b/routers/admin/resources.go @@ -8,6 +8,8 @@ import ( "code.gitea.io/gitea/routers/response" "code.gitea.io/gitea/services/cloudbrain/resource" "net/http" + "strconv" + "strings" ) const ( @@ -246,3 +248,37 @@ func UpdateResourceScene(ctx *context.Context, req models.ResourceSceneReq) { } ctx.JSON(http.StatusOK, response.Success()) } + +func RefreshHistorySpec(ctx *context.Context) { + scope := ctx.Query("scope") + list := ctx.Query("list") + + var scopeAll = false + if scope == "all" { + scopeAll = true + } + var ids = make([]int64, 0) + if list != "" { + strs := strings.Split(list, "|") + for _, s := range strs { + i, err := strconv.ParseInt(s, 10, 64) + if err != nil { + ctx.JSON(http.StatusOK, response.ServerError(err.Error())) + return + } + ids = append(ids, i) + } + + } + + total, success, err := resource.RefreshHistorySpec(scopeAll, ids) + if err != nil { + log.Error("RefreshHistorySpec error. %v", err) + ctx.JSON(http.StatusOK, response.ServerError(err.Error())) + return + } + r := make(map[string]interface{}, 0) + r["success"] = success + r["total"] = total + ctx.JSON(http.StatusOK, response.SuccessWithData(r)) +} diff --git a/routers/private/internal.go b/routers/private/internal.go index 4731463b1..3e2eeab31 100755 --- a/routers/private/internal.go +++ b/routers/private/internal.go @@ -6,6 +6,7 @@ package private import ( + "code.gitea.io/gitea/routers/admin" "strings" "code.gitea.io/gitea/routers/repo" @@ -51,6 +52,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Get("/tool/org_stat", OrgStatisticManually) m.Post("/tool/update_repo_visit/:date", UpdateRepoVisit) m.Post("/task/history_handle/duration", repo.HandleTaskWithNoDuration) + m.Post("/resources/specification/handle_historical_task", admin.RefreshHistorySpec) }, CheckInternalToken) } diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 4a1df232a..a297ed133 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -122,89 +122,8 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { ctx.Data["QueuesDetail"] = queuesDetail } - cloudbrain.InitSpecialPool() - - if gpuInfos == nil { - json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos) - } - ctx.Data["gpu_types"] = gpuInfos.GpuInfo - - if trainGpuInfos == nil { - json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos) - } - ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo - - if inferenceGpuInfos == nil && setting.InferenceGpuTypes != "" { - json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos) - } - if inferenceGpuInfos != nil { - ctx.Data["inference_gpu_types"] = inferenceGpuInfos.GpuInfo - } - - if benchmarkGpuInfos == nil { - json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) - } - ctx.Data["benchmark_gpu_types"] = benchmarkGpuInfos.GpuInfo - - if benchmarkResourceSpecs == nil { - json.Unmarshal([]byte(setting.BenchmarkResourceSpecs), &benchmarkResourceSpecs) - } - ctx.Data["benchmark_resource_specs"] = benchmarkResourceSpecs.ResourceSpec - - if cloudbrain.ResourceSpecs == nil { - json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) - } - ctx.Data["resource_specs"] = cloudbrain.ResourceSpecs.ResourceSpec - - if cloudbrain.TrainResourceSpecs == nil { - json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) - } - ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec - - if cloudbrain.InferenceResourceSpecs == nil && setting.InferenceResourceSpecs != "" { - json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) - } - if cloudbrain.InferenceResourceSpecs != nil { - ctx.Data["inference_resource_specs"] = cloudbrain.InferenceResourceSpecs.ResourceSpec - } - prepareCloudbrainOneSpecs(ctx) - if cloudbrain.SpecialPools != nil { - var debugGpuTypes []*models.GpuInfo - var trainGpuTypes []*models.GpuInfo - - for _, pool := range cloudbrain.SpecialPools.Pools { - isOrgMember, _ := models.IsOrganizationMemberByOrgName(pool.Org, ctx.User.ID) - if isOrgMember { - for _, jobType := range pool.JobType { - if jobType == string(models.JobTypeDebug) { - debugGpuTypes = append(debugGpuTypes, pool.Pool...) - if pool.ResourceSpec != nil { - ctx.Data["resource_specs"] = pool.ResourceSpec - } - } else if jobType == string(models.JobTypeTrain) { - trainGpuTypes = append(trainGpuTypes, pool.Pool...) - if pool.ResourceSpec != nil { - ctx.Data["train_resource_specs"] = pool.ResourceSpec - } - } - } - break - } - - } - - if len(debugGpuTypes) > 0 { - ctx.Data["gpu_types"] = debugGpuTypes - } - - if len(trainGpuTypes) > 0 { - ctx.Data["train_gpu_types"] = trainGpuTypes - } - - } - ctx.Data["params"] = "" ctx.Data["branchName"] = ctx.Repo.BranchName @@ -229,8 +148,6 @@ func prepareCloudbrainOneSpecs(ctx *context.Context) { AiCenterCode: models.AICenterOfCloudBrainOne, }) ctx.Data["debug_specs"] = debugSpecs - b, _ := json.Marshal(debugSpecs) - log.Info("%s", string(b)) trainSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ JobType: models.JobTypeTrain, @@ -247,6 +164,14 @@ func prepareCloudbrainOneSpecs(ctx *context.Context) { AiCenterCode: models.AICenterOfCloudBrainOne, }) ctx.Data["inference_specs"] = inferenceSpecs + + benchmarkSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeBenchmark, + ComputeResource: models.GPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne, + }) + ctx.Data["benchmark_specs"] = benchmarkSpecs } func CloudBrainNew(ctx *context.Context) { @@ -348,18 +273,10 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { command = commandTrain } - errStr := checkCloudBrainSpecialPool(ctx, jobType, gpuQueue, resourceSpecId) - - if errStr != "" { - cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr(errStr, tpl, &form) - return - } - if branchName == "" { branchName = cloudbrain.DefaultBranchName } - errStr = loadCodeAndMakeModelPath(repo, codePath, branchName, jobName, cloudbrain.ModelMountPath) + errStr := loadCodeAndMakeModelPath(repo, codePath, branchName, jobName, cloudbrain.ModelMountPath) if errStr != "" { cloudBrainNewDataPrepare(ctx) ctx.RenderWithErr(ctx.Tr(errStr), tpl, &form) @@ -375,7 +292,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { AiCenterCode: models.AICenterOfCloudBrainOne}) if err != nil || spec == nil { cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("Illegal resource specification", tpl, &form) + ctx.RenderWithErr("Resource specification not available", tpl, &form) return } @@ -534,7 +451,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra AiCenterCode: models.AICenterOfCloudBrainOne}) if err != nil || spec == nil { cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("Illegal resource specification", tpl, &form) + ctx.RenderWithErr("Resource specification not available", tpl, &form) return } req := cloudbrain.GenerateCloudBrainTaskReq{ @@ -2447,7 +2364,7 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo AiCenterCode: models.AICenterOfCloudBrainOne}) if err != nil || spec == nil { cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("Illegal resource specification", tplCloudBrainBenchmarkNew, &form) + ctx.RenderWithErr("Resource specification not available", tplCloudBrainBenchmarkNew, &form) return } @@ -2587,7 +2504,7 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm) AiCenterCode: models.AICenterOfCloudBrainOne}) if err != nil || spec == nil { cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("Illegal resource specification", tpl, &form) + ctx.RenderWithErr("Resource specification not available", tpl, &form) return } req := cloudbrain.GenerateCloudBrainTaskReq{ diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 948a0e751..cbe2589b2 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -2,6 +2,7 @@ package repo import ( "archive/zip" + "code.gitea.io/gitea/services/cloudbrain/resource" "encoding/json" "errors" "fmt" @@ -141,11 +142,7 @@ func notebookNewDataPrepare(ctx *context.Context) error { } ctx.Data["images"] = modelarts.ImageInfos.ImageInfo - if modelarts.FlavorInfos == nil { - json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos) - } - ctx.Data["flavors"] = modelarts.FlavorInfos.FlavorInfo - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeDebug)) + prepareCloudbrainTwoDebugSpecs(ctx) ctx.Data["datasetType"] = models.TypeCloudBrainTwo @@ -155,6 +152,16 @@ func notebookNewDataPrepare(ctx *context.Context) error { return nil } +func prepareCloudbrainTwoDebugSpecs(ctx *context.Context) { + noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeDebug, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo, + }) + ctx.Data["Specs"] = noteBookSpecs +} + func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) { ctx.Data["PageIsNotebook"] = true jobName := form.JobName @@ -205,7 +212,6 @@ func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm jobName := util.ConvertDisplayJobNameToJobName(displayJobName) uuid := form.Attachment description := form.Description - flavor := form.Flavor imageId := form.ImageId repo := ctx.Repo.Repository @@ -241,14 +247,17 @@ func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm } } - errStr := checkModelArtsSpecialPool(ctx, flavor, string(models.JobTypeDebug)) - if errStr != "" { + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeDebug, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo}) + if err != nil || spec == nil { notebookNewDataPrepare(ctx) - ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsNotebookNew, &form) + ctx.RenderWithErr("Resource specification not available", tplModelArtsNotebookNew, &form) return } - - err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, flavor, imageId) + err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, imageId, spec) if err != nil { log.Error("GenerateNotebook2 failed, %v", err, ctx.Data["MsgID"]) notebookNewDataPrepare(ctx) @@ -728,14 +737,7 @@ func trainJobNewDataPrepare(ctx *context.Context) error { } ctx.Data["engine_versions"] = versionInfos.Version - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["flavor_infos"] = flavorInfos.Info - - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain)) + prepareCloudbrainTwoTrainSpecs(ctx) ctx.Data["params"] = "" ctx.Data["branchName"] = ctx.Repo.BranchName @@ -753,6 +755,16 @@ func trainJobNewDataPrepare(ctx *context.Context) error { return nil } +func prepareCloudbrainTwoTrainSpecs(ctx *context.Context) { + noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo, + }) + ctx.Data["Specs"] = noteBookSpecs +} + func setSpecBySpecialPoolConfig(ctx *context.Context, jobType string) { modelarts.InitSpecialPool() @@ -835,13 +847,7 @@ func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArts } ctx.Data["engine_versions"] = versionInfos.Version - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["flavor_infos"] = flavorInfos.Info - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain)) + prepareCloudbrainTwoTrainSpecs(ctx) configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) if err != nil { @@ -1020,13 +1026,7 @@ func versionErrorDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrai } ctx.Data["engine_versions"] = versionInfos.Version - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - ctx.Data["flavor_infos"] = flavorInfos.Info - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain)) + prepareCloudbrainTwoTrainSpecs(ctx) var Parameters modelarts.Parameters if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil { @@ -1079,7 +1079,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) workServerNumber := form.WorkServerNumber engineID := form.EngineID bootFile := strings.TrimSpace(form.BootFile) - flavorCode := form.Flavor params := form.Params poolID := form.PoolID //isSaveParam := form.IsSaveParam @@ -1117,10 +1116,14 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) return } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) - if errStr != "" { + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo}) + if err != nil || spec == nil { trainJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form) + ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobNew, &form) return } //Determine whether the task name of the task in the project is duplicated @@ -1283,7 +1286,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) BootFileUrl: codeObsPath + bootFile, BootFile: bootFile, TrainUrl: outputObsPath, - FlavorCode: flavorCode, WorkServerNumber: workServerNumber, EngineID: int64(engineID), LogUrl: logObsPath, @@ -1299,6 +1301,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) VersionCount: VersionCount, TotalVersionCount: modelarts.TotalVersionCount, DatasetName: datasetNames, + Spec: spec, } userCommand, userImageUrl := getUserCommand(engineID, req) req.UserCommand = userCommand @@ -1384,7 +1387,6 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ workServerNumber := form.WorkServerNumber engineID := form.EngineID bootFile := strings.TrimSpace(form.BootFile) - flavorCode := form.Flavor params := form.Params poolID := form.PoolID //isSaveParam := form.IsSaveParam @@ -1414,10 +1416,14 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ return } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) - if errStr != "" { + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeTrain, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo}) + if err != nil || spec == nil { versionErrorDataPrepare(ctx, form) - ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form) + ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobVersionNew, &form) return } @@ -1571,7 +1577,6 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ BootFileUrl: codeObsPath + bootFile, BootFile: bootFile, TrainUrl: outputObsPath, - FlavorCode: flavorCode, WorkServerNumber: workServerNumber, IsLatestVersion: isLatestVersion, EngineID: int64(engineID), @@ -1588,6 +1593,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ PreVersionName: PreVersionName, TotalVersionCount: latestTask.TotalVersionCount + 1, DatasetName: datasetNames, + Spec: spec, } userCommand, userImageUrl := getUserCommand(engineID, req) req.UserCommand = userCommand @@ -2016,7 +2022,6 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference workServerNumber := form.WorkServerNumber engineID := form.EngineID bootFile := strings.TrimSpace(form.BootFile) - flavorCode := form.Flavor params := form.Params poolID := form.PoolID repo := ctx.Repo.Repository @@ -2078,13 +2083,16 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference } } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference)) - if errStr != "" { + spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ + JobType: models.JobTypeInference, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo}) + if err != nil || spec == nil { inferenceJobErrorNewDataPrepare(ctx, form) - ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form) + ctx.RenderWithErr("Resource specification not available", tplModelArtsInferenceJobNew, &form) return } - //todo: del the codeLocalPath _, err = ioutil.ReadDir(codeLocalPath) if err == nil { @@ -2170,7 +2178,6 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference BootFileUrl: codeObsPath + bootFile, BootFile: bootFile, TrainUrl: trainUrl, - FlavorCode: flavorCode, WorkServerNumber: workServerNumber, EngineID: int64(engineID), LogUrl: logObsPath, @@ -2369,14 +2376,7 @@ func inferenceJobNewDataPrepare(ctx *context.Context) error { } ctx.Data["engine_versions"] = versionInfos.Version - var flavorInfos modelarts.Flavor - if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { - ctx.ServerError("json.Unmarshal failed:", err) - return err - } - - ctx.Data["flavor_infos"] = flavorInfos.Info - setSpecBySpecialPoolConfig(ctx, string(models.JobTypeInference)) + prepareCloudbrainTwoInferenceSpecs(ctx) ctx.Data["params"] = "" ctx.Data["branchName"] = ctx.Repo.BranchName @@ -2407,6 +2407,16 @@ func inferenceJobNewDataPrepare(ctx *context.Context) error { return nil } +func prepareCloudbrainTwoInferenceSpecs(ctx *context.Context) { + noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ + JobType: models.JobTypeInference, + ComputeResource: models.NPU, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainTwo, + }) + ctx.Data["Specs"] = noteBookSpecs +} + func inferenceJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) error { ctx.Data["PageIsCloudBrain"] = true diff --git a/services/cloudbrain/resource/resource_specification.go b/services/cloudbrain/resource/resource_specification.go index e27d88ef1..bab0d3096 100644 --- a/services/cloudbrain/resource/resource_specification.go +++ b/services/cloudbrain/resource/resource_specification.go @@ -2,12 +2,17 @@ package resource import ( "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/cloudbrain" "code.gitea.io/gitea/modules/grampus" "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/routers/response" "code.gitea.io/gitea/services/admin/operate_log" + "encoding/json" + "errors" "fmt" "strings" + "time" ) func AddResourceSpecification(doerId int64, req models.ResourceSpecificationReq) error { @@ -186,7 +191,7 @@ func AddSpecOperateLog(doerId int64, operateType string, newValue, oldValue *mod } func FindAvailableSpecs(userId int64, opts models.FindSpecsOptions) ([]*models.Specification, error) { - r, err := models.FindAvailableSpecs(opts) + r, err := models.FindSpecs(opts) if err != nil { log.Error("FindAvailableSpecs error.%v", err) return nil, err @@ -270,3 +275,260 @@ func GetCloudbrainSpec(cloudbrainId int64) (*models.Specification, error) { } return c.ConvertToSpecification(), nil } + +func RefreshHistorySpec(scopeAll bool, ids []int64) (int64, int64, error) { + var success int64 + var total int64 + + if !scopeAll { + if ids == nil || len(ids) == 0 { + return 0, 0, nil + } + total = int64(len(ids)) + tasks, err := models.GetCloudbrainWithDeletedByIDs(ids) + if err != nil { + return total, 0, err + } + for _, task := range tasks { + err = RefreshOneHistorySpec(task) + if err != nil { + log.Error("RefreshOneHistorySpec error.%v", err) + continue + } + success++ + } + + } else { + page := 1 + pageSize := 100 + n, err := models.CountNoSpecHistoricTask() + if err != nil { + log.Error("FindNoSpecHistoricTask CountNoSpecHistoricTask error. e=%v", err) + return 0, 0, err + } + total = n + for i := 0; i < 1000; i++ { + list, err := models.FindNoSpecHistoricTask(page, pageSize) + if err != nil { + log.Error("FindNoSpecHistoricTask error.page=%d pageSize=%d e=%v", page, pageSize, err) + return total, success, err + } + if len(list) == 0 { + log.Info("RefreshHistorySpec. list is empty") + break + } + for _, task := range list { + time.Sleep(1 * time.Second) + err = RefreshOneHistorySpec(task) + if err != nil { + log.Error("RefreshOneHistorySpec error.%v", err) + continue + } + success++ + } + if len(list) < pageSize { + log.Info("RefreshHistorySpec. list < pageSize") + break + } + } + } + return total, success, nil + +} + +func RefreshOneHistorySpec(task *models.Cloudbrain) error { + var spec *models.Specification + var err error + switch task.Type { + case models.TypeCloudBrainOne: + spec, err = getCloudbrainOneSpec(task) + } + if err != nil { + log.Error("find spec error,task.ID=%d err=%v", task.ID, err) + return err + } + if spec == nil { + log.Error("find spec failed,task.ID=%d", task.ID) + return errors.New("find spec failed") + } + return InsertCloudbrainSpec(task.ID, spec) +} + +func getCloudbrainOneSpec(task *models.Cloudbrain) (*models.Specification, error) { + //find from remote + result, err := cloudbrain.GetJob(task.JobID) + if err != nil { + log.Error("getCloudbrainOneSpec error. %v", err) + return nil, err + } + if result != nil { + jobRes, _ := models.ConvertToJobResultPayload(result.Payload) + memSize, _ := models.ParseMemSizeFromGrampus(jobRes.Resource.Memory) + if task.ComputeResource == "CPU/GPU" { + task.ComputeResource = models.GPU + } + var shmMB float32 + if jobRes.Config.TaskRoles != nil && len(jobRes.Config.TaskRoles) > 0 { + shmMB = float32(jobRes.Config.TaskRoles[0].ShmMB) / 1024 + } + + opt := models.FindSpecsOptions{ + ComputeResource: task.ComputeResource, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne, + QueueCode: task.GpuQueue, + AccCardsNum: jobRes.Resource.NvidiaComGpu, + UseAccCardsNum: true, + CpuCores: jobRes.Resource.CPU, + UseCpuCores: true, + MemGiB: memSize, + UseMemGiB: memSize > 0, + ShareMemGiB: shmMB, + UseShareMemGiB: shmMB > 0, + RequestAll: true, + } + specs, err := models.FindSpecs(opt) + if err != nil { + log.Error("getCloudbrainOneSpec from remote error,%v", err) + return nil, err + } + if len(specs) == 1 { + return specs[0], nil + } + if len(specs) == 0 { + s, err := InitQueueAndSpec(opt, "云脑一", "处理历史云脑任务时自动添加") + if err != nil { + log.Error("getCloudbrainOneSpec InitQueueAndSpec error.err=%v", err) + return nil, nil + } + return s, nil + } + if len(specs) > 1 { + log.Error("Too many results matched.size=%d opt=%+v", len(specs), opt) + return nil, nil + } + + } else { + //find from config + var specConfig *models.ResourceSpec + hasSpec := false + if task.JobType == string(models.JobTypeTrain) { + if cloudbrain.TrainResourceSpecs == nil { + json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) + } + for _, tmp := range cloudbrain.TrainResourceSpecs.ResourceSpec { + if tmp.Id == task.ResourceSpecId { + hasSpec = true + specConfig = tmp + break + } + } + } else if task.JobType == string(models.JobTypeInference) { + if cloudbrain.InferenceResourceSpecs == nil { + json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) + } + for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec { + if tmp.Id == task.ResourceSpecId { + hasSpec = true + specConfig = tmp + break + } + } + } else { + if cloudbrain.ResourceSpecs == nil { + json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) + } + for _, tmp := range cloudbrain.ResourceSpecs.ResourceSpec { + if tmp.Id == task.ResourceSpecId { + hasSpec = true + specConfig = tmp + break + + } + } + } + if !hasSpec && cloudbrain.SpecialPools != nil { + + for _, specialPool := range cloudbrain.SpecialPools.Pools { + + if specialPool.ResourceSpec != nil { + + for _, spec := range specialPool.ResourceSpec { + if task.ResourceSpecId == spec.Id { + hasSpec = true + specConfig = spec + break + } + } + } + } + } + if specConfig == nil { + log.Error("getCloudbrainOneSpec from config failed,task.ResourceSpecId=%d", task.ResourceSpecId) + return nil, nil + } + opt := models.FindSpecsOptions{ + JobType: models.JobType(task.JobType), + ComputeResource: task.ComputeResource, + Cluster: models.OpenICluster, + AiCenterCode: models.AICenterOfCloudBrainOne, + QueueCode: task.GpuQueue, + AccCardsNum: specConfig.GpuNum, + UseAccCardsNum: true, + CpuCores: specConfig.GpuNum, + UseCpuCores: true, + MemGiB: float32(specConfig.MemMiB) / 1024, + UseMemGiB: true, + ShareMemGiB: float32(specConfig.ShareMemMiB) / 1024, + UseShareMemGiB: true, + RequestAll: true, + } + specs, err := models.FindSpecs(opt) + if err != nil { + log.Error("getCloudbrainOneSpec from config error,%v", err) + return nil, err + } + if len(specs) > 1 { + log.Error("Too many results matched.size=%d opt=%+v", len(specs), opt) + return nil, nil + } + if len(specs) == 0 { + s, err := InitQueueAndSpec(opt, "云脑一", "处理历史云脑任务时自动添加") + if err != nil { + log.Error("getCloudbrainOneSpec InitQueueAndSpec error.err=%v", err) + return nil, nil + } + return s, nil + } + return specs[0], nil + } + return nil, nil + +} + +func RefreshCloudbrainTwoSpec(task *models.Cloudbrain) error { + return nil +} + +func RefreshC2NetSpec(task *models.Cloudbrain) error { + return nil +} + +func InitQueueAndSpec(opt models.FindSpecsOptions, aiCenterName string, remark string) (*models.Specification, error) { + return models.InitQueueAndSpec(models.ResourceQueue{ + QueueCode: opt.QueueCode, + Cluster: opt.Cluster, + AiCenterCode: opt.AiCenterCode, + AiCenterName: aiCenterName, + ComputeResource: opt.ComputeResource, + AccCardType: models.GetCloudbrainOneAccCardType(opt.QueueCode), + Remark: remark, + }, models.ResourceSpecification{ + AccCardsNum: opt.AccCardsNum, + CpuCores: opt.CpuCores, + MemGiB: opt.MemGiB, + GPUMemGiB: opt.GPUMemGiB, + ShareMemGiB: opt.ShareMemGiB, + Status: models.SpecOffShelf, + }) +}