| @@ -2313,3 +2313,9 @@ func GetCloudbrainByIDs(ids []int64) ([]*Cloudbrain, error) { | |||
| In("id", ids). | |||
| Find(&cloudbrains) | |||
| } | |||
| func GetCloudbrainWithDeletedByIDs(ids []int64) ([]*Cloudbrain, error) { | |||
| cloudbrains := make([]*Cloudbrain, 0) | |||
| return cloudbrains, x. | |||
| In("id", ids).Unscoped().Find(&cloudbrains) | |||
| } | |||
| @@ -83,3 +83,26 @@ func GetCloudbrainSpecByID(cloudbrainId int64) (*CloudbrainSpec, error) { | |||
| } | |||
| return r, nil | |||
| } | |||
| func FindNoSpecHistoricTask(page, pageSize int) ([]*Cloudbrain, error) { | |||
| r := make([]*Cloudbrain, 0) | |||
| err := x.Unscoped(). | |||
| Where(" 1=1 and not exists (select 1 from cloudbrain_spec where cloudbrain.id = cloudbrain_spec.cloudbrain_id)"). | |||
| Limit(pageSize, (page-1)*pageSize). | |||
| OrderBy("cloudbrain.id"). | |||
| Find(&r) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| return r, nil | |||
| } | |||
| func CountNoSpecHistoricTask() (int64, error) { | |||
| n, err := x.Unscoped(). | |||
| Where(" 1=1 and not exists (select 1 from cloudbrain_spec where cloudbrain.id = cloudbrain_spec.cloudbrain_id)"). | |||
| Count(&Cloudbrain{}) | |||
| if err != nil { | |||
| return 0, err | |||
| } | |||
| return n, nil | |||
| } | |||
| @@ -147,6 +147,21 @@ type FindSpecsOptions struct { | |||
| Cluster string | |||
| AiCenterCode string | |||
| SpecId int64 | |||
| QueueCode string | |||
| SourceSpecId string | |||
| AccCardsNum int | |||
| UseAccCardsNum bool | |||
| AccCardType string | |||
| CpuCores int | |||
| UseCpuCores bool | |||
| MemGiB float32 | |||
| UseMemGiB bool | |||
| GPUMemGiB float32 | |||
| UseGPUMemGiB bool | |||
| ShareMemGiB float32 | |||
| UseShareMemGiB bool | |||
| //if true,find specs no matter used or not used in scene. if false,only find specs used in scene | |||
| RequestAll bool | |||
| } | |||
| type Specification struct { | |||
| @@ -316,9 +331,10 @@ func SyncGrampusSpecs(updateList []ResourceSpecification, insertList []ResourceS | |||
| return sess.Commit() | |||
| } | |||
| func FindAvailableSpecs(opts FindSpecsOptions) ([]*Specification, error) { | |||
| //FindSpecs | |||
| func FindSpecs(opts FindSpecsOptions) ([]*Specification, error) { | |||
| var cond = builder.NewCond() | |||
| if opts.JobType != "" { | |||
| if !opts.RequestAll && opts.JobType != "" { | |||
| cond = cond.And(builder.Eq{"resource_scene.job_type": opts.JobType}) | |||
| } | |||
| if opts.ComputeResource != "" { | |||
| @@ -333,17 +349,108 @@ func FindAvailableSpecs(opts FindSpecsOptions) ([]*Specification, error) { | |||
| if opts.SpecId > 0 { | |||
| cond = cond.And(builder.Eq{"resource_specification.id": opts.SpecId}) | |||
| } | |||
| cond = cond.And(builder.Or(builder.Eq{"resource_scene.delete_time": 0}, builder.IsNull{"resource_scene.delete_time"})) | |||
| if opts.QueueCode != "" { | |||
| cond = cond.And(builder.Eq{"resource_queue.queue_code": opts.QueueCode}) | |||
| } | |||
| if opts.SourceSpecId != "" { | |||
| cond = cond.And(builder.Eq{"resource_specification.source_spec_id": opts.SourceSpecId}) | |||
| } | |||
| if opts.UseAccCardsNum { | |||
| cond = cond.And(builder.Eq{"resource_specification.acc_cards_num": opts.AccCardsNum}) | |||
| } | |||
| if opts.AccCardType != "" { | |||
| cond = cond.And(builder.Eq{"resource_queue.acc_card_type": opts.AccCardType}) | |||
| } | |||
| if opts.UseCpuCores { | |||
| cond = cond.And(builder.Eq{"resource_specification.cpu_cores": opts.CpuCores}) | |||
| } | |||
| if opts.UseMemGiB { | |||
| cond = cond.And(builder.Eq{"resource_specification.mem_gi_b": opts.MemGiB}) | |||
| } | |||
| if opts.UseGPUMemGiB { | |||
| cond = cond.And(builder.Eq{"resource_specification.gpu_mem_gi_b": opts.GPUMemGiB}) | |||
| } | |||
| if opts.UseShareMemGiB { | |||
| cond = cond.And(builder.Eq{"resource_specification.share_mem_gi_b": opts.ShareMemGiB}) | |||
| } | |||
| r := make([]*Specification, 0) | |||
| err := x.Where(cond). | |||
| Join("INNER", "resource_scene_spec", "resource_scene_spec.spec_id = resource_specification.id"). | |||
| Join("INNER", "resource_scene", "resource_scene_spec.scene_id = resource_scene.id"). | |||
| Join("INNER", "resource_queue", "resource_queue.id = resource_specification.queue_id"). | |||
| OrderBy("resource_queue.compute_resource asc,resource_queue.acc_card_type asc,resource_specification.acc_cards_num asc"). | |||
| s := x.Where(cond). | |||
| Join("INNER", "resource_queue", "resource_queue.id = resource_specification.queue_id") | |||
| if !opts.RequestAll { | |||
| s = s.Join("INNER", "resource_scene_spec", "resource_scene_spec.spec_id = resource_specification.id"). | |||
| Join("INNER", "resource_scene", "resource_scene_spec.scene_id = resource_scene.id") | |||
| } | |||
| err := s.OrderBy("resource_queue.compute_resource asc,resource_queue.acc_card_type asc,resource_specification.acc_cards_num asc"). | |||
| Unscoped().Find(&r) | |||
| if err != nil { | |||
| return nil, err | |||
| } | |||
| return r, nil | |||
| } | |||
| func InitQueueAndSpec(queue ResourceQueue, spec ResourceSpecification) (*Specification, error) { | |||
| sess := x.NewSession() | |||
| defer sess.Close() | |||
| sess.Begin() | |||
| param := ResourceQueue{ | |||
| QueueCode: queue.QueueCode, | |||
| Cluster: queue.Cluster, | |||
| AiCenterCode: queue.AiCenterCode, | |||
| ComputeResource: queue.ComputeResource, | |||
| AccCardType: queue.AccCardType, | |||
| } | |||
| _, err := sess.Get(¶m) | |||
| if err != nil { | |||
| sess.Rollback() | |||
| return nil, err | |||
| } | |||
| if param.ID == 0 { | |||
| _, err = sess.InsertOne(&queue) | |||
| if err != nil { | |||
| sess.Rollback() | |||
| return nil, err | |||
| } | |||
| } else { | |||
| queue = param | |||
| } | |||
| spec.QueueId = queue.ID | |||
| _, err = sess.InsertOne(&spec) | |||
| if err != nil { | |||
| sess.Rollback() | |||
| return nil, err | |||
| } | |||
| sess.Commit() | |||
| return &Specification{ | |||
| ID: spec.ID, | |||
| SourceSpecId: spec.SourceSpecId, | |||
| AccCardsNum: spec.AccCardsNum, | |||
| AccCardType: queue.AccCardType, | |||
| CpuCores: spec.CpuCores, | |||
| MemGiB: spec.MemGiB, | |||
| GPUMemGiB: spec.GPUMemGiB, | |||
| ShareMemGiB: spec.ShareMemGiB, | |||
| ComputeResource: queue.ComputeResource, | |||
| UnitPrice: spec.UnitPrice, | |||
| QueueId: queue.ID, | |||
| QueueCode: queue.QueueCode, | |||
| Cluster: queue.Cluster, | |||
| AiCenterCode: queue.AiCenterCode, | |||
| AiCenterName: queue.AiCenterName, | |||
| }, nil | |||
| } | |||
| func GetCloudbrainOneAccCardType(queueCode string) string { | |||
| switch queueCode { | |||
| case "a100": | |||
| return "A100" | |||
| case "openidebug": | |||
| return "T4" | |||
| case "openidgx": | |||
| return "V100" | |||
| } | |||
| return "" | |||
| } | |||
| @@ -22,6 +22,7 @@ type CreateModelArtsNotebookForm struct { | |||
| Description string `form:"description"` | |||
| Flavor string `form:"flavor" binding:"Required"` | |||
| ImageId string `form:"image_id" binding:"Required"` | |||
| SpecId int64 `form:"spec_id" binding:"Required"` | |||
| } | |||
| func (f *CreateModelArtsNotebookForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { | |||
| @@ -46,6 +47,7 @@ type CreateModelArtsTrainJobForm struct { | |||
| VersionName string `form:"version_name" binding:"Required"` | |||
| FlavorName string `form:"flaver_names" binding:"Required"` | |||
| EngineName string `form:"engine_names" binding:"Required"` | |||
| SpecId int64 `form:"spec_id" binding:"Required"` | |||
| } | |||
| type CreateModelArtsInferenceJobForm struct { | |||
| @@ -71,6 +73,7 @@ type CreateModelArtsInferenceJobForm struct { | |||
| ModelName string `form:"model_name" binding:"Required"` | |||
| ModelVersion string `form:"model_version" binding:"Required"` | |||
| CkptName string `form:"ckpt_name" binding:"Required"` | |||
| SpecId int64 `form:"spec_id" binding:"Required"` | |||
| } | |||
| func (f *CreateModelArtsTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { | |||
| @@ -84,7 +84,6 @@ type GenerateTrainJobReq struct { | |||
| BootFileUrl string | |||
| DataUrl string | |||
| TrainUrl string | |||
| FlavorCode string | |||
| LogUrl string | |||
| PoolID string | |||
| WorkServerNumber int | |||
| @@ -96,6 +95,7 @@ type GenerateTrainJobReq struct { | |||
| BranchName string | |||
| PreVersionId int64 | |||
| PreVersionName string | |||
| FlavorCode string | |||
| FlavorName string | |||
| VersionCount int | |||
| EngineName string | |||
| @@ -103,6 +103,7 @@ type GenerateTrainJobReq struct { | |||
| UserImageUrl string | |||
| UserCommand string | |||
| DatasetName string | |||
| Spec *models.Specification | |||
| } | |||
| type GenerateInferenceJobReq struct { | |||
| @@ -115,7 +116,6 @@ type GenerateInferenceJobReq struct { | |||
| BootFileUrl string | |||
| DataUrl string | |||
| TrainUrl string | |||
| FlavorCode string | |||
| LogUrl string | |||
| PoolID string | |||
| WorkServerNumber int | |||
| @@ -134,6 +134,7 @@ type GenerateInferenceJobReq struct { | |||
| ModelVersion string | |||
| CkptName string | |||
| ResultUrl string | |||
| Spec *models.Specification | |||
| } | |||
| type VersionInfo struct { | |||
| @@ -256,7 +257,7 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor strin | |||
| return nil | |||
| } | |||
| func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, flavor, imageId string) error { | |||
| func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification) error { | |||
| if poolInfos == nil { | |||
| json.Unmarshal([]byte(setting.PoolInfos), &poolInfos) | |||
| } | |||
| @@ -270,7 +271,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc | |||
| jobResult, err := createNotebook2(models.CreateNotebook2Params{ | |||
| JobName: jobName, | |||
| Description: description, | |||
| Flavor: flavor, | |||
| Flavor: spec.SourceSpecId, | |||
| Duration: autoStopDurationMs, | |||
| ImageID: imageId, | |||
| PoolID: poolInfos.PoolInfo[0].PoolId, | |||
| @@ -292,7 +293,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc | |||
| RepoID: ctx.Repo.Repository.ID, | |||
| JobID: jobResult.ID, | |||
| JobName: jobName, | |||
| FlavorCode: flavor, | |||
| FlavorCode: spec.SourceSpecId, | |||
| DisplayJobName: displayJobName, | |||
| JobType: string(models.JobTypeDebug), | |||
| Type: models.TypeCloudBrainTwo, | |||
| @@ -302,6 +303,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc | |||
| Description: description, | |||
| CreatedUnix: createTime, | |||
| UpdatedUnix: createTime, | |||
| Spec: spec, | |||
| }) | |||
| if err != nil { | |||
| @@ -335,7 +337,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error | |||
| PoolID: req.PoolID, | |||
| CreateVersion: true, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| Code: req.Spec.SourceSpecId, | |||
| }, | |||
| Parameter: req.Parameters, | |||
| UserImageUrl: req.UserImageUrl, | |||
| @@ -357,7 +359,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error | |||
| PoolID: req.PoolID, | |||
| CreateVersion: true, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| Code: req.Spec.SourceSpecId, | |||
| }, | |||
| Parameter: req.Parameters, | |||
| }, | |||
| @@ -391,7 +393,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error | |||
| BootFile: req.BootFile, | |||
| DataUrl: req.DataUrl, | |||
| LogUrl: req.LogUrl, | |||
| FlavorCode: req.FlavorCode, | |||
| FlavorCode: req.Spec.SourceSpecId, | |||
| Description: req.Description, | |||
| WorkServerNumber: req.WorkServerNumber, | |||
| FlavorName: req.FlavorName, | |||
| @@ -400,6 +402,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error | |||
| TotalVersionCount: req.TotalVersionCount, | |||
| CreatedUnix: createTime, | |||
| UpdatedUnix: createTime, | |||
| Spec: req.Spec, | |||
| }) | |||
| if createErr != nil { | |||
| @@ -451,7 +454,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job | |||
| LogUrl: req.LogUrl, | |||
| PoolID: req.PoolID, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| Code: req.Spec.SourceSpecId, | |||
| }, | |||
| Parameter: req.Parameters, | |||
| PreVersionId: req.PreVersionId, | |||
| @@ -472,7 +475,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job | |||
| LogUrl: req.LogUrl, | |||
| PoolID: req.PoolID, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| Code: req.Spec.SourceSpecId, | |||
| }, | |||
| Parameter: req.Parameters, | |||
| PreVersionId: req.PreVersionId, | |||
| @@ -524,7 +527,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job | |||
| DataUrl: req.DataUrl, | |||
| LogUrl: req.LogUrl, | |||
| PreVersionId: req.PreVersionId, | |||
| FlavorCode: req.FlavorCode, | |||
| FlavorCode: req.Spec.SourceSpecId, | |||
| Description: req.Description, | |||
| WorkServerNumber: req.WorkServerNumber, | |||
| FlavorName: req.FlavorName, | |||
| @@ -533,6 +536,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job | |||
| VersionCount: VersionListCount + 1, | |||
| CreatedUnix: createTime, | |||
| UpdatedUnix: createTime, | |||
| Spec: req.Spec, | |||
| }) | |||
| if createErr != nil { | |||
| log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error()) | |||
| @@ -716,7 +720,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e | |||
| PoolID: req.PoolID, | |||
| CreateVersion: true, | |||
| Flavor: models.Flavor{ | |||
| Code: req.FlavorCode, | |||
| Code: req.Spec.SourceSpecId, | |||
| }, | |||
| Parameter: req.Parameters, | |||
| }, | |||
| @@ -753,7 +757,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e | |||
| BootFile: req.BootFile, | |||
| DataUrl: req.DataUrl, | |||
| LogUrl: req.LogUrl, | |||
| FlavorCode: req.FlavorCode, | |||
| FlavorCode: req.Spec.SourceSpecId, | |||
| Description: req.Description, | |||
| WorkServerNumber: req.WorkServerNumber, | |||
| FlavorName: req.FlavorName, | |||
| @@ -769,6 +773,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e | |||
| ResultUrl: req.ResultUrl, | |||
| CreatedUnix: createTime, | |||
| UpdatedUnix: createTime, | |||
| Spec: req.Spec, | |||
| }) | |||
| if err != nil { | |||
| @@ -8,6 +8,8 @@ import ( | |||
| "code.gitea.io/gitea/routers/response" | |||
| "code.gitea.io/gitea/services/cloudbrain/resource" | |||
| "net/http" | |||
| "strconv" | |||
| "strings" | |||
| ) | |||
| const ( | |||
| @@ -246,3 +248,37 @@ func UpdateResourceScene(ctx *context.Context, req models.ResourceSceneReq) { | |||
| } | |||
| ctx.JSON(http.StatusOK, response.Success()) | |||
| } | |||
| func RefreshHistorySpec(ctx *context.Context) { | |||
| scope := ctx.Query("scope") | |||
| list := ctx.Query("list") | |||
| var scopeAll = false | |||
| if scope == "all" { | |||
| scopeAll = true | |||
| } | |||
| var ids = make([]int64, 0) | |||
| if list != "" { | |||
| strs := strings.Split(list, "|") | |||
| for _, s := range strs { | |||
| i, err := strconv.ParseInt(s, 10, 64) | |||
| if err != nil { | |||
| ctx.JSON(http.StatusOK, response.ServerError(err.Error())) | |||
| return | |||
| } | |||
| ids = append(ids, i) | |||
| } | |||
| } | |||
| total, success, err := resource.RefreshHistorySpec(scopeAll, ids) | |||
| if err != nil { | |||
| log.Error("RefreshHistorySpec error. %v", err) | |||
| ctx.JSON(http.StatusOK, response.ServerError(err.Error())) | |||
| return | |||
| } | |||
| r := make(map[string]interface{}, 0) | |||
| r["success"] = success | |||
| r["total"] = total | |||
| ctx.JSON(http.StatusOK, response.SuccessWithData(r)) | |||
| } | |||
| @@ -6,6 +6,7 @@ | |||
| package private | |||
| import ( | |||
| "code.gitea.io/gitea/routers/admin" | |||
| "strings" | |||
| "code.gitea.io/gitea/routers/repo" | |||
| @@ -51,6 +52,7 @@ func RegisterRoutes(m *macaron.Macaron) { | |||
| m.Get("/tool/org_stat", OrgStatisticManually) | |||
| m.Post("/tool/update_repo_visit/:date", UpdateRepoVisit) | |||
| m.Post("/task/history_handle/duration", repo.HandleTaskWithNoDuration) | |||
| m.Post("/resources/specification/handle_historical_task", admin.RefreshHistorySpec) | |||
| }, CheckInternalToken) | |||
| } | |||
| @@ -122,89 +122,8 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { | |||
| ctx.Data["QueuesDetail"] = queuesDetail | |||
| } | |||
| cloudbrain.InitSpecialPool() | |||
| if gpuInfos == nil { | |||
| json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos) | |||
| } | |||
| ctx.Data["gpu_types"] = gpuInfos.GpuInfo | |||
| if trainGpuInfos == nil { | |||
| json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos) | |||
| } | |||
| ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo | |||
| if inferenceGpuInfos == nil && setting.InferenceGpuTypes != "" { | |||
| json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos) | |||
| } | |||
| if inferenceGpuInfos != nil { | |||
| ctx.Data["inference_gpu_types"] = inferenceGpuInfos.GpuInfo | |||
| } | |||
| if benchmarkGpuInfos == nil { | |||
| json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) | |||
| } | |||
| ctx.Data["benchmark_gpu_types"] = benchmarkGpuInfos.GpuInfo | |||
| if benchmarkResourceSpecs == nil { | |||
| json.Unmarshal([]byte(setting.BenchmarkResourceSpecs), &benchmarkResourceSpecs) | |||
| } | |||
| ctx.Data["benchmark_resource_specs"] = benchmarkResourceSpecs.ResourceSpec | |||
| if cloudbrain.ResourceSpecs == nil { | |||
| json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) | |||
| } | |||
| ctx.Data["resource_specs"] = cloudbrain.ResourceSpecs.ResourceSpec | |||
| if cloudbrain.TrainResourceSpecs == nil { | |||
| json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) | |||
| } | |||
| ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec | |||
| if cloudbrain.InferenceResourceSpecs == nil && setting.InferenceResourceSpecs != "" { | |||
| json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) | |||
| } | |||
| if cloudbrain.InferenceResourceSpecs != nil { | |||
| ctx.Data["inference_resource_specs"] = cloudbrain.InferenceResourceSpecs.ResourceSpec | |||
| } | |||
| prepareCloudbrainOneSpecs(ctx) | |||
| if cloudbrain.SpecialPools != nil { | |||
| var debugGpuTypes []*models.GpuInfo | |||
| var trainGpuTypes []*models.GpuInfo | |||
| for _, pool := range cloudbrain.SpecialPools.Pools { | |||
| isOrgMember, _ := models.IsOrganizationMemberByOrgName(pool.Org, ctx.User.ID) | |||
| if isOrgMember { | |||
| for _, jobType := range pool.JobType { | |||
| if jobType == string(models.JobTypeDebug) { | |||
| debugGpuTypes = append(debugGpuTypes, pool.Pool...) | |||
| if pool.ResourceSpec != nil { | |||
| ctx.Data["resource_specs"] = pool.ResourceSpec | |||
| } | |||
| } else if jobType == string(models.JobTypeTrain) { | |||
| trainGpuTypes = append(trainGpuTypes, pool.Pool...) | |||
| if pool.ResourceSpec != nil { | |||
| ctx.Data["train_resource_specs"] = pool.ResourceSpec | |||
| } | |||
| } | |||
| } | |||
| break | |||
| } | |||
| } | |||
| if len(debugGpuTypes) > 0 { | |||
| ctx.Data["gpu_types"] = debugGpuTypes | |||
| } | |||
| if len(trainGpuTypes) > 0 { | |||
| ctx.Data["train_gpu_types"] = trainGpuTypes | |||
| } | |||
| } | |||
| ctx.Data["params"] = "" | |||
| ctx.Data["branchName"] = ctx.Repo.BranchName | |||
| @@ -229,8 +148,6 @@ func prepareCloudbrainOneSpecs(ctx *context.Context) { | |||
| AiCenterCode: models.AICenterOfCloudBrainOne, | |||
| }) | |||
| ctx.Data["debug_specs"] = debugSpecs | |||
| b, _ := json.Marshal(debugSpecs) | |||
| log.Info("%s", string(b)) | |||
| trainSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ | |||
| JobType: models.JobTypeTrain, | |||
| @@ -247,6 +164,14 @@ func prepareCloudbrainOneSpecs(ctx *context.Context) { | |||
| AiCenterCode: models.AICenterOfCloudBrainOne, | |||
| }) | |||
| ctx.Data["inference_specs"] = inferenceSpecs | |||
| benchmarkSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ | |||
| JobType: models.JobTypeBenchmark, | |||
| ComputeResource: models.GPU, | |||
| Cluster: models.OpenICluster, | |||
| AiCenterCode: models.AICenterOfCloudBrainOne, | |||
| }) | |||
| ctx.Data["benchmark_specs"] = benchmarkSpecs | |||
| } | |||
| func CloudBrainNew(ctx *context.Context) { | |||
| @@ -348,18 +273,10 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { | |||
| command = commandTrain | |||
| } | |||
| errStr := checkCloudBrainSpecialPool(ctx, jobType, gpuQueue, resourceSpecId) | |||
| if errStr != "" { | |||
| cloudBrainNewDataPrepare(ctx) | |||
| ctx.RenderWithErr(errStr, tpl, &form) | |||
| return | |||
| } | |||
| if branchName == "" { | |||
| branchName = cloudbrain.DefaultBranchName | |||
| } | |||
| errStr = loadCodeAndMakeModelPath(repo, codePath, branchName, jobName, cloudbrain.ModelMountPath) | |||
| errStr := loadCodeAndMakeModelPath(repo, codePath, branchName, jobName, cloudbrain.ModelMountPath) | |||
| if errStr != "" { | |||
| cloudBrainNewDataPrepare(ctx) | |||
| ctx.RenderWithErr(ctx.Tr(errStr), tpl, &form) | |||
| @@ -375,7 +292,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { | |||
| AiCenterCode: models.AICenterOfCloudBrainOne}) | |||
| if err != nil || spec == nil { | |||
| cloudBrainNewDataPrepare(ctx) | |||
| ctx.RenderWithErr("Illegal resource specification", tpl, &form) | |||
| ctx.RenderWithErr("Resource specification not available", tpl, &form) | |||
| return | |||
| } | |||
| @@ -534,7 +451,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra | |||
| AiCenterCode: models.AICenterOfCloudBrainOne}) | |||
| if err != nil || spec == nil { | |||
| cloudBrainNewDataPrepare(ctx) | |||
| ctx.RenderWithErr("Illegal resource specification", tpl, &form) | |||
| ctx.RenderWithErr("Resource specification not available", tpl, &form) | |||
| return | |||
| } | |||
| req := cloudbrain.GenerateCloudBrainTaskReq{ | |||
| @@ -2447,7 +2364,7 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo | |||
| AiCenterCode: models.AICenterOfCloudBrainOne}) | |||
| if err != nil || spec == nil { | |||
| cloudBrainNewDataPrepare(ctx) | |||
| ctx.RenderWithErr("Illegal resource specification", tplCloudBrainBenchmarkNew, &form) | |||
| ctx.RenderWithErr("Resource specification not available", tplCloudBrainBenchmarkNew, &form) | |||
| return | |||
| } | |||
| @@ -2587,7 +2504,7 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm) | |||
| AiCenterCode: models.AICenterOfCloudBrainOne}) | |||
| if err != nil || spec == nil { | |||
| cloudBrainNewDataPrepare(ctx) | |||
| ctx.RenderWithErr("Illegal resource specification", tpl, &form) | |||
| ctx.RenderWithErr("Resource specification not available", tpl, &form) | |||
| return | |||
| } | |||
| req := cloudbrain.GenerateCloudBrainTaskReq{ | |||
| @@ -2,6 +2,7 @@ package repo | |||
| import ( | |||
| "archive/zip" | |||
| "code.gitea.io/gitea/services/cloudbrain/resource" | |||
| "encoding/json" | |||
| "errors" | |||
| "fmt" | |||
| @@ -141,11 +142,7 @@ func notebookNewDataPrepare(ctx *context.Context) error { | |||
| } | |||
| ctx.Data["images"] = modelarts.ImageInfos.ImageInfo | |||
| if modelarts.FlavorInfos == nil { | |||
| json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos) | |||
| } | |||
| ctx.Data["flavors"] = modelarts.FlavorInfos.FlavorInfo | |||
| setSpecBySpecialPoolConfig(ctx, string(models.JobTypeDebug)) | |||
| prepareCloudbrainTwoDebugSpecs(ctx) | |||
| ctx.Data["datasetType"] = models.TypeCloudBrainTwo | |||
| @@ -155,6 +152,16 @@ func notebookNewDataPrepare(ctx *context.Context) error { | |||
| return nil | |||
| } | |||
| func prepareCloudbrainTwoDebugSpecs(ctx *context.Context) { | |||
| noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ | |||
| JobType: models.JobTypeDebug, | |||
| ComputeResource: models.NPU, | |||
| Cluster: models.OpenICluster, | |||
| AiCenterCode: models.AICenterOfCloudBrainTwo, | |||
| }) | |||
| ctx.Data["Specs"] = noteBookSpecs | |||
| } | |||
| func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) { | |||
| ctx.Data["PageIsNotebook"] = true | |||
| jobName := form.JobName | |||
| @@ -205,7 +212,6 @@ func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm | |||
| jobName := util.ConvertDisplayJobNameToJobName(displayJobName) | |||
| uuid := form.Attachment | |||
| description := form.Description | |||
| flavor := form.Flavor | |||
| imageId := form.ImageId | |||
| repo := ctx.Repo.Repository | |||
| @@ -241,14 +247,17 @@ func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm | |||
| } | |||
| } | |||
| errStr := checkModelArtsSpecialPool(ctx, flavor, string(models.JobTypeDebug)) | |||
| if errStr != "" { | |||
| spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ | |||
| JobType: models.JobTypeDebug, | |||
| ComputeResource: models.NPU, | |||
| Cluster: models.OpenICluster, | |||
| AiCenterCode: models.AICenterOfCloudBrainTwo}) | |||
| if err != nil || spec == nil { | |||
| notebookNewDataPrepare(ctx) | |||
| ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsNotebookNew, &form) | |||
| ctx.RenderWithErr("Resource specification not available", tplModelArtsNotebookNew, &form) | |||
| return | |||
| } | |||
| err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, flavor, imageId) | |||
| err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, imageId, spec) | |||
| if err != nil { | |||
| log.Error("GenerateNotebook2 failed, %v", err, ctx.Data["MsgID"]) | |||
| notebookNewDataPrepare(ctx) | |||
| @@ -728,14 +737,7 @@ func trainJobNewDataPrepare(ctx *context.Context) error { | |||
| } | |||
| ctx.Data["engine_versions"] = versionInfos.Version | |||
| var flavorInfos modelarts.Flavor | |||
| if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { | |||
| ctx.ServerError("json.Unmarshal failed:", err) | |||
| return err | |||
| } | |||
| ctx.Data["flavor_infos"] = flavorInfos.Info | |||
| setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain)) | |||
| prepareCloudbrainTwoTrainSpecs(ctx) | |||
| ctx.Data["params"] = "" | |||
| ctx.Data["branchName"] = ctx.Repo.BranchName | |||
| @@ -753,6 +755,16 @@ func trainJobNewDataPrepare(ctx *context.Context) error { | |||
| return nil | |||
| } | |||
| func prepareCloudbrainTwoTrainSpecs(ctx *context.Context) { | |||
| noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ | |||
| JobType: models.JobTypeTrain, | |||
| ComputeResource: models.NPU, | |||
| Cluster: models.OpenICluster, | |||
| AiCenterCode: models.AICenterOfCloudBrainTwo, | |||
| }) | |||
| ctx.Data["Specs"] = noteBookSpecs | |||
| } | |||
| func setSpecBySpecialPoolConfig(ctx *context.Context, jobType string) { | |||
| modelarts.InitSpecialPool() | |||
| @@ -835,13 +847,7 @@ func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArts | |||
| } | |||
| ctx.Data["engine_versions"] = versionInfos.Version | |||
| var flavorInfos modelarts.Flavor | |||
| if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { | |||
| ctx.ServerError("json.Unmarshal failed:", err) | |||
| return err | |||
| } | |||
| ctx.Data["flavor_infos"] = flavorInfos.Info | |||
| setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain)) | |||
| prepareCloudbrainTwoTrainSpecs(ctx) | |||
| configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) | |||
| if err != nil { | |||
| @@ -1020,13 +1026,7 @@ func versionErrorDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrai | |||
| } | |||
| ctx.Data["engine_versions"] = versionInfos.Version | |||
| var flavorInfos modelarts.Flavor | |||
| if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { | |||
| ctx.ServerError("json.Unmarshal failed:", err) | |||
| return err | |||
| } | |||
| ctx.Data["flavor_infos"] = flavorInfos.Info | |||
| setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain)) | |||
| prepareCloudbrainTwoTrainSpecs(ctx) | |||
| var Parameters modelarts.Parameters | |||
| if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil { | |||
| @@ -1079,7 +1079,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||
| workServerNumber := form.WorkServerNumber | |||
| engineID := form.EngineID | |||
| bootFile := strings.TrimSpace(form.BootFile) | |||
| flavorCode := form.Flavor | |||
| params := form.Params | |||
| poolID := form.PoolID | |||
| //isSaveParam := form.IsSaveParam | |||
| @@ -1117,10 +1116,14 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||
| return | |||
| } | |||
| errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) | |||
| if errStr != "" { | |||
| spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ | |||
| JobType: models.JobTypeTrain, | |||
| ComputeResource: models.NPU, | |||
| Cluster: models.OpenICluster, | |||
| AiCenterCode: models.AICenterOfCloudBrainTwo}) | |||
| if err != nil || spec == nil { | |||
| trainJobErrorNewDataPrepare(ctx, form) | |||
| ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form) | |||
| ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobNew, &form) | |||
| return | |||
| } | |||
| //Determine whether the task name of the task in the project is duplicated | |||
| @@ -1283,7 +1286,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||
| BootFileUrl: codeObsPath + bootFile, | |||
| BootFile: bootFile, | |||
| TrainUrl: outputObsPath, | |||
| FlavorCode: flavorCode, | |||
| WorkServerNumber: workServerNumber, | |||
| EngineID: int64(engineID), | |||
| LogUrl: logObsPath, | |||
| @@ -1299,6 +1301,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) | |||
| VersionCount: VersionCount, | |||
| TotalVersionCount: modelarts.TotalVersionCount, | |||
| DatasetName: datasetNames, | |||
| Spec: spec, | |||
| } | |||
| userCommand, userImageUrl := getUserCommand(engineID, req) | |||
| req.UserCommand = userCommand | |||
| @@ -1384,7 +1387,6 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ | |||
| workServerNumber := form.WorkServerNumber | |||
| engineID := form.EngineID | |||
| bootFile := strings.TrimSpace(form.BootFile) | |||
| flavorCode := form.Flavor | |||
| params := form.Params | |||
| poolID := form.PoolID | |||
| //isSaveParam := form.IsSaveParam | |||
| @@ -1414,10 +1416,14 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ | |||
| return | |||
| } | |||
| errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) | |||
| if errStr != "" { | |||
| spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ | |||
| JobType: models.JobTypeTrain, | |||
| ComputeResource: models.NPU, | |||
| Cluster: models.OpenICluster, | |||
| AiCenterCode: models.AICenterOfCloudBrainTwo}) | |||
| if err != nil || spec == nil { | |||
| versionErrorDataPrepare(ctx, form) | |||
| ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form) | |||
| ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobVersionNew, &form) | |||
| return | |||
| } | |||
| @@ -1571,7 +1577,6 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ | |||
| BootFileUrl: codeObsPath + bootFile, | |||
| BootFile: bootFile, | |||
| TrainUrl: outputObsPath, | |||
| FlavorCode: flavorCode, | |||
| WorkServerNumber: workServerNumber, | |||
| IsLatestVersion: isLatestVersion, | |||
| EngineID: int64(engineID), | |||
| @@ -1588,6 +1593,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ | |||
| PreVersionName: PreVersionName, | |||
| TotalVersionCount: latestTask.TotalVersionCount + 1, | |||
| DatasetName: datasetNames, | |||
| Spec: spec, | |||
| } | |||
| userCommand, userImageUrl := getUserCommand(engineID, req) | |||
| req.UserCommand = userCommand | |||
| @@ -2016,7 +2022,6 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference | |||
| workServerNumber := form.WorkServerNumber | |||
| engineID := form.EngineID | |||
| bootFile := strings.TrimSpace(form.BootFile) | |||
| flavorCode := form.Flavor | |||
| params := form.Params | |||
| poolID := form.PoolID | |||
| repo := ctx.Repo.Repository | |||
| @@ -2078,13 +2083,16 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference | |||
| } | |||
| } | |||
| errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference)) | |||
| if errStr != "" { | |||
| spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{ | |||
| JobType: models.JobTypeInference, | |||
| ComputeResource: models.NPU, | |||
| Cluster: models.OpenICluster, | |||
| AiCenterCode: models.AICenterOfCloudBrainTwo}) | |||
| if err != nil || spec == nil { | |||
| inferenceJobErrorNewDataPrepare(ctx, form) | |||
| ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form) | |||
| ctx.RenderWithErr("Resource specification not available", tplModelArtsInferenceJobNew, &form) | |||
| return | |||
| } | |||
| //todo: del the codeLocalPath | |||
| _, err = ioutil.ReadDir(codeLocalPath) | |||
| if err == nil { | |||
| @@ -2170,7 +2178,6 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference | |||
| BootFileUrl: codeObsPath + bootFile, | |||
| BootFile: bootFile, | |||
| TrainUrl: trainUrl, | |||
| FlavorCode: flavorCode, | |||
| WorkServerNumber: workServerNumber, | |||
| EngineID: int64(engineID), | |||
| LogUrl: logObsPath, | |||
| @@ -2369,14 +2376,7 @@ func inferenceJobNewDataPrepare(ctx *context.Context) error { | |||
| } | |||
| ctx.Data["engine_versions"] = versionInfos.Version | |||
| var flavorInfos modelarts.Flavor | |||
| if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil { | |||
| ctx.ServerError("json.Unmarshal failed:", err) | |||
| return err | |||
| } | |||
| ctx.Data["flavor_infos"] = flavorInfos.Info | |||
| setSpecBySpecialPoolConfig(ctx, string(models.JobTypeInference)) | |||
| prepareCloudbrainTwoInferenceSpecs(ctx) | |||
| ctx.Data["params"] = "" | |||
| ctx.Data["branchName"] = ctx.Repo.BranchName | |||
| @@ -2407,6 +2407,16 @@ func inferenceJobNewDataPrepare(ctx *context.Context) error { | |||
| return nil | |||
| } | |||
| func prepareCloudbrainTwoInferenceSpecs(ctx *context.Context) { | |||
| noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ | |||
| JobType: models.JobTypeInference, | |||
| ComputeResource: models.NPU, | |||
| Cluster: models.OpenICluster, | |||
| AiCenterCode: models.AICenterOfCloudBrainTwo, | |||
| }) | |||
| ctx.Data["Specs"] = noteBookSpecs | |||
| } | |||
| func inferenceJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) error { | |||
| ctx.Data["PageIsCloudBrain"] = true | |||
| @@ -2,12 +2,17 @@ package resource | |||
| import ( | |||
| "code.gitea.io/gitea/models" | |||
| "code.gitea.io/gitea/modules/cloudbrain" | |||
| "code.gitea.io/gitea/modules/grampus" | |||
| "code.gitea.io/gitea/modules/log" | |||
| "code.gitea.io/gitea/modules/setting" | |||
| "code.gitea.io/gitea/routers/response" | |||
| "code.gitea.io/gitea/services/admin/operate_log" | |||
| "encoding/json" | |||
| "errors" | |||
| "fmt" | |||
| "strings" | |||
| "time" | |||
| ) | |||
| func AddResourceSpecification(doerId int64, req models.ResourceSpecificationReq) error { | |||
| @@ -186,7 +191,7 @@ func AddSpecOperateLog(doerId int64, operateType string, newValue, oldValue *mod | |||
| } | |||
| func FindAvailableSpecs(userId int64, opts models.FindSpecsOptions) ([]*models.Specification, error) { | |||
| r, err := models.FindAvailableSpecs(opts) | |||
| r, err := models.FindSpecs(opts) | |||
| if err != nil { | |||
| log.Error("FindAvailableSpecs error.%v", err) | |||
| return nil, err | |||
| @@ -270,3 +275,260 @@ func GetCloudbrainSpec(cloudbrainId int64) (*models.Specification, error) { | |||
| } | |||
| return c.ConvertToSpecification(), nil | |||
| } | |||
| func RefreshHistorySpec(scopeAll bool, ids []int64) (int64, int64, error) { | |||
| var success int64 | |||
| var total int64 | |||
| if !scopeAll { | |||
| if ids == nil || len(ids) == 0 { | |||
| return 0, 0, nil | |||
| } | |||
| total = int64(len(ids)) | |||
| tasks, err := models.GetCloudbrainWithDeletedByIDs(ids) | |||
| if err != nil { | |||
| return total, 0, err | |||
| } | |||
| for _, task := range tasks { | |||
| err = RefreshOneHistorySpec(task) | |||
| if err != nil { | |||
| log.Error("RefreshOneHistorySpec error.%v", err) | |||
| continue | |||
| } | |||
| success++ | |||
| } | |||
| } else { | |||
| page := 1 | |||
| pageSize := 100 | |||
| n, err := models.CountNoSpecHistoricTask() | |||
| if err != nil { | |||
| log.Error("FindNoSpecHistoricTask CountNoSpecHistoricTask error. e=%v", err) | |||
| return 0, 0, err | |||
| } | |||
| total = n | |||
| for i := 0; i < 1000; i++ { | |||
| list, err := models.FindNoSpecHistoricTask(page, pageSize) | |||
| if err != nil { | |||
| log.Error("FindNoSpecHistoricTask error.page=%d pageSize=%d e=%v", page, pageSize, err) | |||
| return total, success, err | |||
| } | |||
| if len(list) == 0 { | |||
| log.Info("RefreshHistorySpec. list is empty") | |||
| break | |||
| } | |||
| for _, task := range list { | |||
| time.Sleep(1 * time.Second) | |||
| err = RefreshOneHistorySpec(task) | |||
| if err != nil { | |||
| log.Error("RefreshOneHistorySpec error.%v", err) | |||
| continue | |||
| } | |||
| success++ | |||
| } | |||
| if len(list) < pageSize { | |||
| log.Info("RefreshHistorySpec. list < pageSize") | |||
| break | |||
| } | |||
| } | |||
| } | |||
| return total, success, nil | |||
| } | |||
| func RefreshOneHistorySpec(task *models.Cloudbrain) error { | |||
| var spec *models.Specification | |||
| var err error | |||
| switch task.Type { | |||
| case models.TypeCloudBrainOne: | |||
| spec, err = getCloudbrainOneSpec(task) | |||
| } | |||
| if err != nil { | |||
| log.Error("find spec error,task.ID=%d err=%v", task.ID, err) | |||
| return err | |||
| } | |||
| if spec == nil { | |||
| log.Error("find spec failed,task.ID=%d", task.ID) | |||
| return errors.New("find spec failed") | |||
| } | |||
| return InsertCloudbrainSpec(task.ID, spec) | |||
| } | |||
| func getCloudbrainOneSpec(task *models.Cloudbrain) (*models.Specification, error) { | |||
| //find from remote | |||
| result, err := cloudbrain.GetJob(task.JobID) | |||
| if err != nil { | |||
| log.Error("getCloudbrainOneSpec error. %v", err) | |||
| return nil, err | |||
| } | |||
| if result != nil { | |||
| jobRes, _ := models.ConvertToJobResultPayload(result.Payload) | |||
| memSize, _ := models.ParseMemSizeFromGrampus(jobRes.Resource.Memory) | |||
| if task.ComputeResource == "CPU/GPU" { | |||
| task.ComputeResource = models.GPU | |||
| } | |||
| var shmMB float32 | |||
| if jobRes.Config.TaskRoles != nil && len(jobRes.Config.TaskRoles) > 0 { | |||
| shmMB = float32(jobRes.Config.TaskRoles[0].ShmMB) / 1024 | |||
| } | |||
| opt := models.FindSpecsOptions{ | |||
| ComputeResource: task.ComputeResource, | |||
| Cluster: models.OpenICluster, | |||
| AiCenterCode: models.AICenterOfCloudBrainOne, | |||
| QueueCode: task.GpuQueue, | |||
| AccCardsNum: jobRes.Resource.NvidiaComGpu, | |||
| UseAccCardsNum: true, | |||
| CpuCores: jobRes.Resource.CPU, | |||
| UseCpuCores: true, | |||
| MemGiB: memSize, | |||
| UseMemGiB: memSize > 0, | |||
| ShareMemGiB: shmMB, | |||
| UseShareMemGiB: shmMB > 0, | |||
| RequestAll: true, | |||
| } | |||
| specs, err := models.FindSpecs(opt) | |||
| if err != nil { | |||
| log.Error("getCloudbrainOneSpec from remote error,%v", err) | |||
| return nil, err | |||
| } | |||
| if len(specs) == 1 { | |||
| return specs[0], nil | |||
| } | |||
| if len(specs) == 0 { | |||
| s, err := InitQueueAndSpec(opt, "云脑一", "处理历史云脑任务时自动添加") | |||
| if err != nil { | |||
| log.Error("getCloudbrainOneSpec InitQueueAndSpec error.err=%v", err) | |||
| return nil, nil | |||
| } | |||
| return s, nil | |||
| } | |||
| if len(specs) > 1 { | |||
| log.Error("Too many results matched.size=%d opt=%+v", len(specs), opt) | |||
| return nil, nil | |||
| } | |||
| } else { | |||
| //find from config | |||
| var specConfig *models.ResourceSpec | |||
| hasSpec := false | |||
| if task.JobType == string(models.JobTypeTrain) { | |||
| if cloudbrain.TrainResourceSpecs == nil { | |||
| json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) | |||
| } | |||
| for _, tmp := range cloudbrain.TrainResourceSpecs.ResourceSpec { | |||
| if tmp.Id == task.ResourceSpecId { | |||
| hasSpec = true | |||
| specConfig = tmp | |||
| break | |||
| } | |||
| } | |||
| } else if task.JobType == string(models.JobTypeInference) { | |||
| if cloudbrain.InferenceResourceSpecs == nil { | |||
| json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs) | |||
| } | |||
| for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec { | |||
| if tmp.Id == task.ResourceSpecId { | |||
| hasSpec = true | |||
| specConfig = tmp | |||
| break | |||
| } | |||
| } | |||
| } else { | |||
| if cloudbrain.ResourceSpecs == nil { | |||
| json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) | |||
| } | |||
| for _, tmp := range cloudbrain.ResourceSpecs.ResourceSpec { | |||
| if tmp.Id == task.ResourceSpecId { | |||
| hasSpec = true | |||
| specConfig = tmp | |||
| break | |||
| } | |||
| } | |||
| } | |||
| if !hasSpec && cloudbrain.SpecialPools != nil { | |||
| for _, specialPool := range cloudbrain.SpecialPools.Pools { | |||
| if specialPool.ResourceSpec != nil { | |||
| for _, spec := range specialPool.ResourceSpec { | |||
| if task.ResourceSpecId == spec.Id { | |||
| hasSpec = true | |||
| specConfig = spec | |||
| break | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| if specConfig == nil { | |||
| log.Error("getCloudbrainOneSpec from config failed,task.ResourceSpecId=%d", task.ResourceSpecId) | |||
| return nil, nil | |||
| } | |||
| opt := models.FindSpecsOptions{ | |||
| JobType: models.JobType(task.JobType), | |||
| ComputeResource: task.ComputeResource, | |||
| Cluster: models.OpenICluster, | |||
| AiCenterCode: models.AICenterOfCloudBrainOne, | |||
| QueueCode: task.GpuQueue, | |||
| AccCardsNum: specConfig.GpuNum, | |||
| UseAccCardsNum: true, | |||
| CpuCores: specConfig.GpuNum, | |||
| UseCpuCores: true, | |||
| MemGiB: float32(specConfig.MemMiB) / 1024, | |||
| UseMemGiB: true, | |||
| ShareMemGiB: float32(specConfig.ShareMemMiB) / 1024, | |||
| UseShareMemGiB: true, | |||
| RequestAll: true, | |||
| } | |||
| specs, err := models.FindSpecs(opt) | |||
| if err != nil { | |||
| log.Error("getCloudbrainOneSpec from config error,%v", err) | |||
| return nil, err | |||
| } | |||
| if len(specs) > 1 { | |||
| log.Error("Too many results matched.size=%d opt=%+v", len(specs), opt) | |||
| return nil, nil | |||
| } | |||
| if len(specs) == 0 { | |||
| s, err := InitQueueAndSpec(opt, "云脑一", "处理历史云脑任务时自动添加") | |||
| if err != nil { | |||
| log.Error("getCloudbrainOneSpec InitQueueAndSpec error.err=%v", err) | |||
| return nil, nil | |||
| } | |||
| return s, nil | |||
| } | |||
| return specs[0], nil | |||
| } | |||
| return nil, nil | |||
| } | |||
| func RefreshCloudbrainTwoSpec(task *models.Cloudbrain) error { | |||
| return nil | |||
| } | |||
| func RefreshC2NetSpec(task *models.Cloudbrain) error { | |||
| return nil | |||
| } | |||
| func InitQueueAndSpec(opt models.FindSpecsOptions, aiCenterName string, remark string) (*models.Specification, error) { | |||
| return models.InitQueueAndSpec(models.ResourceQueue{ | |||
| QueueCode: opt.QueueCode, | |||
| Cluster: opt.Cluster, | |||
| AiCenterCode: opt.AiCenterCode, | |||
| AiCenterName: aiCenterName, | |||
| ComputeResource: opt.ComputeResource, | |||
| AccCardType: models.GetCloudbrainOneAccCardType(opt.QueueCode), | |||
| Remark: remark, | |||
| }, models.ResourceSpecification{ | |||
| AccCardsNum: opt.AccCardsNum, | |||
| CpuCores: opt.CpuCores, | |||
| MemGiB: opt.MemGiB, | |||
| GPUMemGiB: opt.GPUMemGiB, | |||
| ShareMemGiB: opt.ShareMemGiB, | |||
| Status: models.SpecOffShelf, | |||
| }) | |||
| } | |||