Browse Source

#2701

update
tags/v1.22.9.1^2^2
chenyifan01 3 years ago
parent
commit
2898c40fcc
10 changed files with 547 additions and 176 deletions
  1. +6
    -0
      models/cloudbrain.go
  2. +23
    -0
      models/cloudbrain_spec.go
  3. +116
    -9
      models/resource_specification.go
  4. +3
    -0
      modules/auth/modelarts.go
  5. +18
    -13
      modules/modelarts/modelarts.go
  6. +36
    -0
      routers/admin/resources.go
  7. +2
    -0
      routers/private/internal.go
  8. +13
    -96
      routers/repo/cloudbrain.go
  9. +67
    -57
      routers/repo/modelarts.go
  10. +263
    -1
      services/cloudbrain/resource/resource_specification.go

+ 6
- 0
models/cloudbrain.go View File

@@ -2313,3 +2313,9 @@ func GetCloudbrainByIDs(ids []int64) ([]*Cloudbrain, error) {
In("id", ids). In("id", ids).
Find(&cloudbrains) Find(&cloudbrains)
} }

func GetCloudbrainWithDeletedByIDs(ids []int64) ([]*Cloudbrain, error) {
cloudbrains := make([]*Cloudbrain, 0)
return cloudbrains, x.
In("id", ids).Unscoped().Find(&cloudbrains)
}

+ 23
- 0
models/cloudbrain_spec.go View File

@@ -83,3 +83,26 @@ func GetCloudbrainSpecByID(cloudbrainId int64) (*CloudbrainSpec, error) {
} }
return r, nil return r, nil
} }

func FindNoSpecHistoricTask(page, pageSize int) ([]*Cloudbrain, error) {
r := make([]*Cloudbrain, 0)
err := x.Unscoped().
Where(" 1=1 and not exists (select 1 from cloudbrain_spec where cloudbrain.id = cloudbrain_spec.cloudbrain_id)").
Limit(pageSize, (page-1)*pageSize).
OrderBy("cloudbrain.id").
Find(&r)
if err != nil {
return nil, err
}
return r, nil
}

func CountNoSpecHistoricTask() (int64, error) {
n, err := x.Unscoped().
Where(" 1=1 and not exists (select 1 from cloudbrain_spec where cloudbrain.id = cloudbrain_spec.cloudbrain_id)").
Count(&Cloudbrain{})
if err != nil {
return 0, err
}
return n, nil
}

+ 116
- 9
models/resource_specification.go View File

@@ -147,6 +147,21 @@ type FindSpecsOptions struct {
Cluster string Cluster string
AiCenterCode string AiCenterCode string
SpecId int64 SpecId int64
QueueCode string
SourceSpecId string
AccCardsNum int
UseAccCardsNum bool
AccCardType string
CpuCores int
UseCpuCores bool
MemGiB float32
UseMemGiB bool
GPUMemGiB float32
UseGPUMemGiB bool
ShareMemGiB float32
UseShareMemGiB bool
//if true,find specs no matter used or not used in scene. if false,only find specs used in scene
RequestAll bool
} }


type Specification struct { type Specification struct {
@@ -316,9 +331,10 @@ func SyncGrampusSpecs(updateList []ResourceSpecification, insertList []ResourceS
return sess.Commit() return sess.Commit()
} }


func FindAvailableSpecs(opts FindSpecsOptions) ([]*Specification, error) {
//FindSpecs
func FindSpecs(opts FindSpecsOptions) ([]*Specification, error) {
var cond = builder.NewCond() var cond = builder.NewCond()
if opts.JobType != "" {
if !opts.RequestAll && opts.JobType != "" {
cond = cond.And(builder.Eq{"resource_scene.job_type": opts.JobType}) cond = cond.And(builder.Eq{"resource_scene.job_type": opts.JobType})
} }
if opts.ComputeResource != "" { if opts.ComputeResource != "" {
@@ -333,17 +349,108 @@ func FindAvailableSpecs(opts FindSpecsOptions) ([]*Specification, error) {
if opts.SpecId > 0 { if opts.SpecId > 0 {
cond = cond.And(builder.Eq{"resource_specification.id": opts.SpecId}) cond = cond.And(builder.Eq{"resource_specification.id": opts.SpecId})
} }
cond = cond.And(builder.Or(builder.Eq{"resource_scene.delete_time": 0}, builder.IsNull{"resource_scene.delete_time"}))

if opts.QueueCode != "" {
cond = cond.And(builder.Eq{"resource_queue.queue_code": opts.QueueCode})
}
if opts.SourceSpecId != "" {
cond = cond.And(builder.Eq{"resource_specification.source_spec_id": opts.SourceSpecId})
}
if opts.UseAccCardsNum {
cond = cond.And(builder.Eq{"resource_specification.acc_cards_num": opts.AccCardsNum})
}
if opts.AccCardType != "" {
cond = cond.And(builder.Eq{"resource_queue.acc_card_type": opts.AccCardType})
}
if opts.UseCpuCores {
cond = cond.And(builder.Eq{"resource_specification.cpu_cores": opts.CpuCores})
}
if opts.UseMemGiB {
cond = cond.And(builder.Eq{"resource_specification.mem_gi_b": opts.MemGiB})
}
if opts.UseGPUMemGiB {
cond = cond.And(builder.Eq{"resource_specification.gpu_mem_gi_b": opts.GPUMemGiB})
}
if opts.UseShareMemGiB {
cond = cond.And(builder.Eq{"resource_specification.share_mem_gi_b": opts.ShareMemGiB})
}
r := make([]*Specification, 0) r := make([]*Specification, 0)
err := x.Where(cond).
Join("INNER", "resource_scene_spec", "resource_scene_spec.spec_id = resource_specification.id").
Join("INNER", "resource_scene", "resource_scene_spec.scene_id = resource_scene.id").
Join("INNER", "resource_queue", "resource_queue.id = resource_specification.queue_id").
OrderBy("resource_queue.compute_resource asc,resource_queue.acc_card_type asc,resource_specification.acc_cards_num asc").
s := x.Where(cond).
Join("INNER", "resource_queue", "resource_queue.id = resource_specification.queue_id")

if !opts.RequestAll {
s = s.Join("INNER", "resource_scene_spec", "resource_scene_spec.spec_id = resource_specification.id").
Join("INNER", "resource_scene", "resource_scene_spec.scene_id = resource_scene.id")
}
err := s.OrderBy("resource_queue.compute_resource asc,resource_queue.acc_card_type asc,resource_specification.acc_cards_num asc").
Unscoped().Find(&r) Unscoped().Find(&r)
if err != nil { if err != nil {
return nil, err return nil, err
} }
return r, nil return r, nil
} }

func InitQueueAndSpec(queue ResourceQueue, spec ResourceSpecification) (*Specification, error) {
sess := x.NewSession()
defer sess.Close()

sess.Begin()
param := ResourceQueue{
QueueCode: queue.QueueCode,
Cluster: queue.Cluster,
AiCenterCode: queue.AiCenterCode,
ComputeResource: queue.ComputeResource,
AccCardType: queue.AccCardType,
}
_, err := sess.Get(&param)
if err != nil {
sess.Rollback()
return nil, err
}
if param.ID == 0 {
_, err = sess.InsertOne(&queue)
if err != nil {
sess.Rollback()
return nil, err
}
} else {
queue = param
}

spec.QueueId = queue.ID
_, err = sess.InsertOne(&spec)
if err != nil {
sess.Rollback()
return nil, err
}
sess.Commit()
return &Specification{
ID: spec.ID,
SourceSpecId: spec.SourceSpecId,
AccCardsNum: spec.AccCardsNum,
AccCardType: queue.AccCardType,
CpuCores: spec.CpuCores,
MemGiB: spec.MemGiB,
GPUMemGiB: spec.GPUMemGiB,
ShareMemGiB: spec.ShareMemGiB,
ComputeResource: queue.ComputeResource,
UnitPrice: spec.UnitPrice,
QueueId: queue.ID,
QueueCode: queue.QueueCode,
Cluster: queue.Cluster,
AiCenterCode: queue.AiCenterCode,
AiCenterName: queue.AiCenterName,
}, nil
}

func GetCloudbrainOneAccCardType(queueCode string) string {
switch queueCode {
case "a100":
return "A100"
case "openidebug":
return "T4"
case "openidgx":
return "V100"

}
return ""
}

+ 3
- 0
modules/auth/modelarts.go View File

@@ -22,6 +22,7 @@ type CreateModelArtsNotebookForm struct {
Description string `form:"description"` Description string `form:"description"`
Flavor string `form:"flavor" binding:"Required"` Flavor string `form:"flavor" binding:"Required"`
ImageId string `form:"image_id" binding:"Required"` ImageId string `form:"image_id" binding:"Required"`
SpecId int64 `form:"spec_id" binding:"Required"`
} }


func (f *CreateModelArtsNotebookForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { func (f *CreateModelArtsNotebookForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors {
@@ -46,6 +47,7 @@ type CreateModelArtsTrainJobForm struct {
VersionName string `form:"version_name" binding:"Required"` VersionName string `form:"version_name" binding:"Required"`
FlavorName string `form:"flaver_names" binding:"Required"` FlavorName string `form:"flaver_names" binding:"Required"`
EngineName string `form:"engine_names" binding:"Required"` EngineName string `form:"engine_names" binding:"Required"`
SpecId int64 `form:"spec_id" binding:"Required"`
} }


type CreateModelArtsInferenceJobForm struct { type CreateModelArtsInferenceJobForm struct {
@@ -71,6 +73,7 @@ type CreateModelArtsInferenceJobForm struct {
ModelName string `form:"model_name" binding:"Required"` ModelName string `form:"model_name" binding:"Required"`
ModelVersion string `form:"model_version" binding:"Required"` ModelVersion string `form:"model_version" binding:"Required"`
CkptName string `form:"ckpt_name" binding:"Required"` CkptName string `form:"ckpt_name" binding:"Required"`
SpecId int64 `form:"spec_id" binding:"Required"`
} }


func (f *CreateModelArtsTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors { func (f *CreateModelArtsTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors {


+ 18
- 13
modules/modelarts/modelarts.go View File

@@ -84,7 +84,6 @@ type GenerateTrainJobReq struct {
BootFileUrl string BootFileUrl string
DataUrl string DataUrl string
TrainUrl string TrainUrl string
FlavorCode string
LogUrl string LogUrl string
PoolID string PoolID string
WorkServerNumber int WorkServerNumber int
@@ -96,6 +95,7 @@ type GenerateTrainJobReq struct {
BranchName string BranchName string
PreVersionId int64 PreVersionId int64
PreVersionName string PreVersionName string
FlavorCode string
FlavorName string FlavorName string
VersionCount int VersionCount int
EngineName string EngineName string
@@ -103,6 +103,7 @@ type GenerateTrainJobReq struct {
UserImageUrl string UserImageUrl string
UserCommand string UserCommand string
DatasetName string DatasetName string
Spec *models.Specification
} }


type GenerateInferenceJobReq struct { type GenerateInferenceJobReq struct {
@@ -115,7 +116,6 @@ type GenerateInferenceJobReq struct {
BootFileUrl string BootFileUrl string
DataUrl string DataUrl string
TrainUrl string TrainUrl string
FlavorCode string
LogUrl string LogUrl string
PoolID string PoolID string
WorkServerNumber int WorkServerNumber int
@@ -134,6 +134,7 @@ type GenerateInferenceJobReq struct {
ModelVersion string ModelVersion string
CkptName string CkptName string
ResultUrl string ResultUrl string
Spec *models.Specification
} }


type VersionInfo struct { type VersionInfo struct {
@@ -256,7 +257,7 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor strin
return nil return nil
} }


func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, flavor, imageId string) error {
func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification) error {
if poolInfos == nil { if poolInfos == nil {
json.Unmarshal([]byte(setting.PoolInfos), &poolInfos) json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
} }
@@ -270,7 +271,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc
jobResult, err := createNotebook2(models.CreateNotebook2Params{ jobResult, err := createNotebook2(models.CreateNotebook2Params{
JobName: jobName, JobName: jobName,
Description: description, Description: description,
Flavor: flavor,
Flavor: spec.SourceSpecId,
Duration: autoStopDurationMs, Duration: autoStopDurationMs,
ImageID: imageId, ImageID: imageId,
PoolID: poolInfos.PoolInfo[0].PoolId, PoolID: poolInfos.PoolInfo[0].PoolId,
@@ -292,7 +293,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc
RepoID: ctx.Repo.Repository.ID, RepoID: ctx.Repo.Repository.ID,
JobID: jobResult.ID, JobID: jobResult.ID,
JobName: jobName, JobName: jobName,
FlavorCode: flavor,
FlavorCode: spec.SourceSpecId,
DisplayJobName: displayJobName, DisplayJobName: displayJobName,
JobType: string(models.JobTypeDebug), JobType: string(models.JobTypeDebug),
Type: models.TypeCloudBrainTwo, Type: models.TypeCloudBrainTwo,
@@ -302,6 +303,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc
Description: description, Description: description,
CreatedUnix: createTime, CreatedUnix: createTime,
UpdatedUnix: createTime, UpdatedUnix: createTime,
Spec: spec,
}) })


if err != nil { if err != nil {
@@ -335,7 +337,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
PoolID: req.PoolID, PoolID: req.PoolID,
CreateVersion: true, CreateVersion: true,
Flavor: models.Flavor{ Flavor: models.Flavor{
Code: req.FlavorCode,
Code: req.Spec.SourceSpecId,
}, },
Parameter: req.Parameters, Parameter: req.Parameters,
UserImageUrl: req.UserImageUrl, UserImageUrl: req.UserImageUrl,
@@ -357,7 +359,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
PoolID: req.PoolID, PoolID: req.PoolID,
CreateVersion: true, CreateVersion: true,
Flavor: models.Flavor{ Flavor: models.Flavor{
Code: req.FlavorCode,
Code: req.Spec.SourceSpecId,
}, },
Parameter: req.Parameters, Parameter: req.Parameters,
}, },
@@ -391,7 +393,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
BootFile: req.BootFile, BootFile: req.BootFile,
DataUrl: req.DataUrl, DataUrl: req.DataUrl,
LogUrl: req.LogUrl, LogUrl: req.LogUrl,
FlavorCode: req.FlavorCode,
FlavorCode: req.Spec.SourceSpecId,
Description: req.Description, Description: req.Description,
WorkServerNumber: req.WorkServerNumber, WorkServerNumber: req.WorkServerNumber,
FlavorName: req.FlavorName, FlavorName: req.FlavorName,
@@ -400,6 +402,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
TotalVersionCount: req.TotalVersionCount, TotalVersionCount: req.TotalVersionCount,
CreatedUnix: createTime, CreatedUnix: createTime,
UpdatedUnix: createTime, UpdatedUnix: createTime,
Spec: req.Spec,
}) })


if createErr != nil { if createErr != nil {
@@ -451,7 +454,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job
LogUrl: req.LogUrl, LogUrl: req.LogUrl,
PoolID: req.PoolID, PoolID: req.PoolID,
Flavor: models.Flavor{ Flavor: models.Flavor{
Code: req.FlavorCode,
Code: req.Spec.SourceSpecId,
}, },
Parameter: req.Parameters, Parameter: req.Parameters,
PreVersionId: req.PreVersionId, PreVersionId: req.PreVersionId,
@@ -472,7 +475,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job
LogUrl: req.LogUrl, LogUrl: req.LogUrl,
PoolID: req.PoolID, PoolID: req.PoolID,
Flavor: models.Flavor{ Flavor: models.Flavor{
Code: req.FlavorCode,
Code: req.Spec.SourceSpecId,
}, },
Parameter: req.Parameters, Parameter: req.Parameters,
PreVersionId: req.PreVersionId, PreVersionId: req.PreVersionId,
@@ -524,7 +527,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job
DataUrl: req.DataUrl, DataUrl: req.DataUrl,
LogUrl: req.LogUrl, LogUrl: req.LogUrl,
PreVersionId: req.PreVersionId, PreVersionId: req.PreVersionId,
FlavorCode: req.FlavorCode,
FlavorCode: req.Spec.SourceSpecId,
Description: req.Description, Description: req.Description,
WorkServerNumber: req.WorkServerNumber, WorkServerNumber: req.WorkServerNumber,
FlavorName: req.FlavorName, FlavorName: req.FlavorName,
@@ -533,6 +536,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job
VersionCount: VersionListCount + 1, VersionCount: VersionListCount + 1,
CreatedUnix: createTime, CreatedUnix: createTime,
UpdatedUnix: createTime, UpdatedUnix: createTime,
Spec: req.Spec,
}) })
if createErr != nil { if createErr != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error()) log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
@@ -716,7 +720,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e
PoolID: req.PoolID, PoolID: req.PoolID,
CreateVersion: true, CreateVersion: true,
Flavor: models.Flavor{ Flavor: models.Flavor{
Code: req.FlavorCode,
Code: req.Spec.SourceSpecId,
}, },
Parameter: req.Parameters, Parameter: req.Parameters,
}, },
@@ -753,7 +757,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e
BootFile: req.BootFile, BootFile: req.BootFile,
DataUrl: req.DataUrl, DataUrl: req.DataUrl,
LogUrl: req.LogUrl, LogUrl: req.LogUrl,
FlavorCode: req.FlavorCode,
FlavorCode: req.Spec.SourceSpecId,
Description: req.Description, Description: req.Description,
WorkServerNumber: req.WorkServerNumber, WorkServerNumber: req.WorkServerNumber,
FlavorName: req.FlavorName, FlavorName: req.FlavorName,
@@ -769,6 +773,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e
ResultUrl: req.ResultUrl, ResultUrl: req.ResultUrl,
CreatedUnix: createTime, CreatedUnix: createTime,
UpdatedUnix: createTime, UpdatedUnix: createTime,
Spec: req.Spec,
}) })


if err != nil { if err != nil {


+ 36
- 0
routers/admin/resources.go View File

@@ -8,6 +8,8 @@ import (
"code.gitea.io/gitea/routers/response" "code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/cloudbrain/resource" "code.gitea.io/gitea/services/cloudbrain/resource"
"net/http" "net/http"
"strconv"
"strings"
) )


const ( const (
@@ -246,3 +248,37 @@ func UpdateResourceScene(ctx *context.Context, req models.ResourceSceneReq) {
} }
ctx.JSON(http.StatusOK, response.Success()) ctx.JSON(http.StatusOK, response.Success())
} }

func RefreshHistorySpec(ctx *context.Context) {
scope := ctx.Query("scope")
list := ctx.Query("list")

var scopeAll = false
if scope == "all" {
scopeAll = true
}
var ids = make([]int64, 0)
if list != "" {
strs := strings.Split(list, "|")
for _, s := range strs {
i, err := strconv.ParseInt(s, 10, 64)
if err != nil {
ctx.JSON(http.StatusOK, response.ServerError(err.Error()))
return
}
ids = append(ids, i)
}

}

total, success, err := resource.RefreshHistorySpec(scopeAll, ids)
if err != nil {
log.Error("RefreshHistorySpec error. %v", err)
ctx.JSON(http.StatusOK, response.ServerError(err.Error()))
return
}
r := make(map[string]interface{}, 0)
r["success"] = success
r["total"] = total
ctx.JSON(http.StatusOK, response.SuccessWithData(r))
}

+ 2
- 0
routers/private/internal.go View File

@@ -6,6 +6,7 @@
package private package private


import ( import (
"code.gitea.io/gitea/routers/admin"
"strings" "strings"


"code.gitea.io/gitea/routers/repo" "code.gitea.io/gitea/routers/repo"
@@ -51,6 +52,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("/tool/org_stat", OrgStatisticManually) m.Get("/tool/org_stat", OrgStatisticManually)
m.Post("/tool/update_repo_visit/:date", UpdateRepoVisit) m.Post("/tool/update_repo_visit/:date", UpdateRepoVisit)
m.Post("/task/history_handle/duration", repo.HandleTaskWithNoDuration) m.Post("/task/history_handle/duration", repo.HandleTaskWithNoDuration)
m.Post("/resources/specification/handle_historical_task", admin.RefreshHistorySpec)


}, CheckInternalToken) }, CheckInternalToken)
} }

+ 13
- 96
routers/repo/cloudbrain.go View File

@@ -122,89 +122,8 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error {
ctx.Data["QueuesDetail"] = queuesDetail ctx.Data["QueuesDetail"] = queuesDetail
} }


cloudbrain.InitSpecialPool()

if gpuInfos == nil {
json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos)
}
ctx.Data["gpu_types"] = gpuInfos.GpuInfo

if trainGpuInfos == nil {
json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos)
}
ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo

if inferenceGpuInfos == nil && setting.InferenceGpuTypes != "" {
json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos)
}
if inferenceGpuInfos != nil {
ctx.Data["inference_gpu_types"] = inferenceGpuInfos.GpuInfo
}

if benchmarkGpuInfos == nil {
json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos)
}
ctx.Data["benchmark_gpu_types"] = benchmarkGpuInfos.GpuInfo

if benchmarkResourceSpecs == nil {
json.Unmarshal([]byte(setting.BenchmarkResourceSpecs), &benchmarkResourceSpecs)
}
ctx.Data["benchmark_resource_specs"] = benchmarkResourceSpecs.ResourceSpec

if cloudbrain.ResourceSpecs == nil {
json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs)
}
ctx.Data["resource_specs"] = cloudbrain.ResourceSpecs.ResourceSpec

if cloudbrain.TrainResourceSpecs == nil {
json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs)
}
ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec

if cloudbrain.InferenceResourceSpecs == nil && setting.InferenceResourceSpecs != "" {
json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs)
}
if cloudbrain.InferenceResourceSpecs != nil {
ctx.Data["inference_resource_specs"] = cloudbrain.InferenceResourceSpecs.ResourceSpec
}

prepareCloudbrainOneSpecs(ctx) prepareCloudbrainOneSpecs(ctx)


if cloudbrain.SpecialPools != nil {
var debugGpuTypes []*models.GpuInfo
var trainGpuTypes []*models.GpuInfo

for _, pool := range cloudbrain.SpecialPools.Pools {
isOrgMember, _ := models.IsOrganizationMemberByOrgName(pool.Org, ctx.User.ID)
if isOrgMember {
for _, jobType := range pool.JobType {
if jobType == string(models.JobTypeDebug) {
debugGpuTypes = append(debugGpuTypes, pool.Pool...)
if pool.ResourceSpec != nil {
ctx.Data["resource_specs"] = pool.ResourceSpec
}
} else if jobType == string(models.JobTypeTrain) {
trainGpuTypes = append(trainGpuTypes, pool.Pool...)
if pool.ResourceSpec != nil {
ctx.Data["train_resource_specs"] = pool.ResourceSpec
}
}
}
break
}

}

if len(debugGpuTypes) > 0 {
ctx.Data["gpu_types"] = debugGpuTypes
}

if len(trainGpuTypes) > 0 {
ctx.Data["train_gpu_types"] = trainGpuTypes
}

}

ctx.Data["params"] = "" ctx.Data["params"] = ""
ctx.Data["branchName"] = ctx.Repo.BranchName ctx.Data["branchName"] = ctx.Repo.BranchName


@@ -229,8 +148,6 @@ func prepareCloudbrainOneSpecs(ctx *context.Context) {
AiCenterCode: models.AICenterOfCloudBrainOne, AiCenterCode: models.AICenterOfCloudBrainOne,
}) })
ctx.Data["debug_specs"] = debugSpecs ctx.Data["debug_specs"] = debugSpecs
b, _ := json.Marshal(debugSpecs)
log.Info("%s", string(b))


trainSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{ trainSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
JobType: models.JobTypeTrain, JobType: models.JobTypeTrain,
@@ -247,6 +164,14 @@ func prepareCloudbrainOneSpecs(ctx *context.Context) {
AiCenterCode: models.AICenterOfCloudBrainOne, AiCenterCode: models.AICenterOfCloudBrainOne,
}) })
ctx.Data["inference_specs"] = inferenceSpecs ctx.Data["inference_specs"] = inferenceSpecs

benchmarkSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
JobType: models.JobTypeBenchmark,
ComputeResource: models.GPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainOne,
})
ctx.Data["benchmark_specs"] = benchmarkSpecs
} }


func CloudBrainNew(ctx *context.Context) { func CloudBrainNew(ctx *context.Context) {
@@ -348,18 +273,10 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
command = commandTrain command = commandTrain
} }


errStr := checkCloudBrainSpecialPool(ctx, jobType, gpuQueue, resourceSpecId)

if errStr != "" {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(errStr, tpl, &form)
return
}

if branchName == "" { if branchName == "" {
branchName = cloudbrain.DefaultBranchName branchName = cloudbrain.DefaultBranchName
} }
errStr = loadCodeAndMakeModelPath(repo, codePath, branchName, jobName, cloudbrain.ModelMountPath)
errStr := loadCodeAndMakeModelPath(repo, codePath, branchName, jobName, cloudbrain.ModelMountPath)
if errStr != "" { if errStr != "" {
cloudBrainNewDataPrepare(ctx) cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr(errStr), tpl, &form) ctx.RenderWithErr(ctx.Tr(errStr), tpl, &form)
@@ -375,7 +292,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
AiCenterCode: models.AICenterOfCloudBrainOne}) AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil { if err != nil || spec == nil {
cloudBrainNewDataPrepare(ctx) cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("Illegal resource specification", tpl, &form)
ctx.RenderWithErr("Resource specification not available", tpl, &form)
return return
} }


@@ -534,7 +451,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
AiCenterCode: models.AICenterOfCloudBrainOne}) AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil { if err != nil || spec == nil {
cloudBrainNewDataPrepare(ctx) cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("Illegal resource specification", tpl, &form)
ctx.RenderWithErr("Resource specification not available", tpl, &form)
return return
} }
req := cloudbrain.GenerateCloudBrainTaskReq{ req := cloudbrain.GenerateCloudBrainTaskReq{
@@ -2447,7 +2364,7 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo
AiCenterCode: models.AICenterOfCloudBrainOne}) AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil { if err != nil || spec == nil {
cloudBrainNewDataPrepare(ctx) cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("Illegal resource specification", tplCloudBrainBenchmarkNew, &form)
ctx.RenderWithErr("Resource specification not available", tplCloudBrainBenchmarkNew, &form)
return return
} }


@@ -2587,7 +2504,7 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm)
AiCenterCode: models.AICenterOfCloudBrainOne}) AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil { if err != nil || spec == nil {
cloudBrainNewDataPrepare(ctx) cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("Illegal resource specification", tpl, &form)
ctx.RenderWithErr("Resource specification not available", tpl, &form)
return return
} }
req := cloudbrain.GenerateCloudBrainTaskReq{ req := cloudbrain.GenerateCloudBrainTaskReq{


+ 67
- 57
routers/repo/modelarts.go View File

@@ -2,6 +2,7 @@ package repo


import ( import (
"archive/zip" "archive/zip"
"code.gitea.io/gitea/services/cloudbrain/resource"
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
@@ -141,11 +142,7 @@ func notebookNewDataPrepare(ctx *context.Context) error {
} }
ctx.Data["images"] = modelarts.ImageInfos.ImageInfo ctx.Data["images"] = modelarts.ImageInfos.ImageInfo


if modelarts.FlavorInfos == nil {
json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos)
}
ctx.Data["flavors"] = modelarts.FlavorInfos.FlavorInfo
setSpecBySpecialPoolConfig(ctx, string(models.JobTypeDebug))
prepareCloudbrainTwoDebugSpecs(ctx)


ctx.Data["datasetType"] = models.TypeCloudBrainTwo ctx.Data["datasetType"] = models.TypeCloudBrainTwo


@@ -155,6 +152,16 @@ func notebookNewDataPrepare(ctx *context.Context) error {
return nil return nil
} }


func prepareCloudbrainTwoDebugSpecs(ctx *context.Context) {
noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
JobType: models.JobTypeDebug,
ComputeResource: models.NPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainTwo,
})
ctx.Data["Specs"] = noteBookSpecs
}

func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) { func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) {
ctx.Data["PageIsNotebook"] = true ctx.Data["PageIsNotebook"] = true
jobName := form.JobName jobName := form.JobName
@@ -205,7 +212,6 @@ func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm
jobName := util.ConvertDisplayJobNameToJobName(displayJobName) jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
uuid := form.Attachment uuid := form.Attachment
description := form.Description description := form.Description
flavor := form.Flavor
imageId := form.ImageId imageId := form.ImageId
repo := ctx.Repo.Repository repo := ctx.Repo.Repository


@@ -241,14 +247,17 @@ func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm
} }
} }


errStr := checkModelArtsSpecialPool(ctx, flavor, string(models.JobTypeDebug))
if errStr != "" {
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobTypeDebug,
ComputeResource: models.NPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainTwo})
if err != nil || spec == nil {
notebookNewDataPrepare(ctx) notebookNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsNotebookNew, &form)
ctx.RenderWithErr("Resource specification not available", tplModelArtsNotebookNew, &form)
return return
} }

err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, flavor, imageId)
err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, imageId, spec)
if err != nil { if err != nil {
log.Error("GenerateNotebook2 failed, %v", err, ctx.Data["MsgID"]) log.Error("GenerateNotebook2 failed, %v", err, ctx.Data["MsgID"])
notebookNewDataPrepare(ctx) notebookNewDataPrepare(ctx)
@@ -728,14 +737,7 @@ func trainJobNewDataPrepare(ctx *context.Context) error {
} }
ctx.Data["engine_versions"] = versionInfos.Version ctx.Data["engine_versions"] = versionInfos.Version


var flavorInfos modelarts.Flavor
if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["flavor_infos"] = flavorInfos.Info

setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain))
prepareCloudbrainTwoTrainSpecs(ctx)


ctx.Data["params"] = "" ctx.Data["params"] = ""
ctx.Data["branchName"] = ctx.Repo.BranchName ctx.Data["branchName"] = ctx.Repo.BranchName
@@ -753,6 +755,16 @@ func trainJobNewDataPrepare(ctx *context.Context) error {
return nil return nil
} }


func prepareCloudbrainTwoTrainSpecs(ctx *context.Context) {
noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
JobType: models.JobTypeTrain,
ComputeResource: models.NPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainTwo,
})
ctx.Data["Specs"] = noteBookSpecs
}

func setSpecBySpecialPoolConfig(ctx *context.Context, jobType string) { func setSpecBySpecialPoolConfig(ctx *context.Context, jobType string) {
modelarts.InitSpecialPool() modelarts.InitSpecialPool()


@@ -835,13 +847,7 @@ func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArts
} }
ctx.Data["engine_versions"] = versionInfos.Version ctx.Data["engine_versions"] = versionInfos.Version


var flavorInfos modelarts.Flavor
if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["flavor_infos"] = flavorInfos.Info
setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain))
prepareCloudbrainTwoTrainSpecs(ctx)


configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom) configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
if err != nil { if err != nil {
@@ -1020,13 +1026,7 @@ func versionErrorDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrai
} }
ctx.Data["engine_versions"] = versionInfos.Version ctx.Data["engine_versions"] = versionInfos.Version


var flavorInfos modelarts.Flavor
if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["flavor_infos"] = flavorInfos.Info
setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain))
prepareCloudbrainTwoTrainSpecs(ctx)


var Parameters modelarts.Parameters var Parameters modelarts.Parameters
if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil { if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
@@ -1079,7 +1079,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
workServerNumber := form.WorkServerNumber workServerNumber := form.WorkServerNumber
engineID := form.EngineID engineID := form.EngineID
bootFile := strings.TrimSpace(form.BootFile) bootFile := strings.TrimSpace(form.BootFile)
flavorCode := form.Flavor
params := form.Params params := form.Params
poolID := form.PoolID poolID := form.PoolID
//isSaveParam := form.IsSaveParam //isSaveParam := form.IsSaveParam
@@ -1117,10 +1116,14 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
return return
} }


errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain))
if errStr != "" {
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobTypeTrain,
ComputeResource: models.NPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainTwo})
if err != nil || spec == nil {
trainJobErrorNewDataPrepare(ctx, form) trainJobErrorNewDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form)
ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobNew, &form)
return return
} }
//Determine whether the task name of the task in the project is duplicated //Determine whether the task name of the task in the project is duplicated
@@ -1283,7 +1286,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
BootFileUrl: codeObsPath + bootFile, BootFileUrl: codeObsPath + bootFile,
BootFile: bootFile, BootFile: bootFile,
TrainUrl: outputObsPath, TrainUrl: outputObsPath,
FlavorCode: flavorCode,
WorkServerNumber: workServerNumber, WorkServerNumber: workServerNumber,
EngineID: int64(engineID), EngineID: int64(engineID),
LogUrl: logObsPath, LogUrl: logObsPath,
@@ -1299,6 +1301,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
VersionCount: VersionCount, VersionCount: VersionCount,
TotalVersionCount: modelarts.TotalVersionCount, TotalVersionCount: modelarts.TotalVersionCount,
DatasetName: datasetNames, DatasetName: datasetNames,
Spec: spec,
} }
userCommand, userImageUrl := getUserCommand(engineID, req) userCommand, userImageUrl := getUserCommand(engineID, req)
req.UserCommand = userCommand req.UserCommand = userCommand
@@ -1384,7 +1387,6 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
workServerNumber := form.WorkServerNumber workServerNumber := form.WorkServerNumber
engineID := form.EngineID engineID := form.EngineID
bootFile := strings.TrimSpace(form.BootFile) bootFile := strings.TrimSpace(form.BootFile)
flavorCode := form.Flavor
params := form.Params params := form.Params
poolID := form.PoolID poolID := form.PoolID
//isSaveParam := form.IsSaveParam //isSaveParam := form.IsSaveParam
@@ -1414,10 +1416,14 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
return return
} }


errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain))
if errStr != "" {
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobTypeTrain,
ComputeResource: models.NPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainTwo})
if err != nil || spec == nil {
versionErrorDataPrepare(ctx, form) versionErrorDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form)
ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobVersionNew, &form)
return return
} }


@@ -1571,7 +1577,6 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
BootFileUrl: codeObsPath + bootFile, BootFileUrl: codeObsPath + bootFile,
BootFile: bootFile, BootFile: bootFile,
TrainUrl: outputObsPath, TrainUrl: outputObsPath,
FlavorCode: flavorCode,
WorkServerNumber: workServerNumber, WorkServerNumber: workServerNumber,
IsLatestVersion: isLatestVersion, IsLatestVersion: isLatestVersion,
EngineID: int64(engineID), EngineID: int64(engineID),
@@ -1588,6 +1593,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
PreVersionName: PreVersionName, PreVersionName: PreVersionName,
TotalVersionCount: latestTask.TotalVersionCount + 1, TotalVersionCount: latestTask.TotalVersionCount + 1,
DatasetName: datasetNames, DatasetName: datasetNames,
Spec: spec,
} }
userCommand, userImageUrl := getUserCommand(engineID, req) userCommand, userImageUrl := getUserCommand(engineID, req)
req.UserCommand = userCommand req.UserCommand = userCommand
@@ -2016,7 +2022,6 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference
workServerNumber := form.WorkServerNumber workServerNumber := form.WorkServerNumber
engineID := form.EngineID engineID := form.EngineID
bootFile := strings.TrimSpace(form.BootFile) bootFile := strings.TrimSpace(form.BootFile)
flavorCode := form.Flavor
params := form.Params params := form.Params
poolID := form.PoolID poolID := form.PoolID
repo := ctx.Repo.Repository repo := ctx.Repo.Repository
@@ -2078,13 +2083,16 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference
} }
} }


errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference))
if errStr != "" {
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobTypeInference,
ComputeResource: models.NPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainTwo})
if err != nil || spec == nil {
inferenceJobErrorNewDataPrepare(ctx, form) inferenceJobErrorNewDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form)
ctx.RenderWithErr("Resource specification not available", tplModelArtsInferenceJobNew, &form)
return return
} }

//todo: del the codeLocalPath //todo: del the codeLocalPath
_, err = ioutil.ReadDir(codeLocalPath) _, err = ioutil.ReadDir(codeLocalPath)
if err == nil { if err == nil {
@@ -2170,7 +2178,6 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference
BootFileUrl: codeObsPath + bootFile, BootFileUrl: codeObsPath + bootFile,
BootFile: bootFile, BootFile: bootFile,
TrainUrl: trainUrl, TrainUrl: trainUrl,
FlavorCode: flavorCode,
WorkServerNumber: workServerNumber, WorkServerNumber: workServerNumber,
EngineID: int64(engineID), EngineID: int64(engineID),
LogUrl: logObsPath, LogUrl: logObsPath,
@@ -2369,14 +2376,7 @@ func inferenceJobNewDataPrepare(ctx *context.Context) error {
} }
ctx.Data["engine_versions"] = versionInfos.Version ctx.Data["engine_versions"] = versionInfos.Version


var flavorInfos modelarts.Flavor
if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}

ctx.Data["flavor_infos"] = flavorInfos.Info
setSpecBySpecialPoolConfig(ctx, string(models.JobTypeInference))
prepareCloudbrainTwoInferenceSpecs(ctx)


ctx.Data["params"] = "" ctx.Data["params"] = ""
ctx.Data["branchName"] = ctx.Repo.BranchName ctx.Data["branchName"] = ctx.Repo.BranchName
@@ -2407,6 +2407,16 @@ func inferenceJobNewDataPrepare(ctx *context.Context) error {
return nil return nil
} }


func prepareCloudbrainTwoInferenceSpecs(ctx *context.Context) {
noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
JobType: models.JobTypeInference,
ComputeResource: models.NPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainTwo,
})
ctx.Data["Specs"] = noteBookSpecs
}

func inferenceJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) error { func inferenceJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) error {
ctx.Data["PageIsCloudBrain"] = true ctx.Data["PageIsCloudBrain"] = true




+ 263
- 1
services/cloudbrain/resource/resource_specification.go View File

@@ -2,12 +2,17 @@ package resource


import ( import (
"code.gitea.io/gitea/models" "code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/grampus" "code.gitea.io/gitea/modules/grampus"
"code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/routers/response" "code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/admin/operate_log" "code.gitea.io/gitea/services/admin/operate_log"
"encoding/json"
"errors"
"fmt" "fmt"
"strings" "strings"
"time"
) )


func AddResourceSpecification(doerId int64, req models.ResourceSpecificationReq) error { func AddResourceSpecification(doerId int64, req models.ResourceSpecificationReq) error {
@@ -186,7 +191,7 @@ func AddSpecOperateLog(doerId int64, operateType string, newValue, oldValue *mod
} }


func FindAvailableSpecs(userId int64, opts models.FindSpecsOptions) ([]*models.Specification, error) { func FindAvailableSpecs(userId int64, opts models.FindSpecsOptions) ([]*models.Specification, error) {
r, err := models.FindAvailableSpecs(opts)
r, err := models.FindSpecs(opts)
if err != nil { if err != nil {
log.Error("FindAvailableSpecs error.%v", err) log.Error("FindAvailableSpecs error.%v", err)
return nil, err return nil, err
@@ -270,3 +275,260 @@ func GetCloudbrainSpec(cloudbrainId int64) (*models.Specification, error) {
} }
return c.ConvertToSpecification(), nil return c.ConvertToSpecification(), nil
} }

func RefreshHistorySpec(scopeAll bool, ids []int64) (int64, int64, error) {
var success int64
var total int64

if !scopeAll {
if ids == nil || len(ids) == 0 {
return 0, 0, nil
}
total = int64(len(ids))
tasks, err := models.GetCloudbrainWithDeletedByIDs(ids)
if err != nil {
return total, 0, err
}
for _, task := range tasks {
err = RefreshOneHistorySpec(task)
if err != nil {
log.Error("RefreshOneHistorySpec error.%v", err)
continue
}
success++
}

} else {
page := 1
pageSize := 100
n, err := models.CountNoSpecHistoricTask()
if err != nil {
log.Error("FindNoSpecHistoricTask CountNoSpecHistoricTask error. e=%v", err)
return 0, 0, err
}
total = n
for i := 0; i < 1000; i++ {
list, err := models.FindNoSpecHistoricTask(page, pageSize)
if err != nil {
log.Error("FindNoSpecHistoricTask error.page=%d pageSize=%d e=%v", page, pageSize, err)
return total, success, err
}
if len(list) == 0 {
log.Info("RefreshHistorySpec. list is empty")
break
}
for _, task := range list {
time.Sleep(1 * time.Second)
err = RefreshOneHistorySpec(task)
if err != nil {
log.Error("RefreshOneHistorySpec error.%v", err)
continue
}
success++
}
if len(list) < pageSize {
log.Info("RefreshHistorySpec. list < pageSize")
break
}
}
}
return total, success, nil

}

func RefreshOneHistorySpec(task *models.Cloudbrain) error {
var spec *models.Specification
var err error
switch task.Type {
case models.TypeCloudBrainOne:
spec, err = getCloudbrainOneSpec(task)
}
if err != nil {
log.Error("find spec error,task.ID=%d err=%v", task.ID, err)
return err
}
if spec == nil {
log.Error("find spec failed,task.ID=%d", task.ID)
return errors.New("find spec failed")
}
return InsertCloudbrainSpec(task.ID, spec)
}

func getCloudbrainOneSpec(task *models.Cloudbrain) (*models.Specification, error) {
//find from remote
result, err := cloudbrain.GetJob(task.JobID)
if err != nil {
log.Error("getCloudbrainOneSpec error. %v", err)
return nil, err
}
if result != nil {
jobRes, _ := models.ConvertToJobResultPayload(result.Payload)
memSize, _ := models.ParseMemSizeFromGrampus(jobRes.Resource.Memory)
if task.ComputeResource == "CPU/GPU" {
task.ComputeResource = models.GPU
}
var shmMB float32
if jobRes.Config.TaskRoles != nil && len(jobRes.Config.TaskRoles) > 0 {
shmMB = float32(jobRes.Config.TaskRoles[0].ShmMB) / 1024
}

opt := models.FindSpecsOptions{
ComputeResource: task.ComputeResource,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainOne,
QueueCode: task.GpuQueue,
AccCardsNum: jobRes.Resource.NvidiaComGpu,
UseAccCardsNum: true,
CpuCores: jobRes.Resource.CPU,
UseCpuCores: true,
MemGiB: memSize,
UseMemGiB: memSize > 0,
ShareMemGiB: shmMB,
UseShareMemGiB: shmMB > 0,
RequestAll: true,
}
specs, err := models.FindSpecs(opt)
if err != nil {
log.Error("getCloudbrainOneSpec from remote error,%v", err)
return nil, err
}
if len(specs) == 1 {
return specs[0], nil
}
if len(specs) == 0 {
s, err := InitQueueAndSpec(opt, "云脑一", "处理历史云脑任务时自动添加")
if err != nil {
log.Error("getCloudbrainOneSpec InitQueueAndSpec error.err=%v", err)
return nil, nil
}
return s, nil
}
if len(specs) > 1 {
log.Error("Too many results matched.size=%d opt=%+v", len(specs), opt)
return nil, nil
}

} else {
//find from config
var specConfig *models.ResourceSpec
hasSpec := false
if task.JobType == string(models.JobTypeTrain) {
if cloudbrain.TrainResourceSpecs == nil {
json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs)
}
for _, tmp := range cloudbrain.TrainResourceSpecs.ResourceSpec {
if tmp.Id == task.ResourceSpecId {
hasSpec = true
specConfig = tmp
break
}
}
} else if task.JobType == string(models.JobTypeInference) {
if cloudbrain.InferenceResourceSpecs == nil {
json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs)
}
for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec {
if tmp.Id == task.ResourceSpecId {
hasSpec = true
specConfig = tmp
break
}
}
} else {
if cloudbrain.ResourceSpecs == nil {
json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs)
}
for _, tmp := range cloudbrain.ResourceSpecs.ResourceSpec {
if tmp.Id == task.ResourceSpecId {
hasSpec = true
specConfig = tmp
break

}
}
}
if !hasSpec && cloudbrain.SpecialPools != nil {

for _, specialPool := range cloudbrain.SpecialPools.Pools {

if specialPool.ResourceSpec != nil {

for _, spec := range specialPool.ResourceSpec {
if task.ResourceSpecId == spec.Id {
hasSpec = true
specConfig = spec
break
}
}
}
}
}
if specConfig == nil {
log.Error("getCloudbrainOneSpec from config failed,task.ResourceSpecId=%d", task.ResourceSpecId)
return nil, nil
}
opt := models.FindSpecsOptions{
JobType: models.JobType(task.JobType),
ComputeResource: task.ComputeResource,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainOne,
QueueCode: task.GpuQueue,
AccCardsNum: specConfig.GpuNum,
UseAccCardsNum: true,
CpuCores: specConfig.GpuNum,
UseCpuCores: true,
MemGiB: float32(specConfig.MemMiB) / 1024,
UseMemGiB: true,
ShareMemGiB: float32(specConfig.ShareMemMiB) / 1024,
UseShareMemGiB: true,
RequestAll: true,
}
specs, err := models.FindSpecs(opt)
if err != nil {
log.Error("getCloudbrainOneSpec from config error,%v", err)
return nil, err
}
if len(specs) > 1 {
log.Error("Too many results matched.size=%d opt=%+v", len(specs), opt)
return nil, nil
}
if len(specs) == 0 {
s, err := InitQueueAndSpec(opt, "云脑一", "处理历史云脑任务时自动添加")
if err != nil {
log.Error("getCloudbrainOneSpec InitQueueAndSpec error.err=%v", err)
return nil, nil
}
return s, nil
}
return specs[0], nil
}
return nil, nil

}

func RefreshCloudbrainTwoSpec(task *models.Cloudbrain) error {
return nil
}

func RefreshC2NetSpec(task *models.Cloudbrain) error {
return nil
}

func InitQueueAndSpec(opt models.FindSpecsOptions, aiCenterName string, remark string) (*models.Specification, error) {
return models.InitQueueAndSpec(models.ResourceQueue{
QueueCode: opt.QueueCode,
Cluster: opt.Cluster,
AiCenterCode: opt.AiCenterCode,
AiCenterName: aiCenterName,
ComputeResource: opt.ComputeResource,
AccCardType: models.GetCloudbrainOneAccCardType(opt.QueueCode),
Remark: remark,
}, models.ResourceSpecification{
AccCardsNum: opt.AccCardsNum,
CpuCores: opt.CpuCores,
MemGiB: opt.MemGiB,
GPUMemGiB: opt.GPUMemGiB,
ShareMemGiB: opt.ShareMemGiB,
Status: models.SpecOffShelf,
})
}

Loading…
Cancel
Save