Browse Source

#2701

update
tags/v1.22.9.1^2^2
chenyifan01 3 years ago
parent
commit
2898c40fcc
10 changed files with 547 additions and 176 deletions
  1. +6
    -0
      models/cloudbrain.go
  2. +23
    -0
      models/cloudbrain_spec.go
  3. +116
    -9
      models/resource_specification.go
  4. +3
    -0
      modules/auth/modelarts.go
  5. +18
    -13
      modules/modelarts/modelarts.go
  6. +36
    -0
      routers/admin/resources.go
  7. +2
    -0
      routers/private/internal.go
  8. +13
    -96
      routers/repo/cloudbrain.go
  9. +67
    -57
      routers/repo/modelarts.go
  10. +263
    -1
      services/cloudbrain/resource/resource_specification.go

+ 6
- 0
models/cloudbrain.go View File

@@ -2313,3 +2313,9 @@ func GetCloudbrainByIDs(ids []int64) ([]*Cloudbrain, error) {
In("id", ids).
Find(&cloudbrains)
}

func GetCloudbrainWithDeletedByIDs(ids []int64) ([]*Cloudbrain, error) {
cloudbrains := make([]*Cloudbrain, 0)
return cloudbrains, x.
In("id", ids).Unscoped().Find(&cloudbrains)
}

+ 23
- 0
models/cloudbrain_spec.go View File

@@ -83,3 +83,26 @@ func GetCloudbrainSpecByID(cloudbrainId int64) (*CloudbrainSpec, error) {
}
return r, nil
}

func FindNoSpecHistoricTask(page, pageSize int) ([]*Cloudbrain, error) {
r := make([]*Cloudbrain, 0)
err := x.Unscoped().
Where(" 1=1 and not exists (select 1 from cloudbrain_spec where cloudbrain.id = cloudbrain_spec.cloudbrain_id)").
Limit(pageSize, (page-1)*pageSize).
OrderBy("cloudbrain.id").
Find(&r)
if err != nil {
return nil, err
}
return r, nil
}

func CountNoSpecHistoricTask() (int64, error) {
n, err := x.Unscoped().
Where(" 1=1 and not exists (select 1 from cloudbrain_spec where cloudbrain.id = cloudbrain_spec.cloudbrain_id)").
Count(&Cloudbrain{})
if err != nil {
return 0, err
}
return n, nil
}

+ 116
- 9
models/resource_specification.go View File

@@ -147,6 +147,21 @@ type FindSpecsOptions struct {
Cluster string
AiCenterCode string
SpecId int64
QueueCode string
SourceSpecId string
AccCardsNum int
UseAccCardsNum bool
AccCardType string
CpuCores int
UseCpuCores bool
MemGiB float32
UseMemGiB bool
GPUMemGiB float32
UseGPUMemGiB bool
ShareMemGiB float32
UseShareMemGiB bool
//if true,find specs no matter used or not used in scene. if false,only find specs used in scene
RequestAll bool
}

type Specification struct {
@@ -316,9 +331,10 @@ func SyncGrampusSpecs(updateList []ResourceSpecification, insertList []ResourceS
return sess.Commit()
}

func FindAvailableSpecs(opts FindSpecsOptions) ([]*Specification, error) {
//FindSpecs
func FindSpecs(opts FindSpecsOptions) ([]*Specification, error) {
var cond = builder.NewCond()
if opts.JobType != "" {
if !opts.RequestAll && opts.JobType != "" {
cond = cond.And(builder.Eq{"resource_scene.job_type": opts.JobType})
}
if opts.ComputeResource != "" {
@@ -333,17 +349,108 @@ func FindAvailableSpecs(opts FindSpecsOptions) ([]*Specification, error) {
if opts.SpecId > 0 {
cond = cond.And(builder.Eq{"resource_specification.id": opts.SpecId})
}
cond = cond.And(builder.Or(builder.Eq{"resource_scene.delete_time": 0}, builder.IsNull{"resource_scene.delete_time"}))

if opts.QueueCode != "" {
cond = cond.And(builder.Eq{"resource_queue.queue_code": opts.QueueCode})
}
if opts.SourceSpecId != "" {
cond = cond.And(builder.Eq{"resource_specification.source_spec_id": opts.SourceSpecId})
}
if opts.UseAccCardsNum {
cond = cond.And(builder.Eq{"resource_specification.acc_cards_num": opts.AccCardsNum})
}
if opts.AccCardType != "" {
cond = cond.And(builder.Eq{"resource_queue.acc_card_type": opts.AccCardType})
}
if opts.UseCpuCores {
cond = cond.And(builder.Eq{"resource_specification.cpu_cores": opts.CpuCores})
}
if opts.UseMemGiB {
cond = cond.And(builder.Eq{"resource_specification.mem_gi_b": opts.MemGiB})
}
if opts.UseGPUMemGiB {
cond = cond.And(builder.Eq{"resource_specification.gpu_mem_gi_b": opts.GPUMemGiB})
}
if opts.UseShareMemGiB {
cond = cond.And(builder.Eq{"resource_specification.share_mem_gi_b": opts.ShareMemGiB})
}
r := make([]*Specification, 0)
err := x.Where(cond).
Join("INNER", "resource_scene_spec", "resource_scene_spec.spec_id = resource_specification.id").
Join("INNER", "resource_scene", "resource_scene_spec.scene_id = resource_scene.id").
Join("INNER", "resource_queue", "resource_queue.id = resource_specification.queue_id").
OrderBy("resource_queue.compute_resource asc,resource_queue.acc_card_type asc,resource_specification.acc_cards_num asc").
s := x.Where(cond).
Join("INNER", "resource_queue", "resource_queue.id = resource_specification.queue_id")

if !opts.RequestAll {
s = s.Join("INNER", "resource_scene_spec", "resource_scene_spec.spec_id = resource_specification.id").
Join("INNER", "resource_scene", "resource_scene_spec.scene_id = resource_scene.id")
}
err := s.OrderBy("resource_queue.compute_resource asc,resource_queue.acc_card_type asc,resource_specification.acc_cards_num asc").
Unscoped().Find(&r)
if err != nil {
return nil, err
}
return r, nil
}

func InitQueueAndSpec(queue ResourceQueue, spec ResourceSpecification) (*Specification, error) {
sess := x.NewSession()
defer sess.Close()

sess.Begin()
param := ResourceQueue{
QueueCode: queue.QueueCode,
Cluster: queue.Cluster,
AiCenterCode: queue.AiCenterCode,
ComputeResource: queue.ComputeResource,
AccCardType: queue.AccCardType,
}
_, err := sess.Get(&param)
if err != nil {
sess.Rollback()
return nil, err
}
if param.ID == 0 {
_, err = sess.InsertOne(&queue)
if err != nil {
sess.Rollback()
return nil, err
}
} else {
queue = param
}

spec.QueueId = queue.ID
_, err = sess.InsertOne(&spec)
if err != nil {
sess.Rollback()
return nil, err
}
sess.Commit()
return &Specification{
ID: spec.ID,
SourceSpecId: spec.SourceSpecId,
AccCardsNum: spec.AccCardsNum,
AccCardType: queue.AccCardType,
CpuCores: spec.CpuCores,
MemGiB: spec.MemGiB,
GPUMemGiB: spec.GPUMemGiB,
ShareMemGiB: spec.ShareMemGiB,
ComputeResource: queue.ComputeResource,
UnitPrice: spec.UnitPrice,
QueueId: queue.ID,
QueueCode: queue.QueueCode,
Cluster: queue.Cluster,
AiCenterCode: queue.AiCenterCode,
AiCenterName: queue.AiCenterName,
}, nil
}

func GetCloudbrainOneAccCardType(queueCode string) string {
switch queueCode {
case "a100":
return "A100"
case "openidebug":
return "T4"
case "openidgx":
return "V100"

}
return ""
}

+ 3
- 0
modules/auth/modelarts.go View File

@@ -22,6 +22,7 @@ type CreateModelArtsNotebookForm struct {
Description string `form:"description"`
Flavor string `form:"flavor" binding:"Required"`
ImageId string `form:"image_id" binding:"Required"`
SpecId int64 `form:"spec_id" binding:"Required"`
}

func (f *CreateModelArtsNotebookForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors {
@@ -46,6 +47,7 @@ type CreateModelArtsTrainJobForm struct {
VersionName string `form:"version_name" binding:"Required"`
FlavorName string `form:"flaver_names" binding:"Required"`
EngineName string `form:"engine_names" binding:"Required"`
SpecId int64 `form:"spec_id" binding:"Required"`
}

type CreateModelArtsInferenceJobForm struct {
@@ -71,6 +73,7 @@ type CreateModelArtsInferenceJobForm struct {
ModelName string `form:"model_name" binding:"Required"`
ModelVersion string `form:"model_version" binding:"Required"`
CkptName string `form:"ckpt_name" binding:"Required"`
SpecId int64 `form:"spec_id" binding:"Required"`
}

func (f *CreateModelArtsTrainJobForm) Validate(ctx *macaron.Context, errs binding.Errors) binding.Errors {


+ 18
- 13
modules/modelarts/modelarts.go View File

@@ -84,7 +84,6 @@ type GenerateTrainJobReq struct {
BootFileUrl string
DataUrl string
TrainUrl string
FlavorCode string
LogUrl string
PoolID string
WorkServerNumber int
@@ -96,6 +95,7 @@ type GenerateTrainJobReq struct {
BranchName string
PreVersionId int64
PreVersionName string
FlavorCode string
FlavorName string
VersionCount int
EngineName string
@@ -103,6 +103,7 @@ type GenerateTrainJobReq struct {
UserImageUrl string
UserCommand string
DatasetName string
Spec *models.Specification
}

type GenerateInferenceJobReq struct {
@@ -115,7 +116,6 @@ type GenerateInferenceJobReq struct {
BootFileUrl string
DataUrl string
TrainUrl string
FlavorCode string
LogUrl string
PoolID string
WorkServerNumber int
@@ -134,6 +134,7 @@ type GenerateInferenceJobReq struct {
ModelVersion string
CkptName string
ResultUrl string
Spec *models.Specification
}

type VersionInfo struct {
@@ -256,7 +257,7 @@ func GenerateTask(ctx *context.Context, jobName, uuid, description, flavor strin
return nil
}

func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, flavor, imageId string) error {
func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, description, imageId string, spec *models.Specification) error {
if poolInfos == nil {
json.Unmarshal([]byte(setting.PoolInfos), &poolInfos)
}
@@ -270,7 +271,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc
jobResult, err := createNotebook2(models.CreateNotebook2Params{
JobName: jobName,
Description: description,
Flavor: flavor,
Flavor: spec.SourceSpecId,
Duration: autoStopDurationMs,
ImageID: imageId,
PoolID: poolInfos.PoolInfo[0].PoolId,
@@ -292,7 +293,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc
RepoID: ctx.Repo.Repository.ID,
JobID: jobResult.ID,
JobName: jobName,
FlavorCode: flavor,
FlavorCode: spec.SourceSpecId,
DisplayJobName: displayJobName,
JobType: string(models.JobTypeDebug),
Type: models.TypeCloudBrainTwo,
@@ -302,6 +303,7 @@ func GenerateNotebook2(ctx *context.Context, displayJobName, jobName, uuid, desc
Description: description,
CreatedUnix: createTime,
UpdatedUnix: createTime,
Spec: spec,
})

if err != nil {
@@ -335,7 +337,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
Code: req.Spec.SourceSpecId,
},
Parameter: req.Parameters,
UserImageUrl: req.UserImageUrl,
@@ -357,7 +359,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
Code: req.Spec.SourceSpecId,
},
Parameter: req.Parameters,
},
@@ -391,7 +393,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
BootFile: req.BootFile,
DataUrl: req.DataUrl,
LogUrl: req.LogUrl,
FlavorCode: req.FlavorCode,
FlavorCode: req.Spec.SourceSpecId,
Description: req.Description,
WorkServerNumber: req.WorkServerNumber,
FlavorName: req.FlavorName,
@@ -400,6 +402,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (err error
TotalVersionCount: req.TotalVersionCount,
CreatedUnix: createTime,
UpdatedUnix: createTime,
Spec: req.Spec,
})

if createErr != nil {
@@ -451,7 +454,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job
LogUrl: req.LogUrl,
PoolID: req.PoolID,
Flavor: models.Flavor{
Code: req.FlavorCode,
Code: req.Spec.SourceSpecId,
},
Parameter: req.Parameters,
PreVersionId: req.PreVersionId,
@@ -472,7 +475,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job
LogUrl: req.LogUrl,
PoolID: req.PoolID,
Flavor: models.Flavor{
Code: req.FlavorCode,
Code: req.Spec.SourceSpecId,
},
Parameter: req.Parameters,
PreVersionId: req.PreVersionId,
@@ -524,7 +527,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job
DataUrl: req.DataUrl,
LogUrl: req.LogUrl,
PreVersionId: req.PreVersionId,
FlavorCode: req.FlavorCode,
FlavorCode: req.Spec.SourceSpecId,
Description: req.Description,
WorkServerNumber: req.WorkServerNumber,
FlavorName: req.FlavorName,
@@ -533,6 +536,7 @@ func GenerateTrainJobVersion(ctx *context.Context, req *GenerateTrainJobReq, job
VersionCount: VersionListCount + 1,
CreatedUnix: createTime,
UpdatedUnix: createTime,
Spec: req.Spec,
})
if createErr != nil {
log.Error("CreateCloudbrain(%s) failed:%v", req.JobName, createErr.Error())
@@ -716,7 +720,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e
PoolID: req.PoolID,
CreateVersion: true,
Flavor: models.Flavor{
Code: req.FlavorCode,
Code: req.Spec.SourceSpecId,
},
Parameter: req.Parameters,
},
@@ -753,7 +757,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e
BootFile: req.BootFile,
DataUrl: req.DataUrl,
LogUrl: req.LogUrl,
FlavorCode: req.FlavorCode,
FlavorCode: req.Spec.SourceSpecId,
Description: req.Description,
WorkServerNumber: req.WorkServerNumber,
FlavorName: req.FlavorName,
@@ -769,6 +773,7 @@ func GenerateInferenceJob(ctx *context.Context, req *GenerateInferenceJobReq) (e
ResultUrl: req.ResultUrl,
CreatedUnix: createTime,
UpdatedUnix: createTime,
Spec: req.Spec,
})

if err != nil {


+ 36
- 0
routers/admin/resources.go View File

@@ -8,6 +8,8 @@ import (
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/cloudbrain/resource"
"net/http"
"strconv"
"strings"
)

const (
@@ -246,3 +248,37 @@ func UpdateResourceScene(ctx *context.Context, req models.ResourceSceneReq) {
}
ctx.JSON(http.StatusOK, response.Success())
}

func RefreshHistorySpec(ctx *context.Context) {
scope := ctx.Query("scope")
list := ctx.Query("list")

var scopeAll = false
if scope == "all" {
scopeAll = true
}
var ids = make([]int64, 0)
if list != "" {
strs := strings.Split(list, "|")
for _, s := range strs {
i, err := strconv.ParseInt(s, 10, 64)
if err != nil {
ctx.JSON(http.StatusOK, response.ServerError(err.Error()))
return
}
ids = append(ids, i)
}

}

total, success, err := resource.RefreshHistorySpec(scopeAll, ids)
if err != nil {
log.Error("RefreshHistorySpec error. %v", err)
ctx.JSON(http.StatusOK, response.ServerError(err.Error()))
return
}
r := make(map[string]interface{}, 0)
r["success"] = success
r["total"] = total
ctx.JSON(http.StatusOK, response.SuccessWithData(r))
}

+ 2
- 0
routers/private/internal.go View File

@@ -6,6 +6,7 @@
package private

import (
"code.gitea.io/gitea/routers/admin"
"strings"

"code.gitea.io/gitea/routers/repo"
@@ -51,6 +52,7 @@ func RegisterRoutes(m *macaron.Macaron) {
m.Get("/tool/org_stat", OrgStatisticManually)
m.Post("/tool/update_repo_visit/:date", UpdateRepoVisit)
m.Post("/task/history_handle/duration", repo.HandleTaskWithNoDuration)
m.Post("/resources/specification/handle_historical_task", admin.RefreshHistorySpec)

}, CheckInternalToken)
}

+ 13
- 96
routers/repo/cloudbrain.go View File

@@ -122,89 +122,8 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error {
ctx.Data["QueuesDetail"] = queuesDetail
}

cloudbrain.InitSpecialPool()

if gpuInfos == nil {
json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos)
}
ctx.Data["gpu_types"] = gpuInfos.GpuInfo

if trainGpuInfos == nil {
json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos)
}
ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo

if inferenceGpuInfos == nil && setting.InferenceGpuTypes != "" {
json.Unmarshal([]byte(setting.InferenceGpuTypes), &inferenceGpuInfos)
}
if inferenceGpuInfos != nil {
ctx.Data["inference_gpu_types"] = inferenceGpuInfos.GpuInfo
}

if benchmarkGpuInfos == nil {
json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos)
}
ctx.Data["benchmark_gpu_types"] = benchmarkGpuInfos.GpuInfo

if benchmarkResourceSpecs == nil {
json.Unmarshal([]byte(setting.BenchmarkResourceSpecs), &benchmarkResourceSpecs)
}
ctx.Data["benchmark_resource_specs"] = benchmarkResourceSpecs.ResourceSpec

if cloudbrain.ResourceSpecs == nil {
json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs)
}
ctx.Data["resource_specs"] = cloudbrain.ResourceSpecs.ResourceSpec

if cloudbrain.TrainResourceSpecs == nil {
json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs)
}
ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec

if cloudbrain.InferenceResourceSpecs == nil && setting.InferenceResourceSpecs != "" {
json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs)
}
if cloudbrain.InferenceResourceSpecs != nil {
ctx.Data["inference_resource_specs"] = cloudbrain.InferenceResourceSpecs.ResourceSpec
}

prepareCloudbrainOneSpecs(ctx)

if cloudbrain.SpecialPools != nil {
var debugGpuTypes []*models.GpuInfo
var trainGpuTypes []*models.GpuInfo

for _, pool := range cloudbrain.SpecialPools.Pools {
isOrgMember, _ := models.IsOrganizationMemberByOrgName(pool.Org, ctx.User.ID)
if isOrgMember {
for _, jobType := range pool.JobType {
if jobType == string(models.JobTypeDebug) {
debugGpuTypes = append(debugGpuTypes, pool.Pool...)
if pool.ResourceSpec != nil {
ctx.Data["resource_specs"] = pool.ResourceSpec
}
} else if jobType == string(models.JobTypeTrain) {
trainGpuTypes = append(trainGpuTypes, pool.Pool...)
if pool.ResourceSpec != nil {
ctx.Data["train_resource_specs"] = pool.ResourceSpec
}
}
}
break
}

}

if len(debugGpuTypes) > 0 {
ctx.Data["gpu_types"] = debugGpuTypes
}

if len(trainGpuTypes) > 0 {
ctx.Data["train_gpu_types"] = trainGpuTypes
}

}

ctx.Data["params"] = ""
ctx.Data["branchName"] = ctx.Repo.BranchName

@@ -229,8 +148,6 @@ func prepareCloudbrainOneSpecs(ctx *context.Context) {
AiCenterCode: models.AICenterOfCloudBrainOne,
})
ctx.Data["debug_specs"] = debugSpecs
b, _ := json.Marshal(debugSpecs)
log.Info("%s", string(b))

trainSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
JobType: models.JobTypeTrain,
@@ -247,6 +164,14 @@ func prepareCloudbrainOneSpecs(ctx *context.Context) {
AiCenterCode: models.AICenterOfCloudBrainOne,
})
ctx.Data["inference_specs"] = inferenceSpecs

benchmarkSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
JobType: models.JobTypeBenchmark,
ComputeResource: models.GPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainOne,
})
ctx.Data["benchmark_specs"] = benchmarkSpecs
}

func CloudBrainNew(ctx *context.Context) {
@@ -348,18 +273,10 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
command = commandTrain
}

errStr := checkCloudBrainSpecialPool(ctx, jobType, gpuQueue, resourceSpecId)

if errStr != "" {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(errStr, tpl, &form)
return
}

if branchName == "" {
branchName = cloudbrain.DefaultBranchName
}
errStr = loadCodeAndMakeModelPath(repo, codePath, branchName, jobName, cloudbrain.ModelMountPath)
errStr := loadCodeAndMakeModelPath(repo, codePath, branchName, jobName, cloudbrain.ModelMountPath)
if errStr != "" {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr(errStr), tpl, &form)
@@ -375,7 +292,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("Illegal resource specification", tpl, &form)
ctx.RenderWithErr("Resource specification not available", tpl, &form)
return
}

@@ -534,7 +451,7 @@ func CloudBrainInferenceJobCreate(ctx *context.Context, form auth.CreateCloudBra
AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("Illegal resource specification", tpl, &form)
ctx.RenderWithErr("Resource specification not available", tpl, &form)
return
}
req := cloudbrain.GenerateCloudBrainTaskReq{
@@ -2447,7 +2364,7 @@ func BenchMarkAlgorithmCreate(ctx *context.Context, form auth.CreateCloudBrainFo
AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("Illegal resource specification", tplCloudBrainBenchmarkNew, &form)
ctx.RenderWithErr("Resource specification not available", tplCloudBrainBenchmarkNew, &form)
return
}

@@ -2587,7 +2504,7 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm)
AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr("Illegal resource specification", tpl, &form)
ctx.RenderWithErr("Resource specification not available", tpl, &form)
return
}
req := cloudbrain.GenerateCloudBrainTaskReq{


+ 67
- 57
routers/repo/modelarts.go View File

@@ -2,6 +2,7 @@ package repo

import (
"archive/zip"
"code.gitea.io/gitea/services/cloudbrain/resource"
"encoding/json"
"errors"
"fmt"
@@ -141,11 +142,7 @@ func notebookNewDataPrepare(ctx *context.Context) error {
}
ctx.Data["images"] = modelarts.ImageInfos.ImageInfo

if modelarts.FlavorInfos == nil {
json.Unmarshal([]byte(setting.FlavorInfos), &modelarts.FlavorInfos)
}
ctx.Data["flavors"] = modelarts.FlavorInfos.FlavorInfo
setSpecBySpecialPoolConfig(ctx, string(models.JobTypeDebug))
prepareCloudbrainTwoDebugSpecs(ctx)

ctx.Data["datasetType"] = models.TypeCloudBrainTwo

@@ -155,6 +152,16 @@ func notebookNewDataPrepare(ctx *context.Context) error {
return nil
}

func prepareCloudbrainTwoDebugSpecs(ctx *context.Context) {
noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
JobType: models.JobTypeDebug,
ComputeResource: models.NPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainTwo,
})
ctx.Data["Specs"] = noteBookSpecs
}

func NotebookCreate(ctx *context.Context, form auth.CreateModelArtsNotebookForm) {
ctx.Data["PageIsNotebook"] = true
jobName := form.JobName
@@ -205,7 +212,6 @@ func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm
jobName := util.ConvertDisplayJobNameToJobName(displayJobName)
uuid := form.Attachment
description := form.Description
flavor := form.Flavor
imageId := form.ImageId
repo := ctx.Repo.Repository

@@ -241,14 +247,17 @@ func Notebook2Create(ctx *context.Context, form auth.CreateModelArtsNotebookForm
}
}

errStr := checkModelArtsSpecialPool(ctx, flavor, string(models.JobTypeDebug))
if errStr != "" {
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobTypeDebug,
ComputeResource: models.NPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainTwo})
if err != nil || spec == nil {
notebookNewDataPrepare(ctx)
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsNotebookNew, &form)
ctx.RenderWithErr("Resource specification not available", tplModelArtsNotebookNew, &form)
return
}

err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, flavor, imageId)
err = modelarts.GenerateNotebook2(ctx, displayJobName, jobName, uuid, description, imageId, spec)
if err != nil {
log.Error("GenerateNotebook2 failed, %v", err, ctx.Data["MsgID"])
notebookNewDataPrepare(ctx)
@@ -728,14 +737,7 @@ func trainJobNewDataPrepare(ctx *context.Context) error {
}
ctx.Data["engine_versions"] = versionInfos.Version

var flavorInfos modelarts.Flavor
if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["flavor_infos"] = flavorInfos.Info

setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain))
prepareCloudbrainTwoTrainSpecs(ctx)

ctx.Data["params"] = ""
ctx.Data["branchName"] = ctx.Repo.BranchName
@@ -753,6 +755,16 @@ func trainJobNewDataPrepare(ctx *context.Context) error {
return nil
}

func prepareCloudbrainTwoTrainSpecs(ctx *context.Context) {
noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
JobType: models.JobTypeTrain,
ComputeResource: models.NPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainTwo,
})
ctx.Data["Specs"] = noteBookSpecs
}

func setSpecBySpecialPoolConfig(ctx *context.Context, jobType string) {
modelarts.InitSpecialPool()

@@ -835,13 +847,7 @@ func trainJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArts
}
ctx.Data["engine_versions"] = versionInfos.Version

var flavorInfos modelarts.Flavor
if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["flavor_infos"] = flavorInfos.Info
setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain))
prepareCloudbrainTwoTrainSpecs(ctx)

configList, err := getConfigList(modelarts.PerPage, 1, modelarts.SortByCreateTime, "desc", "", modelarts.ConfigTypeCustom)
if err != nil {
@@ -1020,13 +1026,7 @@ func versionErrorDataPrepare(ctx *context.Context, form auth.CreateModelArtsTrai
}
ctx.Data["engine_versions"] = versionInfos.Version

var flavorInfos modelarts.Flavor
if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}
ctx.Data["flavor_infos"] = flavorInfos.Info
setSpecBySpecialPoolConfig(ctx, string(models.JobTypeTrain))
prepareCloudbrainTwoTrainSpecs(ctx)

var Parameters modelarts.Parameters
if err = json.Unmarshal([]byte(form.Params), &Parameters); err != nil {
@@ -1079,7 +1079,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
workServerNumber := form.WorkServerNumber
engineID := form.EngineID
bootFile := strings.TrimSpace(form.BootFile)
flavorCode := form.Flavor
params := form.Params
poolID := form.PoolID
//isSaveParam := form.IsSaveParam
@@ -1117,10 +1116,14 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
return
}

errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain))
if errStr != "" {
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobTypeTrain,
ComputeResource: models.NPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainTwo})
if err != nil || spec == nil {
trainJobErrorNewDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form)
ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobNew, &form)
return
}
//Determine whether the task name of the task in the project is duplicated
@@ -1283,7 +1286,6 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
BootFileUrl: codeObsPath + bootFile,
BootFile: bootFile,
TrainUrl: outputObsPath,
FlavorCode: flavorCode,
WorkServerNumber: workServerNumber,
EngineID: int64(engineID),
LogUrl: logObsPath,
@@ -1299,6 +1301,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm)
VersionCount: VersionCount,
TotalVersionCount: modelarts.TotalVersionCount,
DatasetName: datasetNames,
Spec: spec,
}
userCommand, userImageUrl := getUserCommand(engineID, req)
req.UserCommand = userCommand
@@ -1384,7 +1387,6 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
workServerNumber := form.WorkServerNumber
engineID := form.EngineID
bootFile := strings.TrimSpace(form.BootFile)
flavorCode := form.Flavor
params := form.Params
poolID := form.PoolID
//isSaveParam := form.IsSaveParam
@@ -1414,10 +1416,14 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
return
}

errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain))
if errStr != "" {
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobTypeTrain,
ComputeResource: models.NPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainTwo})
if err != nil || spec == nil {
versionErrorDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form)
ctx.RenderWithErr("Resource specification not available", tplModelArtsTrainJobVersionNew, &form)
return
}

@@ -1571,7 +1577,6 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
BootFileUrl: codeObsPath + bootFile,
BootFile: bootFile,
TrainUrl: outputObsPath,
FlavorCode: flavorCode,
WorkServerNumber: workServerNumber,
IsLatestVersion: isLatestVersion,
EngineID: int64(engineID),
@@ -1588,6 +1593,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ
PreVersionName: PreVersionName,
TotalVersionCount: latestTask.TotalVersionCount + 1,
DatasetName: datasetNames,
Spec: spec,
}
userCommand, userImageUrl := getUserCommand(engineID, req)
req.UserCommand = userCommand
@@ -2016,7 +2022,6 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference
workServerNumber := form.WorkServerNumber
engineID := form.EngineID
bootFile := strings.TrimSpace(form.BootFile)
flavorCode := form.Flavor
params := form.Params
poolID := form.PoolID
repo := ctx.Repo.Repository
@@ -2078,13 +2083,16 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference
}
}

errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference))
if errStr != "" {
spec, err := resource.GetAndCheckSpec(ctx.User.ID, form.SpecId, models.FindSpecsOptions{
JobType: models.JobTypeInference,
ComputeResource: models.NPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainTwo})
if err != nil || spec == nil {
inferenceJobErrorNewDataPrepare(ctx, form)
ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form)
ctx.RenderWithErr("Resource specification not available", tplModelArtsInferenceJobNew, &form)
return
}

//todo: del the codeLocalPath
_, err = ioutil.ReadDir(codeLocalPath)
if err == nil {
@@ -2170,7 +2178,6 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference
BootFileUrl: codeObsPath + bootFile,
BootFile: bootFile,
TrainUrl: trainUrl,
FlavorCode: flavorCode,
WorkServerNumber: workServerNumber,
EngineID: int64(engineID),
LogUrl: logObsPath,
@@ -2369,14 +2376,7 @@ func inferenceJobNewDataPrepare(ctx *context.Context) error {
}
ctx.Data["engine_versions"] = versionInfos.Version

var flavorInfos modelarts.Flavor
if err = json.Unmarshal([]byte(setting.TrainJobFLAVORINFOS), &flavorInfos); err != nil {
ctx.ServerError("json.Unmarshal failed:", err)
return err
}

ctx.Data["flavor_infos"] = flavorInfos.Info
setSpecBySpecialPoolConfig(ctx, string(models.JobTypeInference))
prepareCloudbrainTwoInferenceSpecs(ctx)

ctx.Data["params"] = ""
ctx.Data["branchName"] = ctx.Repo.BranchName
@@ -2407,6 +2407,16 @@ func inferenceJobNewDataPrepare(ctx *context.Context) error {
return nil
}

func prepareCloudbrainTwoInferenceSpecs(ctx *context.Context) {
noteBookSpecs, _ := resource.FindAvailableSpecs(ctx.User.ID, models.FindSpecsOptions{
JobType: models.JobTypeInference,
ComputeResource: models.NPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainTwo,
})
ctx.Data["Specs"] = noteBookSpecs
}

func inferenceJobErrorNewDataPrepare(ctx *context.Context, form auth.CreateModelArtsInferenceJobForm) error {
ctx.Data["PageIsCloudBrain"] = true



+ 263
- 1
services/cloudbrain/resource/resource_specification.go View File

@@ -2,12 +2,17 @@ package resource

import (
"code.gitea.io/gitea/models"
"code.gitea.io/gitea/modules/cloudbrain"
"code.gitea.io/gitea/modules/grampus"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/routers/response"
"code.gitea.io/gitea/services/admin/operate_log"
"encoding/json"
"errors"
"fmt"
"strings"
"time"
)

func AddResourceSpecification(doerId int64, req models.ResourceSpecificationReq) error {
@@ -186,7 +191,7 @@ func AddSpecOperateLog(doerId int64, operateType string, newValue, oldValue *mod
}

func FindAvailableSpecs(userId int64, opts models.FindSpecsOptions) ([]*models.Specification, error) {
r, err := models.FindAvailableSpecs(opts)
r, err := models.FindSpecs(opts)
if err != nil {
log.Error("FindAvailableSpecs error.%v", err)
return nil, err
@@ -270,3 +275,260 @@ func GetCloudbrainSpec(cloudbrainId int64) (*models.Specification, error) {
}
return c.ConvertToSpecification(), nil
}

func RefreshHistorySpec(scopeAll bool, ids []int64) (int64, int64, error) {
var success int64
var total int64

if !scopeAll {
if ids == nil || len(ids) == 0 {
return 0, 0, nil
}
total = int64(len(ids))
tasks, err := models.GetCloudbrainWithDeletedByIDs(ids)
if err != nil {
return total, 0, err
}
for _, task := range tasks {
err = RefreshOneHistorySpec(task)
if err != nil {
log.Error("RefreshOneHistorySpec error.%v", err)
continue
}
success++
}

} else {
page := 1
pageSize := 100
n, err := models.CountNoSpecHistoricTask()
if err != nil {
log.Error("FindNoSpecHistoricTask CountNoSpecHistoricTask error. e=%v", err)
return 0, 0, err
}
total = n
for i := 0; i < 1000; i++ {
list, err := models.FindNoSpecHistoricTask(page, pageSize)
if err != nil {
log.Error("FindNoSpecHistoricTask error.page=%d pageSize=%d e=%v", page, pageSize, err)
return total, success, err
}
if len(list) == 0 {
log.Info("RefreshHistorySpec. list is empty")
break
}
for _, task := range list {
time.Sleep(1 * time.Second)
err = RefreshOneHistorySpec(task)
if err != nil {
log.Error("RefreshOneHistorySpec error.%v", err)
continue
}
success++
}
if len(list) < pageSize {
log.Info("RefreshHistorySpec. list < pageSize")
break
}
}
}
return total, success, nil

}

func RefreshOneHistorySpec(task *models.Cloudbrain) error {
var spec *models.Specification
var err error
switch task.Type {
case models.TypeCloudBrainOne:
spec, err = getCloudbrainOneSpec(task)
}
if err != nil {
log.Error("find spec error,task.ID=%d err=%v", task.ID, err)
return err
}
if spec == nil {
log.Error("find spec failed,task.ID=%d", task.ID)
return errors.New("find spec failed")
}
return InsertCloudbrainSpec(task.ID, spec)
}

func getCloudbrainOneSpec(task *models.Cloudbrain) (*models.Specification, error) {
//find from remote
result, err := cloudbrain.GetJob(task.JobID)
if err != nil {
log.Error("getCloudbrainOneSpec error. %v", err)
return nil, err
}
if result != nil {
jobRes, _ := models.ConvertToJobResultPayload(result.Payload)
memSize, _ := models.ParseMemSizeFromGrampus(jobRes.Resource.Memory)
if task.ComputeResource == "CPU/GPU" {
task.ComputeResource = models.GPU
}
var shmMB float32
if jobRes.Config.TaskRoles != nil && len(jobRes.Config.TaskRoles) > 0 {
shmMB = float32(jobRes.Config.TaskRoles[0].ShmMB) / 1024
}

opt := models.FindSpecsOptions{
ComputeResource: task.ComputeResource,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainOne,
QueueCode: task.GpuQueue,
AccCardsNum: jobRes.Resource.NvidiaComGpu,
UseAccCardsNum: true,
CpuCores: jobRes.Resource.CPU,
UseCpuCores: true,
MemGiB: memSize,
UseMemGiB: memSize > 0,
ShareMemGiB: shmMB,
UseShareMemGiB: shmMB > 0,
RequestAll: true,
}
specs, err := models.FindSpecs(opt)
if err != nil {
log.Error("getCloudbrainOneSpec from remote error,%v", err)
return nil, err
}
if len(specs) == 1 {
return specs[0], nil
}
if len(specs) == 0 {
s, err := InitQueueAndSpec(opt, "云脑一", "处理历史云脑任务时自动添加")
if err != nil {
log.Error("getCloudbrainOneSpec InitQueueAndSpec error.err=%v", err)
return nil, nil
}
return s, nil
}
if len(specs) > 1 {
log.Error("Too many results matched.size=%d opt=%+v", len(specs), opt)
return nil, nil
}

} else {
//find from config
var specConfig *models.ResourceSpec
hasSpec := false
if task.JobType == string(models.JobTypeTrain) {
if cloudbrain.TrainResourceSpecs == nil {
json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs)
}
for _, tmp := range cloudbrain.TrainResourceSpecs.ResourceSpec {
if tmp.Id == task.ResourceSpecId {
hasSpec = true
specConfig = tmp
break
}
}
} else if task.JobType == string(models.JobTypeInference) {
if cloudbrain.InferenceResourceSpecs == nil {
json.Unmarshal([]byte(setting.InferenceResourceSpecs), &cloudbrain.InferenceResourceSpecs)
}
for _, tmp := range cloudbrain.InferenceResourceSpecs.ResourceSpec {
if tmp.Id == task.ResourceSpecId {
hasSpec = true
specConfig = tmp
break
}
}
} else {
if cloudbrain.ResourceSpecs == nil {
json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs)
}
for _, tmp := range cloudbrain.ResourceSpecs.ResourceSpec {
if tmp.Id == task.ResourceSpecId {
hasSpec = true
specConfig = tmp
break

}
}
}
if !hasSpec && cloudbrain.SpecialPools != nil {

for _, specialPool := range cloudbrain.SpecialPools.Pools {

if specialPool.ResourceSpec != nil {

for _, spec := range specialPool.ResourceSpec {
if task.ResourceSpecId == spec.Id {
hasSpec = true
specConfig = spec
break
}
}
}
}
}
if specConfig == nil {
log.Error("getCloudbrainOneSpec from config failed,task.ResourceSpecId=%d", task.ResourceSpecId)
return nil, nil
}
opt := models.FindSpecsOptions{
JobType: models.JobType(task.JobType),
ComputeResource: task.ComputeResource,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainOne,
QueueCode: task.GpuQueue,
AccCardsNum: specConfig.GpuNum,
UseAccCardsNum: true,
CpuCores: specConfig.GpuNum,
UseCpuCores: true,
MemGiB: float32(specConfig.MemMiB) / 1024,
UseMemGiB: true,
ShareMemGiB: float32(specConfig.ShareMemMiB) / 1024,
UseShareMemGiB: true,
RequestAll: true,
}
specs, err := models.FindSpecs(opt)
if err != nil {
log.Error("getCloudbrainOneSpec from config error,%v", err)
return nil, err
}
if len(specs) > 1 {
log.Error("Too many results matched.size=%d opt=%+v", len(specs), opt)
return nil, nil
}
if len(specs) == 0 {
s, err := InitQueueAndSpec(opt, "云脑一", "处理历史云脑任务时自动添加")
if err != nil {
log.Error("getCloudbrainOneSpec InitQueueAndSpec error.err=%v", err)
return nil, nil
}
return s, nil
}
return specs[0], nil
}
return nil, nil

}

func RefreshCloudbrainTwoSpec(task *models.Cloudbrain) error {
return nil
}

func RefreshC2NetSpec(task *models.Cloudbrain) error {
return nil
}

func InitQueueAndSpec(opt models.FindSpecsOptions, aiCenterName string, remark string) (*models.Specification, error) {
return models.InitQueueAndSpec(models.ResourceQueue{
QueueCode: opt.QueueCode,
Cluster: opt.Cluster,
AiCenterCode: opt.AiCenterCode,
AiCenterName: aiCenterName,
ComputeResource: opt.ComputeResource,
AccCardType: models.GetCloudbrainOneAccCardType(opt.QueueCode),
Remark: remark,
}, models.ResourceSpecification{
AccCardsNum: opt.AccCardsNum,
CpuCores: opt.CpuCores,
MemGiB: opt.MemGiB,
GPUMemGiB: opt.GPUMemGiB,
ShareMemGiB: opt.ShareMemGiB,
Status: models.SpecOffShelf,
})
}

Loading…
Cancel
Save