Browse Source

Merge remote-tracking branch 'origin/V20220718' into point-v2

# Conflicts:
#	routers/repo/cloudbrain.go
tags/v1.22.9.2^2
chenyifan01 3 years ago
parent
commit
518fcc5406
6 changed files with 180 additions and 46 deletions
  1. +6
    -5
      models/cloudbrain.go
  2. +57
    -0
      modules/cloudbrain/cloudbrain.go
  3. +3
    -0
      modules/setting/setting.go
  4. +20
    -4
      routers/api/v1/repo/cloudbrain_dashboard.go
  5. +94
    -1
      routers/repo/cloudbrain.go
  6. +0
    -36
      templates/repo/modelarts/trainjob/version_new.tmpl

+ 6
- 5
models/cloudbrain.go View File

@@ -611,11 +611,12 @@ type SpecialPools struct {
Pools []*SpecialPool `json:"pools"`
}
type SpecialPool struct {
Org string `json:"org"`
Type string `json:"type"`
IsExclusive bool `json:"isExclusive"`
Pool []*GpuInfo `json:"pool"`
JobType []string `json:"jobType"`
Org string `json:"org"`
Type string `json:"type"`
IsExclusive bool `json:"isExclusive"`
Pool []*GpuInfo `json:"pool"`
JobType []string `json:"jobType"`
ResourceSpec []*ResourceSpec `json:"resourceSpecs"`
}

type ImageInfosModelArts struct {


+ 57
- 0
modules/cloudbrain/cloudbrain.go View File

@@ -42,6 +42,7 @@ const (
var (
ResourceSpecs *models.ResourceSpecs
TrainResourceSpecs *models.ResourceSpecs
SpecialPools *models.SpecialPools
)

type GenerateCloudBrainTaskReq struct {
@@ -222,6 +223,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error {
for _, spec := range TrainResourceSpecs.ResourceSpec {
if req.ResourceSpecId == spec.Id {
resourceSpec = spec
break
}
}
} else {
@@ -231,10 +233,29 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error {
for _, spec := range ResourceSpecs.ResourceSpec {
if req.ResourceSpecId == spec.Id {
resourceSpec = spec
break
}
}

}
//如果没有匹配到spec信息,尝试从专属资源池获取
if resourceSpec == nil && SpecialPools != nil {
for _, specialPool := range SpecialPools.Pools {
if resourceSpec != nil {
break
}
if specialPool.ResourceSpec != nil {
if IsElementExist(specialPool.JobType, req.JobType) && IsQueueInSpecialtPool(specialPool.Pool, req.GpuQueue) {
for _, spec := range specialPool.ResourceSpec {
if req.ResourceSpecId == spec.Id {
resourceSpec = spec
break
}
}
}
}
}
}

if resourceSpec == nil {
log.Error("no such resourceSpecId(%d)", req.ResourceSpecId, req.Ctx.Data["MsgID"])
@@ -538,3 +559,39 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e

return nil
}

func InitSpecialPool() {
if SpecialPools == nil && setting.SpecialPools != "" {
json.Unmarshal([]byte(setting.SpecialPools), &SpecialPools)
}
}

func IsResourceSpecInSpecialPool(resourceSpecs []*models.ResourceSpec, resourceSpecId int) bool {
if resourceSpecs == nil || len(resourceSpecs) == 0 {
return true
}
for _, v := range resourceSpecs {
if v.Id == resourceSpecId {
return true
}
}
return false
}

func IsQueueInSpecialtPool(pool []*models.GpuInfo, queue string) bool {
for _, v := range pool {
if v.Queue == queue {
return true
}
}
return false
}

func IsElementExist(s []string, str string) bool {
for _, v := range s {
if v == str {
return true
}
}
return false
}

+ 3
- 0
modules/setting/setting.go View File

@@ -463,6 +463,7 @@ var (
CBCodePathPrefix string
JobType string
GpuTypes string
SpecialPools string
DebugServerHost string
ResourceSpecs string
MaxDuration int64
@@ -1321,6 +1322,8 @@ func NewContext() {
MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400)
TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("")
TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("")
SpecialPools = sec.Key("SPECIAL_POOL").MustString("")
MaxDatasetNum = sec.Key("MAX_DATASET_NUM").MustInt(5)

sec = Cfg.Section("benchmark")


+ 20
- 4
routers/api/v1/repo/cloudbrain_dashboard.go View File

@@ -752,10 +752,26 @@ func GetCloudbrainsDetailData(ctx *context.Context) {
taskDetail.RepoAlias = ciTasks[i].Repo.OwnerName + "/" + ciTasks[i].Repo.Alias
}
if ciTasks[i].Cloudbrain.Status == string(models.JobWaiting) {
WaitTimeInt := time.Now().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix()
taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt)
if WaitTimeInt < 0 {
taskDetail.WaitTime = "00:00:00"
if ciTasks[i].Cloudbrain.DeletedAt != nilTime {
WaitTimeInt := ciTasks[i].Cloudbrain.UpdatedUnix.AsTime().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix()
taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt)
if WaitTimeInt < 0 {
taskDetail.WaitTime = "00:00:00"
}
} else {
if ciTasks[i].Cloudbrain.StartTime.AsTime().Unix() == 0 {
WaitTimeInt := time.Now().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix()
taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt)
if WaitTimeInt < 0 {
taskDetail.WaitTime = "00:00:00"
}
} else {
WaitTimeInt := ciTasks[i].Cloudbrain.StartTime.AsTime().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix()
taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt)
if WaitTimeInt < 0 {
taskDetail.WaitTime = "00:00:00"
}
}
}
} else if ciTasks[i].Cloudbrain.Status == string(models.JobStopped) && ciTasks[i].Cloudbrain.StartTime.AsTime().Unix() == 0 {
WaitTimeInt := ciTasks[i].Cloudbrain.EndTime.AsTime().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix()


+ 94
- 1
routers/repo/cloudbrain.go View File

@@ -17,6 +17,8 @@ import (
"time"
"unicode/utf8"

"code.gitea.io/gitea/modules/grampus"

"code.gitea.io/gitea/modules/timeutil"
"github.com/unknwon/i18n"

@@ -150,6 +152,8 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error {

ctx.Data["benchmark_types"] = GetBenchmarkTypes(ctx).BenchmarkType

cloudbrain.InitSpecialPool()

if gpuInfos == nil {
json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos)
}
@@ -179,6 +183,45 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error {
json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs)
}
ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec

if cloudbrain.SpecialPools != nil {
var debugGpuTypes []*models.GpuInfo
var trainGpuTypes []*models.GpuInfo

for _, pool := range cloudbrain.SpecialPools.Pools {
org, _ := models.GetOrgByName(pool.Org)
if org != nil {
isOrgMember, _ := models.IsOrganizationMember(org.ID, ctx.User.ID)
if isOrgMember {
for _, jobType := range pool.JobType {
if jobType == string(models.JobTypeDebug) {
debugGpuTypes = append(debugGpuTypes, pool.Pool...)
if pool.ResourceSpec != nil {
ctx.Data["resource_specs"] = pool.ResourceSpec
}
} else if jobType == string(models.JobTypeTrain) {
trainGpuTypes = append(trainGpuTypes, pool.Pool...)
if pool.ResourceSpec != nil {
ctx.Data["train_resource_specs"] = pool.ResourceSpec
}
}
}
break
}
}

}

if len(debugGpuTypes) > 0 {
ctx.Data["gpu_types"] = debugGpuTypes
}

if len(trainGpuTypes) > 0 {
ctx.Data["train_gpu_types"] = trainGpuTypes
}

}

ctx.Data["params"] = ""
ctx.Data["branchName"] = ctx.Repo.BranchName

@@ -225,6 +268,10 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
return
}

if jobType == string(models.JobTypeTrain) {
tpl = tplCloudBrainTrainJobNew
}

tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName)
if err == nil {
if len(tasks) != 0 {
@@ -290,6 +337,14 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
command = commandTrain
}

errStr := checkCloudBrainSpecialPool(ctx, jobType, gpuQueue, resourceSpecId)

if errStr != "" {
cloudBrainNewDataPrepare(ctx)
ctx.RenderWithErr(errStr, tpl, &form)
return
}

if branchName == "" {
branchName = cloudbrain.DefaultBranchName
}
@@ -342,6 +397,42 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) {
}
}

/**
检查用户传输的参数是否符合专属资源池
*/
func checkCloudBrainSpecialPool(ctx *context.Context, jobType string, queue string, resourceSpecId int) string {
if cloudbrain.SpecialPools != nil {

var isInPoolOrg = false
var matchSpecialPool = false

for _, specialPool := range cloudbrain.SpecialPools.Pools {

if cloudbrain.IsElementExist(specialPool.JobType, jobType) && cloudbrain.IsQueueInSpecialtPool(specialPool.Pool, queue) {
if cloudbrain.IsResourceSpecInSpecialPool(specialPool.ResourceSpec, resourceSpecId) {
matchSpecialPool = true
org, _ := models.GetOrgByName(specialPool.Org)
if org != nil {
isInPoolOrg, _ = models.IsOrganizationMember(org.ID, ctx.User.ID)
if isInPoolOrg {
break //传入参数,和专属资源池匹配上了,检查通过
}
}
}

}

}
//资源池有匹配上,但是用户不在相应的组织中,返回错误信息。界面已经过滤了选择,界面操作不会到这个逻辑
if matchSpecialPool && !isInPoolOrg {
return ctx.Tr("repo.grampus.no_operate_right")
}

}
//没有匹配到资源池或者没有设置专属资源池,检查通过; 获取和资源池完全匹配检查通过
return ""
}

func CloudBrainRestart(ctx *context.Context) {
var ID = ctx.Params(":id")
var resultCode = "0"
@@ -588,7 +679,9 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo
if task.TrainJobDuration == "" {
if task.Duration == 0 {
var duration int64
if task.Status == string(models.JobRunning) {
if task.Status == string(models.JobWaiting) {
duration = 0
} else if task.Status == string(models.JobRunning) {
duration = time.Now().Unix() - int64(task.CreatedUnix)
} else {
duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix)


+ 0
- 36
templates/repo/modelarts/trainjob/version_new.tmpl View File

@@ -446,24 +446,6 @@
]

},
work_server_number: {
identifier : 'work_server_number',
rules: [
{
type : 'integer[1..25]',
prompt : '计算节点需要在1-25之间,请您键入正确的值'
}
]
},
run_para_list:{
identifier : 'run_para_list',
rules: [
{
type: 'maxLength[255]',
prompt : '所有字符最长不超过255个字符。'
}
]
},
},
})

@@ -512,24 +494,6 @@
]

},
work_server_number: {
identifier : 'work_server_number',
rules: [
{
type : 'integer[1..25]',
prompt : '计算节点需要在1-25之间,请您键入正确的值'
}
]
},
run_para_list:{
identifier : 'run_para_list',
rules: [
{
type: 'maxLength[255]',
prompt : '所有字符最长不超过255个字符。'
}
]
},
},
onSuccess: function(){
// $('.ui.page.dimmer').dimmer('show')


Loading…
Cancel
Save