Browse Source

#2701

update
tags/v1.22.9.1^2^2
chenyifan01 3 years ago
parent
commit
0e4d8844b8
3 changed files with 29 additions and 48 deletions
  1. +8
    -1
      models/cloudbrain.go
  2. +6
    -23
      modules/cloudbrain/cloudbrain.go
  3. +15
    -24
      routers/repo/cloudbrain.go

+ 8
- 1
models/cloudbrain.go View File

@@ -1947,11 +1947,18 @@ func RestartCloudbrain(old *Cloudbrain, new *Cloudbrain) (err error) {
return err return err
} }


if _, err = sess.NoAutoTime().Insert(new); err != nil {
if _, err = sess.NoAutoTime().InsertOne(new); err != nil {
sess.Rollback() sess.Rollback()
return err return err
} }


if new.Spec != nil {
if _, err = sess.Insert(NewCloudBrainSpec(new.ID, *new.Spec)); err != nil {
sess.Rollback()
return err
}
}

if err = sess.Commit(); err != nil { if err = sess.Commit(); err != nil {
return err return err
} }


+ 6
- 23
modules/cloudbrain/cloudbrain.go View File

@@ -409,25 +409,7 @@ func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTy
func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error { func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error {
jobName := task.JobName jobName := task.JobName


var resourceSpec *models.ResourceSpec
if ResourceSpecs == nil {
json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs)
}
for _, spec := range ResourceSpecs.ResourceSpec {
if task.ResourceSpecId == spec.Id {
resourceSpec = spec
}
}

//如果没有匹配到spec信息,尝试从专属资源池获取
if resourceSpec == nil && SpecialPools != nil {
resourceSpec = geMatchResourceSpec(task.JobType, task.GpuQueue, task.ResourceSpecId)
}

if resourceSpec == nil {
log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"])
return errors.New("no such resourceSpec")
}
spec := task.Spec
var datasetInfos map[string]models.DatasetInfo var datasetInfos map[string]models.DatasetInfo
if task.Uuid != "" { if task.Uuid != "" {
var err error var err error
@@ -509,10 +491,10 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e
TaskNumber: 1, TaskNumber: 1,
MinSucceededTaskCount: 1, MinSucceededTaskCount: 1,
MinFailedTaskCount: 1, MinFailedTaskCount: 1,
CPUNumber: resourceSpec.CpuNum,
GPUNumber: resourceSpec.GpuNum,
MemoryMB: resourceSpec.MemMiB,
ShmMB: resourceSpec.ShareMemMiB,
CPUNumber: spec.CpuCores,
GPUNumber: spec.AccCardsNum,
MemoryMB: int(spec.MemGiB * 1024),
ShmMB: int(spec.ShareMemGiB * 1024),
Command: GetCloudbrainDebugCommand(), //Command, Command: GetCloudbrainDebugCommand(), //Command,
NeedIBDevice: false, NeedIBDevice: false,
IsMainRole: false, IsMainRole: false,
@@ -550,6 +532,7 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e
CreatedUnix: createTime, CreatedUnix: createTime,
UpdatedUnix: createTime, UpdatedUnix: createTime,
BranchName: task.BranchName, BranchName: task.BranchName,
Spec: spec,
} }


err = models.RestartCloudbrain(task, newTask) err = models.RestartCloudbrain(task, newTask)


+ 15
- 24
routers/repo/cloudbrain.go View File

@@ -560,34 +560,25 @@ func CloudBrainRestart(ctx *context.Context) {
break break
} }


var hasSameResource bool
if gpuInfos == nil {
json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos)
}
for _, resourceType := range gpuInfos.GpuInfo {
if resourceType.Queue == task.GpuQueue {
hasSameResource = true
break
}
}
if !hasSameResource && cloudbrain.SpecialPools != nil {

for _, specialPool := range cloudbrain.SpecialPools.Pools {
cloudbrain.IsElementExist(specialPool.JobType, string(models.JobTypeDebug))
for _, pool := range specialPool.Pool {
if pool.Queue == task.GpuQueue {
hasSameResource = true
}
}
}
specOld, err := resource.GetCloudbrainSpec(task.ID)
if err != nil {
log.Error("CloudBrainRestart GetCloudbrainSpec error.task.id = %d", task.ID)
resultCode = "-1"
errorMsg = "Resource specification not support any more"
break
} }

if !hasSameResource {
log.Error("has no same resource, can not restart", ctx.Data["MsgID"])
spec, err := resource.GetAndCheckSpec(ctx.User.ID, specOld.ID, models.FindSpecsOptions{
JobType: models.JobType(task.JobType),
ComputeResource: models.GPU,
Cluster: models.OpenICluster,
AiCenterCode: models.AICenterOfCloudBrainOne})
if err != nil || spec == nil {
log.Error("CloudBrainRestart GetAndCheckSpec error.task.id = %d", task.ID)
resultCode = "-1" resultCode = "-1"
errorMsg = "the job's version is too old and can not be restarted"
errorMsg = "Resource specification not support any more"
break break
} }
task.Spec = spec


count, err := models.GetCloudbrainCountByUserID(ctx.User.ID, string(models.JobTypeDebug)) count, err := models.GetCloudbrainCountByUserID(ctx.User.ID, string(models.JobTypeDebug))
if err != nil { if err != nil {


Loading…
Cancel
Save