| @@ -1947,11 +1947,18 @@ func RestartCloudbrain(old *Cloudbrain, new *Cloudbrain) (err error) { | |||
| return err | |||
| } | |||
| if _, err = sess.NoAutoTime().Insert(new); err != nil { | |||
| if _, err = sess.NoAutoTime().InsertOne(new); err != nil { | |||
| sess.Rollback() | |||
| return err | |||
| } | |||
| if new.Spec != nil { | |||
| if _, err = sess.Insert(NewCloudBrainSpec(new.ID, *new.Spec)); err != nil { | |||
| sess.Rollback() | |||
| return err | |||
| } | |||
| } | |||
| if err = sess.Commit(); err != nil { | |||
| return err | |||
| } | |||
| @@ -409,25 +409,7 @@ func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTy | |||
| func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error { | |||
| jobName := task.JobName | |||
| var resourceSpec *models.ResourceSpec | |||
| if ResourceSpecs == nil { | |||
| json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) | |||
| } | |||
| for _, spec := range ResourceSpecs.ResourceSpec { | |||
| if task.ResourceSpecId == spec.Id { | |||
| resourceSpec = spec | |||
| } | |||
| } | |||
| //如果没有匹配到spec信息,尝试从专属资源池获取 | |||
| if resourceSpec == nil && SpecialPools != nil { | |||
| resourceSpec = geMatchResourceSpec(task.JobType, task.GpuQueue, task.ResourceSpecId) | |||
| } | |||
| if resourceSpec == nil { | |||
| log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"]) | |||
| return errors.New("no such resourceSpec") | |||
| } | |||
| spec := task.Spec | |||
| var datasetInfos map[string]models.DatasetInfo | |||
| if task.Uuid != "" { | |||
| var err error | |||
| @@ -509,10 +491,10 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e | |||
| TaskNumber: 1, | |||
| MinSucceededTaskCount: 1, | |||
| MinFailedTaskCount: 1, | |||
| CPUNumber: resourceSpec.CpuNum, | |||
| GPUNumber: resourceSpec.GpuNum, | |||
| MemoryMB: resourceSpec.MemMiB, | |||
| ShmMB: resourceSpec.ShareMemMiB, | |||
| CPUNumber: spec.CpuCores, | |||
| GPUNumber: spec.AccCardsNum, | |||
| MemoryMB: int(spec.MemGiB * 1024), | |||
| ShmMB: int(spec.ShareMemGiB * 1024), | |||
| Command: GetCloudbrainDebugCommand(), //Command, | |||
| NeedIBDevice: false, | |||
| IsMainRole: false, | |||
| @@ -550,6 +532,7 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e | |||
| CreatedUnix: createTime, | |||
| UpdatedUnix: createTime, | |||
| BranchName: task.BranchName, | |||
| Spec: spec, | |||
| } | |||
| err = models.RestartCloudbrain(task, newTask) | |||
| @@ -560,34 +560,25 @@ func CloudBrainRestart(ctx *context.Context) { | |||
| break | |||
| } | |||
| var hasSameResource bool | |||
| if gpuInfos == nil { | |||
| json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos) | |||
| } | |||
| for _, resourceType := range gpuInfos.GpuInfo { | |||
| if resourceType.Queue == task.GpuQueue { | |||
| hasSameResource = true | |||
| break | |||
| } | |||
| } | |||
| if !hasSameResource && cloudbrain.SpecialPools != nil { | |||
| for _, specialPool := range cloudbrain.SpecialPools.Pools { | |||
| cloudbrain.IsElementExist(specialPool.JobType, string(models.JobTypeDebug)) | |||
| for _, pool := range specialPool.Pool { | |||
| if pool.Queue == task.GpuQueue { | |||
| hasSameResource = true | |||
| } | |||
| } | |||
| } | |||
| specOld, err := resource.GetCloudbrainSpec(task.ID) | |||
| if err != nil { | |||
| log.Error("CloudBrainRestart GetCloudbrainSpec error.task.id = %d", task.ID) | |||
| resultCode = "-1" | |||
| errorMsg = "Resource specification not support any more" | |||
| break | |||
| } | |||
| if !hasSameResource { | |||
| log.Error("has no same resource, can not restart", ctx.Data["MsgID"]) | |||
| spec, err := resource.GetAndCheckSpec(ctx.User.ID, specOld.ID, models.FindSpecsOptions{ | |||
| JobType: models.JobType(task.JobType), | |||
| ComputeResource: models.GPU, | |||
| Cluster: models.OpenICluster, | |||
| AiCenterCode: models.AICenterOfCloudBrainOne}) | |||
| if err != nil || spec == nil { | |||
| log.Error("CloudBrainRestart GetAndCheckSpec error.task.id = %d", task.ID) | |||
| resultCode = "-1" | |||
| errorMsg = "the job's version is too old and can not be restarted" | |||
| errorMsg = "Resource specification not support any more" | |||
| break | |||
| } | |||
| task.Spec = spec | |||
| count, err := models.GetCloudbrainCountByUserID(ctx.User.ID, string(models.JobTypeDebug)) | |||
| if err != nil { | |||