| @@ -1947,11 +1947,18 @@ func RestartCloudbrain(old *Cloudbrain, new *Cloudbrain) (err error) { | |||||
| return err | return err | ||||
| } | } | ||||
| if _, err = sess.NoAutoTime().Insert(new); err != nil { | |||||
| if _, err = sess.NoAutoTime().InsertOne(new); err != nil { | |||||
| sess.Rollback() | sess.Rollback() | ||||
| return err | return err | ||||
| } | } | ||||
| if new.Spec != nil { | |||||
| if _, err = sess.Insert(NewCloudBrainSpec(new.ID, *new.Spec)); err != nil { | |||||
| sess.Rollback() | |||||
| return err | |||||
| } | |||||
| } | |||||
| if err = sess.Commit(); err != nil { | if err = sess.Commit(); err != nil { | ||||
| return err | return err | ||||
| } | } | ||||
| @@ -409,25 +409,7 @@ func GetWaitingCloudbrainCount(cloudbrainType int, computeResource string, jobTy | |||||
| func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error { | func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) error { | ||||
| jobName := task.JobName | jobName := task.JobName | ||||
| var resourceSpec *models.ResourceSpec | |||||
| if ResourceSpecs == nil { | |||||
| json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) | |||||
| } | |||||
| for _, spec := range ResourceSpecs.ResourceSpec { | |||||
| if task.ResourceSpecId == spec.Id { | |||||
| resourceSpec = spec | |||||
| } | |||||
| } | |||||
| //如果没有匹配到spec信息,尝试从专属资源池获取 | |||||
| if resourceSpec == nil && SpecialPools != nil { | |||||
| resourceSpec = geMatchResourceSpec(task.JobType, task.GpuQueue, task.ResourceSpecId) | |||||
| } | |||||
| if resourceSpec == nil { | |||||
| log.Error("no such resourceSpecId(%d)", task.ResourceSpecId, ctx.Data["MsgID"]) | |||||
| return errors.New("no such resourceSpec") | |||||
| } | |||||
| spec := task.Spec | |||||
| var datasetInfos map[string]models.DatasetInfo | var datasetInfos map[string]models.DatasetInfo | ||||
| if task.Uuid != "" { | if task.Uuid != "" { | ||||
| var err error | var err error | ||||
| @@ -509,10 +491,10 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e | |||||
| TaskNumber: 1, | TaskNumber: 1, | ||||
| MinSucceededTaskCount: 1, | MinSucceededTaskCount: 1, | ||||
| MinFailedTaskCount: 1, | MinFailedTaskCount: 1, | ||||
| CPUNumber: resourceSpec.CpuNum, | |||||
| GPUNumber: resourceSpec.GpuNum, | |||||
| MemoryMB: resourceSpec.MemMiB, | |||||
| ShmMB: resourceSpec.ShareMemMiB, | |||||
| CPUNumber: spec.CpuCores, | |||||
| GPUNumber: spec.AccCardsNum, | |||||
| MemoryMB: int(spec.MemGiB * 1024), | |||||
| ShmMB: int(spec.ShareMemGiB * 1024), | |||||
| Command: GetCloudbrainDebugCommand(), //Command, | Command: GetCloudbrainDebugCommand(), //Command, | ||||
| NeedIBDevice: false, | NeedIBDevice: false, | ||||
| IsMainRole: false, | IsMainRole: false, | ||||
| @@ -550,6 +532,7 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e | |||||
| CreatedUnix: createTime, | CreatedUnix: createTime, | ||||
| UpdatedUnix: createTime, | UpdatedUnix: createTime, | ||||
| BranchName: task.BranchName, | BranchName: task.BranchName, | ||||
| Spec: spec, | |||||
| } | } | ||||
| err = models.RestartCloudbrain(task, newTask) | err = models.RestartCloudbrain(task, newTask) | ||||
| @@ -560,34 +560,25 @@ func CloudBrainRestart(ctx *context.Context) { | |||||
| break | break | ||||
| } | } | ||||
| var hasSameResource bool | |||||
| if gpuInfos == nil { | |||||
| json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos) | |||||
| } | |||||
| for _, resourceType := range gpuInfos.GpuInfo { | |||||
| if resourceType.Queue == task.GpuQueue { | |||||
| hasSameResource = true | |||||
| break | |||||
| } | |||||
| } | |||||
| if !hasSameResource && cloudbrain.SpecialPools != nil { | |||||
| for _, specialPool := range cloudbrain.SpecialPools.Pools { | |||||
| cloudbrain.IsElementExist(specialPool.JobType, string(models.JobTypeDebug)) | |||||
| for _, pool := range specialPool.Pool { | |||||
| if pool.Queue == task.GpuQueue { | |||||
| hasSameResource = true | |||||
| } | |||||
| } | |||||
| } | |||||
| specOld, err := resource.GetCloudbrainSpec(task.ID) | |||||
| if err != nil { | |||||
| log.Error("CloudBrainRestart GetCloudbrainSpec error.task.id = %d", task.ID) | |||||
| resultCode = "-1" | |||||
| errorMsg = "Resource specification not support any more" | |||||
| break | |||||
| } | } | ||||
| if !hasSameResource { | |||||
| log.Error("has no same resource, can not restart", ctx.Data["MsgID"]) | |||||
| spec, err := resource.GetAndCheckSpec(ctx.User.ID, specOld.ID, models.FindSpecsOptions{ | |||||
| JobType: models.JobType(task.JobType), | |||||
| ComputeResource: models.GPU, | |||||
| Cluster: models.OpenICluster, | |||||
| AiCenterCode: models.AICenterOfCloudBrainOne}) | |||||
| if err != nil || spec == nil { | |||||
| log.Error("CloudBrainRestart GetAndCheckSpec error.task.id = %d", task.ID) | |||||
| resultCode = "-1" | resultCode = "-1" | ||||
| errorMsg = "the job's version is too old and can not be restarted" | |||||
| errorMsg = "Resource specification not support any more" | |||||
| break | break | ||||
| } | } | ||||
| task.Spec = spec | |||||
| count, err := models.GetCloudbrainCountByUserID(ctx.User.ID, string(models.JobTypeDebug)) | count, err := models.GetCloudbrainCountByUserID(ctx.User.ID, string(models.JobTypeDebug)) | ||||
| if err != nil { | if err != nil { | ||||