Reviewed-on: https://git.openi.org.cn/OpenI/aiforge/pulls/1278 Reviewed-by: ychao_1983 <ychao_1983@sina.com>tags/v1.22.1.1^2
| @@ -1210,3 +1210,28 @@ func GetCloudbrainTrainJobCountByUserID(userID int64) (int, error) { | |||||
| And("job_type = ? and user_id = ? and type = ?", JobTypeTrain, userID, TypeCloudBrainTwo).Count(new(Cloudbrain)) | And("job_type = ? and user_id = ? and type = ?", JobTypeTrain, userID, TypeCloudBrainTwo).Count(new(Cloudbrain)) | ||||
| return int(count), err | return int(count), err | ||||
| } | } | ||||
| func RestartCloudbrain(old *Cloudbrain, new *Cloudbrain) (err error) { | |||||
| sess := x.NewSession() | |||||
| defer sess.Close() | |||||
| if err = sess.Begin(); err != nil { | |||||
| return err | |||||
| } | |||||
| if _, err = sess.Delete(old); err != nil { | |||||
| sess.Rollback() | |||||
| return err | |||||
| } | |||||
| if _, err = sess.Insert(new); err != nil { | |||||
| sess.Rollback() | |||||
| return err | |||||
| } | |||||
| if err = sess.Commit(); err != nil { | |||||
| return err | |||||
| } | |||||
| return nil | |||||
| } | |||||
| @@ -82,7 +82,7 @@ func AdminOrOwnerOrJobCreaterRight(ctx *context.Context) { | |||||
| var jobID = ctx.Params(":jobid") | var jobID = ctx.Params(":jobid") | ||||
| job, err := models.GetCloudbrainByJobID(jobID) | job, err := models.GetCloudbrainByJobID(jobID) | ||||
| ctx.Cloudbrain = job | |||||
| if !isAdminOrOwnerOrJobCreater(ctx, job, err) { | if !isAdminOrOwnerOrJobCreater(ctx, job, err) { | ||||
| ctx.NotFound(ctx.Req.URL.RequestURI(), nil) | ctx.NotFound(ctx.Req.URL.RequestURI(), nil) | ||||
| @@ -94,6 +94,7 @@ func AdminOrJobCreaterRight(ctx *context.Context) { | |||||
| var jobID = ctx.Params(":jobid") | var jobID = ctx.Params(":jobid") | ||||
| job, err := models.GetCloudbrainByJobID(jobID) | job, err := models.GetCloudbrainByJobID(jobID) | ||||
| ctx.Cloudbrain = job | |||||
| if !isAdminOrJobCreater(ctx, job, err) { | if !isAdminOrJobCreater(ctx, job, err) { | ||||
| ctx.NotFound(ctx.Req.URL.RequestURI(), nil) | ctx.NotFound(ctx.Req.URL.RequestURI(), nil) | ||||
| @@ -222,7 +223,7 @@ func GenerateTask(ctx *context.Context, jobName, image, command, uuid, codePath, | |||||
| return nil | return nil | ||||
| } | } | ||||
| func RestartTask(ctx *context.Context, task *models.Cloudbrain) error { | |||||
| func RestartTask(ctx *context.Context, task *models.Cloudbrain, newJobID *string) error { | |||||
| dataActualPath := setting.Attachment.Minio.RealPath + | dataActualPath := setting.Attachment.Minio.RealPath + | ||||
| setting.Attachment.Minio.Bucket + "/" + | setting.Attachment.Minio.Bucket + "/" + | ||||
| setting.Attachment.Minio.BasePath + | setting.Attachment.Minio.BasePath + | ||||
| @@ -312,7 +313,7 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain) error { | |||||
| }, | }, | ||||
| }) | }) | ||||
| if err != nil { | if err != nil { | ||||
| log.Error("CreateJob failed:", err.Error(), ctx.Data["MsgID"]) | |||||
| log.Error("CreateJob failed:%v", err.Error(), ctx.Data["MsgID"]) | |||||
| return err | return err | ||||
| } | } | ||||
| if jobResult.Code != Success { | if jobResult.Code != Success { | ||||
| @@ -321,14 +322,29 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain) error { | |||||
| } | } | ||||
| var jobID = jobResult.Payload["jobId"].(string) | var jobID = jobResult.Payload["jobId"].(string) | ||||
| task.JobID = jobID | |||||
| task.Status = string(models.JobWaiting) | |||||
| err = models.UpdateJob(task) | |||||
| newTask := &models.Cloudbrain{ | |||||
| Status: string(models.JobWaiting), | |||||
| UserID: task.UserID, | |||||
| RepoID: task.RepoID, | |||||
| JobID: jobID, | |||||
| JobName: task.JobName, | |||||
| SubTaskName: task.SubTaskName, | |||||
| JobType: task.JobType, | |||||
| Type: task.Type, | |||||
| Uuid: task.Uuid, | |||||
| Image: task.Image, | |||||
| GpuQueue: task.GpuQueue, | |||||
| ResourceSpecId: task.ResourceSpecId, | |||||
| ComputeResource: task.ComputeResource, | |||||
| } | |||||
| err = models.RestartCloudbrain(task, newTask) | |||||
| if err != nil { | if err != nil { | ||||
| log.Error("UpdateJob(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"]) | |||||
| log.Error("RestartCloudbrain(%s) failed:%v", jobName, err.Error(), ctx.Data["MsgID"]) | |||||
| return err | return err | ||||
| } | } | ||||
| *newJobID = jobID | |||||
| return nil | return nil | ||||
| } | } | ||||
| @@ -47,6 +47,7 @@ type Context struct { | |||||
| Repo *Repository | Repo *Repository | ||||
| Org *Organization | Org *Organization | ||||
| Cloudbrain *models.Cloudbrain | |||||
| } | } | ||||
| // IsUserSiteAdmin returns true if current user is a site admin | // IsUserSiteAdmin returns true if current user is a site admin | ||||
| @@ -251,17 +251,10 @@ func CloudBrainRestart(ctx *context.Context) { | |||||
| var jobID = ctx.Params(":jobid") | var jobID = ctx.Params(":jobid") | ||||
| var resultCode = "0" | var resultCode = "0" | ||||
| var errorMsg = "" | var errorMsg = "" | ||||
| var status = "" | |||||
| var status = string(models.JobWaiting) | |||||
| task := ctx.Cloudbrain | |||||
| for { | for { | ||||
| task, err := models.GetCloudbrainByJobID(jobID) | |||||
| if err != nil { | |||||
| log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error(), ctx.Data["MsgID"]) | |||||
| resultCode = "-1" | |||||
| errorMsg = "system error" | |||||
| break | |||||
| } | |||||
| if task.Status != string(models.JobStopped) && task.Status != string(models.JobSucceeded) && task.Status != string(models.JobFailed) { | if task.Status != string(models.JobStopped) && task.Status != string(models.JobSucceeded) && task.Status != string(models.JobFailed) { | ||||
| log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"]) | log.Error("the job(%s) is not stopped", task.JobName, ctx.Data["MsgID"]) | ||||
| resultCode = "-1" | resultCode = "-1" | ||||
| @@ -298,7 +291,7 @@ func CloudBrainRestart(ctx *context.Context) { | |||||
| } | } | ||||
| } | } | ||||
| err = cloudbrain.RestartTask(ctx, task) | |||||
| err = cloudbrain.RestartTask(ctx, task, &jobID) | |||||
| if err != nil { | if err != nil { | ||||
| log.Error("RestartTask failed:%v", err.Error(), ctx.Data["MsgID"]) | log.Error("RestartTask failed:%v", err.Error(), ctx.Data["MsgID"]) | ||||
| resultCode = "-1" | resultCode = "-1" | ||||
| @@ -306,9 +299,6 @@ func CloudBrainRestart(ctx *context.Context) { | |||||
| break | break | ||||
| } | } | ||||
| status = task.Status | |||||
| jobID = task.JobID | |||||
| break | break | ||||
| } | } | ||||
| @@ -369,46 +359,19 @@ func CloudBrainShow(ctx *context.Context) { | |||||
| } | } | ||||
| func CloudBrainDebug(ctx *context.Context) { | func CloudBrainDebug(ctx *context.Context) { | ||||
| var jobID = ctx.Params(":jobid") | |||||
| if !ctx.IsSigned { | |||||
| log.Error("the user has not signed in") | |||||
| ctx.Error(http.StatusForbidden, "", "the user has not signed in") | |||||
| return | |||||
| } | |||||
| task, err := models.GetCloudbrainByJobID(jobID) | |||||
| if err != nil { | |||||
| ctx.ServerError("GetCloudbrainByJobID failed", err) | |||||
| return | |||||
| } | |||||
| debugUrl := setting.DebugServerHost + "jpylab_" + task.JobID + "_" + task.SubTaskName | |||||
| debugUrl := setting.DebugServerHost + "jpylab_" + ctx.Cloudbrain.JobID + "_" + ctx.Cloudbrain.SubTaskName | |||||
| ctx.Redirect(debugUrl) | ctx.Redirect(debugUrl) | ||||
| } | } | ||||
| func CloudBrainCommitImage(ctx *context.Context, form auth.CommitImageCloudBrainForm) { | func CloudBrainCommitImage(ctx *context.Context, form auth.CommitImageCloudBrainForm) { | ||||
| var jobID = ctx.Params(":jobid") | |||||
| if !ctx.IsSigned { | |||||
| log.Error("the user has not signed in") | |||||
| ctx.Error(http.StatusForbidden, "", "the user has not signed in") | |||||
| return | |||||
| } | |||||
| task, err := models.GetCloudbrainByJobID(jobID) | |||||
| if err != nil { | |||||
| ctx.JSON(200, map[string]string{ | |||||
| "result_code": "-1", | |||||
| "error_msg": "GetCloudbrainByJobID failed", | |||||
| }) | |||||
| return | |||||
| } | |||||
| err = cloudbrain.CommitImage(jobID, models.CommitImageParams{ | |||||
| Ip: task.ContainerIp, | |||||
| TaskContainerId: task.ContainerID, | |||||
| err := cloudbrain.CommitImage(ctx.Cloudbrain.JobID, models.CommitImageParams{ | |||||
| Ip: ctx.Cloudbrain.ContainerIp, | |||||
| TaskContainerId: ctx.Cloudbrain.ContainerID, | |||||
| ImageDescription: form.Description, | ImageDescription: form.Description, | ||||
| ImageTag: form.Tag, | ImageTag: form.Tag, | ||||
| }) | }) | ||||
| if err != nil { | if err != nil { | ||||
| log.Error("CommitImage(%s) failed:%v", task.JobName, err.Error(), ctx.Data["msgID"]) | |||||
| log.Error("CommitImage(%s) failed:%v", ctx.Cloudbrain.JobName, err.Error(), ctx.Data["msgID"]) | |||||
| ctx.JSON(200, map[string]string{ | ctx.JSON(200, map[string]string{ | ||||
| "result_code": "-1", | "result_code": "-1", | ||||
| "error_msg": "CommitImage failed", | "error_msg": "CommitImage failed", | ||||
| @@ -428,15 +391,8 @@ func CloudBrainStop(ctx *context.Context) { | |||||
| var errorMsg = "" | var errorMsg = "" | ||||
| var status = "" | var status = "" | ||||
| task := ctx.Cloudbrain | |||||
| for { | for { | ||||
| task, err := models.GetCloudbrainByJobID(jobID) | |||||
| if err != nil { | |||||
| log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) | |||||
| resultCode = "-1" | |||||
| errorMsg = "system error" | |||||
| break | |||||
| } | |||||
| if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) { | if task.Status == string(models.JobStopped) || task.Status == string(models.JobFailed) { | ||||
| log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"]) | log.Error("the job(%s) has been stopped", task.JobName, ctx.Data["msgID"]) | ||||
| resultCode = "-1" | resultCode = "-1" | ||||
| @@ -444,7 +400,7 @@ func CloudBrainStop(ctx *context.Context) { | |||||
| break | break | ||||
| } | } | ||||
| err = cloudbrain.StopJob(jobID) | |||||
| err := cloudbrain.StopJob(jobID) | |||||
| if err != nil { | if err != nil { | ||||
| log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) | log.Error("StopJob(%s) failed:%v", task.JobName, err, ctx.Data["msgID"]) | ||||
| resultCode = "-1" | resultCode = "-1" | ||||
| @@ -554,12 +510,7 @@ func logErrorAndUpdateJobStatus(err error, taskInfo *models.Cloudbrain) { | |||||
| } | } | ||||
| func CloudBrainDel(ctx *context.Context) { | func CloudBrainDel(ctx *context.Context) { | ||||
| var jobID = ctx.Params(":jobid") | |||||
| task, err := models.GetCloudbrainByJobID(jobID) | |||||
| if err != nil { | |||||
| ctx.ServerError("GetCloudbrainByJobID failed", err) | |||||
| return | |||||
| } | |||||
| task := ctx.Cloudbrain | |||||
| if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) { | if task.Status != string(models.JobStopped) && task.Status != string(models.JobFailed) { | ||||
| log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"]) | log.Error("the job(%s) has not been stopped", task.JobName, ctx.Data["msgID"]) | ||||
| @@ -567,7 +518,7 @@ func CloudBrainDel(ctx *context.Context) { | |||||
| return | return | ||||
| } | } | ||||
| err = models.DeleteJob(task) | |||||
| err := models.DeleteJob(task) | |||||
| if err != nil { | if err != nil { | ||||
| ctx.ServerError("DeleteJob failed", err) | ctx.ServerError("DeleteJob failed", err) | ||||
| return | return | ||||
| @@ -192,11 +192,6 @@ func NotebookShow(ctx *context.Context) { | |||||
| func NotebookDebug(ctx *context.Context) { | func NotebookDebug(ctx *context.Context) { | ||||
| var jobID = ctx.Params(":jobid") | var jobID = ctx.Params(":jobid") | ||||
| _, err := models.GetCloudbrainByJobID(jobID) | |||||
| if err != nil { | |||||
| ctx.ServerError("GetCloudbrainByJobID failed", err) | |||||
| return | |||||
| } | |||||
| result, err := modelarts.GetJob(jobID) | result, err := modelarts.GetJob(jobID) | ||||
| if err != nil { | if err != nil { | ||||
| @@ -325,11 +320,7 @@ func NotebookManage(ctx *context.Context) { | |||||
| func NotebookDel(ctx *context.Context) { | func NotebookDel(ctx *context.Context) { | ||||
| var jobID = ctx.Params(":jobid") | var jobID = ctx.Params(":jobid") | ||||
| task, err := models.GetCloudbrainByJobID(jobID) | |||||
| if err != nil { | |||||
| ctx.ServerError("GetCloudbrainByJobID failed", err) | |||||
| return | |||||
| } | |||||
| task := ctx.Cloudbrain | |||||
| if task.Status != string(models.ModelArtsCreateFailed) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsStopped) { | if task.Status != string(models.ModelArtsCreateFailed) && task.Status != string(models.ModelArtsStartFailed) && task.Status != string(models.ModelArtsStopped) { | ||||
| log.Error("the job(%s) has not been stopped", task.JobName) | log.Error("the job(%s) has not been stopped", task.JobName) | ||||
| @@ -337,7 +328,7 @@ func NotebookDel(ctx *context.Context) { | |||||
| return | return | ||||
| } | } | ||||
| _, err = modelarts.DelNotebook(jobID) | |||||
| _, err := modelarts.DelNotebook(jobID) | |||||
| if err != nil { | if err != nil { | ||||
| log.Error("DelJob(%s) failed:%v", task.JobName, err.Error()) | log.Error("DelJob(%s) failed:%v", task.JobName, err.Error()) | ||||
| ctx.ServerError("DelJob failed", err) | ctx.ServerError("DelJob failed", err) | ||||
| @@ -1421,14 +1412,9 @@ func TrainJobDel(ctx *context.Context) { | |||||
| func TrainJobStop(ctx *context.Context) { | func TrainJobStop(ctx *context.Context) { | ||||
| var jobID = ctx.Params(":jobid") | var jobID = ctx.Params(":jobid") | ||||
| task, err := models.GetCloudbrainByJobID(jobID) | |||||
| if err != nil { | |||||
| log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error()) | |||||
| ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) | |||||
| return | |||||
| } | |||||
| task := ctx.Cloudbrain | |||||
| _, err = modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10)) | |||||
| _, err := modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10)) | |||||
| if err != nil { | if err != nil { | ||||
| log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error()) | log.Error("StopTrainJob(%s) failed:%v", task.JobName, err.Error()) | ||||
| ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) | ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobIndex, nil) | ||||