Reviewed-on: https://git.openi.org.cn/OpenI/aiforge/pulls/3165 Reviewed-by: lewis <747342561@qq.com>tags/v1.22.11.2^2
| @@ -16,6 +16,8 @@ import ( | |||||
| "strings" | "strings" | ||||
| "time" | "time" | ||||
| "code.gitea.io/gitea/services/cloudbrain/cloudbrainTask" | |||||
| "code.gitea.io/gitea/modules/notification" | "code.gitea.io/gitea/modules/notification" | ||||
| "code.gitea.io/gitea/modules/setting" | "code.gitea.io/gitea/modules/setting" | ||||
| @@ -81,47 +83,22 @@ func GetCloudbrainTask(ctx *context.APIContext) { | |||||
| "JobDuration": job.TrainJobDuration, | "JobDuration": job.TrainJobDuration, | ||||
| }) | }) | ||||
| } else { | } else { | ||||
| jobResult, err := cloudbrain.GetJob(job.JobID) | |||||
| if err != nil { | |||||
| ctx.NotFound(err) | |||||
| log.Error("GetJob failed:", err) | |||||
| return | |||||
| } | |||||
| result, _ := models.ConvertToJobResultPayload(jobResult.Payload) | |||||
| jobAfter, err := cloudbrainTask.SyncCloudBrainOneStatus(job) | |||||
| if err != nil { | if err != nil { | ||||
| ctx.NotFound(err) | ctx.NotFound(err) | ||||
| log.Error("ConvertToJobResultPayload failed:", err) | |||||
| log.Error("Sync cloud brain one status failed:", err) | |||||
| return | return | ||||
| } | } | ||||
| oldStatus := job.Status | |||||
| job.Status = result.JobStatus.State | |||||
| taskRoles := result.TaskRoles | |||||
| taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) | |||||
| if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) { | |||||
| job.ContainerIp = taskRes.TaskStatuses[0].ContainerIP | |||||
| job.ContainerID = taskRes.TaskStatuses[0].ContainerID | |||||
| job.Status = taskRes.TaskStatuses[0].State | |||||
| } | |||||
| if result.JobStatus.State != string(models.JobWaiting) { | |||||
| models.ParseAndSetDurationFromCloudBrainOne(result, job) | |||||
| if oldStatus != job.Status { | |||||
| notification.NotifyChangeCloudbrainStatus(job, oldStatus) | |||||
| } | |||||
| err = models.UpdateJob(job) | |||||
| if err != nil { | |||||
| log.Error("UpdateJob failed:", err) | |||||
| } | |||||
| } | |||||
| ctx.JSON(http.StatusOK, map[string]interface{}{ | ctx.JSON(http.StatusOK, map[string]interface{}{ | ||||
| "ID": ID, | "ID": ID, | ||||
| "JobName": result.Config.JobName, | |||||
| "JobStatus": result.JobStatus.State, | |||||
| "SubState": result.JobStatus.SubState, | |||||
| "CreatedTime": time.Unix(result.JobStatus.CreatedTime/1000, 0).Format("2006-01-02 15:04:05"), | |||||
| "CompletedTime": time.Unix(result.JobStatus.CompletedTime/1000, 0).Format("2006-01-02 15:04:05"), | |||||
| "JobDuration": job.TrainJobDuration, | |||||
| "JobName": jobAfter.JobName, | |||||
| "JobStatus": jobAfter.Status, | |||||
| "SubState": "", | |||||
| "CreatedTime": jobAfter.CreatedUnix.Format("2006-01-02 15:04:05"), | |||||
| "CompletedTime": jobAfter.UpdatedUnix.Format("2006-01-02 15:04:05"), | |||||
| "JobDuration": jobAfter.TrainJobDuration, | |||||
| }) | }) | ||||
| } | } | ||||
| } | } | ||||
| @@ -12,6 +12,8 @@ import ( | |||||
| "strconv" | "strconv" | ||||
| "strings" | "strings" | ||||
| "code.gitea.io/gitea/services/cloudbrain/cloudbrainTask" | |||||
| "code.gitea.io/gitea/modules/urfs_client/urchin" | "code.gitea.io/gitea/modules/urfs_client/urchin" | ||||
| "code.gitea.io/gitea/modules/notification" | "code.gitea.io/gitea/modules/notification" | ||||
| @@ -20,7 +22,6 @@ import ( | |||||
| "code.gitea.io/gitea/modules/setting" | "code.gitea.io/gitea/modules/setting" | ||||
| "code.gitea.io/gitea/models" | "code.gitea.io/gitea/models" | ||||
| "code.gitea.io/gitea/modules/cloudbrain" | |||||
| "code.gitea.io/gitea/modules/context" | "code.gitea.io/gitea/modules/context" | ||||
| "code.gitea.io/gitea/modules/log" | "code.gitea.io/gitea/modules/log" | ||||
| "code.gitea.io/gitea/modules/modelarts" | "code.gitea.io/gitea/modules/modelarts" | ||||
| @@ -109,39 +110,11 @@ func GetModelArtsTrainJobVersion(ctx *context.APIContext) { | |||||
| } | } | ||||
| if job.Type == models.TypeCloudBrainOne { | if job.Type == models.TypeCloudBrainOne { | ||||
| jobResult, err := cloudbrain.GetJob(job.JobID) | |||||
| if err != nil { | |||||
| ctx.NotFound(err) | |||||
| log.Error("GetJob failed:", err) | |||||
| return | |||||
| } | |||||
| result, err := models.ConvertToJobResultPayload(jobResult.Payload) | |||||
| job, err = cloudbrainTask.SyncCloudBrainOneStatus(job) | |||||
| if err != nil { | if err != nil { | ||||
| ctx.NotFound(err) | ctx.NotFound(err) | ||||
| log.Error("ConvertToJobResultPayload failed:", err) | |||||
| return | return | ||||
| } | } | ||||
| oldStatus := job.Status | |||||
| job.Status = result.JobStatus.State | |||||
| if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) { | |||||
| taskRoles := result.TaskRoles | |||||
| taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) | |||||
| job.ContainerIp = taskRes.TaskStatuses[0].ContainerIP | |||||
| job.ContainerID = taskRes.TaskStatuses[0].ContainerID | |||||
| job.Status = taskRes.TaskStatuses[0].State | |||||
| } | |||||
| if result.JobStatus.State != string(models.JobWaiting) { | |||||
| models.ParseAndSetDurationFromCloudBrainOne(result, job) | |||||
| if oldStatus != job.Status { | |||||
| notification.NotifyChangeCloudbrainStatus(job, oldStatus) | |||||
| } | |||||
| err = models.UpdateJob(job) | |||||
| if err != nil { | |||||
| log.Error("UpdateJob failed:", err) | |||||
| } | |||||
| } | |||||
| } else if job.Type == models.TypeCloudBrainTwo { | } else if job.Type == models.TypeCloudBrainTwo { | ||||
| err := modelarts.HandleTrainJobInfo(job) | err := modelarts.HandleTrainJobInfo(job) | ||||
| if err != nil { | if err != nil { | ||||
| @@ -1845,59 +1845,37 @@ func SyncCloudbrainStatus() { | |||||
| continue | continue | ||||
| } | } | ||||
| if task.Type == models.TypeCloudBrainOne { | if task.Type == models.TypeCloudBrainOne { | ||||
| result, err := cloudbrain.GetJob(task.JobID) | |||||
| task, err = cloudbrainTask.SyncCloudBrainOneStatus(task) | |||||
| if err != nil { | if err != nil { | ||||
| log.Error("GetJob(%s) failed:%v", task.JobName, err) | |||||
| log.Error("Sync cloud brain one (%s) failed:%v", task.JobName, err) | |||||
| continue | continue | ||||
| } | } | ||||
| if result != nil { | |||||
| jobRes, _ := models.ConvertToJobResultPayload(result.Payload) | |||||
| taskRoles := jobRes.TaskRoles | |||||
| taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) | |||||
| oldStatus := task.Status | |||||
| task.Status = taskRes.TaskStatuses[0].State | |||||
| if task.Status != string(models.JobWaiting) { | |||||
| models.ParseAndSetDurationFromCloudBrainOne(jobRes, task) | |||||
| if task.Status != string(models.JobWaiting) { | |||||
| if task.Duration >= setting.MaxDuration && task.JobType == string(models.JobTypeDebug) { | |||||
| log.Info("begin to stop job(%s), because of the duration", task.DisplayJobName) | |||||
| err = cloudbrain.StopJob(task.JobID) | |||||
| if err != nil { | |||||
| log.Error("StopJob(%s) failed:%v", task.DisplayJobName, err) | |||||
| continue | |||||
| } | |||||
| oldStatus := task.Status | |||||
| task.Status = string(models.JobStopped) | |||||
| if task.EndTime == 0 { | |||||
| task.EndTime = timeutil.TimeStampNow() | |||||
| } | |||||
| task.ComputeAndSetDuration() | |||||
| if oldStatus != task.Status { | if oldStatus != task.Status { | ||||
| notification.NotifyChangeCloudbrainStatus(task, oldStatus) | notification.NotifyChangeCloudbrainStatus(task, oldStatus) | ||||
| } | } | ||||
| err = models.UpdateJob(task) | err = models.UpdateJob(task) | ||||
| if err != nil { | if err != nil { | ||||
| log.Error("UpdateJob(%s) failed:%v", task.JobName, err) | |||||
| } | |||||
| var maxDuration int64 | |||||
| if task.JobType == string(models.JobTypeBenchmark) { | |||||
| maxDuration = setting.BenchmarkMaxDuration | |||||
| } else if task.JobType == string(models.JobTypeSnn4imagenet) || task.JobType == string(models.JobTypeBrainScore) { | |||||
| maxDuration = setting.ModelBenchmarkMaxDuration | |||||
| } else { | |||||
| maxDuration = setting.MaxDuration | |||||
| } | |||||
| if task.Duration >= maxDuration && task.JobType != string(models.JobTypeTrain) { | |||||
| log.Info("begin to stop job(%s), because of the duration", task.DisplayJobName) | |||||
| err = cloudbrain.StopJob(task.JobID) | |||||
| if err != nil { | |||||
| log.Error("StopJob(%s) failed:%v", task.DisplayJobName, err) | |||||
| continue | |||||
| } | |||||
| task.Status = string(models.JobStopped) | |||||
| if task.EndTime == 0 { | |||||
| task.EndTime = timeutil.TimeStampNow() | |||||
| } | |||||
| task.ComputeAndSetDuration() | |||||
| if oldStatus != task.Status { | |||||
| notification.NotifyChangeCloudbrainStatus(task, oldStatus) | |||||
| } | |||||
| err = models.UpdateJob(task) | |||||
| if err != nil { | |||||
| log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err) | |||||
| continue | |||||
| } | |||||
| log.Error("UpdateJob(%s) failed:%v", task.DisplayJobName, err) | |||||
| continue | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| } else if task.Type == models.TypeCloudBrainTwo { | } else if task.Type == models.TypeCloudBrainTwo { | ||||
| if task.JobType == string(models.JobTypeDebug) { | if task.JobType == string(models.JobTypeDebug) { | ||||
| @@ -0,0 +1,83 @@ | |||||
| package cloudbrainTask | |||||
| import ( | |||||
| "net/http" | |||||
| "code.gitea.io/gitea/models" | |||||
| "code.gitea.io/gitea/modules/cloudbrain" | |||||
| "code.gitea.io/gitea/modules/httplib" | |||||
| "code.gitea.io/gitea/modules/log" | |||||
| "code.gitea.io/gitea/modules/notification" | |||||
| "code.gitea.io/gitea/modules/setting" | |||||
| ) | |||||
| var noteBookOKMap = make(map[int64]int, 20) | |||||
| //if a task notebook url can get two times, the notebook can browser. | |||||
| const successfulCount = 2 | |||||
| func SyncCloudBrainOneStatus(task *models.Cloudbrain) (*models.Cloudbrain, error) { | |||||
| jobResult, err := cloudbrain.GetJob(task.JobID) | |||||
| if err != nil { | |||||
| log.Error("GetJob failed:", err) | |||||
| return task, err | |||||
| } | |||||
| result, err := models.ConvertToJobResultPayload(jobResult.Payload) | |||||
| if err != nil { | |||||
| log.Error("ConvertToJobResultPayload failed:", err) | |||||
| return task, err | |||||
| } | |||||
| oldStatus := task.Status | |||||
| if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) { | |||||
| taskRoles := result.TaskRoles | |||||
| taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) | |||||
| task.ContainerIp = taskRes.TaskStatuses[0].ContainerIP | |||||
| task.ContainerID = taskRes.TaskStatuses[0].ContainerID | |||||
| } | |||||
| if (result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobRunning)) || | |||||
| task.Status == string(models.JobRunning) || (result.JobStatus.State == string(models.JobRunning) && isNoteBookReady(task)) { | |||||
| models.ParseAndSetDurationFromCloudBrainOne(result, task) | |||||
| task.Status = result.JobStatus.State | |||||
| if oldStatus != task.Status { | |||||
| notification.NotifyChangeCloudbrainStatus(task, oldStatus) | |||||
| } | |||||
| err = models.UpdateJob(task) | |||||
| if err != nil { | |||||
| log.Error("UpdateJob failed:", err) | |||||
| return task, err | |||||
| } | |||||
| } | |||||
| return task, nil | |||||
| } | |||||
| func isNoteBookReady(task *models.Cloudbrain) bool { | |||||
| if task.JobType != string(models.JobTypeDebug) { | |||||
| return true | |||||
| } | |||||
| noteBookUrl := setting.DebugServerHost + "jpylab_" + task.JobID + "_" + task.SubTaskName | |||||
| r := httplib.Get(noteBookUrl) | |||||
| res, err := r.Response() | |||||
| if err != nil { | |||||
| return false | |||||
| } | |||||
| if res.StatusCode == http.StatusOK { | |||||
| count := noteBookOKMap[task.ID] | |||||
| if count < successfulCount-1 { | |||||
| noteBookOKMap[task.ID] = count + 1 | |||||
| return false | |||||
| } else { | |||||
| delete(noteBookOKMap, task.ID) | |||||
| return true | |||||
| } | |||||
| } | |||||
| return false | |||||
| } | |||||