| @@ -2172,7 +2172,7 @@ func GetCloudBrainUnStoppedJob() ([]*Cloudbrain, error) { | |||
| Find(&cloudbrains) | |||
| } | |||
| func GetCloudBrainOneStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) { | |||
| func GetGPUStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) { | |||
| cloudbrains := make([]*Cloudbrain, 0, 10) | |||
| endTimeBefore := time.Now().Unix() - int64(days)*24*3600 | |||
| missEndTimeBefore := endTimeBefore - 24*3600 | |||
| @@ -2181,7 +2181,7 @@ func GetCloudBrainOneStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbra | |||
| JobStopped, JobSucceeded, JobFailed, ModelArtsCreateFailed, ModelArtsStartFailed, ModelArtsUnavailable, ModelArtsResizFailed, ModelArtsDeleted, | |||
| ModelArtsStopped, ModelArtsTrainJobCanceled, ModelArtsTrainJobCheckFailed, ModelArtsTrainJobCompleted, ModelArtsTrainJobDeleteFailed, ModelArtsTrainJobDeployServiceFailed, | |||
| ModelArtsTrainJobFailed, ModelArtsTrainJobImageFailed, ModelArtsTrainJobKilled, ModelArtsTrainJobLost, ModelArtsTrainJobSubmitFailed, ModelArtsTrainJobSubmitModelFailed). | |||
| Where("(((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false and type=0 and job_type != 'DEBUG'", missEndTimeBefore, endTimeBefore). | |||
| Where("(((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false and (type=0 or (type =2 and compute_resource='CPU/GPU')) and job_type != 'DEBUG'", missEndTimeBefore, endTimeBefore). | |||
| Limit(limit). | |||
| Find(&cloudbrains) | |||
| } | |||
| @@ -2189,14 +2189,14 @@ func GetCloudBrainOneStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbra | |||
| /** | |||
| 本方法考虑了再次调试的情况,多次调试取最后一次的任务的结束时间 | |||
| */ | |||
| func GetCloudBrainOneStoppedDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) { | |||
| func GetGPUStoppedDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) { | |||
| cloudbrains := make([]*Cloudbrain, 0, 10) | |||
| endTimeBefore := time.Now().Unix() - int64(days)*24*3600 | |||
| missEndTimeBefore := endTimeBefore - 24*3600 | |||
| sql := `SELECT id,job_name,job_id from (SELECT DISTINCT ON (job_name) | |||
| id, job_name, job_id,status,end_time,updated_unix,cleared | |||
| FROM cloudbrain | |||
| where type=0 and job_type='DEBUG' | |||
| where (type=0 or (type =2 and compute_resource='CPU/GPU')) and job_type='DEBUG' | |||
| ORDER BY job_name, updated_unix DESC) a | |||
| where status in ('STOPPED','SUCCEEDED','FAILED') and (((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false` | |||
| @@ -1063,6 +1063,8 @@ notebook_file_not_exist=Notebook file does not exist. | |||
| notebook_select_wrong=Please select a Notebook(.ipynb) file first. | |||
| notebook_file_no_right=You have no right to access the Notebook(.ipynb) file. | |||
| notebook_repo_conflict=The files in different branches of the same repository can not run together. | |||
| debug_again_fail=Fail to restart debug task, please try again later. | |||
| debug_again_fail_forever=The task was scheduled failed last time, can not restart. | |||
| date=Date | |||
| repo_add=Project Increment | |||
| @@ -1062,6 +1062,8 @@ notebook_file_not_exist=Notebook文件不存在。 | |||
| notebook_select_wrong=请先选择Notebook(.ipynb)文件。 | |||
| notebook_file_no_right=您没有这个Notebook文件的读权限。 | |||
| notebook_repo_conflict=同一个仓库的不同分支文件不能同时运行。 | |||
| debug_again_fail=再次调试失败,请稍后再试。 | |||
| debug_again_fail_forever=这个任务之前没有调度成功,不能再次调试。 | |||
| date=日期 | |||
| repo_add=新增项目 | |||
| @@ -1683,6 +1683,10 @@ func GrampusNotebookRestart(ctx *context.Context) { | |||
| if res.GrampusResult.ErrorCode != 0 || res.NewId == "" { | |||
| log.Error("ManageNotebook2 failed:" + res.GrampusResult.ErrorMsg) | |||
| errorMsg = ctx.Tr("repo.debug_again_fail") | |||
| if res.GrampusResult.ErrorCode == 5005 { | |||
| errorMsg = ctx.Tr("repo.debug_again_fail_forever") | |||
| } | |||
| break | |||
| } | |||
| @@ -14,21 +14,21 @@ import ( | |||
| func ClearCloudbrainResultSpace() { | |||
| log.Info("clear cloudbrain one result space begin.") | |||
| if !setting.ClearStrategy.Enabled{ | |||
| if !setting.ClearStrategy.Enabled { | |||
| return | |||
| } | |||
| tasks, err := models.GetCloudBrainOneStoppedNotDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.BatchSize) | |||
| tasks, err := models.GetGPUStoppedNotDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.BatchSize) | |||
| if err != nil { | |||
| log.Warn("Failed to get cloudbrain, clear result failed.", err) | |||
| return | |||
| } | |||
| debugTasks, err := models.GetCloudBrainOneStoppedDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.DebugJobSize) | |||
| debugTasks, err := models.GetGPUStoppedDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.DebugJobSize) | |||
| if err != nil { | |||
| log.Warn("Failed to get debug cloudbrain.", err) | |||
| } | |||
| tasks=append(tasks,debugTasks...) | |||
| tasks = append(tasks, debugTasks...) | |||
| if err != nil { | |||
| log.Warn("Failed to get cloudbrain, clear result failed.", err) | |||
| @@ -38,7 +38,7 @@ func ClearCloudbrainResultSpace() { | |||
| for _, task := range tasks { | |||
| err := DeleteCloudbrainOneJobStorage(task.JobName) | |||
| if err == nil { | |||
| log.Info("clear job in cloudbrain table:"+task.JobName) | |||
| log.Info("clear job in cloudbrain table:" + task.JobName) | |||
| ids = append(ids, task.ID) | |||
| } | |||
| } | |||
| @@ -69,10 +69,10 @@ func clearMinioHistoryTrashFile() { | |||
| SortModTimeAscend(miniofiles) | |||
| for _, file := range miniofiles { | |||
| if file.Name()!="" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) { | |||
| if file.Name() != "" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) { | |||
| has,err:=models.IsCloudbrainExistByJobName(file.Name()) | |||
| if err==nil && !has { | |||
| has, err := models.IsCloudbrainExistByJobName(file.Name()) | |||
| if err == nil && !has { | |||
| dirPath := setting.CBCodePathPrefix + file.Name() + "/" | |||
| log.Info("clear job in minio trash:" + file.Name()) | |||
| storage.Attachments.DeleteDir(dirPath) | |||
| @@ -90,7 +90,7 @@ func clearMinioHistoryTrashFile() { | |||
| } | |||
| } | |||
| func clearLocalHistoryTrashFile() { | |||
| func clearLocalHistoryTrashFile() { | |||
| files, err := ioutil.ReadDir(setting.JobPath) | |||
| processCount := 0 | |||
| if err != nil { | |||
| @@ -99,11 +99,11 @@ func clearLocalHistoryTrashFile() { | |||
| SortModTimeAscend(files) | |||
| for _, file := range files { | |||
| //清理n天前的历史垃圾数据,清理job目录 | |||
| if file.Name()!="" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) { | |||
| has,err:=models.IsCloudbrainExistByJobName(file.Name()) | |||
| if err==nil && !has{ | |||
| if file.Name() != "" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) { | |||
| has, err := models.IsCloudbrainExistByJobName(file.Name()) | |||
| if err == nil && !has { | |||
| os.RemoveAll(setting.JobPath + file.Name()) | |||
| log.Info("clear job in local trash:"+file.Name()) | |||
| log.Info("clear job in local trash:" + file.Name()) | |||
| processCount++ | |||
| } | |||
| if processCount == setting.ClearStrategy.BatchSize { | |||
| @@ -127,7 +127,7 @@ func SortModTimeAscend(files []os.FileInfo) { | |||
| func DeleteCloudbrainOneJobStorage(jobName string) error { | |||
| if jobName==""{ | |||
| if jobName == "" { | |||
| return nil | |||
| } | |||
| //delete local | |||
| @@ -58,8 +58,8 @@ export default async function initCloudrain() { | |||
| const duration = data.JobDuration; | |||
| const aiCenter = data.AiCenter || '--' | |||
| $("#duration-" + ID).text(duration); | |||
| $("#cluster-" + ID).text(aiCenter); | |||
| $("#" + versionname + "-ai_center").text(data.AiCenter); | |||
| data.AiCenter != undefined && $("#cluster-" + ID).text(aiCenter); | |||
| data.AiCenter != undefined && $("#" + versionname + "-ai_center").text(data.AiCenter); | |||
| if (status != status_text) { | |||
| $("#" + ID + "-icon") | |||
| .removeClass() | |||
| @@ -224,7 +224,7 @@ export default async function initCloudrain() { | |||
| data.StartTime !== undefined && data.StartTime > 0 && $("#" + versionname + "-startTime").text(timeFormat(new Date(data.StartTime * 1000))); | |||
| $("#" + versionname + "-duration").text(data.JobDuration); | |||
| $("#" + versionname + "-status").text(data.JobStatus); | |||
| $("#" + versionname + "-ai_center").text(data.AiCenter); | |||
| data.AiCenter != undefined && $("#" + versionname + "-ai_center").text(data.AiCenter); | |||
| if (stopArray.includes(data.JobStatus)) { | |||
| $("#" + versionname + "-stop").addClass("disabled"); | |||