| @@ -1974,6 +1974,12 @@ func GetCloudbrainByID(id string) (*Cloudbrain, error) { | |||
| return getRepoCloudBrain(cb) | |||
| } | |||
| func IsCloudbrainExistByJobName(jobName string)(bool,error){ | |||
| return x.Unscoped().Exist(&Cloudbrain{ | |||
| JobName: jobName, | |||
| }) | |||
| } | |||
| func GetCloudbrainByIDWithDeleted(id string) (*Cloudbrain, error) { | |||
| idInt64, _ := strconv.ParseInt(id, 10, 64) | |||
| cb := &Cloudbrain{ID: idInt64} | |||
| @@ -2119,19 +2125,37 @@ func GetCloudBrainUnStoppedJob() ([]*Cloudbrain, error) { | |||
| Find(&cloudbrains) | |||
| } | |||
| func GetCloudBrainOneStoppedJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) { | |||
| func GetCloudBrainOneStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) { | |||
| cloudbrains := make([]*Cloudbrain, 0, 10) | |||
| endTimeBefore := time.Now().Unix() - int64(days)*24*3600 | |||
| missEndTimeBefore := endTimeBefore - 24*3600 | |||
| return cloudbrains, x.Cols("id,job_name,job_id"). | |||
| return cloudbrains, x.Unscoped().Cols("id,job_name,job_id"). | |||
| In("status", | |||
| JobStopped, JobSucceeded, JobFailed, ModelArtsCreateFailed, ModelArtsStartFailed, ModelArtsUnavailable, ModelArtsResizFailed, ModelArtsDeleted, | |||
| ModelArtsStopped, ModelArtsTrainJobCanceled, ModelArtsTrainJobCheckFailed, ModelArtsTrainJobCompleted, ModelArtsTrainJobDeleteFailed, ModelArtsTrainJobDeployServiceFailed, | |||
| ModelArtsTrainJobFailed, ModelArtsTrainJobImageFailed, ModelArtsTrainJobKilled, ModelArtsTrainJobLost, ModelArtsTrainJobSubmitFailed, ModelArtsTrainJobSubmitModelFailed). | |||
| Where("(((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false and type=0", missEndTimeBefore, endTimeBefore). | |||
| Where("(((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false and type=0 and job_type != 'DEBUG'", missEndTimeBefore, endTimeBefore). | |||
| Limit(limit). | |||
| Find(&cloudbrains) | |||
| } | |||
| /** | |||
| 本方法考虑了再次调试的情况,多次调试取最后一次的任务的结束时间 | |||
| */ | |||
| func GetCloudBrainOneStoppedDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) { | |||
| cloudbrains := make([]*Cloudbrain, 0, 10) | |||
| endTimeBefore := time.Now().Unix() - int64(days)*24*3600 | |||
| missEndTimeBefore := endTimeBefore - 24*3600 | |||
| sql:=`SELECT id,job_name,job_id from (SELECT DISTINCT ON (job_name) | |||
| id, job_name, job_id,status,end_time,updated_unix,cleared | |||
| FROM cloudbrain | |||
| where type=0 and job_type='DEBUG' | |||
| ORDER BY job_name, updated_unix DESC) a | |||
| where status in ('STOPPED','SUCCEEDED','FAILED') and (((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false` | |||
| return cloudbrains, x.Unscoped().SQL(sql,missEndTimeBefore, endTimeBefore).Limit(limit).Find(&cloudbrains) | |||
| } | |||
| func UpdateCloudBrainRecordsCleared(ids []int64) error { | |||
| pageSize := 150 | |||
| @@ -183,6 +183,17 @@ func GetWaittingTop() ([]*CloudbrainInfo, error) { | |||
| Find(&cloudbrains); err != nil { | |||
| log.Info("find error.") | |||
| } | |||
| var ids []int64 | |||
| for _, task := range cloudbrains { | |||
| ids = append(ids, task.RepoID) | |||
| } | |||
| repositoryMap, err := GetRepositoriesMapByIDs(ids) | |||
| if err == nil { | |||
| for _, task := range cloudbrains { | |||
| task.Repo = repositoryMap[task.RepoID] | |||
| } | |||
| } | |||
| return cloudbrains, nil | |||
| } | |||
| @@ -199,6 +210,16 @@ func GetRunningTop() ([]*CloudbrainInfo, error) { | |||
| Find(&cloudbrains); err != nil { | |||
| log.Info("find error.") | |||
| } | |||
| var ids []int64 | |||
| for _, task := range cloudbrains { | |||
| ids = append(ids, task.RepoID) | |||
| } | |||
| repositoryMap, err := GetRepositoriesMapByIDs(ids) | |||
| if err == nil { | |||
| for _, task := range cloudbrains { | |||
| task.Repo = repositoryMap[task.RepoID] | |||
| } | |||
| } | |||
| return cloudbrains, nil | |||
| } | |||
| @@ -3,6 +3,7 @@ package models | |||
| import ( | |||
| "code.gitea.io/gitea/modules/timeutil" | |||
| "fmt" | |||
| "strings" | |||
| "xorm.io/builder" | |||
| ) | |||
| @@ -197,12 +198,104 @@ type Specification struct { | |||
| AiCenterName string | |||
| IsExclusive bool | |||
| ExclusiveOrg string | |||
| //specs that have the same sourceSpecId, computeResource and cluster as current spec | |||
| RelatedSpecs []*Specification | |||
| } | |||
| func (Specification) TableName() string { | |||
| return "resource_specification" | |||
| } | |||
| func (s *Specification) loadRelatedSpecs() { | |||
| if s.RelatedSpecs != nil { | |||
| return | |||
| } | |||
| defaultSpecs := make([]*Specification, 0) | |||
| if s.SourceSpecId == "" { | |||
| s.RelatedSpecs = defaultSpecs | |||
| return | |||
| } | |||
| r, err := FindSpecs(FindSpecsOptions{ | |||
| ComputeResource: s.ComputeResource, | |||
| Cluster: s.Cluster, | |||
| SourceSpecId: s.SourceSpecId, | |||
| RequestAll: true, | |||
| SpecStatus: SpecOnShelf, | |||
| }) | |||
| if err != nil { | |||
| s.RelatedSpecs = defaultSpecs | |||
| return | |||
| } | |||
| s.RelatedSpecs = r | |||
| } | |||
| func (s *Specification) GetAvailableCenterIds(userIds ...int64) []string { | |||
| s.loadRelatedSpecs() | |||
| if len(s.RelatedSpecs) == 0 { | |||
| return make([]string, 0) | |||
| } | |||
| var uId int64 | |||
| if len(userIds) > 0 { | |||
| uId = userIds[0] | |||
| } | |||
| //filter exclusive specs | |||
| specs := FilterExclusiveSpecs(s.RelatedSpecs, uId) | |||
| centerIds := make([]string, len(specs)) | |||
| for i, v := range specs { | |||
| centerIds[i] = v.AiCenterCode | |||
| } | |||
| return centerIds | |||
| } | |||
| func FilterExclusiveSpecs(r []*Specification, userId int64) []*Specification { | |||
| if userId == 0 { | |||
| return r | |||
| } | |||
| specs := make([]*Specification, 0, len(r)) | |||
| specMap := make(map[int64]string, 0) | |||
| for i := 0; i < len(r); i++ { | |||
| spec := r[i] | |||
| if _, has := specMap[spec.ID]; has { | |||
| continue | |||
| } | |||
| if !spec.IsExclusive { | |||
| specs = append(specs, spec) | |||
| specMap[spec.ID] = "" | |||
| continue | |||
| } | |||
| orgs := strings.Split(spec.ExclusiveOrg, ";") | |||
| for _, org := range orgs { | |||
| isMember, _ := IsOrganizationMemberByOrgName(org, userId) | |||
| if isMember { | |||
| specs = append(specs, spec) | |||
| specMap[spec.ID] = "" | |||
| break | |||
| } | |||
| } | |||
| } | |||
| return specs | |||
| } | |||
| func DistinctSpecs(r []*Specification) []*Specification { | |||
| specs := make([]*Specification, 0, len(r)) | |||
| sourceSpecIdMap := make(map[string]string, 0) | |||
| for i := 0; i < len(r); i++ { | |||
| spec := r[i] | |||
| if spec.SourceSpecId == "" { | |||
| specs = append(specs, spec) | |||
| continue | |||
| } | |||
| if _, has := sourceSpecIdMap[spec.SourceSpecId]; has { | |||
| continue | |||
| } | |||
| specs = append(specs, spec) | |||
| sourceSpecIdMap[spec.SourceSpecId] = "" | |||
| } | |||
| return specs | |||
| } | |||
| func InsertResourceSpecification(r ResourceSpecification) (int64, error) { | |||
| return x.Insert(&r) | |||
| } | |||
| @@ -282,8 +282,6 @@ func GenerateNotebookJob(ctx *context.Context, req *GenerateNotebookJobReq) (job | |||
| func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId string, err error) { | |||
| createTime := timeutil.TimeStampNow() | |||
| centerID, centerName := getCentersParamter(ctx, req) | |||
| var datasetGrampus, modelGrampus []models.GrampusDataset | |||
| var codeGrampus models.GrampusDataset | |||
| if ProcessorTypeNPU == req.ProcessType { | |||
| @@ -315,8 +313,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId str | |||
| ResourceSpecId: req.Spec.SourceSpecId, | |||
| ImageId: req.ImageId, | |||
| ImageUrl: req.ImageUrl, | |||
| CenterID: centerID, | |||
| CenterName: centerName, | |||
| CenterID: req.Spec.GetAvailableCenterIds(ctx.User.ID), | |||
| ReplicaNum: 1, | |||
| Datasets: datasetGrampus, | |||
| Models: modelGrampus, | |||
| @@ -618,6 +618,7 @@ var ( | |||
| Enabled bool | |||
| ResultSaveDays int | |||
| BatchSize int | |||
| DebugJobSize int | |||
| TrashSaveDays int | |||
| Cron string | |||
| RunAtStart bool | |||
| @@ -1696,6 +1697,7 @@ func getClearStrategy(){ | |||
| ClearStrategy.Enabled=sec.Key("ENABLED").MustBool(false) | |||
| ClearStrategy.ResultSaveDays=sec.Key("RESULT_SAVE_DAYS").MustInt(30) | |||
| ClearStrategy.BatchSize=sec.Key("BATCH_SIZE").MustInt(500) | |||
| ClearStrategy.DebugJobSize=sec.Key("DEBUG_BATCH_SIZE").MustInt(100) | |||
| ClearStrategy.TrashSaveDays=sec.Key("TRASH_SAVE_DAYS").MustInt(90) | |||
| ClearStrategy.Cron=sec.Key("CRON").MustString("* 0,30 2-8 * * ?") | |||
| ClearStrategy.RunAtStart=sec.Key("RUN_AT_START").MustBool(false) | |||
| @@ -968,6 +968,8 @@ func GetWaittingTop(ctx *context.Context) { | |||
| taskDetail.RepoID = ciTasks[i].RepoID | |||
| if ciTasks[i].Repo != nil { | |||
| taskDetail.RepoName = ciTasks[i].Repo.OwnerName + "/" + ciTasks[i].Repo.Name | |||
| } else { | |||
| taskDetail.RepoName = "" | |||
| } | |||
| WaitTimeInt := time.Now().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() | |||
| taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt) | |||
| @@ -975,6 +977,13 @@ func GetWaittingTop(ctx *context.Context) { | |||
| if WaitTimeInt < 0 { | |||
| taskDetail.WaitTime = "00:00:00" | |||
| } | |||
| taskDetail.ID = ciTasks[i].Cloudbrain.ID | |||
| taskDetail.ComputeResource = ciTasks[i].Cloudbrain.ComputeResource | |||
| taskDetail.JobType = ciTasks[i].Cloudbrain.JobType | |||
| taskDetail.JobID = ciTasks[i].Cloudbrain.JobID | |||
| taskDetail.Type = ciTasks[i].Cloudbrain.Type | |||
| tasks = append(tasks, taskDetail) | |||
| } | |||
| ctx.JSON(http.StatusOK, map[string]interface{}{ | |||
| @@ -1001,6 +1010,12 @@ func GetRunningTop(ctx *context.Context) { | |||
| taskDetail.RepoName = ciTasks[i].Repo.OwnerName + "/" + ciTasks[i].Repo.Name | |||
| } | |||
| taskDetail.ID = ciTasks[i].Cloudbrain.ID | |||
| taskDetail.ComputeResource = ciTasks[i].Cloudbrain.ComputeResource | |||
| taskDetail.JobType = ciTasks[i].Cloudbrain.JobType | |||
| taskDetail.JobID = ciTasks[i].Cloudbrain.JobID | |||
| taskDetail.Type = ciTasks[i].Cloudbrain.Type | |||
| tasks = append(tasks, taskDetail) | |||
| } | |||
| ctx.JSON(http.StatusOK, map[string]interface{}{ | |||
| @@ -13,11 +13,22 @@ import ( | |||
| ) | |||
| func ClearCloudbrainResultSpace() { | |||
| log.Info("clear cloudbrain one result space begin.") | |||
| if !setting.ClearStrategy.Enabled{ | |||
| return | |||
| } | |||
| tasks, err := models.GetCloudBrainOneStoppedJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.BatchSize) | |||
| tasks, err := models.GetCloudBrainOneStoppedNotDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.BatchSize) | |||
| if err != nil { | |||
| log.Warn("Failed to get cloudbrain, clear result failed.", err) | |||
| return | |||
| } | |||
| debugTasks, err := models.GetCloudBrainOneStoppedDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.DebugJobSize) | |||
| if err != nil { | |||
| log.Warn("Failed to get debug cloudbrain.", err) | |||
| } | |||
| tasks=append(tasks,debugTasks...) | |||
| if err != nil { | |||
| log.Warn("Failed to get cloudbrain, clear result failed.", err) | |||
| @@ -37,11 +48,12 @@ func ClearCloudbrainResultSpace() { | |||
| log.Warn("Failed to set cloudbrain cleared status", err) | |||
| } | |||
| //如果云脑表处理完了,通过遍历minio对象处理历史垃圾数据,如果存在的话 | |||
| if len(tasks) < setting.ClearStrategy.BatchSize { | |||
| if len(tasks) < setting.ClearStrategy.BatchSize+setting.ClearStrategy.DebugJobSize { | |||
| clearLocalHistoryTrashFile() | |||
| clearMinioHistoryTrashFile() | |||
| } | |||
| log.Info("clear cloudbrain one result space end.") | |||
| } | |||
| @@ -57,11 +69,15 @@ func clearMinioHistoryTrashFile() { | |||
| SortModTimeAscend(miniofiles) | |||
| for _, file := range miniofiles { | |||
| if file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) { | |||
| dirPath := setting.CBCodePathPrefix + file.Name() + "/" | |||
| log.Info("clear job in minio trash:"+file.Name()) | |||
| storage.Attachments.DeleteDir(dirPath) | |||
| processCount++ | |||
| if file.Name()!="" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) { | |||
| has,err:=models.IsCloudbrainExistByJobName(file.Name()) | |||
| if err==nil && !has { | |||
| dirPath := setting.CBCodePathPrefix + file.Name() + "/" | |||
| log.Info("clear job in minio trash:" + file.Name()) | |||
| storage.Attachments.DeleteDir(dirPath) | |||
| processCount++ | |||
| } | |||
| if processCount == setting.ClearStrategy.BatchSize { | |||
| break | |||
| } | |||
| @@ -83,10 +99,13 @@ func clearLocalHistoryTrashFile() { | |||
| SortModTimeAscend(files) | |||
| for _, file := range files { | |||
| //清理n天前的历史垃圾数据,清理job目录 | |||
| if file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) { | |||
| os.RemoveAll(setting.JobPath + file.Name()) | |||
| log.Info("clear job in local trash:"+file.Name()) | |||
| processCount++ | |||
| if file.Name()!="" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) { | |||
| has,err:=models.IsCloudbrainExistByJobName(file.Name()) | |||
| if err==nil && !has{ | |||
| os.RemoveAll(setting.JobPath + file.Name()) | |||
| log.Info("clear job in local trash:"+file.Name()) | |||
| processCount++ | |||
| } | |||
| if processCount == setting.ClearStrategy.BatchSize { | |||
| break | |||
| } | |||
| @@ -105,15 +124,12 @@ func SortModTimeAscend(files []os.FileInfo) { | |||
| return files[i].ModTime().Before(files[j].ModTime()) | |||
| }) | |||
| } | |||
| func SortModTimeAscendForMinio(files []storage.FileInfo) { | |||
| sort.Slice(files, func(i, j int) bool { | |||
| timeI, _ := time.Parse("2006-01-02 15:04:05", files[i].ModTime) | |||
| timeJ, _ := time.Parse("2006-01-02 15:04:05", files[i].ModTime) | |||
| return timeI.Before(timeJ) | |||
| }) | |||
| } | |||
| func DeleteCloudbrainOneJobStorage(jobName string) error { | |||
| if jobName==""{ | |||
| return nil | |||
| } | |||
| //delete local | |||
| localJobPath := setting.JobPath + jobName | |||
| err := os.RemoveAll(localJobPath) | |||
| @@ -246,10 +246,10 @@ func FindAvailableSpecs(userId int64, opts models.FindSpecsOptions) ([]*models.S | |||
| return nil, err | |||
| } | |||
| //filter exclusive specs | |||
| specs := filterExclusiveSpecs(r, userId) | |||
| specs := models.FilterExclusiveSpecs(r, userId) | |||
| //distinct by sourceSpecId | |||
| specs = distinctSpecs(specs) | |||
| specs = models.DistinctSpecs(specs) | |||
| return specs, err | |||
| } | |||
| @@ -265,50 +265,6 @@ func FindAvailableSpecs4Show(userId int64, opts models.FindSpecsOptions) ([]*api | |||
| return result, nil | |||
| } | |||
| func filterExclusiveSpecs(r []*models.Specification, userId int64) []*models.Specification { | |||
| specs := make([]*models.Specification, 0, len(r)) | |||
| specMap := make(map[int64]string, 0) | |||
| for i := 0; i < len(r); i++ { | |||
| spec := r[i] | |||
| if _, has := specMap[spec.ID]; has { | |||
| continue | |||
| } | |||
| if !spec.IsExclusive { | |||
| specs = append(specs, spec) | |||
| specMap[spec.ID] = "" | |||
| continue | |||
| } | |||
| orgs := strings.Split(spec.ExclusiveOrg, ";") | |||
| for _, org := range orgs { | |||
| isMember, _ := models.IsOrganizationMemberByOrgName(org, userId) | |||
| if isMember { | |||
| specs = append(specs, spec) | |||
| specMap[spec.ID] = "" | |||
| break | |||
| } | |||
| } | |||
| } | |||
| return specs | |||
| } | |||
| func distinctSpecs(r []*models.Specification) []*models.Specification { | |||
| specs := make([]*models.Specification, 0, len(r)) | |||
| sourceSpecIdMap := make(map[string]string, 0) | |||
| for i := 0; i < len(r); i++ { | |||
| spec := r[i] | |||
| if spec.SourceSpecId == "" { | |||
| specs = append(specs, spec) | |||
| continue | |||
| } | |||
| if _, has := sourceSpecIdMap[spec.SourceSpecId]; has { | |||
| continue | |||
| } | |||
| specs = append(specs, spec) | |||
| sourceSpecIdMap[spec.SourceSpecId] = "" | |||
| } | |||
| return specs | |||
| } | |||
| func GetAndCheckSpec(userId int64, specId int64, opts models.FindSpecsOptions) (*models.Specification, error) { | |||
| if specId == 0 { | |||
| return nil, nil | |||