Browse Source

Merge remote-tracking branch 'origin/V20221214' into gcu

tags/v1.22.12.2^2
chenyifan01 2 years ago
parent
commit
31cb5840d3
8 changed files with 195 additions and 71 deletions
  1. +27
    -3
      models/cloudbrain.go
  2. +21
    -0
      models/cloudbrain_static.go
  3. +93
    -0
      models/resource_specification.go
  4. +1
    -4
      modules/grampus/grampus.go
  5. +2
    -0
      modules/setting/setting.go
  6. +15
    -0
      routers/api/v1/repo/cloudbrain_dashboard.go
  7. +34
    -18
      services/cloudbrain/clear.go
  8. +2
    -46
      services/cloudbrain/resource/resource_specification.go

+ 27
- 3
models/cloudbrain.go View File

@@ -1974,6 +1974,12 @@ func GetCloudbrainByID(id string) (*Cloudbrain, error) {
return getRepoCloudBrain(cb)
}

func IsCloudbrainExistByJobName(jobName string)(bool,error){
return x.Unscoped().Exist(&Cloudbrain{
JobName: jobName,
})
}

func GetCloudbrainByIDWithDeleted(id string) (*Cloudbrain, error) {
idInt64, _ := strconv.ParseInt(id, 10, 64)
cb := &Cloudbrain{ID: idInt64}
@@ -2119,19 +2125,37 @@ func GetCloudBrainUnStoppedJob() ([]*Cloudbrain, error) {
Find(&cloudbrains)
}

func GetCloudBrainOneStoppedJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) {
func GetCloudBrainOneStoppedNotDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) {
cloudbrains := make([]*Cloudbrain, 0, 10)
endTimeBefore := time.Now().Unix() - int64(days)*24*3600
missEndTimeBefore := endTimeBefore - 24*3600
return cloudbrains, x.Cols("id,job_name,job_id").
return cloudbrains, x.Unscoped().Cols("id,job_name,job_id").
In("status",
JobStopped, JobSucceeded, JobFailed, ModelArtsCreateFailed, ModelArtsStartFailed, ModelArtsUnavailable, ModelArtsResizFailed, ModelArtsDeleted,
ModelArtsStopped, ModelArtsTrainJobCanceled, ModelArtsTrainJobCheckFailed, ModelArtsTrainJobCompleted, ModelArtsTrainJobDeleteFailed, ModelArtsTrainJobDeployServiceFailed,
ModelArtsTrainJobFailed, ModelArtsTrainJobImageFailed, ModelArtsTrainJobKilled, ModelArtsTrainJobLost, ModelArtsTrainJobSubmitFailed, ModelArtsTrainJobSubmitModelFailed).
Where("(((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false and type=0", missEndTimeBefore, endTimeBefore).
Where("(((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false and type=0 and job_type != 'DEBUG'", missEndTimeBefore, endTimeBefore).
Limit(limit).
Find(&cloudbrains)
}
/**
本方法考虑了再次调试的情况,多次调试取最后一次的任务的结束时间
*/
func GetCloudBrainOneStoppedDebugJobDaysAgo(days int, limit int) ([]*Cloudbrain, error) {
cloudbrains := make([]*Cloudbrain, 0, 10)
endTimeBefore := time.Now().Unix() - int64(days)*24*3600
missEndTimeBefore := endTimeBefore - 24*3600
sql:=`SELECT id,job_name,job_id from (SELECT DISTINCT ON (job_name)
id, job_name, job_id,status,end_time,updated_unix,cleared
FROM cloudbrain
where type=0 and job_type='DEBUG'
ORDER BY job_name, updated_unix DESC) a
where status in ('STOPPED','SUCCEEDED','FAILED') and (((end_time is null or end_time=0) and updated_unix<? and updated_unix != 0 ) or (end_time<? and end_time != 0)) and cleared=false`

return cloudbrains, x.Unscoped().SQL(sql,missEndTimeBefore, endTimeBefore).Limit(limit).Find(&cloudbrains)

}


func UpdateCloudBrainRecordsCleared(ids []int64) error {
pageSize := 150


+ 21
- 0
models/cloudbrain_static.go View File

@@ -183,6 +183,17 @@ func GetWaittingTop() ([]*CloudbrainInfo, error) {
Find(&cloudbrains); err != nil {
log.Info("find error.")
}

var ids []int64
for _, task := range cloudbrains {
ids = append(ids, task.RepoID)
}
repositoryMap, err := GetRepositoriesMapByIDs(ids)
if err == nil {
for _, task := range cloudbrains {
task.Repo = repositoryMap[task.RepoID]
}
}
return cloudbrains, nil
}

@@ -199,6 +210,16 @@ func GetRunningTop() ([]*CloudbrainInfo, error) {
Find(&cloudbrains); err != nil {
log.Info("find error.")
}
var ids []int64
for _, task := range cloudbrains {
ids = append(ids, task.RepoID)
}
repositoryMap, err := GetRepositoriesMapByIDs(ids)
if err == nil {
for _, task := range cloudbrains {
task.Repo = repositoryMap[task.RepoID]
}
}
return cloudbrains, nil
}



+ 93
- 0
models/resource_specification.go View File

@@ -3,6 +3,7 @@ package models
import (
"code.gitea.io/gitea/modules/timeutil"
"fmt"
"strings"
"xorm.io/builder"
)

@@ -197,12 +198,104 @@ type Specification struct {
AiCenterName string
IsExclusive bool
ExclusiveOrg string
//specs that have the same sourceSpecId, computeResource and cluster as current spec
RelatedSpecs []*Specification
}

func (Specification) TableName() string {
return "resource_specification"
}

func (s *Specification) loadRelatedSpecs() {
if s.RelatedSpecs != nil {
return
}
defaultSpecs := make([]*Specification, 0)
if s.SourceSpecId == "" {
s.RelatedSpecs = defaultSpecs
return
}
r, err := FindSpecs(FindSpecsOptions{
ComputeResource: s.ComputeResource,
Cluster: s.Cluster,
SourceSpecId: s.SourceSpecId,
RequestAll: true,
SpecStatus: SpecOnShelf,
})
if err != nil {
s.RelatedSpecs = defaultSpecs
return
}
s.RelatedSpecs = r
}
func (s *Specification) GetAvailableCenterIds(userIds ...int64) []string {
s.loadRelatedSpecs()

if len(s.RelatedSpecs) == 0 {
return make([]string, 0)
}

var uId int64
if len(userIds) > 0 {
uId = userIds[0]
}
//filter exclusive specs
specs := FilterExclusiveSpecs(s.RelatedSpecs, uId)

centerIds := make([]string, len(specs))
for i, v := range specs {
centerIds[i] = v.AiCenterCode
}
return centerIds
}

func FilterExclusiveSpecs(r []*Specification, userId int64) []*Specification {
if userId == 0 {
return r
}
specs := make([]*Specification, 0, len(r))
specMap := make(map[int64]string, 0)
for i := 0; i < len(r); i++ {
spec := r[i]
if _, has := specMap[spec.ID]; has {
continue
}
if !spec.IsExclusive {
specs = append(specs, spec)
specMap[spec.ID] = ""
continue
}
orgs := strings.Split(spec.ExclusiveOrg, ";")
for _, org := range orgs {
isMember, _ := IsOrganizationMemberByOrgName(org, userId)
if isMember {
specs = append(specs, spec)
specMap[spec.ID] = ""
break
}
}
}
return specs
}

func DistinctSpecs(r []*Specification) []*Specification {
specs := make([]*Specification, 0, len(r))
sourceSpecIdMap := make(map[string]string, 0)
for i := 0; i < len(r); i++ {
spec := r[i]
if spec.SourceSpecId == "" {
specs = append(specs, spec)
continue
}
if _, has := sourceSpecIdMap[spec.SourceSpecId]; has {
continue
}
specs = append(specs, spec)
sourceSpecIdMap[spec.SourceSpecId] = ""
}
return specs
}

func InsertResourceSpecification(r ResourceSpecification) (int64, error) {
return x.Insert(&r)
}


+ 1
- 4
modules/grampus/grampus.go View File

@@ -282,8 +282,6 @@ func GenerateNotebookJob(ctx *context.Context, req *GenerateNotebookJobReq) (job
func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId string, err error) {
createTime := timeutil.TimeStampNow()

centerID, centerName := getCentersParamter(ctx, req)

var datasetGrampus, modelGrampus []models.GrampusDataset
var codeGrampus models.GrampusDataset
if ProcessorTypeNPU == req.ProcessType {
@@ -315,8 +313,7 @@ func GenerateTrainJob(ctx *context.Context, req *GenerateTrainJobReq) (jobId str
ResourceSpecId: req.Spec.SourceSpecId,
ImageId: req.ImageId,
ImageUrl: req.ImageUrl,
CenterID: centerID,
CenterName: centerName,
CenterID: req.Spec.GetAvailableCenterIds(ctx.User.ID),
ReplicaNum: 1,
Datasets: datasetGrampus,
Models: modelGrampus,


+ 2
- 0
modules/setting/setting.go View File

@@ -618,6 +618,7 @@ var (
Enabled bool
ResultSaveDays int
BatchSize int
DebugJobSize int
TrashSaveDays int
Cron string
RunAtStart bool
@@ -1696,6 +1697,7 @@ func getClearStrategy(){
ClearStrategy.Enabled=sec.Key("ENABLED").MustBool(false)
ClearStrategy.ResultSaveDays=sec.Key("RESULT_SAVE_DAYS").MustInt(30)
ClearStrategy.BatchSize=sec.Key("BATCH_SIZE").MustInt(500)
ClearStrategy.DebugJobSize=sec.Key("DEBUG_BATCH_SIZE").MustInt(100)
ClearStrategy.TrashSaveDays=sec.Key("TRASH_SAVE_DAYS").MustInt(90)
ClearStrategy.Cron=sec.Key("CRON").MustString("* 0,30 2-8 * * ?")
ClearStrategy.RunAtStart=sec.Key("RUN_AT_START").MustBool(false)


+ 15
- 0
routers/api/v1/repo/cloudbrain_dashboard.go View File

@@ -968,6 +968,8 @@ func GetWaittingTop(ctx *context.Context) {
taskDetail.RepoID = ciTasks[i].RepoID
if ciTasks[i].Repo != nil {
taskDetail.RepoName = ciTasks[i].Repo.OwnerName + "/" + ciTasks[i].Repo.Name
} else {
taskDetail.RepoName = ""
}
WaitTimeInt := time.Now().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix()
taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt)
@@ -975,6 +977,13 @@ func GetWaittingTop(ctx *context.Context) {
if WaitTimeInt < 0 {
taskDetail.WaitTime = "00:00:00"
}

taskDetail.ID = ciTasks[i].Cloudbrain.ID
taskDetail.ComputeResource = ciTasks[i].Cloudbrain.ComputeResource
taskDetail.JobType = ciTasks[i].Cloudbrain.JobType
taskDetail.JobID = ciTasks[i].Cloudbrain.JobID
taskDetail.Type = ciTasks[i].Cloudbrain.Type

tasks = append(tasks, taskDetail)
}
ctx.JSON(http.StatusOK, map[string]interface{}{
@@ -1001,6 +1010,12 @@ func GetRunningTop(ctx *context.Context) {
taskDetail.RepoName = ciTasks[i].Repo.OwnerName + "/" + ciTasks[i].Repo.Name
}

taskDetail.ID = ciTasks[i].Cloudbrain.ID
taskDetail.ComputeResource = ciTasks[i].Cloudbrain.ComputeResource
taskDetail.JobType = ciTasks[i].Cloudbrain.JobType
taskDetail.JobID = ciTasks[i].Cloudbrain.JobID
taskDetail.Type = ciTasks[i].Cloudbrain.Type

tasks = append(tasks, taskDetail)
}
ctx.JSON(http.StatusOK, map[string]interface{}{


+ 34
- 18
services/cloudbrain/clear.go View File

@@ -13,11 +13,22 @@ import (
)

func ClearCloudbrainResultSpace() {
log.Info("clear cloudbrain one result space begin.")
if !setting.ClearStrategy.Enabled{
return
}

tasks, err := models.GetCloudBrainOneStoppedJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.BatchSize)
tasks, err := models.GetCloudBrainOneStoppedNotDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.BatchSize)
if err != nil {
log.Warn("Failed to get cloudbrain, clear result failed.", err)
return
}
debugTasks, err := models.GetCloudBrainOneStoppedDebugJobDaysAgo(setting.ClearStrategy.ResultSaveDays, setting.ClearStrategy.DebugJobSize)
if err != nil {
log.Warn("Failed to get debug cloudbrain.", err)

}
tasks=append(tasks,debugTasks...)

if err != nil {
log.Warn("Failed to get cloudbrain, clear result failed.", err)
@@ -37,11 +48,12 @@ func ClearCloudbrainResultSpace() {
log.Warn("Failed to set cloudbrain cleared status", err)
}
//如果云脑表处理完了,通过遍历minio对象处理历史垃圾数据,如果存在的话
if len(tasks) < setting.ClearStrategy.BatchSize {
if len(tasks) < setting.ClearStrategy.BatchSize+setting.ClearStrategy.DebugJobSize {
clearLocalHistoryTrashFile()
clearMinioHistoryTrashFile()

}
log.Info("clear cloudbrain one result space end.")

}

@@ -57,11 +69,15 @@ func clearMinioHistoryTrashFile() {
SortModTimeAscend(miniofiles)
for _, file := range miniofiles {

if file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) {
dirPath := setting.CBCodePathPrefix + file.Name() + "/"
log.Info("clear job in minio trash:"+file.Name())
storage.Attachments.DeleteDir(dirPath)
processCount++
if file.Name()!="" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) {

has,err:=models.IsCloudbrainExistByJobName(file.Name())
if err==nil && !has {
dirPath := setting.CBCodePathPrefix + file.Name() + "/"
log.Info("clear job in minio trash:" + file.Name())
storage.Attachments.DeleteDir(dirPath)
processCount++
}
if processCount == setting.ClearStrategy.BatchSize {
break
}
@@ -83,10 +99,13 @@ func clearLocalHistoryTrashFile() {
SortModTimeAscend(files)
for _, file := range files {
//清理n天前的历史垃圾数据,清理job目录
if file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) {
os.RemoveAll(setting.JobPath + file.Name())
log.Info("clear job in local trash:"+file.Name())
processCount++
if file.Name()!="" && file.ModTime().Before(time.Now().AddDate(0, 0, -setting.ClearStrategy.TrashSaveDays)) {
has,err:=models.IsCloudbrainExistByJobName(file.Name())
if err==nil && !has{
os.RemoveAll(setting.JobPath + file.Name())
log.Info("clear job in local trash:"+file.Name())
processCount++
}
if processCount == setting.ClearStrategy.BatchSize {
break
}
@@ -105,15 +124,12 @@ func SortModTimeAscend(files []os.FileInfo) {
return files[i].ModTime().Before(files[j].ModTime())
})
}
func SortModTimeAscendForMinio(files []storage.FileInfo) {
sort.Slice(files, func(i, j int) bool {
timeI, _ := time.Parse("2006-01-02 15:04:05", files[i].ModTime)
timeJ, _ := time.Parse("2006-01-02 15:04:05", files[i].ModTime)
return timeI.Before(timeJ)
})
}

func DeleteCloudbrainOneJobStorage(jobName string) error {

if jobName==""{
return nil
}
//delete local
localJobPath := setting.JobPath + jobName
err := os.RemoveAll(localJobPath)


+ 2
- 46
services/cloudbrain/resource/resource_specification.go View File

@@ -246,10 +246,10 @@ func FindAvailableSpecs(userId int64, opts models.FindSpecsOptions) ([]*models.S
return nil, err
}
//filter exclusive specs
specs := filterExclusiveSpecs(r, userId)
specs := models.FilterExclusiveSpecs(r, userId)

//distinct by sourceSpecId
specs = distinctSpecs(specs)
specs = models.DistinctSpecs(specs)
return specs, err
}

@@ -265,50 +265,6 @@ func FindAvailableSpecs4Show(userId int64, opts models.FindSpecsOptions) ([]*api
return result, nil
}

func filterExclusiveSpecs(r []*models.Specification, userId int64) []*models.Specification {
specs := make([]*models.Specification, 0, len(r))
specMap := make(map[int64]string, 0)
for i := 0; i < len(r); i++ {
spec := r[i]
if _, has := specMap[spec.ID]; has {
continue
}
if !spec.IsExclusive {
specs = append(specs, spec)
specMap[spec.ID] = ""
continue
}
orgs := strings.Split(spec.ExclusiveOrg, ";")
for _, org := range orgs {
isMember, _ := models.IsOrganizationMemberByOrgName(org, userId)
if isMember {
specs = append(specs, spec)
specMap[spec.ID] = ""
break
}
}
}
return specs
}

func distinctSpecs(r []*models.Specification) []*models.Specification {
specs := make([]*models.Specification, 0, len(r))
sourceSpecIdMap := make(map[string]string, 0)
for i := 0; i < len(r); i++ {
spec := r[i]
if spec.SourceSpecId == "" {
specs = append(specs, spec)
continue
}
if _, has := sourceSpecIdMap[spec.SourceSpecId]; has {
continue
}
specs = append(specs, spec)
sourceSpecIdMap[spec.SourceSpecId] = ""
}
return specs
}

func GetAndCheckSpec(userId int64, specId int64, opts models.FindSpecsOptions) (*models.Specification, error) {
if specId == 0 {
return nil, nil


Loading…
Cancel
Save