你确认删除该任务么?此任务一旦删除不可恢复。
+diff --git a/models/action.go b/models/action.go index 2a9d88399..9b92b4192 100755 --- a/models/action.go +++ b/models/action.go @@ -57,6 +57,7 @@ const ( ActionCreateInferenceTask // 28 ActionCreateBenchMarkTask //29 ActionCreateNewModelTask //30 + ActionCreateGPUTrainTask //31 ) // Action represents user operation type and other information to diff --git a/models/cloudbrain.go b/models/cloudbrain.go index ea6d0338e..8ffbace25 100755 --- a/models/cloudbrain.go +++ b/models/cloudbrain.go @@ -20,9 +20,17 @@ type CloudbrainStatus string type JobType string type ModelArtsJobStatus string +const ( + TypeCloudBrainOne int = iota + TypeCloudBrainTwo + + TypeCloudBrainAll = -1 +) + const ( NPUResource = "NPU" GPUResource = "CPU/GPU" + AllResource = "all" //notebook storage category EVSCategory = "EVS" @@ -87,6 +95,8 @@ const ( ModelArtsTrainJobCheckRunning ModelArtsJobStatus = "CHECK_RUNNING" //审核作业正在运行中 ModelArtsTrainJobCheckRunningCompleted ModelArtsJobStatus = "CHECK_RUNNING_COMPLETED" //审核作业已经完成 ModelArtsTrainJobCheckFailed ModelArtsJobStatus = "CHECK_FAILED" //审核作业失败 + + DURATION_STR_ZERO = "00:00:00" ) type Cloudbrain struct { @@ -174,7 +184,7 @@ func (task *Cloudbrain) ComputeAndSetDuration() { func ConvertDurationToStr(duration int64) string { if duration == 0 { - return "00:00:00" + return DURATION_STR_ZERO } return util.AddZero(duration/3600) + ":" + util.AddZero(duration%3600/60) + ":" + util.AddZero(duration%60) } @@ -1323,6 +1333,7 @@ func CloudbrainsVersionList(opts *CloudbrainsOptions) ([]*CloudbrainInfo, int, e } func CreateCloudbrain(cloudbrain *Cloudbrain) (err error) { + cloudbrain.TrainJobDuration = DURATION_STR_ZERO if _, err = x.Insert(cloudbrain); err != nil { return err } @@ -1467,6 +1478,15 @@ func GetCloudBrainUnStoppedJob() ([]*Cloudbrain, error) { Find(&cloudbrains) } +func GetStoppedJobWithNoDurationJob() ([]*Cloudbrain, error) { + cloudbrains := make([]*Cloudbrain, 0) + return cloudbrains, x. + In("status", ModelArtsTrainJobCompleted, ModelArtsTrainJobFailed, ModelArtsTrainJobKilled, ModelArtsStopped, JobStopped, JobFailed, JobSucceeded). + Where("train_job_duration is null or train_job_duration = '' "). + Limit(100). + Find(&cloudbrains) +} + func GetCloudbrainCountByUserID(userID int64, jobType string) (int, error) { count, err := x.In("status", JobWaiting, JobRunning).And("job_type = ? and user_id = ? and type = ?", jobType, userID, TypeCloudBrainOne).Count(new(Cloudbrain)) return int(count), err diff --git a/models/file_chunk.go b/models/file_chunk.go index 76c926dc5..0fc3a8879 100755 --- a/models/file_chunk.go +++ b/models/file_chunk.go @@ -13,11 +13,6 @@ const ( FileUploaded ) -const ( - TypeCloudBrainOne int = iota - TypeCloudBrainTwo -) - type FileChunk struct { ID int64 `xorm:"pk autoincr"` UUID string `xorm:"uuid UNIQUE"` diff --git a/modules/auth/cloudbrain.go b/modules/auth/cloudbrain.go index 9949feddc..9d3d6290f 100755 --- a/modules/auth/cloudbrain.go +++ b/modules/auth/cloudbrain.go @@ -20,6 +20,9 @@ type CreateCloudBrainForm struct { ResourceSpecId int `form:"resource_spec_id" binding:"Required"` BenchmarkTypeID int `form:"benchmark_types_id"` BenchmarkChildTypeID int `form:"benchmark_child_types_id"` + BootFile string `form:"boot_file"` + Params string `form:"run_para_list"` + BranchName string `form:"branch_name"` } type CommitImageCloudBrainForm struct { diff --git a/modules/cloudbrain/cloudbrain.go b/modules/cloudbrain/cloudbrain.go index 54ac0c7ac..bd0f11507 100755 --- a/modules/cloudbrain/cloudbrain.go +++ b/modules/cloudbrain/cloudbrain.go @@ -15,14 +15,13 @@ import ( ) const ( - Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple; - service ssh stop; - jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` + Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"` CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"` CodeMountPath = "/code" DataSetMountPath = "/dataset" ModelMountPath = "/model" + LogFile = "log.txt" BenchMarkMountPath = "/benchmark" BenchMarkResourceID = 1 Snn4imagenetMountPath = "/snn4imagenet" @@ -32,10 +31,13 @@ const ( SubTaskName = "task1" Success = "S000" + + DefaultBranchName = "master" ) var ( - ResourceSpecs *models.ResourceSpecs + ResourceSpecs *models.ResourceSpecs + TrainResourceSpecs *models.ResourceSpecs ) func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool { @@ -147,7 +149,7 @@ func AdminOrJobCreaterRightForTrain(ctx *context.Context) { } -func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, uuid, codePath, modelPath, benchmarkPath, snn4imagenetPath, brainScorePath, jobType, gpuQueue, description string, benchmarkTypeID, benchmarkChildTypeID, resourceSpecId int) error { +func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, uuid, codePath, modelPath, benchmarkPath, snn4imagenetPath, brainScorePath, jobType, gpuQueue, description, branchName, bootFile, params string, benchmarkTypeID, benchmarkChildTypeID, resourceSpecId int) error { dataActualPath := setting.Attachment.Minio.RealPath + setting.Attachment.Minio.Bucket + "/" + setting.Attachment.Minio.BasePath + @@ -155,13 +157,27 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, uuid var resourceSpec *models.ResourceSpec - if ResourceSpecs == nil { - json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) - } - for _, spec := range ResourceSpecs.ResourceSpec { - if resourceSpecId == spec.Id { - resourceSpec = spec + var versionCount int + if jobType == string(models.JobTypeTrain) { + versionCount = 1 + if TrainResourceSpecs == nil { + json.Unmarshal([]byte(setting.TrainResourceSpecs), &TrainResourceSpecs) } + for _, spec := range TrainResourceSpecs.ResourceSpec { + if resourceSpecId == spec.Id { + resourceSpec = spec + } + } + } else { + if ResourceSpecs == nil { + json.Unmarshal([]byte(setting.ResourceSpecs), &ResourceSpecs) + } + for _, spec := range ResourceSpecs.ResourceSpec { + if resourceSpecId == spec.Id { + resourceSpec = spec + } + } + } if resourceSpec == nil { @@ -169,6 +185,15 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, return errors.New("no such resourceSpec") } + var datasetName string + attach, err := models.GetAttachmentByUUID(uuid) + if err != nil { + //for benchmark, do not return error + log.Error("GetAttachmentByUUID failed:%v", err) + } else { + datasetName = attach.Name + } + jobResult, err := CreateJob(jobName, models.CreateJobParams{ JobName: jobName, RetryCount: 1, @@ -263,6 +288,12 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, BenchmarkTypeID: benchmarkTypeID, BenchmarkChildTypeID: benchmarkChildTypeID, Description: description, + IsLatestVersion: "1", + VersionCount: versionCount, + BranchName: branchName, + BootFile: bootFile, + DatasetName: datasetName, + Parameters: params, }) if err != nil { @@ -278,6 +309,8 @@ func GenerateTask(ctx *context.Context, displayJobName, jobName, image, command, if string(models.JobTypeBenchmark) == jobType { notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateBenchMarkTask) + } else if string(models.JobTypeTrain) == jobType { + notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, jobID, displayJobName, models.ActionCreateGPUTrainTask) } else { notification.NotifyOtherTask(ctx.User, ctx.Repo.Repository, stringId, displayJobName, models.ActionCreateDebugGPUTask) } diff --git a/modules/modelarts/modelarts.go b/modules/modelarts/modelarts.go index e30d0100c..538fcfbd9 100755 --- a/modules/modelarts/modelarts.go +++ b/modules/modelarts/modelarts.go @@ -56,7 +56,6 @@ const ( PerPage = 10 IsLatestVersion = "1" NotLatestVersion = "0" - DebugType = -1 VersionCount = 1 SortByCreateTime = "create_time" diff --git a/modules/setting/setting.go b/modules/setting/setting.go index 7dc8167bd..26f068193 100755 --- a/modules/setting/setting.go +++ b/modules/setting/setting.go @@ -452,16 +452,18 @@ var ( DecompressOBSTaskName string //cloudbrain config - CBAuthUser string - CBAuthPassword string - RestServerHost string - JobPath string - CBCodePathPrefix string - JobType string - GpuTypes string - DebugServerHost string - ResourceSpecs string - MaxDuration int64 + CBAuthUser string + CBAuthPassword string + RestServerHost string + JobPath string + CBCodePathPrefix string + JobType string + GpuTypes string + DebugServerHost string + ResourceSpecs string + MaxDuration int64 + TrainGpuTypes string + TrainResourceSpecs string //benchmark config IsBenchmarkEnabled bool @@ -1286,6 +1288,8 @@ func NewContext() { GpuTypes = sec.Key("GPU_TYPES").MustString("") ResourceSpecs = sec.Key("RESOURCE_SPECS").MustString("") MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400) + TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("") + TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("") sec = Cfg.Section("benchmark") IsBenchmarkEnabled = sec.Key("ENABLED").MustBool(false) diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index b1a25494a..09bb5015f 100755 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -1022,7 +1022,8 @@ modelarts.train_job.parameter_value=Parameter Value modelarts.train_job.resource_setting=resource_setting modelarts.train_job.resource_setting_info=resource_setting_info modelarts.train_job.resource_pool=resource_pool -modelarts.train_job.resource_type=resource_type +modelarts.train_job.resource_type=Resource Type +modelarts.train_job.train_dataset=Train Dataset modelarts.train_job.standard=Standard modelarts.train_job.NAS_address=NAS Address modelarts.train_job.NAS_mount_path=NAS Mount Path @@ -2802,10 +2803,11 @@ reject_pull_request = `suggested changes for %s#%[2]s` upload_dataset=`upload dataset %s` task_gpudebugjob=`created CPU/GPU type debugging task%s` task_npudebugjob=`created NPU type debugging task %s` -task_trainjob=`created training task%s` +task_nputrainjob=`created NPU training task%s` task_inferencejob=`created reasoning task %s` task_benchmark=`created profiling task %s` task_createmodel=`created new model %s` +task_gputrainjob=`created CPU/GPU training task%s` [tool] ago = %s ago diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini index ece7f7bdf..d26065363 100755 --- a/options/locale/locale_zh-CN.ini +++ b/options/locale/locale_zh-CN.ini @@ -2809,10 +2809,11 @@ reject_pull_request=`建议变更 %s#%[2]s` upload_dataset=`上传了数据集文件 %s` task_gpudebugjob=`创建了CPU/GPU类型调试任务 %s` task_npudebugjob=`创建了NPU类型调试任务 %s` -task_trainjob=`创建了训练任务 %s` +task_nputrainjob=`创建了NPU类型训练任务 %s` task_inferencejob=`创建了推理任务 %s` task_benchmark=`创建了评测任务 %s` task_createmodel=`导入了新模型 %s` +task_gputrainjob=`创建了CPU/GPU类型训练任务 %s` [tool] ago=%s前 diff --git a/public/home/home.js b/public/home/home.js old mode 100644 new mode 100755 index 7512a4423..478c70f21 --- a/public/home/home.js +++ b/public/home/home.js @@ -135,7 +135,7 @@ socket.onmessage = function (e) { html += recordPrefix + actionName; html += " " + getRepotext(record) + "" } - else if(record.OpType == "24" || record.OpType == "26" || record.OpType == "27" || record.OpType == "28" || record.OpType == "30"){ + else if(record.OpType == "24" || record.OpType == "26" || record.OpType == "27" || record.OpType == "28" || record.OpType == "30" || record.OpType == "31"){ html += recordPrefix + actionName; html += " " + record.RefName + "" } @@ -175,6 +175,8 @@ function getTaskLink(record){ re = re + "/cloudbrain/benchmark/" + record.Content; }else if(record.OpType == 30){ re = re + "/modelmanage/show_model_info?name=" + record.RefName; + }else if(record.OpType == 31){ + re = re + "/cloudbrain/train-job/" + record.Content; } re = encodeURI(re); return re; @@ -321,10 +323,11 @@ var actionNameZH={ "24":"上传了数据集文件", "25":"创建了CPU/GPU类型调试任务", "26":"创建了NPU类型调试任务", - "27":"创建了训练任务", + "27":"创建了NPU类型训练任务", "28":"创建了推理任务", "29":"创建了评测任务", - "30":"导入了新模型" + "30":"导入了新模型", + "31":"创建了CPU/GPU类型训练任务" }; var actionNameEN={ @@ -346,10 +349,11 @@ var actionNameEN={ "24":" upload dataset ", "25":" created CPU/GPU type debugging task ", "26":" created NPU type debugging task ", - "27":" created training task", + "27":" created NPU type training task", "28":" created reasoning task", "29":" created profiling task", - "30":" created new model" + "30":" created new model", + "31":" created CPU/GPU type training task", }; var repoAndOrgZH={ diff --git a/routers/admin/cloudbrains.go b/routers/admin/cloudbrains.go old mode 100644 new mode 100755 index 6bbd534b9..884ed6b9b --- a/routers/admin/cloudbrains.go +++ b/routers/admin/cloudbrains.go @@ -41,7 +41,7 @@ func CloudBrains(ctx *context.Context) { if page <= 0 { page = 1 } - debugType := modelarts.DebugType + debugType := models.TypeCloudBrainAll if listType == models.GPUResource { debugType = models.TypeCloudBrainOne } else if listType == models.NPUResource { @@ -121,7 +121,7 @@ func DownloadCloudBrains(ctx *context.Context) { Page: page, PageSize: 1, }, - Type: modelarts.DebugType, + Type: models.TypeCloudBrainAll, NeedRepoInfo: false, IsLatestVersion: modelarts.IsLatestVersion, }) @@ -151,7 +151,7 @@ func DownloadCloudBrains(ctx *context.Context) { Page: page, PageSize: pageSize, }, - Type: modelarts.DebugType, + Type: models.TypeCloudBrainAll, NeedRepoInfo: true, IsLatestVersion: modelarts.IsLatestVersion, }) diff --git a/routers/api/v1/api.go b/routers/api/v1/api.go index 306854af3..1868edcb5 100755 --- a/routers/api/v1/api.go +++ b/routers/api/v1/api.go @@ -62,10 +62,10 @@ import ( "net/http" "strings" - "code.gitea.io/gitea/routers/authentication" - "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/auth" + + "code.gitea.io/gitea/modules/cloudbrain" "code.gitea.io/gitea/modules/context" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" @@ -77,6 +77,7 @@ import ( "code.gitea.io/gitea/routers/api/v1/repo" _ "code.gitea.io/gitea/routers/api/v1/swagger" // for swagger generation "code.gitea.io/gitea/routers/api/v1/user" + "code.gitea.io/gitea/routers/authentication" repo_ext "code.gitea.io/gitea/routers/repo" "gitea.com/macaron/binding" @@ -882,6 +883,13 @@ func RegisterRoutes(m *macaron.Macaron) { m.Group("/cloudbrain", func() { m.Get("/:id", repo.GetCloudbrainTask) m.Get("/:id/log", repo.CloudbrainGetLog) + m.Group("/train-job", func() { + m.Group("/:jobid", func() { + m.Get("", repo.GetModelArtsTrainJobVersion) + m.Get("/model_list", repo.CloudBrainModelList) + m.Post("/stop_version", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo_ext.CloudBrainStop) + }) + }) }, reqRepoReader(models.UnitTypeCloudBrain)) m.Group("/modelarts", func() { m.Group("/notebook", func() { diff --git a/routers/api/v1/repo/cloudbrain.go b/routers/api/v1/repo/cloudbrain.go index b2f529dfb..d31943d42 100755 --- a/routers/api/v1/repo/cloudbrain.go +++ b/routers/api/v1/repo/cloudbrain.go @@ -6,16 +6,19 @@ package repo import ( - "code.gitea.io/gitea/modules/timeutil" + "encoding/json" "net/http" "sort" + "strings" "time" - "code.gitea.io/gitea/modules/log" - "code.gitea.io/gitea/models" "code.gitea.io/gitea/modules/cloudbrain" "code.gitea.io/gitea/modules/context" + "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/storage" + "code.gitea.io/gitea/modules/timeutil" + routerRepo "code.gitea.io/gitea/routers/repo" ) // cloudbrain get job task by jobid @@ -161,3 +164,55 @@ func CloudbrainGetLog(ctx *context.Context) { return } + +func CloudBrainModelList(ctx *context.APIContext) { + var ( + err error + ) + + var jobID = ctx.Params(":jobid") + var versionName = ctx.Query("version_name") + parentDir := ctx.Query("parentDir") + dirArray := strings.Split(parentDir, "/") + + task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) + if err != nil { + log.Error("GetCloudbrainByJobID(%s) failed:%v", task.JobName, err.Error()) + return + } + + //get dirs + dirs, err := routerRepo.GetModelDirs(task.JobName, parentDir) + if err != nil { + log.Error("GetModelDirs failed:%v", err.Error(), ctx.Data["msgID"]) + ctx.ServerError("GetModelDirs failed:", err) + return + } + + var fileInfos []storage.FileInfo + err = json.Unmarshal([]byte(dirs), &fileInfos) + if err != nil { + log.Error("json.Unmarshal failed:%v", err.Error(), ctx.Data["msgID"]) + ctx.ServerError("json.Unmarshal failed:", err) + return + } + + for i, fileInfo := range fileInfos { + temp, _ := time.Parse("2006-01-02 15:04:05", fileInfo.ModTime) + fileInfos[i].ModTime = temp.Local().Format("2006-01-02 15:04:05") + } + + sort.Slice(fileInfos, func(i, j int) bool { + return fileInfos[i].ModTime > fileInfos[j].ModTime + }) + + ctx.JSON(http.StatusOK, map[string]interface{}{ + "JobID": jobID, + "VersionName": versionName, + "StatusOK": 0, + "Path": dirArray, + "Dirs": fileInfos, + "task": task, + "PageIsCloudBrain": true, + }) +} diff --git a/routers/api/v1/repo/modelarts.go b/routers/api/v1/repo/modelarts.go index d7d011e07..e24ac95fb 100755 --- a/routers/api/v1/repo/modelarts.go +++ b/routers/api/v1/repo/modelarts.go @@ -6,16 +6,17 @@ package repo import ( - "code.gitea.io/gitea/modules/timeutil" "net/http" "strconv" "strings" "code.gitea.io/gitea/models" + "code.gitea.io/gitea/modules/cloudbrain" "code.gitea.io/gitea/modules/context" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/modelarts" "code.gitea.io/gitea/modules/storage" + "code.gitea.io/gitea/modules/timeutil" routerRepo "code.gitea.io/gitea/routers/repo" ) @@ -66,8 +67,8 @@ func GetModelArtsNotebook2(ctx *context.APIContext) { ctx.NotFound(err) return } - if job.StartTime == 0 && result.Lease.CreateTime > 0 { - job.StartTime = timeutil.TimeStamp(result.Lease.CreateTime / 1000) + if job.StartTime == 0 && result.Lease.UpdateTime > 0 { + job.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000) } job.Status = result.Status if job.EndTime == 0 && models.IsModelArtsDebugJobTerminal(job.Status) { @@ -133,27 +134,61 @@ func GetModelArtsTrainJobVersion(ctx *context.APIContext) { ctx.NotFound(err) return } - result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10)) - if err != nil { - ctx.NotFound(err) - return - } - if job.StartTime == 0 && result.StartTime > 0 { - job.StartTime = timeutil.TimeStamp(result.StartTime / 1000) - } - job.Status = modelarts.TransTrainJobStatus(result.IntStatus) - job.Duration = result.Duration / 1000 - job.TrainJobDuration = result.TrainJobDuration - job.TrainJobDuration = models.ConvertDurationToStr(job.Duration) + if job.Type == models.TypeCloudBrainOne { + jobResult, err := cloudbrain.GetJob(job.JobID) + if err != nil { + ctx.NotFound(err) + log.Error("GetJob failed:", err) + return + } + result, err := models.ConvertToJobResultPayload(jobResult.Payload) + if err != nil { + ctx.NotFound(err) + log.Error("ConvertToJobResultPayload failed:", err) + return + } - if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 { - job.EndTime = job.StartTime.Add(job.Duration) - } + job.Status = result.JobStatus.State + if result.JobStatus.State != string(models.JobWaiting) && result.JobStatus.State != string(models.JobFailed) { + taskRoles := result.TaskRoles + taskRes, _ := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) - err = models.UpdateTrainJobVersion(job) - if err != nil { - log.Error("UpdateJob failed:", err) + job.ContainerIp = taskRes.TaskStatuses[0].ContainerIP + job.ContainerID = taskRes.TaskStatuses[0].ContainerID + job.Status = taskRes.TaskStatuses[0].State + } + + if result.JobStatus.State != string(models.JobWaiting) { + err = models.UpdateJob(job) + if err != nil { + log.Error("UpdateJob failed:", err) + } + } + } else { + result, err := modelarts.GetTrainJob(jobID, strconv.FormatInt(job.VersionID, 10)) + if err != nil { + ctx.NotFound(err) + return + } + + if job.StartTime == 0 && result.StartTime > 0 { + job.StartTime = timeutil.TimeStamp(result.StartTime / 1000) + } + job.Status = modelarts.TransTrainJobStatus(result.IntStatus) + job.Duration = result.Duration / 1000 + job.TrainJobDuration = result.TrainJobDuration + + job.TrainJobDuration = models.ConvertDurationToStr(job.Duration) + + if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 { + job.EndTime = job.StartTime.Add(job.Duration) + } + + err = models.UpdateTrainJobVersion(job) + if err != nil { + log.Error("UpdateJob failed:", err) + } } ctx.JSON(http.StatusOK, map[string]interface{}{ @@ -377,9 +412,7 @@ func GetModelArtsInferenceJob(ctx *context.APIContext) { } job.Status = modelarts.TransTrainJobStatus(result.IntStatus) job.Duration = result.Duration / 1000 - job.TrainJobDuration = result.TrainJobDuration - - job.TrainJobDuration = models.ConvertDurationToStr(result.Duration) + job.TrainJobDuration = models.ConvertDurationToStr(job.Duration) if job.EndTime == 0 && models.IsTrainJobTerminal(job.Status) && job.StartTime > 0 { job.EndTime = job.StartTime.Add(job.Duration) diff --git a/routers/private/internal.go b/routers/private/internal.go index 0dd725ca3..d80a706cc 100755 --- a/routers/private/internal.go +++ b/routers/private/internal.go @@ -6,6 +6,7 @@ package private import ( + "code.gitea.io/gitea/routers/repo" "strings" "code.gitea.io/gitea/modules/log" @@ -45,6 +46,7 @@ func RegisterRoutes(m *macaron.Macaron) { m.Post("/tool/update_all_repo_commit_cnt", UpdateAllRepoCommitCnt) m.Post("/tool/repo_stat/:date", RepoStatisticManually) m.Post("/tool/update_repo_visit/:date", UpdateRepoVisit) + m.Post("/task/history_handle/duration", repo.HandleTaskWithNoDuration) }, CheckInternalToken) } diff --git a/routers/repo/cloudbrain.go b/routers/repo/cloudbrain.go index 0905efd54..438fb610e 100755 --- a/routers/repo/cloudbrain.go +++ b/routers/repo/cloudbrain.go @@ -37,6 +37,9 @@ const ( tplCloudBrainBenchmarkIndex base.TplName = "repo/cloudbrain/benchmark/index" tplCloudBrainBenchmarkNew base.TplName = "repo/cloudbrain/benchmark/new" tplCloudBrainBenchmarkShow base.TplName = "repo/cloudbrain/benchmark/show" + + tplCloudBrainTrainJobNew base.TplName = "repo/cloudbrain/trainjob/new" + tplCloudBrainTrainJobShow base.TplName = "repo/cloudbrain/trainjob/show" ) var ( @@ -45,6 +48,7 @@ var ( benchmarkTypes *models.BenchmarkTypes benchmarkGpuInfos *models.GpuInfos benchmarkResourceSpecs *models.ResourceSpecs + trainGpuInfos *models.GpuInfos ) const BENCHMARK_TYPE_CODE = "repo.cloudbrain.benchmark.types" @@ -143,6 +147,11 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { } ctx.Data["gpu_types"] = gpuInfos.GpuInfo + if trainGpuInfos == nil { + json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos) + } + ctx.Data["train_gpu_types"] = trainGpuInfos.GpuInfo + if benchmarkGpuInfos == nil { json.Unmarshal([]byte(setting.BenchmarkGpuTypes), &benchmarkGpuInfos) } @@ -157,6 +166,14 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { json.Unmarshal([]byte(setting.ResourceSpecs), &cloudbrain.ResourceSpecs) } ctx.Data["resource_specs"] = cloudbrain.ResourceSpecs.ResourceSpec + + if cloudbrain.TrainResourceSpecs == nil { + json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) + } + ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec + ctx.Data["params"] = "" + ctx.Data["branchName"] = ctx.Repo.BranchName + ctx.Data["snn4imagenet_path"] = cloudbrain.Snn4imagenetMountPath ctx.Data["is_snn4imagenet_enabled"] = setting.IsSnn4imagenetEnabled @@ -184,38 +201,52 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { image := form.Image uuid := form.Attachment jobType := form.JobType - command := cloudbrain.Command gpuQueue := form.GpuType codePath := setting.JobPath + jobName + cloudbrain.CodeMountPath resourceSpecId := form.ResourceSpecId + branchName := form.BranchName repo := ctx.Repo.Repository - tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, string(models.JobTypeDebug), displayJobName) + tpl := tplCloudBrainNew + command := cloudbrain.Command + if jobType == string(models.JobTypeTrain) { + tpl = tplCloudBrainTrainJobNew + commandTrain, err := getTrainJobCommand(form) + if err != nil { + log.Error("getTrainJobCommand failed: %v", err) + ctx.RenderWithErr(err.Error(), tpl, &form) + return + } + + command = commandTrain + } + + tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName) if err == nil { if len(tasks) != 0 { log.Error("the job name did already exist", ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("the job name did already exist", tplCloudBrainNew, &form) + ctx.RenderWithErr("the job name did already exist", tpl, &form) return } } else { if !models.IsErrJobNotExist(err) { log.Error("system error, %v", err, ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("system error", tplCloudBrainNew, &form) + ctx.RenderWithErr("system error", tpl, &form) return } } if !jobNamePattern.MatchString(displayJobName) { - ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tplCloudBrainNew, &form) + ctx.RenderWithErr(ctx.Tr("repo.cloudbrain_jobname_err"), tpl, &form) return } - if jobType != string(models.JobTypeBenchmark) && jobType != string(models.JobTypeDebug) && jobType != string(models.JobTypeSnn4imagenet) && jobType != string(models.JobTypeBrainScore) { + if jobType != string(models.JobTypeBenchmark) && jobType != string(models.JobTypeDebug) && jobType != string(models.JobTypeSnn4imagenet) && jobType != string(models.JobTypeBrainScore) && jobType != string(models.JobTypeTrain) { log.Error("jobtype error:", jobType, ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("jobtype error", tplCloudBrainNew, &form) + ctx.RenderWithErr("jobtype error", tpl, &form) return } @@ -223,18 +254,21 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { if err != nil { log.Error("GetCloudbrainCountByUserID failed:%v", err, ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("system error", tplCloudBrainNew, &form) + ctx.RenderWithErr("system error", tpl, &form) return } else { if count >= 1 { log.Error("the user already has running or waiting task", ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr("you have already a running or waiting task, can not create more", tplCloudBrainNew, &form) + ctx.RenderWithErr("you have already a running or waiting task, can not create more", tpl, &form) return } } - downloadCode(repo, codePath) + if branchName == "" { + branchName = cloudbrain.DefaultBranchName + } + downloadCode(repo, codePath, branchName) uploadCodeToMinio(codePath+"/", jobName, cloudbrain.CodeMountPath+"/") modelPath := setting.JobPath + jobName + cloudbrain.ModelMountPath + "/" @@ -268,15 +302,19 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { err = cloudbrain.GenerateTask(ctx, displayJobName, jobName, image, command, uuid, storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), - storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, form.Description, + storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), jobType, gpuQueue, form.Description, branchName, form.BootFile, form.Params, 0, 0, resourceSpecId) if err != nil { cloudBrainNewDataPrepare(ctx) - ctx.RenderWithErr(err.Error(), tplCloudBrainNew, &form) + ctx.RenderWithErr(err.Error(), tpl, &form) return } - ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all") + if jobType == string(models.JobTypeTrain) { + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=all") + } else { + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/debugjob?debugListType=all") + } } func CloudBrainRestart(ctx *context.Context) { @@ -342,18 +380,29 @@ func CloudBrainRestart(ctx *context.Context) { } func CloudBrainBenchMarkShow(ctx *context.Context) { - cloudBrainShow(ctx, tplCloudBrainBenchmarkShow) + cloudBrainShow(ctx, tplCloudBrainBenchmarkShow, models.JobTypeBenchmark) } func CloudBrainShow(ctx *context.Context) { - cloudBrainShow(ctx, tplCloudBrainShow) + cloudBrainShow(ctx, tplCloudBrainShow, models.JobTypeDebug) } -func cloudBrainShow(ctx *context.Context, tpName base.TplName) { +func CloudBrainTrainJobShow(ctx *context.Context) { + cloudBrainShow(ctx, tplCloudBrainTrainJobShow, models.JobTypeTrain) +} + +func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.JobType) { ctx.Data["PageIsCloudBrain"] = true - var ID = ctx.Params(":id") debugListType := ctx.Query("debugListType") - task, err := models.GetCloudbrainByID(ID) + + var task *models.Cloudbrain + var err error + if jobType == models.JobTypeTrain { + task, err = models.GetCloudbrainByJobID(ctx.Params(":jobid")) + } else { + task, err = models.GetCloudbrainByID(ctx.Params(":id")) + } + if err != nil { log.Info("error:" + err.Error()) ctx.Data["error"] = err.Error() @@ -368,6 +417,16 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName) { jobRes.Resource.Memory = strings.ReplaceAll(jobRes.Resource.Memory, "Mi", "MB") spec := "GPU数:" + strconv.Itoa(jobRes.Resource.NvidiaComGpu) + ",CPU数:" + strconv.Itoa(jobRes.Resource.CPU) + ",内存(MB):" + jobRes.Resource.Memory ctx.Data["resource_spec"] = spec + if task.JobType == string(models.JobTypeTrain) { + if trainGpuInfos == nil { + json.Unmarshal([]byte(setting.TrainGpuTypes), &trainGpuInfos) + } + for _, resourceType := range trainGpuInfos.GpuInfo { + if resourceType.Queue == jobRes.Config.GpuType { + ctx.Data["resource_type"] = resourceType.Value + } + } + } taskRoles := jobRes.TaskRoles if jobRes.JobStatus.State != string(models.JobFailed) { @@ -419,15 +478,41 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName) { } } if task.TrainJobDuration == "" { - var duration int64 - if task.Status == string(models.JobRunning) { - duration = time.Now().Unix() - int64(task.CreatedUnix) - } else { - duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix) + if task.Duration == 0 { + var duration int64 + if task.Status == string(models.JobRunning) { + duration = time.Now().Unix() - int64(task.CreatedUnix) + } else { + duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix) + } + task.Duration = duration } - task.TrainJobDuration = models.ConvertDurationToStr(duration) + task.TrainJobDuration = models.ConvertDurationToStr(task.Duration) } ctx.Data["duration"] = task.TrainJobDuration + + if len(task.Parameters) > 0 { + var parameters models.Parameters + + err := json.Unmarshal([]byte(task.Parameters), ¶meters) + if err != nil { + log.Error("Failed to Unmarshal Parameters: %s (%v)", task.Parameters, err) + task.Parameters = "" + } else { + if len(parameters.Parameter) > 0 { + paramTemp := "" + for _, Parameter := range parameters.Parameter { + param := Parameter.Label + " = " + Parameter.Value + "; " + paramTemp = paramTemp + param + } + task.Parameters = paramTemp[:len(paramTemp)-2] + } else { + task.Parameters = "" + } + } + + } + ctx.Data["task"] = task ctx.Data["jobName"] = task.JobName ctx.Data["displayJobName"] = task.DisplayJobName @@ -435,6 +520,7 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName) { version_list_task = append(version_list_task, task) ctx.Data["version_list_task"] = version_list_task ctx.Data["debugListType"] = debugListType + ctx.Data["canDownload"] = cloudbrain.CanDeleteJob(ctx, task) ctx.HTML(200, tpName) } @@ -506,11 +592,12 @@ func CloudBrainStop(ctx *context.Context) { break } - ctx.JSON(200, map[string]string{ + ctx.JSON(200, map[string]interface{}{ "result_code": resultCode, "error_msg": errorMsg, "status": status, "id": ID, + "StatusOK": 0, }) } @@ -762,8 +849,8 @@ func GetRate(ctx *context.Context) { } } -func downloadCode(repo *models.Repository, codePath string) error { - if err := git.Clone(repo.RepoPath(), codePath, git.CloneRepoOptions{}); err != nil { +func downloadCode(repo *models.Repository, codePath, branchName string) error { + if err := git.Clone(repo.RepoPath(), codePath, git.CloneRepoOptions{Branch: branchName}); err != nil { log.Error("Failed to clone repository: %s (%v)", repo.FullName(), err) return err } @@ -1011,8 +1098,8 @@ func SyncCloudbrainStatus() { if result != nil { task.Status = result.Status - if task.StartTime == 0 && result.Lease.CreateTime > 0 { - task.StartTime = timeutil.TimeStamp(result.Lease.CreateTime / 1000) + if task.StartTime == 0 && result.Lease.UpdateTime > 0 { + task.StartTime = timeutil.TimeStamp(result.Lease.UpdateTime / 1000) } if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) { task.EndTime = timeutil.TimeStampNow() @@ -1062,6 +1149,156 @@ func SyncCloudbrainStatus() { return } +func HandleTaskWithNoDuration(ctx *context.Context) { + log.Info("HandleTaskWithNoDuration start") + count := 0 + for { + cloudBrains, err := models.GetStoppedJobWithNoDurationJob() + if err != nil { + log.Error("HandleTaskWithNoTrainJobDuration failed:", err.Error()) + break + } + if len(cloudBrains) == 0 { + log.Info("HandleTaskWithNoTrainJobDuration:no task need handle") + break + } + handleNoDurationTask(cloudBrains) + count += len(cloudBrains) + if len(cloudBrains) < 100 { + log.Info("HandleTaskWithNoTrainJobDuration:task less than 100") + break + } + } + log.Info("HandleTaskWithNoTrainJobDuration:count=%d", count) + ctx.JSON(200, "success") +} + +func handleNoDurationTask(cloudBrains []*models.Cloudbrain) { + for _, task := range cloudBrains { + log.Info("Handle job ,%+v", task) + if task.Type == models.TypeCloudBrainOne { + result, err := cloudbrain.GetJob(task.JobID) + if err != nil { + log.Error("GetJob(%s) failed:%v", task.JobName, err) + updateDefaultDuration(task) + continue + } + + if result != nil { + if result.Msg != "success" { + updateDefaultDuration(task) + continue + } + jobRes, err := models.ConvertToJobResultPayload(result.Payload) + if err != nil || len(jobRes.TaskRoles) == 0 { + updateDefaultDuration(task) + continue + } + taskRoles := jobRes.TaskRoles + taskRes, err := models.ConvertToTaskPod(taskRoles[cloudbrain.SubTaskName].(map[string]interface{})) + if err != nil || len(taskRes.TaskStatuses) == 0 { + updateDefaultDuration(task) + continue + } + task.Status = taskRes.TaskStatuses[0].State + startTime := taskRes.TaskStatuses[0].StartAt.Unix() + endTime := taskRes.TaskStatuses[0].FinishedAt.Unix() + log.Info("task startTime = %v endTime= %v ,jobId=%d", startTime, endTime, task.ID) + if startTime > 0 { + task.StartTime = timeutil.TimeStamp(startTime) + } else { + task.StartTime = task.CreatedUnix + } + if endTime > 0 { + task.EndTime = timeutil.TimeStamp(endTime) + } else { + task.EndTime = task.UpdatedUnix + } + + if task.EndTime < task.StartTime { + log.Info("endTime[%v] is less than starTime[%v],jobId=%d", task.EndTime, task.StartTime, task.ID) + st := task.StartTime + task.StartTime = task.EndTime + task.EndTime = st + } + task.ComputeAndSetDuration() + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.JobName, err) + } + } + } else if task.Type == models.TypeCloudBrainTwo { + if task.JobType == string(models.JobTypeDebug) { + //result, err := modelarts.GetJob(task.JobID) + result, err := modelarts.GetNotebook2(task.JobID) + if err != nil { + log.Error("GetJob(%s) failed:%v", task.JobName, err) + task.StartTime = task.CreatedUnix + task.EndTime = task.UpdatedUnix + task.ComputeAndSetDuration() + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.JobName, err) + } + continue + } + + if result != nil { + task.Status = result.Status + startTime := result.Lease.CreateTime + duration := result.Lease.Duration / 1000 + if startTime > 0 { + task.StartTime = timeutil.TimeStamp(startTime) + task.EndTime = task.StartTime.Add(duration) + } + task.ComputeAndSetDuration() + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.JobName, err) + continue + } + } + } else if task.JobType == string(models.JobTypeTrain) { + result, err := modelarts.GetTrainJob(task.JobID, strconv.FormatInt(task.VersionID, 10)) + if err != nil { + log.Error("GetTrainJob(%s) failed:%v", task.JobName, err) + continue + } + + if result != nil { + startTime := result.StartTime / 1000 + if startTime > 0 { + task.StartTime = timeutil.TimeStamp(startTime) + task.EndTime = task.StartTime.Add(result.Duration / 1000) + } + task.ComputeAndSetDuration() + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.JobName, err) + continue + } + } + } else { + log.Error("task.JobType(%s) is error:%s", task.JobName, task.JobType) + } + + } else { + log.Error("task.Type(%s) is error:%d", task.JobName, task.Type) + } + } +} + +func updateDefaultDuration(task *models.Cloudbrain) { + log.Info("updateDefaultDuration: taskId=%d", task.ID) + task.StartTime = task.CreatedUnix + task.EndTime = task.UpdatedUnix + task.ComputeAndSetDuration() + err := models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.JobName, err) + } +} + func CloudBrainBenchmarkIndex(ctx *context.Context) { MustEnableCloudbrain(ctx) repo := ctx.Repo.Repository @@ -1090,13 +1327,16 @@ func CloudBrainBenchmarkIndex(ctx *context.Context) { ciTasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) ciTasks[i].Cloudbrain.ComputeResource = task.ComputeResource if ciTasks[i].TrainJobDuration == "" { - var duration int64 - if task.Status == string(models.JobRunning) { - duration = time.Now().Unix() - int64(task.Cloudbrain.CreatedUnix) - } else { - duration = int64(task.Cloudbrain.UpdatedUnix) - int64(task.Cloudbrain.CreatedUnix) + if ciTasks[i].Duration == 0 { + var duration int64 + if task.Status == string(models.JobRunning) { + duration = time.Now().Unix() - int64(task.Cloudbrain.CreatedUnix) + } else { + duration = int64(task.Cloudbrain.UpdatedUnix) - int64(task.Cloudbrain.CreatedUnix) + } + ciTasks[i].Duration = duration } - ciTasks[i].TrainJobDuration = models.ConvertDurationToStr(duration) + ciTasks[i].TrainJobDuration = models.ConvertDurationToStr(ciTasks[i].Duration) } ciTasks[i].BenchmarkTypeName = "" @@ -1315,7 +1555,7 @@ func CloudBrainBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainF } os.RemoveAll(codePath) - if err := downloadCode(repo, codePath); err != nil { + if err := downloadCode(repo, codePath, cloudbrain.DefaultBranchName); err != nil { log.Error("downloadCode failed, %v", err, ctx.Data["MsgID"]) cloudBrainNewDataPrepare(ctx) ctx.RenderWithErr("system error", tplCloudBrainBenchmarkNew, &form) @@ -1380,7 +1620,7 @@ func CloudBrainBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainF err = cloudbrain.GenerateTask(ctx, displayJobName, jobName, image, command, childInfo.Attachment, storage.GetMinioPath(jobName, cloudbrain.CodeMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.ModelMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.BenchMarkMountPath+"/"), storage.GetMinioPath(jobName, cloudbrain.Snn4imagenetMountPath+"/"), - storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), string(models.JobTypeBenchmark), gpuQueue, form.Description, + storage.GetMinioPath(jobName, cloudbrain.BrainScoreMountPath+"/"), string(models.JobTypeBenchmark), gpuQueue, form.Description, cloudbrain.DefaultBranchName, "", "", benchmarkTypeID, benchmarkChildTypeID, resourceSpecId) if err != nil { cloudBrainNewDataPrepare(ctx) @@ -1406,10 +1646,66 @@ func BenchmarkDel(ctx *context.Context) { } } +func CloudBrainTrainJobNew(ctx *context.Context) { + err := cloudBrainNewDataPrepare(ctx) + if err != nil { + ctx.ServerError("get new train-job info failed", err) + return + } + ctx.HTML(http.StatusOK, tplCloudBrainTrainJobNew) +} + +func getTrainJobCommand(form auth.CreateCloudBrainForm) (string, error) { + var command string + bootFile := form.BootFile + params := form.Params + + if !strings.HasSuffix(bootFile, ".py") { + log.Error("bootFile(%s) format error", bootFile) + return command, errors.New("bootFile format error") + } + + var parameters models.Parameters + var param string + if len(params) != 0 { + err := json.Unmarshal([]byte(params), ¶meters) + if err != nil { + log.Error("Failed to Unmarshal params: %s (%v)", params, err) + return command, err + } + + for _, parameter := range parameters.Parameter { + param += " --" + parameter.Label + "=" + parameter.Value + } + } + + command += "python /code/" + bootFile + param + " > " + cloudbrain.ModelMountPath + "/" + form.DisplayJobName + "-" + cloudbrain.LogFile + + return command, nil +} + +func CloudBrainTrainJobDel(ctx *context.Context) { + var listType = ctx.Query("listType") + if err := deleteCloudbrainJob(ctx); err != nil { + log.Error("deleteCloudbrainJob failed: %v", err, ctx.Data["msgID"]) + ctx.ServerError(err.Error(), err) + return + } + + var isAdminPage = ctx.Query("isadminpage") + if ctx.IsUserSiteAdmin() && isAdminPage == "true" { + ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains") + } else { + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType) + } +} + func GetBenchmarkTypes(ctx *context.Context) *models.BenchmarkTypes { var lang = ctx.Locale.Language() if benchmarkTypesMap[lang] == nil { var val = i18n.Tr(lang, BENCHMARK_TYPE_CODE) + //use config + val = setting.BenchmarkTypes var tempType *models.BenchmarkTypes if err := json.Unmarshal([]byte(val), &tempType); err != nil { log.Error("json.Unmarshal BenchmarkTypes(%s) failed:%v", val, err, ctx.Data["MsgID"]) diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 7d4c203bc..5a2e4691e 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -47,20 +47,26 @@ const ( ) func DebugJobIndex(ctx *context.Context) { - debugListType := ctx.Query("debugListType") - ctx.Data["ListType"] = debugListType + listType := ctx.Query("debugListType") + ctx.Data["ListType"] = listType MustEnableCloudbrain(ctx) repo := ctx.Repo.Repository page := ctx.QueryInt("page") if page <= 0 { page = 1 } - debugType := modelarts.DebugType + typeCloudBrain := models.TypeCloudBrainAll jobTypeNot := false - if debugListType == models.GPUResource { - debugType = models.TypeCloudBrainOne - } else if debugListType == models.NPUResource { - debugType = models.TypeCloudBrainTwo + if listType == models.GPUResource { + typeCloudBrain = models.TypeCloudBrainOne + } else if listType == models.NPUResource { + typeCloudBrain = models.TypeCloudBrainTwo + } else if listType == models.AllResource { + typeCloudBrain = models.TypeCloudBrainAll + } else { + log.Error("listType(%s) error", listType) + ctx.ServerError("listType error", errors.New("listType error")) + return } var jobTypes []string @@ -71,7 +77,7 @@ func DebugJobIndex(ctx *context.Context) { PageSize: setting.UI.IssuePagingNum, }, RepoID: repo.ID, - Type: debugType, + Type: typeCloudBrain, JobTypeNot: jobTypeNot, JobTypes: jobTypes, }) @@ -93,7 +99,7 @@ func DebugJobIndex(ctx *context.Context) { ctx.Data["Tasks"] = ciTasks ctx.Data["CanCreate"] = cloudbrain.CanCreateOrDebugJob(ctx) ctx.Data["RepoIsEmpty"] = repo.IsEmpty - ctx.Data["debugListType"] = debugListType + ctx.Data["debugListType"] = listType ctx.HTML(200, tplDebugJobIndex) } @@ -410,20 +416,45 @@ func NotebookManage(ctx *context.Context) { break } - task.Status = res.Status - if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) { - task.EndTime = timeutil.TimeStampNow() - } - task.ComputeAndSetDuration() - err = models.UpdateJob(task) - if err != nil { - log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) - resultCode = "-1" - errorMsg = "system error" - break - } + status = res.Status + if action == models.ActionStart { + newTask := &models.Cloudbrain{ + Status: status, + UserID: task.UserID, + RepoID: task.RepoID, + JobID: task.JobID, + JobName: task.JobName, + DisplayJobName: task.DisplayJobName, + JobType: task.JobType, + Type: task.Type, + Uuid: task.Uuid, + Image: task.Image, + ComputeResource: task.ComputeResource, + Description: task.Description, + } - status = task.Status + err = models.RestartCloudbrain(task, newTask) + if err != nil { + log.Error("RestartCloudbrain(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) + resultCode = "-1" + errorMsg = "system error" + break + } + ID = strconv.FormatInt(newTask.ID, 10) + } else { + task.Status = res.Status + if task.EndTime == 0 && models.IsModelArtsDebugJobTerminal(task.Status) { + task.EndTime = timeutil.TimeStampNow() + } + task.ComputeAndSetDuration() + err = models.UpdateJob(task) + if err != nil { + log.Error("UpdateJob(%s) failed:%v", task.JobName, err.Error(), ctx.Data["MsgID"]) + resultCode = "-1" + errorMsg = "system error" + break + } + } break } @@ -480,6 +511,26 @@ func TrainJobIndex(ctx *context.Context) { page = 1 } + listType := ctx.Query("listType") + if len(listType) == 0 { + listType = models.AllResource + } + ctx.Data["ListType"] = listType + + typeCloudBrain := models.TypeCloudBrainAll + if listType == models.GPUResource { + typeCloudBrain = models.TypeCloudBrainOne + } else if listType == models.NPUResource { + typeCloudBrain = models.TypeCloudBrainTwo + } else if listType == models.AllResource { + typeCloudBrain = models.TypeCloudBrainAll + } + //else { + // log.Error("listType(%s) error", listType) + // ctx.ServerError("listType error", errors.New("listType error")) + // return + //} + var jobTypes []string jobTypes = append(jobTypes, string(models.JobTypeTrain)) tasks, count, err := models.Cloudbrains(&models.CloudbrainsOptions{ @@ -488,7 +539,7 @@ func TrainJobIndex(ctx *context.Context) { PageSize: setting.UI.IssuePagingNum, }, RepoID: repo.ID, - Type: models.TypeCloudBrainTwo, + Type: typeCloudBrain, JobTypeNot: false, JobTypes: jobTypes, IsLatestVersion: modelarts.IsLatestVersion, @@ -501,11 +552,16 @@ func TrainJobIndex(ctx *context.Context) { for i, task := range tasks { tasks[i].CanDel = cloudbrain.CanDeleteJob(ctx, &task.Cloudbrain) tasks[i].CanModify = cloudbrain.CanModifyJob(ctx, &task.Cloudbrain) - tasks[i].ComputeResource = models.NPUResource + if task.Cloudbrain.Type == models.TypeCloudBrainOne { + tasks[i].ComputeResource = models.GPUResource + } else if task.Cloudbrain.Type == models.TypeCloudBrainTwo { + tasks[i].ComputeResource = models.NPUResource + } } pager := context.NewPagination(int(count), setting.UI.IssuePagingNum, page, 5) pager.SetDefaultParams(ctx) + pager.AddParam(ctx, "listType", "ListType") ctx.Data["Page"] = pager ctx.Data["PageIsCloudBrain"] = true @@ -1555,6 +1611,7 @@ func trainJobGetLog(jobID string) (*models.GetTrainJobLogFileNamesResult, *model func TrainJobDel(ctx *context.Context) { var jobID = ctx.Params(":jobid") + var listType = ctx.Query("listType") repo := ctx.Repo.Repository var jobTypes []string @@ -1596,12 +1653,13 @@ func TrainJobDel(ctx *context.Context) { if ctx.IsUserSiteAdmin() && isAdminPage == "true" { ctx.Redirect(setting.AppSubURL + "/admin" + "/cloudbrains") } else { - ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType) } } func TrainJobStop(ctx *context.Context) { var jobID = ctx.Params(":jobid") + var listType = ctx.Query("listType") task := ctx.Cloudbrain _, err := modelarts.StopTrainJob(jobID, strconv.FormatInt(task.VersionID, 10)) @@ -1611,7 +1669,7 @@ func TrainJobStop(ctx *context.Context) { return } - ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") + ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job?listType=" + listType) } func canUserCreateTrainJob(uid int64) (bool, error) { @@ -2276,7 +2334,7 @@ func SetJobCount(ctx *context.Context) { repoId := ctx.Repo.Repository.ID _, jobCount, err := models.Cloudbrains(&models.CloudbrainsOptions{ RepoID: repoId, - Type: modelarts.DebugType, + Type: models.TypeCloudBrainAll, }) if err != nil { ctx.ServerError("Get job faild:", err) diff --git a/routers/routes/routes.go b/routers/routes/routes.go index 743572b21..4cffcd10b 100755 --- a/routers/routes/routes.go +++ b/routers/routes/routes.go @@ -1038,6 +1038,19 @@ func RegisterRoutes(m *macaron.Macaron) { m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.CloudBrainBenchmarkCreate) m.Get("/get_child_types", repo.GetChildTypes) }) + + m.Group("/train-job", func() { + m.Group("/:jobid", func() { + m.Get("", reqRepoCloudBrainReader, repo.CloudBrainTrainJobShow) + m.Post("/del", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainTrainJobDel) + //m.Get("/models", reqRepoCloudBrainReader, repo.CloudBrainShowModels) + m.Get("/download_model", cloudbrain.AdminOrOwnerOrJobCreaterRightForTrain, repo.CloudBrainDownloadModel) + //m.Get("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, repo.TrainJobNewVersion) + //m.Post("/create_version", reqWechatBind, cloudbrain.AdminOrJobCreaterRightForTrain, bindIgnErr(auth.CreateModelArtsTrainJobForm{}), repo.TrainJobCreateVersion) + }) + m.Get("/create", reqWechatBind, reqRepoCloudBrainWriter, repo.CloudBrainTrainJobNew) + m.Post("/create", reqWechatBind, reqRepoCloudBrainWriter, bindIgnErr(auth.CreateCloudBrainForm{}), repo.CloudBrainCreate) + }) }, context.RepoRef()) m.Group("/modelmanage", func() { m.Post("/create_model", reqRepoModelManageWriter, repo.SaveModel) diff --git a/services/socketwrap/clientManager.go b/services/socketwrap/clientManager.go old mode 100644 new mode 100755 index 98b0e0aa9..61f356a66 --- a/services/socketwrap/clientManager.go +++ b/services/socketwrap/clientManager.go @@ -10,7 +10,7 @@ import ( "github.com/elliotchance/orderedmap" ) -var opTypes = []int{1, 2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 22, 23, 25, 26, 27, 28, 29, 30} +var opTypes = []int{1, 2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 22, 23, 25, 26, 27, 28, 29, 30, 31} type ClientsManager struct { Clients *orderedmap.OrderedMap diff --git a/templates/custom/select_dataset.tmpl b/templates/custom/select_dataset.tmpl index 17e4eee42..dc5ca6c9e 100644 --- a/templates/custom/select_dataset.tmpl +++ b/templates/custom/select_dataset.tmpl @@ -3,7 +3,11 @@
| + {{$.i18n.Tr "repo.cloudbrain_task"}} + | +
+
+ {{.DisplayJobName}}
+
+ |
+
| + {{$.i18n.Tr "repo.modelarts.status"}} + | + +
+
+ {{.Status}}
+
+ |
+
| + {{$.i18n.Tr "repo.modelarts.train_job.start_time"}} + | + +
+
+ {{TimeSinceUnix1 .CreatedUnix}}
+
+ |
+
| + {{$.i18n.Tr "repo.modelarts.train_job.dura_time"}} + | + +
+
+ {{$.duration}}
+
+ |
+
| + {{$.i18n.Tr "repo.modelarts.train_job.resource_type"}} + | + +
+
+ {{$.resource_type}}
+
+ |
+
| + {{$.i18n.Tr "repo.modelarts.train_job.standard"}} + | + +
+
+ {{$.resource_spec}}
+
+ |
+
| + 镜像 + | + +
+
+ {{.Image}}
+
+ |
+
| + {{$.i18n.Tr "repo.modelarts.code_version"}} + | + +
+
+ {{.BranchName}}
+
+ |
+
| + {{$.i18n.Tr "repo.modelarts.train_job.start_file"}} + | + +
+
+ {{.BootFile}}
+
+ |
+
| + {{$.i18n.Tr "repo.modelarts.train_job.train_dataset"}} + | + +
+
+ {{.DatasetName}}
+
+ |
+
| + {{$.i18n.Tr "repo.modelarts.train_job.run_parameter"}} + | + +
+
+ {{.Parameters}}
+
+ |
+
| + {{$.i18n.Tr "repo.modelarts.train_job.description"}} + | + +
+
+ {{.Description}}
+
+ |
+
你确认删除该任务么?此任务一旦删除不可恢复。
+