| @@ -570,11 +570,12 @@ type SpecialPools struct { | |||||
| Pools []*SpecialPool `json:"pools"` | Pools []*SpecialPool `json:"pools"` | ||||
| } | } | ||||
| type SpecialPool struct { | type SpecialPool struct { | ||||
| Org string `json:"org"` | |||||
| Type string `json:"type"` | |||||
| IsExclusive bool `json:"isExclusive"` | |||||
| Pool []*GpuInfo `json:"pool"` | |||||
| JobType []string `json:"jobType"` | |||||
| Org string `json:"org"` | |||||
| Type string `json:"type"` | |||||
| IsExclusive bool `json:"isExclusive"` | |||||
| Pool []*GpuInfo `json:"pool"` | |||||
| JobType []string `json:"jobType"` | |||||
| ResourceSpec []*ResourceSpec `json:"resourceSpecs"` | |||||
| } | } | ||||
| type ImageInfosModelArts struct { | type ImageInfosModelArts struct { | ||||
| @@ -17,7 +17,7 @@ import ( | |||||
| ) | ) | ||||
| const ( | const ( | ||||
| Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` | |||||
| //Command = `pip3 install jupyterlab==2.2.5 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;jupyter lab --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --LabApp.token="" --LabApp.allow_origin="self https://cloudbrain.pcl.ac.cn"` | |||||
| //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"` | //CommandBenchmark = `echo "start benchmark";python /code/test.py;echo "end benchmark"` | ||||
| CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"` | CommandBenchmark = `echo "start benchmark";cd /benchmark && bash run_bk.sh;echo "end benchmark"` | ||||
| CodeMountPath = "/code" | CodeMountPath = "/code" | ||||
| @@ -42,6 +42,7 @@ const ( | |||||
| var ( | var ( | ||||
| ResourceSpecs *models.ResourceSpecs | ResourceSpecs *models.ResourceSpecs | ||||
| TrainResourceSpecs *models.ResourceSpecs | TrainResourceSpecs *models.ResourceSpecs | ||||
| SpecialPools *models.SpecialPools | |||||
| ) | ) | ||||
| type GenerateCloudBrainTaskReq struct { | type GenerateCloudBrainTaskReq struct { | ||||
| @@ -70,6 +71,11 @@ type GenerateCloudBrainTaskReq struct { | |||||
| ResourceSpecId int | ResourceSpecId int | ||||
| } | } | ||||
| func GetCloudbrainDebugCommand() string { | |||||
| var command = `pip3 install jupyterlab==3 -i https://pypi.tuna.tsinghua.edu.cn/simple;service ssh stop;/usr/local/bin/python /usr/local/bin/jupyter-lab --ServerApp.shutdown_no_activity_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_inactive_timeout=` + setting.CullIdleTimeout + ` --TerminalManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_idle_timeout=` + setting.CullIdleTimeout + ` --MappingKernelManager.cull_interval=` + setting.CullInterval + ` --MappingKernelManager.cull_connected=True --MappingKernelManager.cull_busy=True --no-browser --ip=0.0.0.0 --allow-root --notebook-dir="/code" --port=80 --ServerApp.token="" --ServerApp.allow_origin="self https://cloudbrain.pcl.ac.cn" ` | |||||
| return command | |||||
| } | |||||
| func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool { | func isAdminOrOwnerOrJobCreater(ctx *context.Context, job *models.Cloudbrain, err error) bool { | ||||
| if !ctx.IsSigned { | if !ctx.IsSigned { | ||||
| return false | return false | ||||
| @@ -222,6 +228,7 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { | |||||
| for _, spec := range TrainResourceSpecs.ResourceSpec { | for _, spec := range TrainResourceSpecs.ResourceSpec { | ||||
| if req.ResourceSpecId == spec.Id { | if req.ResourceSpecId == spec.Id { | ||||
| resourceSpec = spec | resourceSpec = spec | ||||
| break | |||||
| } | } | ||||
| } | } | ||||
| } else { | } else { | ||||
| @@ -231,10 +238,29 @@ func GenerateTask(req GenerateCloudBrainTaskReq) error { | |||||
| for _, spec := range ResourceSpecs.ResourceSpec { | for _, spec := range ResourceSpecs.ResourceSpec { | ||||
| if req.ResourceSpecId == spec.Id { | if req.ResourceSpecId == spec.Id { | ||||
| resourceSpec = spec | resourceSpec = spec | ||||
| break | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| //如果没有匹配到spec信息,尝试从专属资源池获取 | |||||
| if resourceSpec == nil && SpecialPools != nil { | |||||
| for _, specialPool := range SpecialPools.Pools { | |||||
| if resourceSpec != nil { | |||||
| break | |||||
| } | |||||
| if specialPool.ResourceSpec != nil { | |||||
| if IsElementExist(specialPool.JobType, req.JobType) && IsQueueInSpecialtPool(specialPool.Pool, req.GpuQueue) { | |||||
| for _, spec := range specialPool.ResourceSpec { | |||||
| if req.ResourceSpecId == spec.Id { | |||||
| resourceSpec = spec | |||||
| break | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| if resourceSpec == nil { | if resourceSpec == nil { | ||||
| log.Error("no such resourceSpecId(%d)", req.ResourceSpecId, req.Ctx.Data["MsgID"]) | log.Error("no such resourceSpecId(%d)", req.ResourceSpecId, req.Ctx.Data["MsgID"]) | ||||
| @@ -486,7 +512,7 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e | |||||
| GPUNumber: resourceSpec.GpuNum, | GPUNumber: resourceSpec.GpuNum, | ||||
| MemoryMB: resourceSpec.MemMiB, | MemoryMB: resourceSpec.MemMiB, | ||||
| ShmMB: resourceSpec.ShareMemMiB, | ShmMB: resourceSpec.ShareMemMiB, | ||||
| Command: Command, | |||||
| Command: GetCloudbrainDebugCommand(),//Command, | |||||
| NeedIBDevice: false, | NeedIBDevice: false, | ||||
| IsMainRole: false, | IsMainRole: false, | ||||
| UseNNI: false, | UseNNI: false, | ||||
| @@ -538,3 +564,39 @@ func RestartTask(ctx *context.Context, task *models.Cloudbrain, newID *string) e | |||||
| return nil | return nil | ||||
| } | } | ||||
| func InitSpecialPool() { | |||||
| if SpecialPools == nil && setting.SpecialPools != "" { | |||||
| json.Unmarshal([]byte(setting.SpecialPools), &SpecialPools) | |||||
| } | |||||
| } | |||||
| func IsResourceSpecInSpecialPool(resourceSpecs []*models.ResourceSpec, resourceSpecId int) bool { | |||||
| if resourceSpecs == nil || len(resourceSpecs) == 0 { | |||||
| return true | |||||
| } | |||||
| for _, v := range resourceSpecs { | |||||
| if v.Id == resourceSpecId { | |||||
| return true | |||||
| } | |||||
| } | |||||
| return false | |||||
| } | |||||
| func IsQueueInSpecialtPool(pool []*models.GpuInfo, queue string) bool { | |||||
| for _, v := range pool { | |||||
| if v.Queue == queue { | |||||
| return true | |||||
| } | |||||
| } | |||||
| return false | |||||
| } | |||||
| func IsElementExist(s []string, str string) bool { | |||||
| for _, v := range s { | |||||
| if v == str { | |||||
| return true | |||||
| } | |||||
| } | |||||
| return false | |||||
| } | |||||
| @@ -460,12 +460,15 @@ var ( | |||||
| CBCodePathPrefix string | CBCodePathPrefix string | ||||
| JobType string | JobType string | ||||
| GpuTypes string | GpuTypes string | ||||
| SpecialPools string | |||||
| DebugServerHost string | DebugServerHost string | ||||
| ResourceSpecs string | ResourceSpecs string | ||||
| MaxDuration int64 | MaxDuration int64 | ||||
| TrainGpuTypes string | TrainGpuTypes string | ||||
| TrainResourceSpecs string | TrainResourceSpecs string | ||||
| MaxDatasetNum int | MaxDatasetNum int | ||||
| CullIdleTimeout string | |||||
| CullInterval string | |||||
| //benchmark config | //benchmark config | ||||
| IsBenchmarkEnabled bool | IsBenchmarkEnabled bool | ||||
| @@ -1311,7 +1314,11 @@ func NewContext() { | |||||
| MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400) | MaxDuration = sec.Key("MAX_DURATION").MustInt64(14400) | ||||
| TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("") | TrainGpuTypes = sec.Key("TRAIN_GPU_TYPES").MustString("") | ||||
| TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("") | TrainResourceSpecs = sec.Key("TRAIN_RESOURCE_SPECS").MustString("") | ||||
| SpecialPools = sec.Key("SPECIAL_POOL").MustString("") | |||||
| MaxDatasetNum = sec.Key("MAX_DATASET_NUM").MustInt(5) | MaxDatasetNum = sec.Key("MAX_DATASET_NUM").MustInt(5) | ||||
| CullIdleTimeout = sec.Key("CULL_IDLE_TIMEOUT").MustString("900") | |||||
| CullInterval = sec.Key("CULL_INTERVAL").MustString("60") | |||||
| sec = Cfg.Section("benchmark") | sec = Cfg.Section("benchmark") | ||||
| IsBenchmarkEnabled = sec.Key("ENABLED").MustBool(false) | IsBenchmarkEnabled = sec.Key("ENABLED").MustBool(false) | ||||
| @@ -18,6 +18,7 @@ import ( | |||||
| "path/filepath" | "path/filepath" | ||||
| "regexp" | "regexp" | ||||
| "runtime" | "runtime" | ||||
| "strconv" | |||||
| "strings" | "strings" | ||||
| texttmpl "text/template" | texttmpl "text/template" | ||||
| "time" | "time" | ||||
| @@ -327,6 +328,7 @@ func NewFuncMap() []template.FuncMap { | |||||
| }, | }, | ||||
| "GetRefType": GetRefType, | "GetRefType": GetRefType, | ||||
| "GetRefName": GetRefName, | "GetRefName": GetRefName, | ||||
| "MB2GB": MB2GB, | |||||
| }} | }} | ||||
| } | } | ||||
| @@ -785,3 +787,14 @@ func GetRefName(ref string) string { | |||||
| reg := regexp.MustCompile(REF_TYPE_PATTERN) | reg := regexp.MustCompile(REF_TYPE_PATTERN) | ||||
| return reg.ReplaceAllString(ref, "") | return reg.ReplaceAllString(ref, "") | ||||
| } | } | ||||
| func MB2GB(size int64) string { | |||||
| s := strconv.FormatFloat(float64(size)/float64(1024), 'f', 2, 64) | |||||
| for strings.HasSuffix(s, "0") { | |||||
| s = strings.TrimSuffix(s, "0") | |||||
| } | |||||
| if strings.HasSuffix(s, ".") { | |||||
| s = strings.TrimSuffix(s, ".") | |||||
| } | |||||
| return s | |||||
| } | |||||
| @@ -752,10 +752,26 @@ func GetCloudbrainsDetailData(ctx *context.Context) { | |||||
| taskDetail.RepoAlias = ciTasks[i].Repo.OwnerName + "/" + ciTasks[i].Repo.Alias | taskDetail.RepoAlias = ciTasks[i].Repo.OwnerName + "/" + ciTasks[i].Repo.Alias | ||||
| } | } | ||||
| if ciTasks[i].Cloudbrain.Status == string(models.JobWaiting) { | if ciTasks[i].Cloudbrain.Status == string(models.JobWaiting) { | ||||
| WaitTimeInt := time.Now().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() | |||||
| taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt) | |||||
| if WaitTimeInt < 0 { | |||||
| taskDetail.WaitTime = "00:00:00" | |||||
| if ciTasks[i].Cloudbrain.DeletedAt != nilTime { | |||||
| WaitTimeInt := ciTasks[i].Cloudbrain.UpdatedUnix.AsTime().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() | |||||
| taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt) | |||||
| if WaitTimeInt < 0 { | |||||
| taskDetail.WaitTime = "00:00:00" | |||||
| } | |||||
| } else { | |||||
| if ciTasks[i].Cloudbrain.StartTime.AsTime().Unix() == 0 { | |||||
| WaitTimeInt := time.Now().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() | |||||
| taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt) | |||||
| if WaitTimeInt < 0 { | |||||
| taskDetail.WaitTime = "00:00:00" | |||||
| } | |||||
| } else { | |||||
| WaitTimeInt := ciTasks[i].Cloudbrain.StartTime.AsTime().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() | |||||
| taskDetail.WaitTime = models.ConvertDurationToStr(WaitTimeInt) | |||||
| if WaitTimeInt < 0 { | |||||
| taskDetail.WaitTime = "00:00:00" | |||||
| } | |||||
| } | |||||
| } | } | ||||
| } else if ciTasks[i].Cloudbrain.Status == string(models.JobStopped) && ciTasks[i].Cloudbrain.StartTime.AsTime().Unix() == 0 { | } else if ciTasks[i].Cloudbrain.Status == string(models.JobStopped) && ciTasks[i].Cloudbrain.StartTime.AsTime().Unix() == 0 { | ||||
| WaitTimeInt := ciTasks[i].Cloudbrain.EndTime.AsTime().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() | WaitTimeInt := ciTasks[i].Cloudbrain.EndTime.AsTime().Unix() - ciTasks[i].Cloudbrain.CreatedUnix.AsTime().Unix() | ||||
| @@ -7,8 +7,10 @@ package repo | |||||
| import ( | import ( | ||||
| "code.gitea.io/gitea/modules/grampus" | "code.gitea.io/gitea/modules/grampus" | ||||
| "code.gitea.io/gitea/modules/setting" | |||||
| "encoding/json" | "encoding/json" | ||||
| "net/http" | "net/http" | ||||
| "path" | |||||
| "strconv" | "strconv" | ||||
| "strings" | "strings" | ||||
| @@ -263,39 +265,49 @@ func TrainJobGetLog(ctx *context.APIContext) { | |||||
| return | return | ||||
| } | } | ||||
| resultLogFile, result, err := trainJobGetLogContent(jobID, versionName, baseLine, order, lines_int) | |||||
| task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) | |||||
| if err != nil { | |||||
| log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) | |||||
| return | |||||
| } | |||||
| resultLogFile, result, err := trainJobGetLogContent(jobID, task.VersionID, baseLine, order, lines_int) | |||||
| if err != nil { | if err != nil { | ||||
| log.Error("trainJobGetLog(%s) failed:%v", jobID, err.Error()) | log.Error("trainJobGetLog(%s) failed:%v", jobID, err.Error()) | ||||
| // ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) | // ctx.RenderWithErr(err.Error(), tplModelArtsTrainJobShow, nil) | ||||
| return | return | ||||
| } | } | ||||
| prefix := strings.TrimPrefix(path.Join(setting.TrainJobModelPath, task.JobName, modelarts.LogPath, versionName), "/") + "/job" | |||||
| _, err = storage.GetObsLogFileName(prefix) | |||||
| var canLogDownload bool | |||||
| if err != nil { | |||||
| canLogDownload = false | |||||
| } else { | |||||
| canLogDownload = true | |||||
| } | |||||
| ctx.Data["log_file_name"] = resultLogFile.LogFileList[0] | ctx.Data["log_file_name"] = resultLogFile.LogFileList[0] | ||||
| ctx.JSON(http.StatusOK, map[string]interface{}{ | ctx.JSON(http.StatusOK, map[string]interface{}{ | ||||
| "JobID": jobID, | |||||
| "LogFileName": resultLogFile.LogFileList[0], | |||||
| "StartLine": result.StartLine, | |||||
| "EndLine": result.EndLine, | |||||
| "Content": result.Content, | |||||
| "Lines": result.Lines, | |||||
| "JobID": jobID, | |||||
| "LogFileName": resultLogFile.LogFileList[0], | |||||
| "StartLine": result.StartLine, | |||||
| "EndLine": result.EndLine, | |||||
| "Content": result.Content, | |||||
| "Lines": result.Lines, | |||||
| "CanLogDownload": canLogDownload, | |||||
| }) | }) | ||||
| } | } | ||||
| func trainJobGetLogContent(jobID string, versionName string, baseLine string, order string, lines int) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) { | |||||
| task, err := models.GetCloudbrainByJobIDAndVersionName(jobID, versionName) | |||||
| if err != nil { | |||||
| log.Error("GetCloudbrainByJobID(%s) failed:%v", jobID, err.Error()) | |||||
| return nil, nil, err | |||||
| } | |||||
| func trainJobGetLogContent(jobID string, versionID int64, baseLine string, order string, lines int) (*models.GetTrainJobLogFileNamesResult, *models.GetTrainJobLogResult, error) { | |||||
| resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(task.VersionID, 10)) | |||||
| resultLogFile, err := modelarts.GetTrainJobLogFileNames(jobID, strconv.FormatInt(versionID, 10)) | |||||
| if err != nil { | if err != nil { | ||||
| log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error()) | log.Error("GetTrainJobLogFileNames(%s) failed:%v", jobID, err.Error()) | ||||
| return nil, nil, err | return nil, nil, err | ||||
| } | } | ||||
| result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(task.VersionID, 10), baseLine, resultLogFile.LogFileList[0], order, lines) | |||||
| result, err := modelarts.GetTrainJobLog(jobID, strconv.FormatInt(versionID, 10), baseLine, resultLogFile.LogFileList[0], order, lines) | |||||
| if err != nil { | if err != nil { | ||||
| log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error()) | log.Error("GetTrainJobLog(%s) failed:%v", jobID, err.Error()) | ||||
| return nil, nil, err | return nil, nil, err | ||||
| @@ -2,7 +2,6 @@ package repo | |||||
| import ( | import ( | ||||
| "bufio" | "bufio" | ||||
| "code.gitea.io/gitea/modules/grampus" | |||||
| "encoding/json" | "encoding/json" | ||||
| "errors" | "errors" | ||||
| "fmt" | "fmt" | ||||
| @@ -16,6 +15,8 @@ import ( | |||||
| "time" | "time" | ||||
| "unicode/utf8" | "unicode/utf8" | ||||
| "code.gitea.io/gitea/modules/grampus" | |||||
| "code.gitea.io/gitea/modules/timeutil" | "code.gitea.io/gitea/modules/timeutil" | ||||
| "github.com/unknwon/i18n" | "github.com/unknwon/i18n" | ||||
| @@ -135,7 +136,7 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { | |||||
| } | } | ||||
| ctx.Data["attachments"] = attachs | ctx.Data["attachments"] = attachs | ||||
| ctx.Data["command"] = cloudbrain.Command | |||||
| ctx.Data["command"] = cloudbrain.GetCloudbrainDebugCommand() | |||||
| ctx.Data["code_path"] = cloudbrain.CodeMountPath | ctx.Data["code_path"] = cloudbrain.CodeMountPath | ||||
| ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath | ctx.Data["dataset_path"] = cloudbrain.DataSetMountPath | ||||
| ctx.Data["model_path"] = cloudbrain.ModelMountPath | ctx.Data["model_path"] = cloudbrain.ModelMountPath | ||||
| @@ -149,6 +150,8 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { | |||||
| ctx.Data["benchmark_types"] = GetBenchmarkTypes(ctx).BenchmarkType | ctx.Data["benchmark_types"] = GetBenchmarkTypes(ctx).BenchmarkType | ||||
| cloudbrain.InitSpecialPool() | |||||
| if gpuInfos == nil { | if gpuInfos == nil { | ||||
| json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos) | json.Unmarshal([]byte(setting.GpuTypes), &gpuInfos) | ||||
| } | } | ||||
| @@ -178,6 +181,45 @@ func cloudBrainNewDataPrepare(ctx *context.Context) error { | |||||
| json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) | json.Unmarshal([]byte(setting.TrainResourceSpecs), &cloudbrain.TrainResourceSpecs) | ||||
| } | } | ||||
| ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec | ctx.Data["train_resource_specs"] = cloudbrain.TrainResourceSpecs.ResourceSpec | ||||
| if cloudbrain.SpecialPools != nil { | |||||
| var debugGpuTypes []*models.GpuInfo | |||||
| var trainGpuTypes []*models.GpuInfo | |||||
| for _, pool := range cloudbrain.SpecialPools.Pools { | |||||
| org, _ := models.GetOrgByName(pool.Org) | |||||
| if org != nil { | |||||
| isOrgMember, _ := models.IsOrganizationMember(org.ID, ctx.User.ID) | |||||
| if isOrgMember { | |||||
| for _, jobType := range pool.JobType { | |||||
| if jobType == string(models.JobTypeDebug) { | |||||
| debugGpuTypes = append(debugGpuTypes, pool.Pool...) | |||||
| if pool.ResourceSpec != nil { | |||||
| ctx.Data["resource_specs"] = pool.ResourceSpec | |||||
| } | |||||
| } else if jobType == string(models.JobTypeTrain) { | |||||
| trainGpuTypes = append(trainGpuTypes, pool.Pool...) | |||||
| if pool.ResourceSpec != nil { | |||||
| ctx.Data["train_resource_specs"] = pool.ResourceSpec | |||||
| } | |||||
| } | |||||
| } | |||||
| break | |||||
| } | |||||
| } | |||||
| } | |||||
| if len(debugGpuTypes) > 0 { | |||||
| ctx.Data["gpu_types"] = debugGpuTypes | |||||
| } | |||||
| if len(trainGpuTypes) > 0 { | |||||
| ctx.Data["train_gpu_types"] = trainGpuTypes | |||||
| } | |||||
| } | |||||
| ctx.Data["params"] = "" | ctx.Data["params"] = "" | ||||
| ctx.Data["branchName"] = ctx.Repo.BranchName | ctx.Data["branchName"] = ctx.Repo.BranchName | ||||
| @@ -217,6 +259,10 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { | |||||
| repo := ctx.Repo.Repository | repo := ctx.Repo.Repository | ||||
| tpl := tplCloudBrainNew | tpl := tplCloudBrainNew | ||||
| if jobType == string(models.JobTypeTrain) { | |||||
| tpl = tplCloudBrainTrainJobNew | |||||
| } | |||||
| tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName) | tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName) | ||||
| if err == nil { | if err == nil { | ||||
| if len(tasks) != 0 { | if len(tasks) != 0 { | ||||
| @@ -269,7 +315,7 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { | |||||
| return | return | ||||
| } | } | ||||
| command := cloudbrain.Command | |||||
| command := cloudbrain.GetCloudbrainDebugCommand() | |||||
| if jobType == string(models.JobTypeTrain) { | if jobType == string(models.JobTypeTrain) { | ||||
| tpl = tplCloudBrainTrainJobNew | tpl = tplCloudBrainTrainJobNew | ||||
| commandTrain, err := getTrainJobCommand(form) | commandTrain, err := getTrainJobCommand(form) | ||||
| @@ -282,6 +328,14 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { | |||||
| command = commandTrain | command = commandTrain | ||||
| } | } | ||||
| errStr := checkCloudBrainSpecialPool(ctx, jobType, gpuQueue, resourceSpecId) | |||||
| if errStr != "" { | |||||
| cloudBrainNewDataPrepare(ctx) | |||||
| ctx.RenderWithErr(errStr, tpl, &form) | |||||
| return | |||||
| } | |||||
| if branchName == "" { | if branchName == "" { | ||||
| branchName = cloudbrain.DefaultBranchName | branchName = cloudbrain.DefaultBranchName | ||||
| } | } | ||||
| @@ -334,6 +388,42 @@ func CloudBrainCreate(ctx *context.Context, form auth.CreateCloudBrainForm) { | |||||
| } | } | ||||
| } | } | ||||
| /** | |||||
| 检查用户传输的参数是否符合专属资源池 | |||||
| */ | |||||
| func checkCloudBrainSpecialPool(ctx *context.Context, jobType string, queue string, resourceSpecId int) string { | |||||
| if cloudbrain.SpecialPools != nil { | |||||
| var isInPoolOrg = false | |||||
| var matchSpecialPool = false | |||||
| for _, specialPool := range cloudbrain.SpecialPools.Pools { | |||||
| if cloudbrain.IsElementExist(specialPool.JobType, jobType) && cloudbrain.IsQueueInSpecialtPool(specialPool.Pool, queue) { | |||||
| if cloudbrain.IsResourceSpecInSpecialPool(specialPool.ResourceSpec, resourceSpecId) { | |||||
| matchSpecialPool = true | |||||
| org, _ := models.GetOrgByName(specialPool.Org) | |||||
| if org != nil { | |||||
| isInPoolOrg, _ = models.IsOrganizationMember(org.ID, ctx.User.ID) | |||||
| if isInPoolOrg { | |||||
| break //传入参数,和专属资源池匹配上了,检查通过 | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| //资源池有匹配上,但是用户不在相应的组织中,返回错误信息。界面已经过滤了选择,界面操作不会到这个逻辑 | |||||
| if matchSpecialPool && !isInPoolOrg { | |||||
| return ctx.Tr("repo.grampus.no_operate_right") | |||||
| } | |||||
| } | |||||
| //没有匹配到资源池或者没有设置专属资源池,检查通过; 获取和资源池完全匹配检查通过 | |||||
| return "" | |||||
| } | |||||
| func CloudBrainRestart(ctx *context.Context) { | func CloudBrainRestart(ctx *context.Context) { | ||||
| var ID = ctx.Params(":id") | var ID = ctx.Params(":id") | ||||
| var resultCode = "0" | var resultCode = "0" | ||||
| @@ -573,7 +663,9 @@ func cloudBrainShow(ctx *context.Context, tpName base.TplName, jobType models.Jo | |||||
| if task.TrainJobDuration == "" { | if task.TrainJobDuration == "" { | ||||
| if task.Duration == 0 { | if task.Duration == 0 { | ||||
| var duration int64 | var duration int64 | ||||
| if task.Status == string(models.JobRunning) { | |||||
| if task.Status == string(models.JobWaiting) { | |||||
| duration = 0 | |||||
| } else if task.Status == string(models.JobRunning) { | |||||
| duration = time.Now().Unix() - int64(task.CreatedUnix) | duration = time.Now().Unix() - int64(task.CreatedUnix) | ||||
| } else { | } else { | ||||
| duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix) | duration = int64(task.UpdatedUnix) - int64(task.CreatedUnix) | ||||
| @@ -2094,7 +2186,7 @@ func ModelBenchmarkCreate(ctx *context.Context, form auth.CreateCloudBrainForm) | |||||
| repo := ctx.Repo.Repository | repo := ctx.Repo.Repository | ||||
| tpl := tplCloudBrainBenchmarkNew | tpl := tplCloudBrainBenchmarkNew | ||||
| command := cloudbrain.Command | |||||
| command := cloudbrain.GetCloudbrainDebugCommand() | |||||
| tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName) | tasks, err := models.GetCloudbrainsByDisplayJobName(repo.ID, jobType, displayJobName) | ||||
| if err == nil { | if err == nil { | ||||
| @@ -480,8 +480,13 @@ | |||||
| </div> | </div> | ||||
| <div id="dir_list{{.VersionName}}"> | <div id="dir_list{{.VersionName}}"> | ||||
| </div> | </div> | ||||
| {{if eq .ComputeResource "CPU/GPU"}} | |||||
| <div style="display:flex;align-items: center;justify-content: end;color: #f2711c;"> | |||||
| <i class="ri-error-warning-line" style="margin-right:0.5rem;"></i> | |||||
| <span>{{$.i18n.Tr "repo.file_limit_100"}}</span> | |||||
| </div> | |||||
| {{end}} | |||||
| </div> | </div> | ||||
| </div> | </div> | ||||
| @@ -488,7 +488,7 @@ | |||||
| <div class="ui tab" data-tab="second{{$k}}"> | <div class="ui tab" data-tab="second{{$k}}"> | ||||
| <div> | <div> | ||||
| <a id="{{.VersionName}}-log-down" | <a id="{{.VersionName}}-log-down" | ||||
| class='{{if and (.CanModify) (eq .Status "KILLED" "FAILED" "START_FAILED" "STOPPED" "COMPLETED") }}ti-download-file{{else}}disabled{{end}}' | |||||
| class='{{if and ($.CanLogDownload) (eq .Status "KILLED" "FAILED" "START_FAILED" "STOPPED" "COMPLETED") }}ti-download-file{{else}}disabled{{end}}' | |||||
| href="{{$.RepoLink}}/modelarts/train-job/{{.JobID}}/download_log_file?version_name={{.VersionName}}"> | href="{{$.RepoLink}}/modelarts/train-job/{{.JobID}}/download_log_file?version_name={{.VersionName}}"> | ||||
| <i class="ri-download-cloud-2-line"></i> | <i class="ri-download-cloud-2-line"></i> | ||||
| <span style="margin-left: 0.3rem;">{{$.i18n.Tr "repo.modelarts.download_log"}}</span> | <span style="margin-left: 0.3rem;">{{$.i18n.Tr "repo.modelarts.download_log"}}</span> | ||||
| @@ -446,24 +446,6 @@ | |||||
| ] | ] | ||||
| }, | }, | ||||
| work_server_number: { | |||||
| identifier : 'work_server_number', | |||||
| rules: [ | |||||
| { | |||||
| type : 'integer[1..25]', | |||||
| prompt : '计算节点需要在1-25之间,请您键入正确的值' | |||||
| } | |||||
| ] | |||||
| }, | |||||
| run_para_list:{ | |||||
| identifier : 'run_para_list', | |||||
| rules: [ | |||||
| { | |||||
| type: 'maxLength[255]', | |||||
| prompt : '所有字符最长不超过255个字符。' | |||||
| } | |||||
| ] | |||||
| }, | |||||
| }, | }, | ||||
| }) | }) | ||||
| @@ -512,24 +494,6 @@ | |||||
| ] | ] | ||||
| }, | }, | ||||
| work_server_number: { | |||||
| identifier : 'work_server_number', | |||||
| rules: [ | |||||
| { | |||||
| type : 'integer[1..25]', | |||||
| prompt : '计算节点需要在1-25之间,请您键入正确的值' | |||||
| } | |||||
| ] | |||||
| }, | |||||
| run_para_list:{ | |||||
| identifier : 'run_para_list', | |||||
| rules: [ | |||||
| { | |||||
| type: 'maxLength[255]', | |||||
| prompt : '所有字符最长不超过255个字符。' | |||||
| } | |||||
| ] | |||||
| }, | |||||
| }, | }, | ||||
| onSuccess: function(){ | onSuccess: function(){ | ||||
| // $('.ui.page.dimmer').dimmer('show') | // $('.ui.page.dimmer').dimmer('show') | ||||