From af0bada5d33d62100f549d94f14f0b5ebf24eec7 Mon Sep 17 00:00:00 2001 From: ychao_1983 Date: Thu, 1 Sep 2022 17:18:42 +0800 Subject: [PATCH] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- options/locale/locale_en-US.ini | 1 + options/locale/locale_zh-CN.ini | 1 + routers/repo/modelarts.go | 69 +++++++++++++++++++++++++++++++-- 3 files changed, 68 insertions(+), 3 deletions(-) diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index 5eac4cf2e..3453344f7 100755 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -1213,6 +1213,7 @@ modelarts.infer_job.select_model = Select Model modelarts.infer_job.boot_file_helper=The startup file is the entry file for your program execution and must end in.py.Such as inference.py, main.py, example/inference.py, case/main.py. modelarts.infer_job.tooltip = The model has been deleted and cannot be viewed. modelarts.download_log=Download log file +modelarts.no_node_right = The value of 'Amount of Compute Node' is wrong, you have no right to use the current value of 'Amount of Compute Node'. debug_task_not_created = Debug task has not been created diff --git a/options/locale/locale_zh-CN.ini b/options/locale/locale_zh-CN.ini index 2fbd3ab52..d527218d3 100755 --- a/options/locale/locale_zh-CN.ini +++ b/options/locale/locale_zh-CN.ini @@ -1226,6 +1226,7 @@ modelarts.infer_job.select_model = 选择模型 modelarts.infer_job.boot_file_helper=启动文件是您程序执行的入口文件,必须是以.py结尾的文件。比如inference.py、main.py、example/inference.py、case/main.py。 modelarts.infer_job.tooltip = 该模型已删除,无法查看。 modelarts.download_log=下载日志文件 +modelarts.no_node_right = 计算节点数的值配置错误,您没有权限使用当前配置的计算节点数。 debug_task_not_created = 未创建过调试任务 diff --git a/routers/repo/modelarts.go b/routers/repo/modelarts.go index 10843e683..424a7fe23 100755 --- a/routers/repo/modelarts.go +++ b/routers/repo/modelarts.go @@ -1130,6 +1130,13 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) VersionCount := modelarts.VersionCountOne EngineName := form.EngineName + errStr:=checkMultiNode(ctx.User.ID,form.WorkServerNumber) + if errStr!=""{ + trainJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form) + return + } + count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -1160,7 +1167,7 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) return } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) + errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) if errStr != "" { trainJobErrorNewDataPrepare(ctx, form) ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobNew, &form) @@ -1364,6 +1371,48 @@ func TrainJobCreate(ctx *context.Context, form auth.CreateModelArtsTrainJobForm) ctx.Redirect(setting.AppSubURL + ctx.Repo.RepoLink + "/modelarts/train-job") } +func checkMultiNode(userId int64, serverNum int) string{ + if serverNum==1{ + return "" + } + modelarts.InitMultiNode() + var isServerNumValid=false + if modelarts.MultiNodeConfig != nil { + for _, info := range modelarts.MultiNodeConfig.Info { + if isInOrg, _ := models.IsOrganizationMemberByOrgName(info.Org, userId); isInOrg { + if isInNodes(info.Node,serverNum){ + isServerNumValid=true + break + } + + } + } + } + if isServerNumValid{ + return "" + }else{ + return "repo.modelarts.no_node_right" + } +} +func checkInferenceJobMultiNode(userId int64, serverNum int) string{ + if serverNum==1{ + return "" + } + + return "repo.modelarts.no_node_right" + +} + +func isInNodes(nodes []int, num int) bool { + for _, node:=range nodes{ + if node==num{ + return true + } + } + return false + +} + func getUserCommand(engineId int, req *modelarts.GenerateTrainJobReq) (string, string) { userImageUrl := "" userCommand := "" @@ -1398,6 +1447,13 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ ctx.Data["PageIsTrainJob"] = true var jobID = ctx.Params(":jobid") + errStr:=checkMultiNode(ctx.User.ID,form.WorkServerNumber) + if errStr!=""{ + versionErrorDataPrepare(ctx, form) + ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form) + return + } + count, err := models.GetCloudbrainTrainJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainTrainJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -1465,7 +1521,7 @@ func TrainJobCreateVersion(ctx *context.Context, form auth.CreateModelArtsTrainJ return } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) + errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeTrain)) if errStr != "" { versionErrorDataPrepare(ctx, form) ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsTrainJobVersionNew, &form) @@ -2036,6 +2092,13 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference ckptUrl := "/" + form.TrainUrl + form.CkptName log.Info("ckpt url:" + ckptUrl) + errStr:=checkInferenceJobMultiNode(ctx.User.ID,form.WorkServerNumber) + if errStr!=""{ + inferenceJobErrorNewDataPrepare(ctx, form) + ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form) + return + } + count, err := models.GetCloudbrainInferenceJobCountByUserID(ctx.User.ID) if err != nil { log.Error("GetCloudbrainInferenceJobCountByUserID failed:%v", err, ctx.Data["MsgID"]) @@ -2084,7 +2147,7 @@ func InferenceJobCreate(ctx *context.Context, form auth.CreateModelArtsInference } } - errStr := checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference)) + errStr = checkModelArtsSpecialPool(ctx, flavorCode, string(models.JobTypeInference)) if errStr != "" { inferenceJobErrorNewDataPrepare(ctx, form) ctx.RenderWithErr(ctx.Tr(errStr), tplModelArtsInferenceJobNew, &form)