From 482645eae3c8865336b1f619b7076f593f059a7f Mon Sep 17 00:00:00 2001 From: qiwang <1364512070@qq.com> Date: Wed, 12 Mar 2025 19:26:28 +0800 Subject: [PATCH 1/6] fix:update createTraining in modelarts --- go.mod | 2 +- go.sum | 6 +++++ internal/storeLink/modelarts.go | 39 +++++++++++++++++---------------- 3 files changed, 27 insertions(+), 20 deletions(-) diff --git a/go.mod b/go.mod index 38297330..a1df5104 100644 --- a/go.mod +++ b/go.mod @@ -20,7 +20,7 @@ require ( github.com/zeromicro/go-zero v1.7.4 gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20250107025835-8fc888b1d170 gitlink.org.cn/JointCloud/pcm-hpc v0.0.0-20241125115811-72f3568255a4 - gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250306073530-56ecf1273207 + gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250312043331-e84d101055bd gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240817071412-44397870b110 gitlink.org.cn/JointCloud/pcm-openi v0.0.0-20250102093846-164b4884c9ec gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 diff --git a/go.sum b/go.sum index 2e827dd1..d2836722 100644 --- a/go.sum +++ b/go.sum @@ -298,6 +298,8 @@ github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hashicorp/memberlist v0.5.0 h1:EtYPN8DpAURiapus508I4n9CzHs2W+8NZGbmmR/prTM= github.com/hashicorp/memberlist v0.5.0/go.mod h1:yvyXLpo0QaGE59Y7hDTsTzDD25JYBZ4mHgHUZ8lrOI0= +github.com/huaweicloud/huaweicloud-sdk-go-v3 v0.1.61 h1:b203Ob+V22EyNiJlrhYQGJ0aAJ9ddFMa3neYrOZ8/tQ= +github.com/huaweicloud/huaweicloud-sdk-go-v3 v0.1.61/go.mod h1:AZT3IyeViMA1qIoo6lM2eDobcTXORpqIQzSqdodah7E= github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= @@ -530,6 +532,10 @@ gitlink.org.cn/JointCloud/pcm-hpc v0.0.0-20241125115811-72f3568255a4 h1:WIs/189l gitlink.org.cn/JointCloud/pcm-hpc v0.0.0-20241125115811-72f3568255a4/go.mod h1:YbuoRgF9sEVvNJPQtGRjdocX7Du6NBOTLn+GVwqRVjo= gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250306073530-56ecf1273207 h1:korhOkFl0x1wuQBKoKTsQHeFboDwLFRWwR2G9IPPfNg= gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250306073530-56ecf1273207/go.mod h1:MxtnJJcU8S4zfGKZVcg2MOXGtwucKy7MMDwA0IemBd0= +gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250311041651-d676f57aac45 h1:SoR/DoLffkzoXrcfSaOY4EitPayA3kfjTp/yTOdRlps= +gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250311041651-d676f57aac45/go.mod h1:MxtnJJcU8S4zfGKZVcg2MOXGtwucKy7MMDwA0IemBd0= +gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250312043331-e84d101055bd h1:MMF06GNHfYDKhtOneImh1mL5fcgDqOpesZF2fW9oU6A= +gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250312043331-e84d101055bd/go.mod h1:MxtnJJcU8S4zfGKZVcg2MOXGtwucKy7MMDwA0IemBd0= gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240817071412-44397870b110 h1:GaXwr5sgDh0raHjUf9IewTvnRvajYea7zbLsaerYyXo= gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240817071412-44397870b110/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ= gitlink.org.cn/JointCloud/pcm-openi v0.0.0-20250102093846-164b4884c9ec h1:Yul2JOAIS94B+eIg0UvmBSe8JrtSrZ2OA47gAYLiBYI= diff --git a/internal/storeLink/modelarts.go b/internal/storeLink/modelarts.go index 5640d403..0da35ee5 100644 --- a/internal/storeLink/modelarts.go +++ b/internal/storeLink/modelarts.go @@ -157,8 +157,8 @@ func (m *ModelArtsLink) SubmitTask(ctx context.Context, imageId string, cmd stri // modelArts提交任务 environments := make(map[string]string) parameters := make([]*modelarts.ParametersTrainJob, 0) - /* inputs := make([]*modelarts.InputTraining, 0) - outputs := make([]*modelarts.OutputTraining, 0)*/ + inputs := make([]*modelarts.InputTraining, 0) + outputs := make([]*modelarts.OutputTraining, 0) for _, env := range envs { s := strings.Split(env, COMMA) environments[s[0]] = s[1] @@ -170,22 +170,23 @@ func (m *ModelArtsLink) SubmitTask(ctx context.Context, imageId string, cmd stri Value: s[1], }) } - /* inputs = append(inputs, &modelarts.InputTraining{ - Name: "data_url", - Remote: &modelarts.RemoteTra{ - Obs: &modelarts.Obs1{ - ObsUrl: "/test-wq/data/mnist.npz", - }, - }}) - - outputs = append(outputs, &modelarts.OutputTraining{ - Name: "train_url", - Remote: &modelarts.RemoteOut{ - Obs: &modelarts.ObsTra{ - ObsUrl: "/test-wq/model/", - }, + + inputs = append(inputs, &modelarts.InputTraining{ + Name: "input", + AccessMethod: "parameter", + Remote: &modelarts.RemoteTra{ + Obs: &modelarts.ObsTra{ + ObsUrl: datasetsId + "/", + }, + }}) + + outputs = append(outputs, &modelarts.OutputTraining{ + Name: "output", + Remote: &modelarts.RemoteOut{ + Obs: &modelarts.ObsTra{ + ObsUrl: "obs://test-modelarts-train/output/10v/", }, - })*/ + }}) req := &modelarts.CreateTrainingJobReq{ Kind: "job", Metadata: &modelarts.MetadataS{ @@ -200,8 +201,8 @@ func (m *ModelArtsLink) SubmitTask(ctx context.Context, imageId string, cmd stri Command: cmd, Environments: environments, Parameters: parameters, - //Inputs: inputs, - //Outputs: outputs, + Inputs: inputs, + Outputs: outputs, }, Spec: &modelarts.SpecsC{ Resource: &modelarts.ResourceCreateTraining{ From bb7b0dcd16e8f6543e697b19760653bb63bc7091 Mon Sep 17 00:00:00 2001 From: devad Date: Thu, 13 Mar 2025 09:46:44 +0800 Subject: [PATCH 2/6] =?UTF-8?q?refactor:=20delete=20.devops/=E9=98=BF?= =?UTF-8?q?=E9=87=8C=E4=BA=91.yml?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .devops/阿里云.yml | 65 ------------------------------------------- 1 file changed, 65 deletions(-) delete mode 100644 .devops/阿里云.yml diff --git a/.devops/阿里云.yml b/.devops/阿里云.yml deleted file mode 100644 index a1094917..00000000 --- a/.devops/阿里云.yml +++ /dev/null @@ -1,65 +0,0 @@ -version: 2 -name: 阿里云 -description: "" -global: - concurrent: 1 - param: - - ref: ssh_host - name: "" - value: '"47.92.39.128"' - required: false - type: STRING - hidden: true - - ref: ssh_user - name: "" - value: '"root"' - required: false - type: STRING - hidden: true -workflow: - - ref: start - name: 开始 - task: start - - ref: end - name: 结束 - task: end - needs: - - ssh_cmd_0 - - ref: git_clone_0 - name: git clone - task: git_clone@1.2.9 - input: - remote_url: '"https://gitlink.org.cn/JointCloud/pcm-coordinator.git"' - ref: '"refs/heads/master"' - commit_id: '""' - depth: 1 - needs: - - start - - ref: docker_image_build_0 - name: docker镜像构建 - task: docker_image_build@1.6.0 - input: - docker_username: ((aly.docker_user)) - docker_password: ((aly.docker_password)) - image_name: '"registry.cn-hangzhou.aliyuncs.com/jcce/pcm-core-api"' - image_tag: '"latest"' - registry_address: '"registry.cn-hangzhou.aliyuncs.com"' - docker_file: '"Dockerfile"' - docker_build_path: git_clone_0.git_path - workspace: git_clone_0.git_path - image_push: true - build_args: '""' - needs: - - git_clone_0 - - ref: ssh_cmd_0 - name: ssh执行命令 - task: ssh_cmd@1.1.1 - input: - ssh_private_key: ((aly.ssh_private_key)) - ssh_ip: global.ssh_host - ssh_port: '"22"' - ssh_user: global.ssh_user - ssh_cmd: '"kubectl rollout restart deployment pcm-core-api -n ns-admin"' - needs: - - docker_image_build_0 - From 3a6c4ae17cfc02d2b7d8d9310a469ea7b8e7edf5 Mon Sep 17 00:00:00 2001 From: qiwang <1364512070@qq.com> Date: Thu, 13 Mar 2025 17:43:19 +0800 Subject: [PATCH 3/6] fix:update createTraining in modelarts --- internal/storeLink/modelarts.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/internal/storeLink/modelarts.go b/internal/storeLink/modelarts.go index 0da35ee5..1ed8ead7 100644 --- a/internal/storeLink/modelarts.go +++ b/internal/storeLink/modelarts.go @@ -158,7 +158,7 @@ func (m *ModelArtsLink) SubmitTask(ctx context.Context, imageId string, cmd stri environments := make(map[string]string) parameters := make([]*modelarts.ParametersTrainJob, 0) inputs := make([]*modelarts.InputTraining, 0) - outputs := make([]*modelarts.OutputTraining, 0) + //outputs := make([]*modelarts.OutputTraining, 0) for _, env := range envs { s := strings.Split(env, COMMA) environments[s[0]] = s[1] @@ -180,13 +180,13 @@ func (m *ModelArtsLink) SubmitTask(ctx context.Context, imageId string, cmd stri }, }}) - outputs = append(outputs, &modelarts.OutputTraining{ - Name: "output", - Remote: &modelarts.RemoteOut{ - Obs: &modelarts.ObsTra{ - ObsUrl: "obs://test-modelarts-train/output/10v/", - }, - }}) + /*outputs = append(outputs, &modelarts.OutputTraining{ + Name: "output", + Remote: &modelarts.RemoteOut{ + Obs: &modelarts.ObsTra{ + ObsUrl: "obs://test-modelarts-train/output/10v/", + }, + }})*/ req := &modelarts.CreateTrainingJobReq{ Kind: "job", Metadata: &modelarts.MetadataS{ @@ -202,7 +202,7 @@ func (m *ModelArtsLink) SubmitTask(ctx context.Context, imageId string, cmd stri Environments: environments, Parameters: parameters, Inputs: inputs, - Outputs: outputs, + //Outputs: outputs, }, Spec: &modelarts.SpecsC{ Resource: &modelarts.ResourceCreateTraining{ From cc273cb5ec979c4dad0abf07a6b635f38e034d26 Mon Sep 17 00:00:00 2001 From: Jake <450705171@qq.com> Date: Fri, 14 Mar 2025 10:13:15 +0800 Subject: [PATCH 4/6] =?UTF-8?q?=E6=8F=90=E4=BA=A4=E8=B6=85=E7=AE=97?= =?UTF-8?q?=E4=BB=BB=E5=8A=A1=E6=9B=B4=E6=96=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- desc/hpc/pcm-hpc.api | 43 +++--- internal/handler/hpc/commithpctaskhandler.go | 7 +- internal/logic/hpc/commithpctasklogic.go | 132 ++++++++++--------- internal/types/types.go | 42 +++--- pkg/models/taskhpcmodel_gen.go | 3 + 5 files changed, 112 insertions(+), 115 deletions(-) diff --git a/desc/hpc/pcm-hpc.api b/desc/hpc/pcm-hpc.api index 13adfa20..37dfa23c 100644 --- a/desc/hpc/pcm-hpc.api +++ b/desc/hpc/pcm-hpc.api @@ -10,37 +10,33 @@ info( type ( commitHpcTaskReq { - ClusterId string `json:"clusterId,optional"` Name string `json:"name"` - Account string `json:"account,optional"` + Backend string `json:"backend"` + ClusterId string `json:"clusterId"` + App string `json:"app"` Description string `json:"description,optional"` - TenantId int64 `json:"tenantId,optional"` - TaskId int64 `json:"taskId,optional"` - AdapterIds []string `json:"adapterIds,optional"` - MatchLabels map[string]string `json:"matchLabels,optional"` - CardCount int64 `json:"cardCount,optional"` - WorkDir string `json:"workDir,optional"` //paratera:workingDir - WallTime string `json:"wallTime,optional"` - CmdScript string `json:"cmdScript,optional"` // paratera:bootScript - AppType string `json:"appType,optional"` - AppName string `json:"appName,optional"` // paratera:jobGroupName ac:appname - Queue string `json:"queue,optional"` - NNode string `json:"nNode,optional"` - SubmitType string `json:"submitType,optional"` - StdInput string `json:"stdInput,optional"` - ClusterType string `json:"clusterType,optional"` - Partition string `json:"partition"` - UserId int64 `json:"userId,optional"` - Token string `json:"token,optional"` - UserIp string `json:"userIp,optional"` + OperateType string `json:"operateType,optional"` + Parameters map[string]string `json:"parameters"` + CustomParams map[string]string `json:"customParams"` } +) + +type ( commitHpcTaskResp { - ClusterId int64 `json:"clusterId"` - JobId string `json:"jobId"` + Code int `json:"code"` + Data Data `json:"data"` + Msg string `json:"msg"` + TraceId string `json:"trace_id"` } + Data { + Backend string `json:"backend"` + JobInfo map[string]string `json:"jobInfo"` + } + ) + type ( hpcOverViewReq { } @@ -160,7 +156,6 @@ type ( InstanceType int32 `form:"instanceType,optional"` InstanceClass string `form:"instanceClass,optional"` InstanceName string `form:"instanceName,optional"` - PageInfo } HpcInstanceCenterResp { InstanceCenterList []HpcInstanceCenterList `json:"instanceCenterList" copier:"InstanceCenterList"` diff --git a/internal/handler/hpc/commithpctaskhandler.go b/internal/handler/hpc/commithpctaskhandler.go index e116dd10..6cfc623c 100644 --- a/internal/handler/hpc/commithpctaskhandler.go +++ b/internal/handler/hpc/commithpctaskhandler.go @@ -10,6 +10,7 @@ import ( "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils" "k8s.io/apimachinery/pkg/util/json" "net/http" + "strconv" ) func CommitHpcTaskHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { @@ -21,15 +22,15 @@ func CommitHpcTaskHandler(svcCtx *svc.ServiceContext) http.HandlerFunc { } // 获取ip信息 ip := utils.GetClientIP(r) - req.UserIp = ip + req.Parameters["UserIp"] = ip // 获取token信息 token := r.Header.Get("Authorization") - req.Token = token + req.Parameters["Token"] = token // 获取用户信息 userStr := r.Header.Get("User") user := &models.JccUserInfo{} json.Unmarshal([]byte(userStr), user) - req.UserId = user.Id + req.Parameters["UserId"] = strconv.FormatInt(user.Id, 10) l := hpc.NewCommitHpcTaskLogic(r.Context(), svcCtx) resp, err := l.CommitHpcTask(&req) diff --git a/internal/logic/hpc/commithpctasklogic.go b/internal/logic/hpc/commithpctasklogic.go index e8364c53..64300de8 100644 --- a/internal/logic/hpc/commithpctasklogic.go +++ b/internal/logic/hpc/commithpctasklogic.go @@ -6,9 +6,6 @@ import ( "github.com/go-resty/resty/v2" clientCore "gitlink.org.cn/JointCloud/pcm-coordinator/client" "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/models" - "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/remoteUtil" - v1 "gitlink.org.cn/JointCloud/pcm-hpc/routers/v1" - "k8s.io/apimachinery/pkg/util/json" "strconv" "time" @@ -32,6 +29,27 @@ func NewCommitHpcTaskLogic(ctx context.Context, svcCtx *svc.ServiceContext) *Com } } +type JobSpec struct { + Name string // 应用名称: BWA/lammps + Backend string // 后端类型:slurm/sugonac + App string + OperateType string // 应用内操作类型: bwa:构建索引/对比序列 + Parameters map[string]string // 通用参数 + CustomParams map[string]string // 各平台自定义参数 +} +type ResultParticipant struct { + Code int `json:"code"` + Data struct { + Backend string `json:"backend"` + JobInfo struct { + JobDir string `json:"jobDir"` + JobId string `json:"jobId"` + } `json:"jobInfo"` + } `json:"data"` + Msg string `json:"msg"` + TraceId string `json:"trace_id"` +} + func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *types.CommitHpcTaskResp, err error) { var clusterInfo types.ClusterInfo @@ -42,13 +60,14 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t } // 构建主任务结构体 + userId, _ := strconv.ParseInt(req.Parameters["UserId"], 10, 64) taskModel := models.Task{ Name: req.Name, Description: req.Description, CommitTime: time.Now(), Status: "Running", AdapterTypeDict: "2", - UserId: req.UserId, + UserId: userId, } // 保存任务数据到数据库 @@ -65,6 +84,8 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t return nil, errors.New("no corresponding adapter found") } clusterId, err := strconv.ParseInt(req.ClusterId, 10, 64) + cardCount, _ := strconv.ParseInt(req.Parameters["cardCount"], 10, 64) + timelimit, _ := strconv.ParseInt(req.Parameters["timeLimit"], 10, 64) hpcInfo := models.TaskHpc{ TaskId: taskModel.Id, AdapterId: clusterInfo.AdapterId, @@ -72,24 +93,27 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t ClusterId: clusterId, ClusterName: clusterInfo.Name, Name: taskModel.Name, - CmdScript: req.CmdScript, + Backend: req.Backend, + OperateType: req.OperateType, + CmdScript: req.Parameters["cmdScript"], StartTime: time.Now().String(), - CardCount: req.CardCount, - WorkDir: req.WorkDir, - WallTime: req.WallTime, - AppType: req.AppType, - AppName: req.AppName, - Queue: req.Queue, - SubmitType: req.SubmitType, - NNode: req.NNode, + CardCount: cardCount, + WorkDir: req.Parameters["workDir"], + WallTime: req.Parameters["wallTime"], + AppType: req.Parameters["appType"], + AppName: req.Parameters["appName"], + Queue: req.Parameters["queue"], + SubmitType: req.Parameters["submitType"], + NNode: req.Parameters["nNode"], Account: clusterInfo.Username, - StdInput: req.StdInput, - Partition: req.Partition, + StdInput: req.Parameters["stdInput"], + Partition: req.Parameters["partition"], CreatedTime: time.Now(), UpdatedTime: time.Now(), Status: "Running", + TimeLimit: timelimit, } - hpcInfo.WorkDir = clusterInfo.WorkDir + req.WorkDir + hpcInfo.WorkDir = clusterInfo.WorkDir + req.Parameters["WorkDir"] tx = l.svcCtx.DbEngin.Create(&hpcInfo) if tx.Error != nil { return nil, tx.Error @@ -109,64 +133,46 @@ func (l *CommitHpcTaskLogic) CommitHpcTask(req *types.CommitHpcTaskReq) (resp *t if result.Error != nil { logx.Errorf("Task creation failure, err: %v", result.Error) } - resp = &types.CommitHpcTaskResp{ - JobId: string(""), - } // 数据上链 // 查询资源价格 var price int64 - l.svcCtx.DbEngin.Raw("select price from resource_cost where resource_id = ?", clusterId).Scan(&price) - - bytes, _ := json.Marshal(taskModel) - remoteUtil.Evidence(remoteUtil.EvidenceParam{ - UserIp: req.UserIp, - Url: l.svcCtx.Config.BlockChain.Url, - ContractAddress: l.svcCtx.Config.BlockChain.ContractAddress, - FunctionName: l.svcCtx.Config.BlockChain.FunctionName, - Type: l.svcCtx.Config.BlockChain.Type, - Token: req.Token, - Amount: price, - Args: []string{strconv.FormatInt(taskModel.Id, 10), string(bytes)}, - }) + l.svcCtx.DbEngin.Raw("select price from `resource_cost` where resource_id = ?", clusterId).Scan(&price) + + //bytes, _ := json.Marshal(taskModel) + //remoteUtil.Evidence(remoteUtil.EvidenceParam{ + // UserIp: req.Parameters["UserIp"], + // Url: l.svcCtx.Config.BlockChain.Url, + // ContractAddress: l.svcCtx.Config.BlockChain.ContractAddress, + // FunctionName: l.svcCtx.Config.BlockChain.FunctionName, + // Type: l.svcCtx.Config.BlockChain.Type, + // Token: req.Parameters["Token"], + // Amount: price, + // Args: []string{strconv.FormatInt(taskModel.Id, 10), string(bytes)}, + //}) // 提交job到指定集群 logx.Info("提交job到指定集群") - go func() { - submitJob(&hpcInfo, &clusterInfo, server) - }() + resp, _ = submitJob(req, server) + return resp, nil } -func submitJob(hpcInfo *models.TaskHpc, clusterInfo *types.ClusterInfo, adapterAddress string) (int, error) { - SubmitJobReq := v1.SubmitJobReq{ - Server: clusterInfo.Server, - Version: clusterInfo.Version, - Username: clusterInfo.Username, - Token: clusterInfo.Token, - JobOptions: v1.JobOptions{ - Script: hpcInfo.CmdScript, - Job: &v1.JobProperties{ - Account: hpcInfo.Account, - Name: hpcInfo.Name, - NTasks: 1, - CurrentWorkingDirectory: hpcInfo.WorkDir, - Partition: hpcInfo.Partition, - Environment: map[string]string{"PATH": clusterInfo.EnvPath, - "LD_LIBRARY_PATH": clusterInfo.EnvLdPath}, - StandardOutput: hpcInfo.WorkDir + "/job.out", - StandardError: hpcInfo.WorkDir + "/job.err", - }, - }, +func submitJob(req *types.CommitHpcTaskReq, adapterAddress string) (resp *types.CommitHpcTaskResp, err error) { + req.Parameters["jobName"] = req.Name + reqParticipant := JobSpec{ + Name: req.Name, + Backend: req.Backend, + App: req.App, + OperateType: req.OperateType, + Parameters: req.Parameters, + CustomParams: req.CustomParams, } - var resp v1.SubmitJobResp httpClient := resty.New().R() logx.Info("远程调用p端接口开始") - _, err := httpClient.SetHeader("Content-Type", "application/json"). - SetBody(SubmitJobReq). + httpClient.SetHeader("Content-Type", "application/json"). + SetBody(reqParticipant). SetResult(&resp). - Post(adapterAddress + "/api/v1/job/submit") + Post(adapterAddress + "/api/v1/jobs") logx.Info("远程调用p端接口完成") - if err != nil { - return 0, err - } - return resp.JobId, nil + + return resp, nil } diff --git a/internal/types/types.go b/internal/types/types.go index e633fe24..636e3731 100644 --- a/internal/types/types.go +++ b/internal/types/types.go @@ -1307,34 +1307,26 @@ type ResourceCostRecord struct { } type CommitHpcTaskReq struct { - ClusterId string `json:"clusterId,optional"` - Name string `json:"name"` - Account string `json:"account,optional"` - Description string `json:"description,optional"` - TenantId int64 `json:"tenantId,optional"` - TaskId int64 `json:"taskId,optional"` - AdapterIds []string `json:"adapterIds,optional"` - MatchLabels map[string]string `json:"matchLabels,optional"` - CardCount int64 `json:"cardCount,optional"` - WorkDir string `json:"workDir,optional"` //paratera:workingDir - WallTime string `json:"wallTime,optional"` - CmdScript string `json:"cmdScript,optional"` // paratera:bootScript - AppType string `json:"appType,optional"` - AppName string `json:"appName,optional"` // paratera:jobGroupName ac:appname - Queue string `json:"queue,optional"` - NNode string `json:"nNode,optional"` - SubmitType string `json:"submitType,optional"` - StdInput string `json:"stdInput,optional"` - ClusterType string `json:"clusterType,optional"` - Partition string `json:"partition"` - UserId int64 `json:"userId,optional"` - Token string `json:"token,optional"` - UserIp string `json:"userIp,optional"` + Name string `json:"name"` + Backend string `json:"backend"` // + ClusterId string `json:"clusterId"` + App string `json:"app"` + Description string `json:"description,optional"` + OperateType string `json:"operateType,optional"` + Parameters map[string]string `json:"parameters"` + CustomParams map[string]string `json:"customParams"` } type CommitHpcTaskResp struct { - ClusterId int64 `json:"clusterId"` - JobId string `json:"jobId"` + Code int `json:"code"` + Data Data `json:"data"` + Msg string `json:"msg"` + TraceId string `json:"trace_id"` +} + +type Data struct { + Backend string `json:"backend"` + JobInfo map[string]string `json:"jobInfo"` } type HpcOverViewReq struct { diff --git a/pkg/models/taskhpcmodel_gen.go b/pkg/models/taskhpcmodel_gen.go index 4286f4b6..7a75da86 100644 --- a/pkg/models/taskhpcmodel_gen.go +++ b/pkg/models/taskhpcmodel_gen.go @@ -44,6 +44,8 @@ type ( ClusterId int64 `db:"cluster_id"` //集群id ClusterName string `db:"cluster_name"` //集群名称 Name string `db:"name"` // 名称 + Backend string `db:"backend"` // 平台类型 + OperateType string `db:"operate_type"` // 操作类型 Status string `db:"status"` // 状态 CmdScript string `db:"cmd_script"` StartTime string `db:"start_time"` // 开始时间 @@ -78,6 +80,7 @@ type ( UpdatedBy int64 `db:"updated_by"` // 更新人 UpdatedTime time.Time `db:"updated_time"` // 更新时间 UserId int64 `db:"user_id"` + TimeLimit int64 `db:"time_limit"` } ) From 017e7ce0f3afd6dd6f9979d609e9a01e950f0ddf Mon Sep 17 00:00:00 2001 From: qiwang <1364512070@qq.com> Date: Fri, 14 Mar 2025 10:32:00 +0800 Subject: [PATCH 5/6] fix:update createTraining in modelarts --- go.mod | 2 +- go.sum | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/go.mod b/go.mod index a1df5104..fc8b143b 100644 --- a/go.mod +++ b/go.mod @@ -20,7 +20,7 @@ require ( github.com/zeromicro/go-zero v1.7.4 gitlink.org.cn/JointCloud/pcm-ac v0.0.0-20250107025835-8fc888b1d170 gitlink.org.cn/JointCloud/pcm-hpc v0.0.0-20241125115811-72f3568255a4 - gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250312043331-e84d101055bd + gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250313064001-91fb558cfdb6 gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240817071412-44397870b110 gitlink.org.cn/JointCloud/pcm-openi v0.0.0-20250102093846-164b4884c9ec gitlink.org.cn/JointCloud/pcm-openstack v0.0.0-20240403033338-e7edabad4203 diff --git a/go.sum b/go.sum index d2836722..17834ca1 100644 --- a/go.sum +++ b/go.sum @@ -536,6 +536,10 @@ gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250311041651-d676f57aac45 h1:So gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250311041651-d676f57aac45/go.mod h1:MxtnJJcU8S4zfGKZVcg2MOXGtwucKy7MMDwA0IemBd0= gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250312043331-e84d101055bd h1:MMF06GNHfYDKhtOneImh1mL5fcgDqOpesZF2fW9oU6A= gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250312043331-e84d101055bd/go.mod h1:MxtnJJcU8S4zfGKZVcg2MOXGtwucKy7MMDwA0IemBd0= +gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250313020604-f0c18343ad05 h1:2YjglJQeesAd3x6Lraq/c0qA2m8kGk8v5IPsu3IfDso= +gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250313020604-f0c18343ad05/go.mod h1:MxtnJJcU8S4zfGKZVcg2MOXGtwucKy7MMDwA0IemBd0= +gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250313064001-91fb558cfdb6 h1:9o0ONbSiQHTzODptzgtVZjRYFBLncZ6dpHp9YF+v73I= +gitlink.org.cn/JointCloud/pcm-modelarts v0.0.0-20250313064001-91fb558cfdb6/go.mod h1:MxtnJJcU8S4zfGKZVcg2MOXGtwucKy7MMDwA0IemBd0= gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240817071412-44397870b110 h1:GaXwr5sgDh0raHjUf9IewTvnRvajYea7zbLsaerYyXo= gitlink.org.cn/JointCloud/pcm-octopus v0.0.0-20240817071412-44397870b110/go.mod h1:QOD5+/l2D+AYBjF2h5T0mdJyfGAmF78QmeKdbBXbjLQ= gitlink.org.cn/JointCloud/pcm-openi v0.0.0-20250102093846-164b4884c9ec h1:Yul2JOAIS94B+eIg0UvmBSe8JrtSrZ2OA47gAYLiBYI= From 77d342ecb89cf7315fc775e350d659aa475dde93 Mon Sep 17 00:00:00 2001 From: jagger Date: Fri, 14 Mar 2025 12:17:23 +0800 Subject: [PATCH 6/6] =?UTF-8?q?=E5=88=A0=E6=8E=89=E4=B8=8D=E5=BF=85?= =?UTF-8?q?=E8=A6=81=E7=9A=84panic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: jagger --- internal/logic/adapters/updateclusterlogic.go | 2 +- .../service/inference/imageInference/imageInference.go | 2 +- internal/svc/servicecontext.go | 6 +----- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/internal/logic/adapters/updateclusterlogic.go b/internal/logic/adapters/updateclusterlogic.go index 9d0bc69c..877151c0 100644 --- a/internal/logic/adapters/updateclusterlogic.go +++ b/internal/logic/adapters/updateclusterlogic.go @@ -60,7 +60,7 @@ func (l *UpdateClusterLogic) UpdateCluster(req *types.ClusterCreateReq) (resp *t }).Create(&resourceCost) if dbResult.Error != nil { - panic(dbResult.Error) + return nil, dbResult.Error } return } diff --git a/internal/scheduler/service/inference/imageInference/imageInference.go b/internal/scheduler/service/inference/imageInference/imageInference.go index 891ea225..b9cfa5e8 100644 --- a/internal/scheduler/service/inference/imageInference/imageInference.go +++ b/internal/scheduler/service/inference/imageInference/imageInference.go @@ -417,7 +417,7 @@ func (i *ImageInference) saveAiSubTasks(id int64, aiTaskList []*models.TaskAi, c } err := i.storage.SaveAiTaskImageSubTask(&taskAiSub) if err != nil { - panic(err) + return err } } } diff --git a/internal/svc/servicecontext.go b/internal/svc/servicecontext.go index 216db7fe..f79940f6 100644 --- a/internal/svc/servicecontext.go +++ b/internal/svc/servicecontext.go @@ -63,15 +63,11 @@ func NewServiceContext(c config.Config) *ServiceContext { if err != nil { logx.Errorf("InitPrometheus err: %v", err) - panic("InitSnowflake err") + panic("InitPrometheus err") } httpClient := resty.New() httpClient.SetTimeout(1 * time.Second) alertClient := tracker.NewAlertClient(c.Monitoring.AlertUrl) - if err != nil { - logx.Errorf("InitPrometheus err: %v", err) - panic("InitSnowflake err") - } //添加snowflake支持 err = utils.InitSnowflake(c.SnowflakeConf.MachineId)