From e896af8b2939e952dcaca25ad2fccda7b2d67596 Mon Sep 17 00:00:00 2001 From: tzwang Date: Fri, 27 Oct 2023 17:08:39 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AD=98=E7=AE=97=E8=81=94=E5=8A=A8=E6=8F=90?= =?UTF-8?q?=E4=BA=A4=E6=8E=A5=E5=8F=A3=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Former-commit-id: 6b5a30defe9fd5510936d8299ca9e2ff53664cfb --- api/internal/storeLink/shuguangai.go | 33 ++++++++++++++++++++++++---- api/internal/storeLink/storeLink.go | 1 + go.mod | 2 +- go.sum | 4 ++-- 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/api/internal/storeLink/shuguangai.go b/api/internal/storeLink/shuguangai.go index bbe93782..7ca68343 100644 --- a/api/internal/storeLink/shuguangai.go +++ b/api/internal/storeLink/shuguangai.go @@ -2,11 +2,13 @@ package storeLink import ( "context" + "errors" "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils/timeutils" "gitlink.org.cn/jcce-pcm/pcm-participant-ac/hpcAC" + "strings" "time" ) @@ -16,6 +18,16 @@ type ShuguangAi struct { participant *models.ScParticipantPhyInfo } +const ( + WORKER_RAM_SIZE = 10240 // 10G + WORKER_NUMBER = 1 + WORKER_CPU_NUMBER = 5 + WORKER_GPU_NUMBER = 1 + PY_PARAM_PREFIX = "--" + SPACE = " " + SHUGUANGAI_RESOURCE_ID = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi" +) + func NewShuguangAi(ctx context.Context, svcCtx *svc.ServiceContext, participant *models.ScParticipantPhyInfo) *ShuguangAi { return &ShuguangAi{ctx: ctx, svcCtx: svcCtx, participant: participant} } @@ -51,6 +63,11 @@ func (s *ShuguangAi) QueryImageList() (interface{}, error) { func (s *ShuguangAi) SubmitTask(imageId string, cmd string, params []string, resourceId string) (interface{}, error) { // shuguangAi提交任务 + //判断是否resourceId匹配自定义资源Id + if resourceId != SHUGUANGAI_RESOURCE_ID { + return nil, errors.New("shuguangAi资源Id不存在") + } + //根据imageId获取imagePath, version imageReq := &hpcAC.GetImageAiByIdReq{ImageId: imageId} imageResp, err := s.svcCtx.ACRpc.GetImageAiById(s.ctx, imageReq) @@ -60,6 +77,13 @@ func (s *ShuguangAi) SubmitTask(imageId string, cmd string, params []string, res dateStr := timeutils.UnixTimeToString(time.Now().Unix()) + //python参数 + var pythonArg string + for _, param := range params { + s := strings.Split(param, COMMA) + pythonArg += PY_PARAM_PREFIX + s[0] + "=" + s[1] + SPACE + } + req := &hpcAC.SubmitPytorchTaskReq{ Params: &hpcAC.SubmitPytorchTaskParams{ TaskName: TASK_PYTORCH_PREFIX + "_" + utils.RandomString(7) + dateStr, @@ -70,13 +94,14 @@ func (s *ShuguangAi) SubmitTask(imageId string, cmd string, params []string, res AcceleratorType: DCU, Version: imageResp.Image.Version, ImagePath: imageResp.Image.Path, - WorkerNumber: 1, - WorkerCpuNumber: "1", - WorkerGpuNumber: 1, - WorkerRamSize: 1024, + WorkerNumber: WORKER_NUMBER, + WorkerCpuNumber: WORKER_CPU_NUMBER, + WorkerGpuNumber: WORKER_GPU_NUMBER, + WorkerRamSize: WORKER_RAM_SIZE, ResourceGroup: RESOURCE_GROUP, TimeoutLimit: TimeoutLimit, PythonCodePath: PythonCodePath, + PythonArg: pythonArg, }, } resp, err := s.svcCtx.ACRpc.SubmitPytorchTask(s.ctx, req) diff --git a/api/internal/storeLink/storeLink.go b/api/internal/storeLink/storeLink.go index bc61bc2f..e1dc6769 100644 --- a/api/internal/storeLink/storeLink.go +++ b/api/internal/storeLink/storeLink.go @@ -315,6 +315,7 @@ func ConvertType[T any](in *T, participant *models.ScParticipantPhyInfo) (interf resp.Success = true spec.ParticipantName = participant.Name spec.ParticipantId = strconv.FormatInt(participant.Id, 10) + spec.SpecId = SHUGUANGAI_RESOURCE_ID resp.ResourceSpecs = append(resp.ResourceSpecs, &spec) } return resp, nil diff --git a/go.mod b/go.mod index 28ee5001..1ac1f7ec 100644 --- a/go.mod +++ b/go.mod @@ -21,7 +21,7 @@ require ( github.com/shopspring/decimal v1.3.1 github.com/zeromicro/go-queue v1.1.8 github.com/zeromicro/go-zero v1.5.5 - gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231026084523-f76f3da5525d + gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231027084000-16876da5aa31 gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230817103341-2459e5bfc835 gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20230830120334-bf6d99c715ef gitlink.org.cn/jcce-pcm/pcm-participant-modelarts v0.0.0-20231024115530-f6fd0505d2a1 diff --git a/go.sum b/go.sum index f01419c4..59447806 100644 --- a/go.sum +++ b/go.sum @@ -1033,8 +1033,8 @@ github.com/zeromicro/go-zero v1.4.3/go.mod h1:UmDjuW7LHd9j7+nnnPBcXF0HLNmjJw6OjH github.com/zeromicro/go-zero v1.5.1/go.mod h1:bGYm4XWsGN9GhDsO2O2BngpVoWjf3Eog2a5hUOMhlXs= github.com/zeromicro/go-zero v1.5.3 h1:9poyd+raeL7gSMUu6P19N7bssTppieR2j7Oos2j1yFQ= github.com/zeromicro/go-zero v1.5.3/go.mod h1:dmoBpgJTxt9KWmgrNGpv06XxZRPXMakrxUVgROFAR3g= -gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231026084523-f76f3da5525d h1:CY4pWM8JVRXBtD5CdVZC0fe4xUxjHmQegdwpHBaOBes= -gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231026084523-f76f3da5525d/go.mod h1:DY45tXlPBWBptj9YjCHWnAK5LshvJ33PjFkE5/vtd4o= +gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231027084000-16876da5aa31 h1:SppjTZvObJgqliPk1wSeuezQu1k/tMGcyVaMVEaDIUU= +gitlink.org.cn/jcce-pcm/pcm-participant-ac v0.0.0-20231027084000-16876da5aa31/go.mod h1:DY45tXlPBWBptj9YjCHWnAK5LshvJ33PjFkE5/vtd4o= gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230817103341-2459e5bfc835 h1:WDCPqD8IrepGJXankkpG14Ny6inh9AldB0RX9WWa+ck= gitlink.org.cn/jcce-pcm/pcm-participant-ceph v0.0.0-20230817103341-2459e5bfc835/go.mod h1:r/KLzUpupCV5jdxSfgDhc2pVjP0fBi3VhAWRttsBn30= gitlink.org.cn/jcce-pcm/pcm-participant-kubernetes v0.0.0-20230830120334-bf6d99c715ef h1:s7JfXjka2MhGaDjKMJ57fj0k3XuDB6w+UlYHFLyJlUY=