|
|
|
@@ -2,11 +2,13 @@ package storeLink |
|
|
|
|
|
|
|
import ( |
|
|
|
"context" |
|
|
|
"errors" |
|
|
|
"gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc" |
|
|
|
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models" |
|
|
|
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils" |
|
|
|
"gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils/timeutils" |
|
|
|
"gitlink.org.cn/jcce-pcm/pcm-participant-ac/hpcAC" |
|
|
|
"strings" |
|
|
|
"time" |
|
|
|
) |
|
|
|
|
|
|
|
@@ -16,6 +18,16 @@ type ShuguangAi struct { |
|
|
|
participant *models.ScParticipantPhyInfo |
|
|
|
} |
|
|
|
|
|
|
|
const ( |
|
|
|
WORKER_RAM_SIZE = 10240 // 10G |
|
|
|
WORKER_NUMBER = 1 |
|
|
|
WORKER_CPU_NUMBER = 5 |
|
|
|
WORKER_GPU_NUMBER = 1 |
|
|
|
PY_PARAM_PREFIX = "--" |
|
|
|
SPACE = " " |
|
|
|
SHUGUANGAI_RESOURCE_ID = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi" |
|
|
|
) |
|
|
|
|
|
|
|
func NewShuguangAi(ctx context.Context, svcCtx *svc.ServiceContext, participant *models.ScParticipantPhyInfo) *ShuguangAi { |
|
|
|
return &ShuguangAi{ctx: ctx, svcCtx: svcCtx, participant: participant} |
|
|
|
} |
|
|
|
@@ -51,6 +63,11 @@ func (s *ShuguangAi) QueryImageList() (interface{}, error) { |
|
|
|
func (s *ShuguangAi) SubmitTask(imageId string, cmd string, params []string, resourceId string) (interface{}, error) { |
|
|
|
// shuguangAi提交任务 |
|
|
|
|
|
|
|
//判断是否resourceId匹配自定义资源Id |
|
|
|
if resourceId != SHUGUANGAI_RESOURCE_ID { |
|
|
|
return nil, errors.New("shuguangAi资源Id不存在") |
|
|
|
} |
|
|
|
|
|
|
|
//根据imageId获取imagePath, version |
|
|
|
imageReq := &hpcAC.GetImageAiByIdReq{ImageId: imageId} |
|
|
|
imageResp, err := s.svcCtx.ACRpc.GetImageAiById(s.ctx, imageReq) |
|
|
|
@@ -60,6 +77,13 @@ func (s *ShuguangAi) SubmitTask(imageId string, cmd string, params []string, res |
|
|
|
|
|
|
|
dateStr := timeutils.UnixTimeToString(time.Now().Unix()) |
|
|
|
|
|
|
|
//python参数 |
|
|
|
var pythonArg string |
|
|
|
for _, param := range params { |
|
|
|
s := strings.Split(param, COMMA) |
|
|
|
pythonArg += PY_PARAM_PREFIX + s[0] + "=" + s[1] + SPACE |
|
|
|
} |
|
|
|
|
|
|
|
req := &hpcAC.SubmitPytorchTaskReq{ |
|
|
|
Params: &hpcAC.SubmitPytorchTaskParams{ |
|
|
|
TaskName: TASK_PYTORCH_PREFIX + "_" + utils.RandomString(7) + dateStr, |
|
|
|
@@ -70,13 +94,14 @@ func (s *ShuguangAi) SubmitTask(imageId string, cmd string, params []string, res |
|
|
|
AcceleratorType: DCU, |
|
|
|
Version: imageResp.Image.Version, |
|
|
|
ImagePath: imageResp.Image.Path, |
|
|
|
WorkerNumber: 1, |
|
|
|
WorkerCpuNumber: "1", |
|
|
|
WorkerGpuNumber: 1, |
|
|
|
WorkerRamSize: 1024, |
|
|
|
WorkerNumber: WORKER_NUMBER, |
|
|
|
WorkerCpuNumber: WORKER_CPU_NUMBER, |
|
|
|
WorkerGpuNumber: WORKER_GPU_NUMBER, |
|
|
|
WorkerRamSize: WORKER_RAM_SIZE, |
|
|
|
ResourceGroup: RESOURCE_GROUP, |
|
|
|
TimeoutLimit: TimeoutLimit, |
|
|
|
PythonCodePath: PythonCodePath, |
|
|
|
PythonArg: pythonArg, |
|
|
|
}, |
|
|
|
} |
|
|
|
resp, err := s.svcCtx.ACRpc.SubmitPytorchTask(s.ctx, req) |
|
|
|
|