package storeLink import ( "context" "errors" "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils" "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils/timeutils" "gitlink.org.cn/jcce-pcm/pcm-participant-ac/hpcAC" "strings" "time" ) type ShuguangAi struct { ctx context.Context svcCtx *svc.ServiceContext participant *models.StorelinkCenter } const ( WORKER_RAM_SIZE = 10240 // 10G WORKER_NUMBER = 1 WORKER_CPU_NUMBER = 5 WORKER_GPU_NUMBER = 1 PY_PARAM_PREFIX = "--" SPACE = " " SHUGUANGAI_CUSTOM_RESOURCE_ID = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi" SHUGUANGAI_CUSTOM_RESOURCE_NAME = "1*DCU, CPU:5, 内存:10GB" ) func NewShuguangAi(ctx context.Context, svcCtx *svc.ServiceContext, participant *models.StorelinkCenter) *ShuguangAi { return &ShuguangAi{ctx: ctx, svcCtx: svcCtx, participant: participant} } func (s *ShuguangAi) UploadImage(path string) (interface{}, error) { return nil, nil } func (s *ShuguangAi) DeleteImage(imageId string) (interface{}, error) { return nil, nil } func (s *ShuguangAi) QueryImageList() (interface{}, error) { // shuguangAi获取镜像列表 req := &hpcAC.GetImageListAiReq{ AcceleratorType: DCU, TaskType: PYTORCH, } resp, err := s.svcCtx.ACRpc.GetImageListAi(s.ctx, req) if err != nil { return nil, err } //转换成统一返回类型 imgListResp, err := ConvertType[hpcAC.GetImageListAiResp](resp, nil) if err != nil { return nil, err } return imgListResp, nil } func (s *ShuguangAi) SubmitTask(imageId string, cmd string, params []string, resourceId string) (interface{}, error) { // shuguangAi提交任务 //判断是否resourceId匹配自定义资源Id if resourceId != SHUGUANGAI_CUSTOM_RESOURCE_ID { return nil, errors.New("shuguangAi资源Id不存在") } //根据imageId获取imagePath, version imageReq := &hpcAC.GetImageAiByIdReq{ImageId: imageId} imageResp, err := s.svcCtx.ACRpc.GetImageAiById(s.ctx, imageReq) if err != nil { return nil, err } dateStr := timeutils.UnixTimeToString(time.Now().Unix()) //python参数 var pythonArg string for _, param := range params { s := strings.Split(param, COMMA) pythonArg += PY_PARAM_PREFIX + s[0] + "=" + s[1] + SPACE } req := &hpcAC.SubmitPytorchTaskReq{ Params: &hpcAC.SubmitPytorchTaskParams{ TaskName: TASK_PYTORCH_PREFIX + "_" + utils.RandomString(7) + dateStr, WorkPath: WorkPath, IsDistributed: false, IsHvd: false, //Env: AcceleratorType: DCU, Version: imageResp.Image.Version, ImagePath: imageResp.Image.Path, WorkerNumber: WORKER_NUMBER, WorkerCpuNumber: WORKER_CPU_NUMBER, WorkerGpuNumber: WORKER_GPU_NUMBER, WorkerRamSize: WORKER_RAM_SIZE, ResourceGroup: RESOURCE_GROUP, TimeoutLimit: TimeoutLimit, PythonCodePath: PythonCodePath, PythonArg: pythonArg, }, } resp, err := s.svcCtx.ACRpc.SubmitPytorchTask(s.ctx, req) if err != nil { return nil, err } //转换成统一返回类型 submitResp, err := ConvertType[hpcAC.SubmitTaskAiResp](resp, nil) if err != nil { return nil, err } return submitResp, nil } func (s *ShuguangAi) QueryTask(taskId string) (interface{}, error) { // shuguangAi获取任务 req := &hpcAC.GetPytorchTaskReq{ Id: taskId, } resp, err := s.svcCtx.ACRpc.GetPytorchTask(s.ctx, req) if err != nil { return nil, err } //转换成统一返回类型 taskResp, err := ConvertType[hpcAC.GetPytorchTaskResp](resp, nil) if err != nil { return nil, err } return taskResp, nil } func (s *ShuguangAi) DeleteTask(taskId string) (interface{}, error) { // shuguangAi删除任务 req := &hpcAC.DeleteTaskAiReq{ Ids: taskId, } resp, err := s.svcCtx.ACRpc.DeleteTaskAi(s.ctx, req) if err != nil { return nil, err } //转换成统一返回类型 deleteResp, err := ConvertType[hpcAC.DeleteTaskAiResp](resp, nil) if err != nil { return nil, err } return deleteResp, nil } func (o *ShuguangAi) QuerySpecs() (interface{}, error) { // ShuguangAi查询资源规格 req := &hpcAC.GetResourceSpecReq{ AcceleratorType: DCU, ResourceGroup: RESOURCE_GROUP, } specs, err := o.svcCtx.ACRpc.GetResourceSpec(o.ctx, req) if err != nil { return nil, err } //转换成统一返回类型 specsResp, err := ConvertType[hpcAC.GetResourceSpecResp](specs, o.participant) if err != nil { return nil, err } return specsResp, nil }