- package storeLink
-
- import (
- "context"
- "errors"
- "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc"
- "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models"
- "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils"
- "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils/timeutils"
- "gitlink.org.cn/jcce-pcm/pcm-participant-ac/hpcAC"
- "strings"
- "time"
- )
-
- type ShuguangAi struct {
- ctx context.Context
- svcCtx *svc.ServiceContext
- participant *models.StorelinkCenter
- }
-
- const (
- WORKER_RAM_SIZE = 10240 // 10G
- WORKER_NUMBER = 1
- WORKER_CPU_NUMBER = 5
- WORKER_GPU_NUMBER = 1
- PY_PARAM_PREFIX = "--"
- SPACE = " "
- SHUGUANGAI_CUSTOM_RESOURCE_ID = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
- SHUGUANGAI_CUSTOM_RESOURCE_NAME = "1*DCU, CPU:5, 内存:10GB"
- )
-
- func NewShuguangAi(ctx context.Context, svcCtx *svc.ServiceContext, participant *models.StorelinkCenter) *ShuguangAi {
- return &ShuguangAi{ctx: ctx, svcCtx: svcCtx, participant: participant}
- }
-
- func (s *ShuguangAi) UploadImage(path string) (interface{}, error) {
- return nil, nil
- }
-
- func (s *ShuguangAi) DeleteImage(imageId string) (interface{}, error) {
- return nil, nil
- }
-
- func (s *ShuguangAi) QueryImageList() (interface{}, error) {
- // shuguangAi获取镜像列表
- req := &hpcAC.GetImageListAiReq{
- AcceleratorType: DCU,
- TaskType: PYTORCH,
- }
- resp, err := s.svcCtx.ACRpc.GetImageListAi(s.ctx, req)
- if err != nil {
- return nil, err
- }
-
- //转换成统一返回类型
- imgListResp, err := ConvertType[hpcAC.GetImageListAiResp](resp, nil)
- if err != nil {
- return nil, err
- }
-
- return imgListResp, nil
- }
-
- func (s *ShuguangAi) SubmitTask(imageId string, cmd string, params []string, resourceId string) (interface{}, error) {
- // shuguangAi提交任务
-
- //判断是否resourceId匹配自定义资源Id
- if resourceId != SHUGUANGAI_CUSTOM_RESOURCE_ID {
- return nil, errors.New("shuguangAi资源Id不存在")
- }
-
- //根据imageId获取imagePath, version
- imageReq := &hpcAC.GetImageAiByIdReq{ImageId: imageId}
- imageResp, err := s.svcCtx.ACRpc.GetImageAiById(s.ctx, imageReq)
- if err != nil {
- return nil, err
- }
-
- dateStr := timeutils.UnixTimeToString(time.Now().Unix())
-
- //python参数
- var pythonArg string
- for _, param := range params {
- s := strings.Split(param, COMMA)
- pythonArg += PY_PARAM_PREFIX + s[0] + "=" + s[1] + SPACE
- }
-
- req := &hpcAC.SubmitPytorchTaskReq{
- Params: &hpcAC.SubmitPytorchTaskParams{
- TaskName: TASK_PYTORCH_PREFIX + "_" + utils.RandomString(7) + dateStr,
- WorkPath: WorkPath,
- IsDistributed: false,
- IsHvd: false,
- //Env:
- AcceleratorType: DCU,
- Version: imageResp.Image.Version,
- ImagePath: imageResp.Image.Path,
- WorkerNumber: WORKER_NUMBER,
- WorkerCpuNumber: WORKER_CPU_NUMBER,
- WorkerGpuNumber: WORKER_GPU_NUMBER,
- WorkerRamSize: WORKER_RAM_SIZE,
- ResourceGroup: RESOURCE_GROUP,
- TimeoutLimit: TimeoutLimit,
- PythonCodePath: PythonCodePath,
- PythonArg: pythonArg,
- },
- }
- resp, err := s.svcCtx.ACRpc.SubmitPytorchTask(s.ctx, req)
- if err != nil {
- return nil, err
- }
-
- //转换成统一返回类型
- submitResp, err := ConvertType[hpcAC.SubmitTaskAiResp](resp, nil)
- if err != nil {
- return nil, err
- }
-
- return submitResp, nil
- }
-
- func (s *ShuguangAi) QueryTask(taskId string) (interface{}, error) {
- // shuguangAi获取任务
- req := &hpcAC.GetPytorchTaskReq{
- Id: taskId,
- }
- resp, err := s.svcCtx.ACRpc.GetPytorchTask(s.ctx, req)
- if err != nil {
- return nil, err
- }
-
- //转换成统一返回类型
- taskResp, err := ConvertType[hpcAC.GetPytorchTaskResp](resp, nil)
- if err != nil {
- return nil, err
- }
-
- return taskResp, nil
- }
-
- func (s *ShuguangAi) DeleteTask(taskId string) (interface{}, error) {
- // shuguangAi删除任务
- req := &hpcAC.DeleteTaskAiReq{
- Ids: taskId,
- }
- resp, err := s.svcCtx.ACRpc.DeleteTaskAi(s.ctx, req)
- if err != nil {
- return nil, err
- }
-
- //转换成统一返回类型
- deleteResp, err := ConvertType[hpcAC.DeleteTaskAiResp](resp, nil)
- if err != nil {
- return nil, err
- }
-
- return deleteResp, nil
- }
-
- func (o *ShuguangAi) QuerySpecs() (interface{}, error) {
- // ShuguangAi查询资源规格
- req := &hpcAC.GetResourceSpecReq{
- AcceleratorType: DCU,
- ResourceGroup: RESOURCE_GROUP,
- }
- specs, err := o.svcCtx.ACRpc.GetResourceSpec(o.ctx, req)
- if err != nil {
- return nil, err
- }
-
- //转换成统一返回类型
- specsResp, err := ConvertType[hpcAC.GetResourceSpecResp](specs, o.participant)
- if err != nil {
- return nil, err
- }
-
- return specsResp, nil
- }
|