- /*
-
- Copyright (c) [2023] [pcm]
- [pcm-coordinator] is licensed under Mulan PSL v2.
- You can use this software according to the terms and conditions of the Mulan PSL v2.
- You may obtain a copy of Mulan PSL v2 at:
- http://license.coscl.org.cn/MulanPSL2
- THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
- EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
- MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
- See the Mulan PSL v2 for more details.
-
- */
-
- package storeLink
-
- import (
- "context"
- "errors"
- "gitlink.org.cn/jcce-pcm/pcm-ac/hpcAC"
- "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/scheduler/schedulers/option"
- "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/scheduler/service/collector"
- "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc"
- "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils"
- "strings"
- )
-
- type ShuguangAi struct {
- ctx context.Context
- svcCtx *svc.ServiceContext
- platform string
- participantId int64
- }
-
- const (
- WORKER_RAM_SIZE = 10240 // 10G
- WORKER_NUMBER = 1
- WORKER_CPU_NUMBER = 5
- WORKER_GPU_NUMBER = 1
- SHUGUANGAI_CUSTOM_RESOURCE_ID = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
- SHUGUANGAI_CUSTOM_RESOURCE_NAME = "1*DCU, CPU:5, 内存:10GB"
- DCU = "dcu"
- PYTORCH = "Pytorch"
- TASK_PYTORCH_PREFIX = "PytorchTask"
- TENSORFLOW = "Tensorflow"
- RESOURCE_GROUP = "wzhdtest"
- WorkPath = "/work/home/acgnnmfbwo/111111/py/"
- TimeoutLimit = "10:00:00"
- PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py"
- )
-
- func NewShuguangAi(ctx context.Context, svcCtx *svc.ServiceContext, name string, id int64) *ShuguangAi {
- return &ShuguangAi{ctx: ctx, svcCtx: svcCtx, platform: name, participantId: id}
- }
-
- func (s *ShuguangAi) UploadImage(path string) (interface{}, error) {
- return nil, nil
- }
-
- func (s *ShuguangAi) DeleteImage(imageId string) (interface{}, error) {
- return nil, nil
- }
-
- func (s *ShuguangAi) QueryImageList() (interface{}, error) {
- // shuguangAi获取镜像列表
- req := &hpcAC.GetImageListAiReq{
- AcceleratorType: DCU,
- TaskType: PYTORCH,
- }
- resp, err := s.svcCtx.ACRpc.GetImageListAi(s.ctx, req)
- if err != nil {
- return nil, err
- }
-
- return resp, nil
- }
-
- func (s *ShuguangAi) SubmitTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) {
- // shuguangAi提交任务
-
- //判断是否resourceId匹配自定义资源Id
- if resourceId != SHUGUANGAI_CUSTOM_RESOURCE_ID {
- return nil, errors.New("shuguangAi资源Id不存在")
- }
-
- //根据imageId获取imagePath, version
- imageReq := &hpcAC.GetImageAiByIdReq{ImageId: imageId}
- imageResp, err := s.svcCtx.ACRpc.GetImageAiById(s.ctx, imageReq)
- if err != nil {
- return nil, err
- }
-
- //python参数
- var pythonArg string
- for _, param := range params {
- s := strings.Split(param, COMMA)
- pythonArg += PY_PARAM_PREFIX + s[0] + "=" + s[1] + SPACE
- }
-
- //环境变量
- var env string
- for _, e := range envs {
- s := strings.Split(e, COMMA)
- env += s[0] + "=" + s[1] + SPACE
- }
-
- req := &hpcAC.SubmitPytorchTaskReq{
- Params: &hpcAC.SubmitPytorchTaskParams{
- TaskName: TASK_PYTORCH_PREFIX + UNDERSCORE + utils.RandomString(10),
- WorkPath: WorkPath,
- IsDistributed: false,
- IsHvd: false,
- Env: env,
- AcceleratorType: DCU,
- Version: imageResp.Image.Version,
- ImagePath: imageResp.Image.Path,
- WorkerNumber: WORKER_NUMBER,
- WorkerCpuNumber: WORKER_CPU_NUMBER,
- WorkerGpuNumber: WORKER_GPU_NUMBER,
- WorkerRamSize: WORKER_RAM_SIZE,
- ResourceGroup: RESOURCE_GROUP,
- TimeoutLimit: TimeoutLimit,
- PythonCodePath: PythonCodePath,
- PythonArg: pythonArg,
- },
- }
- resp, err := s.svcCtx.ACRpc.SubmitPytorchTask(s.ctx, req)
- if err != nil {
- return nil, err
- }
-
- return resp, nil
- }
-
- func (s *ShuguangAi) QueryTask(taskId string) (interface{}, error) {
- // shuguangAi获取任务
- req := &hpcAC.GetPytorchTaskReq{
- Id: taskId,
- }
- resp, err := s.svcCtx.ACRpc.GetPytorchTask(s.ctx, req)
- if err != nil {
- return nil, err
- }
-
- return resp, nil
- }
-
- func (s *ShuguangAi) DeleteTask(taskId string) (interface{}, error) {
- // shuguangAi删除任务
- req := &hpcAC.DeleteTaskAiReq{
- Ids: taskId,
- }
- resp, err := s.svcCtx.ACRpc.DeleteTaskAi(s.ctx, req)
- if err != nil {
- return nil, err
- }
-
- return resp, nil
- }
-
- func (o *ShuguangAi) QuerySpecs() (interface{}, error) {
- // ShuguangAi查询资源规格
- req := &hpcAC.GetResourceSpecReq{
- AcceleratorType: DCU,
- ResourceGroup: RESOURCE_GROUP,
- }
- specs, err := o.svcCtx.ACRpc.GetResourceSpec(o.ctx, req)
- if err != nil {
- return nil, err
- }
-
- return specs, nil
- }
-
- func (o *ShuguangAi) GetResourceSpecs() (*collector.ResourceSpecs, error) {
- return nil, nil
- }
-
- func (o *ShuguangAi) Execute(option option.AiOption) (interface{}, error) {
- return nil, nil
- }
|