您最多选择25个标签 标签必须以中文、字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

shuguangai.go 5.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. /*
  2. Copyright (c) [2023] [pcm]
  3. [pcm-coordinator] is licensed under Mulan PSL v2.
  4. You can use this software according to the terms and conditions of the Mulan PSL v2.
  5. You may obtain a copy of Mulan PSL v2 at:
  6. http://license.coscl.org.cn/MulanPSL2
  7. THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
  8. EITHER EXPaRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
  9. MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
  10. See the Mulan PSL v2 for more details.
  11. */
  12. package storeLink
  13. import (
  14. "context"
  15. "errors"
  16. "gitlink.org.cn/jcce-pcm/pcm-ac/hpcAC"
  17. "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/scheduler/schedulers/option"
  18. "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/scheduler/service/collector"
  19. "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc"
  20. "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils"
  21. "strings"
  22. )
  23. type ShuguangAi struct {
  24. ctx context.Context
  25. svcCtx *svc.ServiceContext
  26. platform string
  27. participantId int64
  28. }
  29. const (
  30. WORKER_RAM_SIZE = 10240 // 10G
  31. WORKER_NUMBER = 1
  32. WORKER_CPU_NUMBER = 5
  33. WORKER_GPU_NUMBER = 1
  34. SHUGUANGAI_CUSTOM_RESOURCE_ID = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
  35. SHUGUANGAI_CUSTOM_RESOURCE_NAME = "1*DCU, CPU:5, 内存:10GB"
  36. DCU = "dcu"
  37. PYTORCH = "Pytorch"
  38. TASK_PYTORCH_PREFIX = "PytorchTask"
  39. TENSORFLOW = "Tensorflow"
  40. RESOURCE_GROUP = "wzhdtest"
  41. WorkPath = "/work/home/acgnnmfbwo/111111/py/"
  42. TimeoutLimit = "10:00:00"
  43. PythonCodePath = "/work/home/acgnnmfbwo/111111/py/test.py"
  44. )
  45. func NewShuguangAi(ctx context.Context, svcCtx *svc.ServiceContext, name string, id int64) *ShuguangAi {
  46. return &ShuguangAi{ctx: ctx, svcCtx: svcCtx, platform: name, participantId: id}
  47. }
  48. func (s *ShuguangAi) UploadImage(path string) (interface{}, error) {
  49. return nil, nil
  50. }
  51. func (s *ShuguangAi) DeleteImage(imageId string) (interface{}, error) {
  52. return nil, nil
  53. }
  54. func (s *ShuguangAi) QueryImageList() (interface{}, error) {
  55. // shuguangAi获取镜像列表
  56. req := &hpcAC.GetImageListAiReq{
  57. AcceleratorType: DCU,
  58. TaskType: PYTORCH,
  59. }
  60. resp, err := s.svcCtx.ACRpc.GetImageListAi(s.ctx, req)
  61. if err != nil {
  62. return nil, err
  63. }
  64. return resp, nil
  65. }
  66. func (s *ShuguangAi) SubmitTask(imageId string, cmd string, envs []string, params []string, resourceId string) (interface{}, error) {
  67. // shuguangAi提交任务
  68. //判断是否resourceId匹配自定义资源Id
  69. if resourceId != SHUGUANGAI_CUSTOM_RESOURCE_ID {
  70. return nil, errors.New("shuguangAi资源Id不存在")
  71. }
  72. //根据imageId获取imagePath, version
  73. imageReq := &hpcAC.GetImageAiByIdReq{ImageId: imageId}
  74. imageResp, err := s.svcCtx.ACRpc.GetImageAiById(s.ctx, imageReq)
  75. if err != nil {
  76. return nil, err
  77. }
  78. //python参数
  79. var pythonArg string
  80. for _, param := range params {
  81. s := strings.Split(param, COMMA)
  82. pythonArg += PY_PARAM_PREFIX + s[0] + "=" + s[1] + SPACE
  83. }
  84. //环境变量
  85. var env string
  86. for _, e := range envs {
  87. s := strings.Split(e, COMMA)
  88. env += s[0] + "=" + s[1] + SPACE
  89. }
  90. req := &hpcAC.SubmitPytorchTaskReq{
  91. Params: &hpcAC.SubmitPytorchTaskParams{
  92. TaskName: TASK_PYTORCH_PREFIX + UNDERSCORE + utils.RandomString(10),
  93. WorkPath: WorkPath,
  94. IsDistributed: false,
  95. IsHvd: false,
  96. Env: env,
  97. AcceleratorType: DCU,
  98. Version: imageResp.Image.Version,
  99. ImagePath: imageResp.Image.Path,
  100. WorkerNumber: WORKER_NUMBER,
  101. WorkerCpuNumber: WORKER_CPU_NUMBER,
  102. WorkerGpuNumber: WORKER_GPU_NUMBER,
  103. WorkerRamSize: WORKER_RAM_SIZE,
  104. ResourceGroup: RESOURCE_GROUP,
  105. TimeoutLimit: TimeoutLimit,
  106. PythonCodePath: PythonCodePath,
  107. PythonArg: pythonArg,
  108. },
  109. }
  110. resp, err := s.svcCtx.ACRpc.SubmitPytorchTask(s.ctx, req)
  111. if err != nil {
  112. return nil, err
  113. }
  114. return resp, nil
  115. }
  116. func (s *ShuguangAi) QueryTask(taskId string) (interface{}, error) {
  117. // shuguangAi获取任务
  118. req := &hpcAC.GetPytorchTaskReq{
  119. Id: taskId,
  120. }
  121. resp, err := s.svcCtx.ACRpc.GetPytorchTask(s.ctx, req)
  122. if err != nil {
  123. return nil, err
  124. }
  125. return resp, nil
  126. }
  127. func (s *ShuguangAi) DeleteTask(taskId string) (interface{}, error) {
  128. // shuguangAi删除任务
  129. req := &hpcAC.DeleteTaskAiReq{
  130. Ids: taskId,
  131. }
  132. resp, err := s.svcCtx.ACRpc.DeleteTaskAi(s.ctx, req)
  133. if err != nil {
  134. return nil, err
  135. }
  136. return resp, nil
  137. }
  138. func (o *ShuguangAi) QuerySpecs() (interface{}, error) {
  139. // ShuguangAi查询资源规格
  140. req := &hpcAC.GetResourceSpecReq{
  141. AcceleratorType: DCU,
  142. ResourceGroup: RESOURCE_GROUP,
  143. }
  144. specs, err := o.svcCtx.ACRpc.GetResourceSpec(o.ctx, req)
  145. if err != nil {
  146. return nil, err
  147. }
  148. return specs, nil
  149. }
  150. func (o *ShuguangAi) GetResourceSpecs() (*collector.ResourceSpecs, error) {
  151. return nil, nil
  152. }
  153. func (o *ShuguangAi) Execute(option option.AiOption) (interface{}, error) {
  154. return nil, nil
  155. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.