You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

shuguangai.go 4.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. package storeLink
  2. import (
  3. "context"
  4. "errors"
  5. "gitlink.org.cn/jcce-pcm/pcm-coordinator/api/internal/svc"
  6. "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/models"
  7. "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils"
  8. "gitlink.org.cn/jcce-pcm/pcm-coordinator/pkg/utils/timeutils"
  9. "gitlink.org.cn/jcce-pcm/pcm-participant-ac/hpcAC"
  10. "strings"
  11. "time"
  12. )
  13. type ShuguangAi struct {
  14. ctx context.Context
  15. svcCtx *svc.ServiceContext
  16. participant *models.StorelinkCenter
  17. }
  18. const (
  19. WORKER_RAM_SIZE = 10240 // 10G
  20. WORKER_NUMBER = 1
  21. WORKER_CPU_NUMBER = 5
  22. WORKER_GPU_NUMBER = 1
  23. PY_PARAM_PREFIX = "--"
  24. SPACE = " "
  25. SHUGUANGAI_CUSTOM_RESOURCE_ID = "WodTB2rJ8SobMgQ1nrtR245jxOrsovFi"
  26. SHUGUANGAI_CUSTOM_RESOURCE_NAME = "1*DCU, CPU:5, 内存:10GB"
  27. )
  28. func NewShuguangAi(ctx context.Context, svcCtx *svc.ServiceContext, participant *models.StorelinkCenter) *ShuguangAi {
  29. return &ShuguangAi{ctx: ctx, svcCtx: svcCtx, participant: participant}
  30. }
  31. func (s *ShuguangAi) UploadImage(path string) (interface{}, error) {
  32. return nil, nil
  33. }
  34. func (s *ShuguangAi) DeleteImage(imageId string) (interface{}, error) {
  35. return nil, nil
  36. }
  37. func (s *ShuguangAi) QueryImageList() (interface{}, error) {
  38. // shuguangAi获取镜像列表
  39. req := &hpcAC.GetImageListAiReq{
  40. AcceleratorType: DCU,
  41. TaskType: PYTORCH,
  42. }
  43. resp, err := s.svcCtx.ACRpc.GetImageListAi(s.ctx, req)
  44. if err != nil {
  45. return nil, err
  46. }
  47. //转换成统一返回类型
  48. imgListResp, err := ConvertType[hpcAC.GetImageListAiResp](resp, nil)
  49. if err != nil {
  50. return nil, err
  51. }
  52. return imgListResp, nil
  53. }
  54. func (s *ShuguangAi) SubmitTask(imageId string, cmd string, params []string, resourceId string) (interface{}, error) {
  55. // shuguangAi提交任务
  56. //判断是否resourceId匹配自定义资源Id
  57. if resourceId != SHUGUANGAI_CUSTOM_RESOURCE_ID {
  58. return nil, errors.New("shuguangAi资源Id不存在")
  59. }
  60. //根据imageId获取imagePath, version
  61. imageReq := &hpcAC.GetImageAiByIdReq{ImageId: imageId}
  62. imageResp, err := s.svcCtx.ACRpc.GetImageAiById(s.ctx, imageReq)
  63. if err != nil {
  64. return nil, err
  65. }
  66. dateStr := timeutils.UnixTimeToString(time.Now().Unix())
  67. //python参数
  68. var pythonArg string
  69. for _, param := range params {
  70. s := strings.Split(param, COMMA)
  71. pythonArg += PY_PARAM_PREFIX + s[0] + "=" + s[1] + SPACE
  72. }
  73. req := &hpcAC.SubmitPytorchTaskReq{
  74. Params: &hpcAC.SubmitPytorchTaskParams{
  75. TaskName: TASK_PYTORCH_PREFIX + "_" + utils.RandomString(7) + dateStr,
  76. WorkPath: WorkPath,
  77. IsDistributed: false,
  78. IsHvd: false,
  79. //Env:
  80. AcceleratorType: DCU,
  81. Version: imageResp.Image.Version,
  82. ImagePath: imageResp.Image.Path,
  83. WorkerNumber: WORKER_NUMBER,
  84. WorkerCpuNumber: WORKER_CPU_NUMBER,
  85. WorkerGpuNumber: WORKER_GPU_NUMBER,
  86. WorkerRamSize: WORKER_RAM_SIZE,
  87. ResourceGroup: RESOURCE_GROUP,
  88. TimeoutLimit: TimeoutLimit,
  89. PythonCodePath: PythonCodePath,
  90. PythonArg: pythonArg,
  91. },
  92. }
  93. resp, err := s.svcCtx.ACRpc.SubmitPytorchTask(s.ctx, req)
  94. if err != nil {
  95. return nil, err
  96. }
  97. //转换成统一返回类型
  98. submitResp, err := ConvertType[hpcAC.SubmitTaskAiResp](resp, nil)
  99. if err != nil {
  100. return nil, err
  101. }
  102. return submitResp, nil
  103. }
  104. func (s *ShuguangAi) QueryTask(taskId string) (interface{}, error) {
  105. // shuguangAi获取任务
  106. req := &hpcAC.GetPytorchTaskReq{
  107. Id: taskId,
  108. }
  109. resp, err := s.svcCtx.ACRpc.GetPytorchTask(s.ctx, req)
  110. if err != nil {
  111. return nil, err
  112. }
  113. //转换成统一返回类型
  114. taskResp, err := ConvertType[hpcAC.GetPytorchTaskResp](resp, nil)
  115. if err != nil {
  116. return nil, err
  117. }
  118. return taskResp, nil
  119. }
  120. func (s *ShuguangAi) DeleteTask(taskId string) (interface{}, error) {
  121. // shuguangAi删除任务
  122. req := &hpcAC.DeleteTaskAiReq{
  123. Ids: taskId,
  124. }
  125. resp, err := s.svcCtx.ACRpc.DeleteTaskAi(s.ctx, req)
  126. if err != nil {
  127. return nil, err
  128. }
  129. //转换成统一返回类型
  130. deleteResp, err := ConvertType[hpcAC.DeleteTaskAiResp](resp, nil)
  131. if err != nil {
  132. return nil, err
  133. }
  134. return deleteResp, nil
  135. }
  136. func (o *ShuguangAi) QuerySpecs() (interface{}, error) {
  137. // ShuguangAi查询资源规格
  138. req := &hpcAC.GetResourceSpecReq{
  139. AcceleratorType: DCU,
  140. ResourceGroup: RESOURCE_GROUP,
  141. }
  142. specs, err := o.svcCtx.ACRpc.GetResourceSpec(o.ctx, req)
  143. if err != nil {
  144. return nil, err
  145. }
  146. //转换成统一返回类型
  147. specsResp, err := ConvertType[hpcAC.GetResourceSpecResp](specs, o.participant)
  148. if err != nil {
  149. return nil, err
  150. }
  151. return specsResp, nil
  152. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.