You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

shuguangHpc.go 7.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. package storeLink
  2. import (
  3. "context"
  4. "errors"
  5. "fmt"
  6. "gitlink.org.cn/JointCloud/pcm-coordinator/api/internal/types"
  7. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils"
  8. "gitlink.org.cn/jcce-pcm/pcm-ac/hpcAC"
  9. "gitlink.org.cn/jcce-pcm/pcm-ac/hpcacclient"
  10. "strconv"
  11. "strings"
  12. )
  13. type ShuguangHpc struct {
  14. aCRpc hpcacclient.HpcAC
  15. platform string
  16. participantId int64
  17. }
  18. const (
  19. GAP_WALL_TIME_24H = "24:00:00"
  20. TASK_SHUGUANG_PREFIX = "ShuguangHPC"
  21. NEWLINE = "\n"
  22. JOBNAME = "JOBNAME"
  23. GAP_CMD_FILE = "cmd"
  24. GAP_NNODE = "1" // 节点个数
  25. GAP_NODE_STRING = ""
  26. GAP_APPNAME = "BASE"
  27. GAP_QUEUE = "wzhdtest"
  28. GAP_WORK_DIR = "/work/home/acgnnmfbwo/BASE/JOBNAME"
  29. GAP_STD_OUT_FILE = "/work/home/acgnnmfbwo/BASE/JOBNAME/std.out.%j"
  30. GAP_STD_ERR_FILE = "/work/home/acgnnmfbwo/BASE/JOBNAME/std.err.%j"
  31. StrJobManagerID = 1637920656
  32. Apptype = "BASIC"
  33. EXPORT = "export"
  34. GAP_NPROC = "1"
  35. GAP_NDCU = "1"
  36. GAP_EXCLUSIVE = ""
  37. GAP_PPN = ""
  38. GAP_NGPU = ""
  39. GAP_MULTI_SUB = ""
  40. StrJobInfoMap = "%d,%s:%s:"
  41. Username = "acgnnmfbwo"
  42. )
  43. var RESOURCEMAP = map[string]ResourceSpecHpc{
  44. "FPOqD5Cx8iNYqawEgDrAxLdrszp4Tmhl": {
  45. GAP_NNODE: "1",
  46. GAP_NPROC: "1",
  47. GAP_NDCU: "1",
  48. },
  49. "Nd99eGNoBFC2ZTycKDlqD37heWTOmrMS": {
  50. GAP_NNODE: "1",
  51. GAP_NPROC: "2",
  52. GAP_NDCU: "1",
  53. },
  54. "uAmLkz6jgSZkC6o8JywG7Yo2aiFPPOBO": {
  55. GAP_NNODE: "1",
  56. GAP_NPROC: "4",
  57. GAP_NDCU: "2",
  58. },
  59. "D71OZQYrRabJc2nfL2GDWOdLEfbiMzYH": {
  60. GAP_NNODE: "1",
  61. GAP_NPROC: "8",
  62. GAP_NDCU: "4",
  63. },
  64. "sXUMrGmgMDFJaLi6dPiB9LkHjFb3lvL5": {
  65. GAP_NNODE: "1",
  66. GAP_NPROC: "16",
  67. GAP_NDCU: "4",
  68. },
  69. "ZfCKQKbNbQl9RPwlSyWLah1Gf7Ti7uJA": {
  70. GAP_NNODE: "1",
  71. GAP_NPROC: "32",
  72. GAP_NDCU: "4",
  73. },
  74. "cfEI4ulTNo2gYUozzdG59URByUjwLl3x": {
  75. GAP_NNODE: "2",
  76. GAP_NPROC: "4",
  77. GAP_NDCU: "2",
  78. },
  79. "vtbkaks8bErhpLRkUDiPDUHq6ssotFpD": {
  80. GAP_NNODE: "2",
  81. GAP_NPROC: "8",
  82. GAP_NDCU: "4",
  83. },
  84. "QJXZFJSReVWWQfkvQjGyEq1JpDHN55Oh": {
  85. GAP_NNODE: "2",
  86. GAP_NPROC: "16",
  87. GAP_NDCU: "4",
  88. },
  89. "79xSdy48yLbVLl9DqEV6tQ2J6jaHe5KO": {
  90. GAP_NNODE: "2",
  91. GAP_NPROC: "32",
  92. GAP_NDCU: "8",
  93. },
  94. }
  95. var RESOURCESPECSHPC = map[string]string{
  96. "FPOqD5Cx8iNYqawEgDrAxLdrszp4Tmhl": "1*NODE, CPU:1, 1*DCU",
  97. "Nd99eGNoBFC2ZTycKDlqD37heWTOmrMS": "1*NODE, CPU:2, 1*DCU",
  98. "uAmLkz6jgSZkC6o8JywG7Yo2aiFPPOBO": "1*NODE, CPU:4, 2*DCU",
  99. "D71OZQYrRabJc2nfL2GDWOdLEfbiMzYH": "1*NODE, CPU:8, 4*DCU",
  100. "sXUMrGmgMDFJaLi6dPiB9LkHjFb3lvL5": "1*NODE, CPU:16, 4*DCU",
  101. "ZfCKQKbNbQl9RPwlSyWLah1Gf7Ti7uJA": "1*NODE, CPU:32, 4*DCU",
  102. "cfEI4ulTNo2gYUozzdG59URByUjwLl3x": "2*NODE, CPU:4, 2*DCU",
  103. "vtbkaks8bErhpLRkUDiPDUHq6ssotFpD": "2*NODE, CPU:8, 4*DCU",
  104. "QJXZFJSReVWWQfkvQjGyEq1JpDHN55Oh": "2*NODE, CPU:16, 4*DCU",
  105. "79xSdy48yLbVLl9DqEV6tQ2J6jaHe5KO": "2*NODE, CPU:32, 8*DCU",
  106. }
  107. var AcStatus = map[string]string{
  108. "statQ": "Pending",
  109. "statR": "Running",
  110. "statE": "Pending",
  111. "statC": "Completed",
  112. "statH": "Pending",
  113. "statS": "Pending",
  114. "statW": "Pending",
  115. "statX": "Other",
  116. }
  117. type ResourceSpecHpc struct {
  118. GAP_NNODE string
  119. GAP_NPROC string
  120. GAP_NDCU string
  121. }
  122. func NewShuguangHpc(aCRpc hpcacclient.HpcAC, name string, id int64) *ShuguangHpc {
  123. return &ShuguangHpc{aCRpc: aCRpc, platform: name, participantId: id}
  124. }
  125. func (s ShuguangHpc) UploadImage(ctx context.Context, path string) (interface{}, error) {
  126. return nil, nil
  127. }
  128. func (s ShuguangHpc) DeleteImage(ctx context.Context, imageId string) (interface{}, error) {
  129. return nil, nil
  130. }
  131. func (s ShuguangHpc) QueryImageList(ctx context.Context) (interface{}, error) {
  132. return nil, nil
  133. }
  134. func (s ShuguangHpc) SubmitTask(ctx context.Context, imageId string, cmd string, envs []string, params []string, resourceId string, datasetsId string, algorithmId string, aiType string) (interface{}, error) {
  135. // shuguangHpc提交任务
  136. //判断是否resourceId匹配自定义资源Id
  137. _, isMapContainsKey := RESOURCESPECSHPC[resourceId]
  138. if !isMapContainsKey {
  139. return nil, errors.New("shuguangHpc资源Id不存在")
  140. }
  141. //环境变量
  142. var env string
  143. for _, e := range envs {
  144. s := strings.Split(e, COMMA)
  145. env += EXPORT + SPACE + s[0] + EQUAL + s[1] + NEWLINE
  146. }
  147. //请求
  148. taskName := TASK_SHUGUANG_PREFIX + UNDERSCORE + utils.RandomString(10)
  149. GAP_WORK_DIR := strings.Replace(GAP_WORK_DIR, JOBNAME, taskName, -1)
  150. GAP_STD_OUT_FILE := strings.Replace(GAP_STD_OUT_FILE, JOBNAME, taskName, -1)
  151. GAP_STD_ERR_FILE := strings.Replace(GAP_STD_ERR_FILE, JOBNAME, taskName, -1)
  152. req := &hpcAC.SubmitJobReq{
  153. Apptype: Apptype,
  154. Appname: GAP_APPNAME,
  155. StrJobManagerID: StrJobManagerID,
  156. MapAppJobInfo: &hpcAC.MapAppJobInfo{
  157. GAP_CMD_FILE: cmd,
  158. GAP_NNODE: GAP_NNODE,
  159. GAP_NODE_STRING: GAP_NODE_STRING,
  160. GAP_SUBMIT_TYPE: GAP_CMD_FILE,
  161. GAP_JOB_NAME: taskName,
  162. GAP_WORK_DIR: GAP_WORK_DIR,
  163. GAP_QUEUE: GAP_QUEUE,
  164. GAP_NPROC: GAP_NPROC,
  165. GAP_PPN: GAP_PPN,
  166. GAP_NGPU: GAP_NGPU,
  167. GAP_NDCU: GAP_NDCU,
  168. GAP_WALL_TIME: GAP_WALL_TIME_24H,
  169. GAP_EXCLUSIVE: GAP_EXCLUSIVE,
  170. GAP_APPNAME: GAP_APPNAME,
  171. GAP_MULTI_SUB: GAP_MULTI_SUB,
  172. GAP_STD_OUT_FILE: GAP_STD_OUT_FILE,
  173. GAP_STD_ERR_FILE: GAP_STD_ERR_FILE,
  174. GAP_SCHEDULER_OPT_WEB: env,
  175. },
  176. }
  177. updateSGHpcRequestByResourceId(resourceId, req)
  178. resp, err := s.aCRpc.SubmitJob(ctx, req)
  179. if err != nil {
  180. return nil, err
  181. }
  182. return resp, nil
  183. }
  184. func (s ShuguangHpc) QueryTask(ctx context.Context, taskId string) (interface{}, error) {
  185. //实时作业
  186. reqC := &hpcAC.JobDetailReq{
  187. JobId: taskId,
  188. }
  189. respC, err := s.aCRpc.GetJobDetail(ctx, reqC)
  190. if err != nil {
  191. return nil, err
  192. }
  193. //实时作业检查是否成功
  194. if respC.Data != nil && respC.Data.JobEndTime != "" {
  195. return respC, nil
  196. } else {
  197. //历史作业
  198. reqH := &hpcAC.HistoryJobDetailReq{
  199. JobId: taskId,
  200. JobmanagerId: strconv.Itoa(StrJobManagerID),
  201. }
  202. respH, err := s.aCRpc.HistoryJobDetail(ctx, reqH)
  203. if err != nil {
  204. return nil, err
  205. }
  206. return respH, nil
  207. }
  208. }
  209. func (s ShuguangHpc) QuerySpecs(ctx context.Context) (interface{}, error) {
  210. resp := &types.GetResourceSpecsResp{}
  211. for k, v := range RESOURCESPECSHPC {
  212. var respec types.ResourceSpecSl
  213. respec.SpecId = k
  214. respec.SpecName = v
  215. respec.ParticipantId = s.participantId
  216. respec.ParticipantName = s.platform
  217. resp.ResourceSpecs = append(resp.ResourceSpecs, &respec)
  218. }
  219. resp.Success = true
  220. return resp, nil
  221. }
  222. func (s ShuguangHpc) DeleteTask(ctx context.Context, taskId string) (interface{}, error) {
  223. strJobInfoMap := fmt.Sprintf(StrJobInfoMap, StrJobManagerID, Username, taskId)
  224. req := &hpcAC.DeleteJobReq{
  225. StrJobInfoMap: strJobInfoMap,
  226. }
  227. resp, err := s.aCRpc.DeleteJob(ctx, req)
  228. if err != nil {
  229. return nil, err
  230. }
  231. return resp, nil
  232. }
  233. func updateSGHpcRequestByResourceId(resourceId string, req *hpcAC.SubmitJobReq) {
  234. spec := RESOURCEMAP[resourceId]
  235. req.MapAppJobInfo.GAP_NNODE = spec.GAP_NNODE
  236. req.MapAppJobInfo.GAP_NPROC = spec.GAP_NPROC
  237. req.MapAppJobInfo.GAP_NDCU = spec.GAP_NDCU
  238. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.