You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

slurm.go 6.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. package hpcservice
  2. import (
  3. "context"
  4. "fmt"
  5. "net/http"
  6. "github.com/go-resty/resty/v2"
  7. "github.com/zeromicro/go-zero/core/logx"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/restyclient"
  13. )
  14. type ParticipantHpc struct {
  15. participantId int64
  16. platform string
  17. host string
  18. userName string
  19. accessToken string
  20. *restyclient.RestyClient
  21. }
  22. const (
  23. SubmitTaskUrl = "/api/v1/hpc/jobs"
  24. JobStatus = "/api/v1/hpc/jobs/status/{clusterId}/{jobId}"
  25. JobDetailUrl = "/api/v1/hpc/jobs/detail/{clusterId}/{jobId}"
  26. JobLogUrl = "/api/v1/hpc/jobs/logs/{clusterId}/{jobId}"
  27. CancelTaskUrl = "/api/v1/hpc/jobs/cancel/{clusterId}/{jobId}"
  28. JobResourceUsageUrl = "/api/v1/hpc/jobs/resource/usage/{clusterId}/{jobId}"
  29. )
  30. func NewHpc(host string, id int64, platform string) *ParticipantHpc {
  31. return &ParticipantHpc{
  32. host: host,
  33. participantId: id,
  34. platform: platform,
  35. RestyClient: restyclient.InitClient(host, ""),
  36. }
  37. }
  38. func (c *ParticipantHpc) GetTaskDetail(ctx context.Context, taskId string, clusterId string) (*collector.Task, error) {
  39. reqUrl := c.host + JobDetailUrl
  40. hpcResp := &collector.HpcJobDetailResp{}
  41. httpClient := resty.New().R()
  42. _, err := httpClient.SetHeaders(
  43. map[string]string{
  44. "Content-Type": "application/json",
  45. "traceId": result.TraceIDFromContext(ctx),
  46. }).
  47. SetPathParams(map[string]string{
  48. "clusterId": clusterId,
  49. "jobId": taskId,
  50. }).
  51. SetResult(&hpcResp).
  52. Get(reqUrl)
  53. if err != nil {
  54. return nil, err
  55. }
  56. var resp collector.Task
  57. resp.Id = hpcResp.Data.ID
  58. if !hpcResp.Data.StartTime.IsZero() {
  59. resp.Start = hpcResp.Data.StartTime.Format(constants.Layout)
  60. }
  61. if !hpcResp.Data.EndTime.IsZero() {
  62. resp.End = hpcResp.Data.EndTime.Format(constants.Layout)
  63. }
  64. switch hpcResp.Data.StatusText {
  65. case "COMPLETED":
  66. resp.Status = constants.Completed
  67. case "FAILED":
  68. resp.Status = constants.Failed
  69. case "CREATED_FAILED":
  70. resp.Status = constants.Failed
  71. case "RUNNING":
  72. resp.Status = constants.Running
  73. case "STOPPED":
  74. resp.Status = constants.Stopped
  75. case "PENDING":
  76. resp.Status = constants.Pending
  77. case "WAITING":
  78. resp.Status = constants.Waiting
  79. case "CANCELLED":
  80. resp.Status = constants.Cancelled
  81. default:
  82. resp.Status = "undefined"
  83. }
  84. return &resp, nil
  85. }
  86. func (c *ParticipantHpc) SubmitTask(ctx context.Context, req types.SubmitHpcTaskReq) (*types.CommitHpcTaskResp, error) {
  87. reqUrl := c.host + SubmitTaskUrl
  88. resp := types.CommitHpcTaskResp{}
  89. logx.WithContext(ctx).Infof("提交任务到超算集群, url: %s, req: %+v", reqUrl, req)
  90. httpClient := resty.New().R()
  91. _, err := httpClient.SetHeaders(
  92. map[string]string{
  93. "Content-Type": "application/json",
  94. "traceId": result.TraceIDFromContext(ctx),
  95. }).SetBody(req).
  96. SetResult(&resp).
  97. Post(reqUrl)
  98. if err != nil {
  99. return nil, err
  100. }
  101. if resp.Code != http.StatusOK {
  102. return nil, fmt.Errorf(resp.Msg)
  103. }
  104. return &resp, nil
  105. }
  106. func (c *ParticipantHpc) CancelTask(ctx context.Context, jobId string, clusterId string) error {
  107. reqUrl := c.host + CancelTaskUrl
  108. resp := types.CommonResp{}
  109. logx.WithContext(ctx).Infof("取消超算集群任务, url: %s, jobId: %s", reqUrl, jobId)
  110. httpClient := resty.New().R()
  111. _, err := httpClient.SetHeaders(
  112. map[string]string{
  113. "Content-Type": "application/json",
  114. "traceId": result.TraceIDFromContext(ctx),
  115. }).SetPathParams(map[string]string{
  116. "clusterId": clusterId,
  117. "jobId": jobId,
  118. }).SetResult(&resp).Delete(reqUrl)
  119. if err != nil {
  120. return err
  121. }
  122. if resp.Code != http.StatusOK {
  123. return fmt.Errorf(resp.Msg)
  124. }
  125. return nil
  126. }
  127. func (c *ParticipantHpc) GetTaskLogs(ctx context.Context, jobId string, clusterId string) (interface{}, error) {
  128. logx.WithContext(ctx).Infof("获取超算集群任务日志, url: %s, jobId: %s", JobLogUrl, jobId)
  129. if jobId == "" {
  130. return nil, fmt.Errorf("jobId is empty")
  131. }
  132. resp := types.CommonResp{}
  133. _, err := c.Request(JobLogUrl, http.MethodGet, func(req *resty.Request) {
  134. req.SetHeaders(map[string]string{
  135. "Content-Type": "application/json",
  136. "traceId": result.TraceIDFromContext(ctx),
  137. }).SetPathParams(map[string]string{
  138. "clusterId": clusterId,
  139. "jobId": jobId,
  140. }).SetResult(&resp)
  141. })
  142. if err != nil {
  143. return nil, err
  144. }
  145. if resp.Code != http.StatusOK {
  146. return nil, fmt.Errorf(resp.Msg)
  147. }
  148. return resp, nil
  149. }
  150. func (c *ParticipantHpc) GetTaskStatus(ctx context.Context, jobId string, clusterId string) (interface{}, error) {
  151. logx.WithContext(ctx).Infof("获取超算集群任务日志, url: %s, jobId: %s", JobLogUrl, jobId)
  152. if jobId == "" {
  153. return nil, fmt.Errorf("jobId is empty")
  154. }
  155. resp := types.CommonResp{}
  156. _, err := c.Request(JobStatus, http.MethodGet, func(req *resty.Request) {
  157. req.SetHeaders(map[string]string{
  158. "Content-Type": "application/json",
  159. "traceId": result.TraceIDFromContext(ctx),
  160. }).SetPathParams(map[string]string{
  161. "clusterId": clusterId,
  162. "jobId": jobId,
  163. }).SetResult(&resp)
  164. })
  165. if err != nil {
  166. return nil, err
  167. }
  168. if resp.Code != http.StatusOK {
  169. return nil, fmt.Errorf(resp.Msg)
  170. }
  171. return resp, nil
  172. }
  173. func (c *ParticipantHpc) GetTaskResourceUsage(ctx context.Context, jobId string, clusterId string) (interface{}, error) {
  174. logx.WithContext(ctx).Infof("获取超算集群任务资源使用情况, url: %s, jobId: %s", JobResourceUsageUrl, jobId)
  175. if jobId == "" {
  176. return nil, fmt.Errorf("jobId is empty")
  177. }
  178. resp := types.CommonResp{}
  179. _, err := c.Request(JobResourceUsageUrl, http.MethodGet, func(req *resty.Request) {
  180. req.SetHeaders(map[string]string{
  181. "Content-Type": "application/json",
  182. "traceId": result.TraceIDFromContext(ctx),
  183. }).SetPathParams(map[string]string{
  184. "clusterId": clusterId,
  185. "jobId": jobId,
  186. }).SetResult(&resp)
  187. })
  188. if err != nil {
  189. return nil, err
  190. }
  191. if resp.Code != http.StatusOK {
  192. return nil, fmt.Errorf(resp.Msg)
  193. }
  194. return resp, nil
  195. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.