You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

slurm.go 7.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. package hpcservice
  2. import (
  3. "context"
  4. "fmt"
  5. "net/http"
  6. "github.com/go-resty/resty/v2"
  7. "github.com/zeromicro/go-zero/core/logx"
  8. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
  9. "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
  10. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
  11. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
  12. "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/restyclient"
  13. )
  14. type ParticipantHpc struct {
  15. participantId int64
  16. platform string
  17. host string
  18. userName string
  19. accessToken string
  20. *restyclient.RestyClient
  21. }
  22. const (
  23. SubmitTaskUrl = "/api/v1/hpc/jobs"
  24. JobStatus = "/api/v1/hpc/jobs/status/{clusterId}/{jobId}"
  25. JobDetailUrl = "/api/v1/hpc/jobs/detail/{clusterId}/{jobId}"
  26. JobLogUrl = "/api/v1/hpc/jobs/logs/{clusterId}/{jobId}"
  27. CancelTaskUrl = "/api/v1/hpc/jobs/cancel/{clusterId}/{jobId}"
  28. JobResourceUsageUrl = "/api/v1/hpc/jobs/resource/usage/{clusterId}/{jobId}"
  29. JobFailureAnalyze = "/api/v1/hpc/task/analyze"
  30. )
  31. func NewHpc(host string, id int64, platform string) *ParticipantHpc {
  32. return &ParticipantHpc{
  33. host: host,
  34. participantId: id,
  35. platform: platform,
  36. RestyClient: restyclient.InitClient(host, ""),
  37. }
  38. }
  39. func (c *ParticipantHpc) GetTaskDetail(ctx context.Context, taskId string, clusterId string) (*collector.Task, error) {
  40. reqUrl := c.host + JobDetailUrl
  41. hpcResp := &collector.HpcJobDetailResp{}
  42. httpClient := resty.New().R()
  43. _, err := httpClient.SetHeaders(
  44. map[string]string{
  45. "Content-Type": "application/json",
  46. "traceId": result.TraceIDFromContext(ctx),
  47. }).
  48. SetPathParams(map[string]string{
  49. "clusterId": clusterId,
  50. "jobId": taskId,
  51. }).
  52. SetResult(&hpcResp).
  53. Get(reqUrl)
  54. if err != nil {
  55. return nil, err
  56. }
  57. var resp collector.Task
  58. resp.Id = hpcResp.Data.ID
  59. if !hpcResp.Data.StartTime.IsZero() {
  60. resp.Start = hpcResp.Data.StartTime.Format(constants.Layout)
  61. }
  62. if !hpcResp.Data.EndTime.IsZero() {
  63. resp.End = hpcResp.Data.EndTime.Format(constants.Layout)
  64. }
  65. switch hpcResp.Data.StatusText {
  66. case "COMPLETED":
  67. resp.Status = constants.Completed
  68. case "FAILED":
  69. resp.Status = constants.Failed
  70. case "CREATED_FAILED":
  71. resp.Status = constants.Failed
  72. case "RUNNING":
  73. resp.Status = constants.Running
  74. case "STOPPED":
  75. resp.Status = constants.Stopped
  76. case "PENDING":
  77. resp.Status = constants.Pending
  78. case "WAITING":
  79. resp.Status = constants.Waiting
  80. case "CANCELLED":
  81. resp.Status = constants.Cancelled
  82. default:
  83. resp.Status = "undefined"
  84. }
  85. return &resp, nil
  86. }
  87. func (c *ParticipantHpc) SubmitTask(ctx context.Context, req types.SubmitHpcTaskReq) (*types.CommitHpcTaskResp, error) {
  88. reqUrl := c.host + SubmitTaskUrl
  89. resp := types.CommitHpcTaskResp{}
  90. logx.WithContext(ctx).Infof("提交任务到超算集群, url: %s, req: %+v", reqUrl, req)
  91. httpClient := resty.New().R()
  92. _, err := httpClient.SetHeaders(
  93. map[string]string{
  94. "Content-Type": "application/json",
  95. "traceId": result.TraceIDFromContext(ctx),
  96. }).SetBody(req).
  97. SetResult(&resp).
  98. Post(reqUrl)
  99. if err != nil {
  100. return nil, err
  101. }
  102. if resp.Code != http.StatusOK {
  103. return nil, fmt.Errorf(resp.Msg)
  104. }
  105. return &resp, nil
  106. }
  107. func (c *ParticipantHpc) CancelTask(ctx context.Context, jobId string, clusterId string) error {
  108. reqUrl := c.host + CancelTaskUrl
  109. resp := types.CommonResp{}
  110. logx.WithContext(ctx).Infof("取消超算集群任务, url: %s, jobId: %s", reqUrl, jobId)
  111. httpClient := resty.New().R()
  112. _, err := httpClient.SetHeaders(
  113. map[string]string{
  114. "Content-Type": "application/json",
  115. "traceId": result.TraceIDFromContext(ctx),
  116. }).SetPathParams(map[string]string{
  117. "clusterId": clusterId,
  118. "jobId": jobId,
  119. }).SetResult(&resp).Delete(reqUrl)
  120. if err != nil {
  121. return err
  122. }
  123. if resp.Code != http.StatusOK {
  124. return fmt.Errorf(resp.Msg)
  125. }
  126. return nil
  127. }
  128. func (c *ParticipantHpc) GetTaskLogs(ctx context.Context, jobId string, clusterId string) (interface{}, error) {
  129. logx.WithContext(ctx).Infof("获取超算集群任务日志, url: %s, jobId: %s", JobLogUrl, jobId)
  130. if jobId == "" {
  131. return nil, fmt.Errorf("jobId is empty")
  132. }
  133. resp := types.CommonResp{}
  134. _, err := c.Request(JobLogUrl, http.MethodGet, func(req *resty.Request) {
  135. req.SetHeaders(map[string]string{
  136. "Content-Type": "application/json",
  137. "traceId": result.TraceIDFromContext(ctx),
  138. }).SetPathParams(map[string]string{
  139. "clusterId": clusterId,
  140. "jobId": jobId,
  141. }).SetResult(&resp)
  142. })
  143. if err != nil {
  144. return nil, err
  145. }
  146. if resp.Code != http.StatusOK {
  147. return nil, fmt.Errorf(resp.Msg)
  148. }
  149. return resp, nil
  150. }
  151. func (c *ParticipantHpc) GetTaskStatus(ctx context.Context, jobId string, clusterId string) (interface{}, error) {
  152. logx.WithContext(ctx).Infof("获取超算集群任务日志, url: %s, jobId: %s", JobLogUrl, jobId)
  153. if jobId == "" {
  154. return nil, fmt.Errorf("jobId is empty")
  155. }
  156. resp := types.CommonResp{}
  157. _, err := c.Request(JobStatus, http.MethodGet, func(req *resty.Request) {
  158. req.SetHeaders(map[string]string{
  159. "Content-Type": "application/json",
  160. "traceId": result.TraceIDFromContext(ctx),
  161. }).SetPathParams(map[string]string{
  162. "clusterId": clusterId,
  163. "jobId": jobId,
  164. }).SetResult(&resp)
  165. })
  166. if err != nil {
  167. return nil, err
  168. }
  169. if resp.Code != http.StatusOK {
  170. return nil, fmt.Errorf(resp.Msg)
  171. }
  172. return resp, nil
  173. }
  174. func (c *ParticipantHpc) GetTaskResourceUsage(ctx context.Context, jobId string, clusterId string) (interface{}, error) {
  175. logx.WithContext(ctx).Infof("获取超算集群任务资源使用情况, url: %s, jobId: %s", JobResourceUsageUrl, jobId)
  176. if jobId == "" {
  177. return nil, fmt.Errorf("jobId is empty")
  178. }
  179. resp := types.CommonResp{}
  180. _, err := c.Request(JobResourceUsageUrl, http.MethodGet, func(req *resty.Request) {
  181. req.SetHeaders(map[string]string{
  182. "Content-Type": "application/json",
  183. "traceId": result.TraceIDFromContext(ctx),
  184. }).SetPathParams(map[string]string{
  185. "clusterId": clusterId,
  186. "jobId": jobId,
  187. }).SetResult(&resp)
  188. })
  189. if err != nil {
  190. return nil, err
  191. }
  192. if resp.Code != http.StatusOK {
  193. return nil, fmt.Errorf(resp.Msg)
  194. }
  195. return resp, nil
  196. }
  197. func (c *ParticipantHpc) GetHpcTaskFailureAnalyze(ctx context.Context, jobId string, clusterId string) (interface{}, error) {
  198. logx.WithContext(ctx).Infof("获取超算集群任务失败分析, url: %s, jobId: %s", JobFailureAnalyze, jobId)
  199. if jobId == "" {
  200. return nil, fmt.Errorf("jobId is empty")
  201. }
  202. resp := types.CommonResp{}
  203. _, err := c.Request(JobFailureAnalyze, http.MethodPost, func(req *resty.Request) {
  204. req.SetHeaders(map[string]string{
  205. "Content-Type": "application/json",
  206. "traceId": result.TraceIDFromContext(ctx),
  207. }).SetBody(map[string]string{
  208. "JobId": jobId,
  209. "clusterId": clusterId,
  210. "clusterType": "hpc",
  211. }).SetResult(&resp)
  212. })
  213. if err != nil {
  214. return nil, err
  215. }
  216. if resp.Code != http.StatusOK {
  217. return nil, fmt.Errorf(resp.Msg)
  218. }
  219. return resp, nil
  220. }

PCM is positioned as Software stack over Cloud, aiming to build the standards and ecology of heterogeneous cloud collaboration for JCC in a non intrusive and autonomous peer-to-peer manner.