|
- package hpcservice
-
- import (
- "context"
- "fmt"
- "net/http"
-
- "github.com/go-resty/resty/v2"
- "github.com/zeromicro/go-zero/core/logx"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/scheduler/service/collector"
- "gitlink.org.cn/JointCloud/pcm-coordinator/internal/types"
- "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/constants"
- "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/repository/result"
- "gitlink.org.cn/JointCloud/pcm-coordinator/pkg/utils/restyclient"
- )
-
- type ParticipantHpc struct {
- participantId int64
- platform string
- host string
- userName string
- accessToken string
- *restyclient.RestyClient
- }
-
- const (
- SubmitTaskUrl = "/api/v1/hpc/jobs"
- JobStatus = "/api/v1/hpc/jobs/status/{clusterId}/{jobId}"
- JobDetailUrl = "/api/v1/hpc/jobs/detail/{clusterId}/{jobId}"
- JobLogUrl = "/api/v1/hpc/jobs/logs/{clusterId}/{jobId}"
- CancelTaskUrl = "/api/v1/hpc/jobs/cancel/{clusterId}/{jobId}"
- JobResourceUsageUrl = "/api/v1/hpc/jobs/resource/usage/{clusterId}/{jobId}"
- JobFailureAnalyze = "/api/v1/hpc/task/analyze"
- )
-
- func NewHpc(host string, id int64, platform string) *ParticipantHpc {
- return &ParticipantHpc{
- host: host,
- participantId: id,
- platform: platform,
- RestyClient: restyclient.InitClient(host, ""),
- }
- }
-
- func (c *ParticipantHpc) GetTaskDetail(ctx context.Context, taskId string, clusterId string) (*collector.Task, error) {
- reqUrl := c.host + JobDetailUrl
- hpcResp := &collector.HpcJobDetailResp{}
- httpClient := resty.New().R()
- _, err := httpClient.SetHeaders(
- map[string]string{
- "Content-Type": "application/json",
- "traceId": result.TraceIDFromContext(ctx),
- }).
- SetPathParams(map[string]string{
- "clusterId": clusterId,
- "jobId": taskId,
- }).
- SetResult(&hpcResp).
- Get(reqUrl)
- if err != nil {
- return nil, err
- }
- var resp collector.Task
- resp.Id = hpcResp.Data.ID
- if !hpcResp.Data.StartTime.IsZero() {
- resp.Start = hpcResp.Data.StartTime.Format(constants.Layout)
- }
- if !hpcResp.Data.EndTime.IsZero() {
- resp.End = hpcResp.Data.EndTime.Format(constants.Layout)
- }
- switch hpcResp.Data.StatusText {
- case "COMPLETED":
- resp.Status = constants.Completed
- case "FAILED":
- resp.Status = constants.Failed
- case "CREATED_FAILED":
- resp.Status = constants.Failed
- case "RUNNING":
- resp.Status = constants.Running
- case "STOPPED":
- resp.Status = constants.Stopped
- case "PENDING":
- resp.Status = constants.Pending
- case "WAITING":
- resp.Status = constants.Waiting
- case "CANCELLED":
- resp.Status = constants.Cancelled
- default:
- resp.Status = "undefined"
- }
-
- return &resp, nil
- }
-
- func (c *ParticipantHpc) SubmitTask(ctx context.Context, req types.SubmitHpcTaskReq) (*types.CommitHpcTaskResp, error) {
- reqUrl := c.host + SubmitTaskUrl
- resp := types.CommitHpcTaskResp{}
- logx.WithContext(ctx).Infof("提交任务到超算集群, url: %s, req: %+v", reqUrl, req)
- httpClient := resty.New().R()
- _, err := httpClient.SetHeaders(
- map[string]string{
- "Content-Type": "application/json",
- "traceId": result.TraceIDFromContext(ctx),
- }).SetBody(req).
- SetResult(&resp).
- Post(reqUrl)
- if err != nil {
- return nil, err
- }
- if resp.Code != http.StatusOK {
- return nil, fmt.Errorf(resp.Msg)
- }
- return &resp, nil
- }
-
- func (c *ParticipantHpc) CancelTask(ctx context.Context, jobId string, clusterId string) error {
- reqUrl := c.host + CancelTaskUrl
- resp := types.CommonResp{}
- logx.WithContext(ctx).Infof("取消超算集群任务, url: %s, jobId: %s", reqUrl, jobId)
- httpClient := resty.New().R()
- _, err := httpClient.SetHeaders(
- map[string]string{
- "Content-Type": "application/json",
- "traceId": result.TraceIDFromContext(ctx),
- }).SetPathParams(map[string]string{
- "clusterId": clusterId,
- "jobId": jobId,
- }).SetResult(&resp).Delete(reqUrl)
- if err != nil {
- return err
- }
- if resp.Code != http.StatusOK {
- return fmt.Errorf(resp.Msg)
- }
- return nil
- }
-
- func (c *ParticipantHpc) GetTaskLogs(ctx context.Context, jobId string, clusterId string) (interface{}, error) {
- logx.WithContext(ctx).Infof("获取超算集群任务日志, url: %s, jobId: %s", JobLogUrl, jobId)
- if jobId == "" {
- return nil, fmt.Errorf("jobId is empty")
- }
- resp := types.CommonResp{}
- _, err := c.Request(JobLogUrl, http.MethodGet, func(req *resty.Request) {
- req.SetHeaders(map[string]string{
- "Content-Type": "application/json",
- "traceId": result.TraceIDFromContext(ctx),
- }).SetPathParams(map[string]string{
- "clusterId": clusterId,
- "jobId": jobId,
- }).SetResult(&resp)
- })
- if err != nil {
- return nil, err
- }
- if resp.Code != http.StatusOK {
- return nil, fmt.Errorf(resp.Msg)
- }
- return resp, nil
- }
-
- func (c *ParticipantHpc) GetTaskStatus(ctx context.Context, jobId string, clusterId string) (interface{}, error) {
- logx.WithContext(ctx).Infof("获取超算集群任务日志, url: %s, jobId: %s", JobLogUrl, jobId)
- if jobId == "" {
- return nil, fmt.Errorf("jobId is empty")
- }
- resp := types.CommonResp{}
- _, err := c.Request(JobStatus, http.MethodGet, func(req *resty.Request) {
- req.SetHeaders(map[string]string{
- "Content-Type": "application/json",
- "traceId": result.TraceIDFromContext(ctx),
- }).SetPathParams(map[string]string{
- "clusterId": clusterId,
- "jobId": jobId,
- }).SetResult(&resp)
- })
- if err != nil {
- return nil, err
- }
- if resp.Code != http.StatusOK {
- return nil, fmt.Errorf(resp.Msg)
- }
- return resp, nil
- }
-
- func (c *ParticipantHpc) GetTaskResourceUsage(ctx context.Context, jobId string, clusterId string) (interface{}, error) {
- logx.WithContext(ctx).Infof("获取超算集群任务资源使用情况, url: %s, jobId: %s", JobResourceUsageUrl, jobId)
- if jobId == "" {
- return nil, fmt.Errorf("jobId is empty")
- }
- resp := types.CommonResp{}
- _, err := c.Request(JobResourceUsageUrl, http.MethodGet, func(req *resty.Request) {
- req.SetHeaders(map[string]string{
- "Content-Type": "application/json",
- "traceId": result.TraceIDFromContext(ctx),
- }).SetPathParams(map[string]string{
- "clusterId": clusterId,
- "jobId": jobId,
- }).SetResult(&resp)
- })
- if err != nil {
- return nil, err
- }
- if resp.Code != http.StatusOK {
- return nil, fmt.Errorf(resp.Msg)
- }
- return resp, nil
- }
-
- func (c *ParticipantHpc) GetHpcTaskFailureAnalyze(ctx context.Context, jobId string, clusterId string) (interface{}, error) {
- logx.WithContext(ctx).Infof("获取超算集群任务失败分析, url: %s, jobId: %s", JobFailureAnalyze, jobId)
- if jobId == "" {
- return nil, fmt.Errorf("jobId is empty")
- }
- resp := types.CommonResp{}
- _, err := c.Request(JobFailureAnalyze, http.MethodPost, func(req *resty.Request) {
- req.SetHeaders(map[string]string{
- "Content-Type": "application/json",
- "traceId": result.TraceIDFromContext(ctx),
- }).SetBody(map[string]string{
- "JobId": jobId,
- "clusterId": clusterId,
- "clusterType": "hpc",
- }).SetResult(&resp)
- })
- if err != nil {
- return nil, err
- }
- if resp.Code != http.StatusOK {
- return nil, fmt.Errorf(resp.Msg)
- }
- return resp, nil
- }
|